#include "buffer_manager.h" #include "config.h" #include "runtime.h" #include "threading.h" #include #include #include #include typedef struct rt_buffer_region_s { void *memory; int16_t *refcounts; // One per block uint32_t *bitmap; size_t block_count; rt_mutex *guard; } rt_buffer_region; /* Count leading zeroes. * Note that the return value of __builtin_clz(0) is undefined. */ #ifdef _MSC_VER #include #define lzcnt32(x) __lzcnt((x)) #define popcnt32(x) __popcnt((x)) static __forceinline uint32_t tzcnt32(uint32_t x) { unsigned long i; _BitScanForward(&i, x); return (uint32_t)i; } static inline bool IsLZCNTSupported(void) { #define Type 0x80000001 int info[4]; __cpuid(info, Type); return (info[2] & (1 << 5)) != 0; #undef Type } #elif defined(__GNUC__) #define lzcnt32(x) __builtin_clz((x)) #define tzcnt32(x) __builtin_ctz((x)) #define popcnt32(x) __builtin_popcount((x)) #define IsLZCNTSupported() true #endif /* NOTE(Kevin): Keep these sorted! */ static size_t _block_sizes[] = {RT_KB(512), RT_MB(1), RT_MB(4), RT_MB(8)}; #define NUM_BLOCK_SIZES (sizeof(_block_sizes) / sizeof(_block_sizes[0])) static rt_buffer_region _regions[NUM_BLOCK_SIZES]; RT_CVAR_SZ(rt_BufferManagerMemory, "Total number of bytes allocated for the buffer manager. Default: 1GB", RT_GB(1)); rt_result InitBufferManager(void) { if ((rt_BufferManagerMemory.sz % NUM_BLOCK_SIZES) != 0) rtLog("BUFFERMGR", "Configured memory amount is not dividable by number of block " "sizes: %u MB/%u", rt_BufferManagerMemory.sz / (1024 * 1024), NUM_BLOCK_SIZES); size_t mem_per_size = rt_BufferManagerMemory.sz / NUM_BLOCK_SIZES; for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) { if ((mem_per_size % _block_sizes[i]) != 0) rtLog("BUFFERMGR", "Memory per block size is not dividable by block size: %u " "MB/%u KB", mem_per_size / (1024 * 1024), _block_sizes[i] / 1024); size_t block_count = mem_per_size / _block_sizes[i]; _regions[i].block_count = block_count; _regions[i].guard = rtCreateMutex(); if (!_regions[i].guard) { rtReportError("BUFFERMGR", "Failed to create guard mutex %u", i); return RT_BUFFER_MGR_MUTEX_CREATION_FAILED; } _regions[i].memory = malloc(mem_per_size); if (!_regions[i].memory) { rtDestroyMutex(_regions[i].guard); rtReportError("BUFFERMGR", "Failed to allocate memory.", i); return RT_BUFFER_MGR_OUT_OF_MEMORY; } _regions[i].bitmap = calloc((block_count + 31) / 32, sizeof(uint32_t)); if (!_regions[i].bitmap) { rtDestroyMutex(_regions[i].guard); free(_regions[i].memory); rtReportError("BUFFERMGR", "Failed to allocate memory.", i); return RT_BUFFER_MGR_OUT_OF_MEMORY; } _regions[i].refcounts = calloc(block_count, sizeof(uint16_t)); if (!_regions[i].refcounts) { rtDestroyMutex(_regions[i].guard); free(_regions[i].memory); free(_regions[i].bitmap); rtReportError("BUFFERMGR", "Failed to allocate memory.", i); return RT_BUFFER_MGR_OUT_OF_MEMORY; } } return RT_SUCCESS; } void ShutdownBufferManager(void) { for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) { rtDestroyMutex(_regions[i].guard); free(_regions[i].memory); free(_regions[i].bitmap); free(_regions[i].refcounts); } } RT_DLLEXPORT void *rtAllocBuffer(size_t size) { assert(IsLZCNTSupported()); // Determine the best block size to use size_t required_blocks = (size + _block_sizes[0] - 1) / _block_sizes[0]; size_t best_fit = 0; for (size_t i = 1; i < NUM_BLOCK_SIZES; ++i) { size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i]; if (block_count < required_blocks && size >= _block_sizes[i]) { required_blocks = block_count; best_fit = i; } } void *result = NULL; rt_buffer_region *region = &_regions[best_fit]; rtLockMutex(region->guard); size_t dword_count = (region->block_count + 31) / 32; if (required_blocks < 32) { /* Fast path for allocations that potentially fit into one dword */ uint32_t in_use_mask = (1ull << required_blocks) - 1; size_t max_occupancy = 32 - required_blocks; for (size_t i = 0; i < dword_count; ++i) { size_t block_index = 0; if (region->bitmap[i] != 0 && popcnt32(region->bitmap[i]) < max_occupancy) { size_t free_high_blocks = lzcnt32(region->bitmap[i]); if (free_high_blocks >= required_blocks) { /* High blocks are free */ size_t first_free = 32 - free_high_blocks; region->bitmap[i] |= (in_use_mask << first_free); block_index = i * 32 + first_free; result = (char *)region->memory + block_index * _block_sizes[best_fit]; } else if (tzcnt32(region->bitmap[i]) >= required_blocks) { /* Low blocks are free */ region->bitmap[i] |= in_use_mask; block_index = i * 32; result = (char *)region->memory + block_index * _block_sizes[best_fit]; } else { /* Check if we can find a large enough range of free blocks. * Start after the first set bit. */ for (uint32_t j = tzcnt32(region->bitmap[i]) + 1; j < 32 - required_blocks; ++j) { if ((region->bitmap[i] & in_use_mask << j) == 0) { region->bitmap[i] |= (in_use_mask << j); block_index = i * 32 + j; result = (char *)region->memory + block_index * _block_sizes[best_fit]; break; } } } } else if (region->bitmap[i] == 0) { /* All free */ region->bitmap[i] = in_use_mask; block_index = i * 32; result = (char *)region->memory + block_index * _block_sizes[best_fit]; } else if (i < dword_count - 1) { /* Check if we can use high blocks from this dword and low blocks from the next one */ size_t high_blocks = lzcnt32(region->bitmap[i]); size_t low_blocks = (region->bitmap[i + 1] != 0) ? tzcnt32(region->bitmap[i + 1]) : 32; if (high_blocks + low_blocks >= required_blocks) { size_t high_mask = (1u << high_blocks) - 1; size_t first_free = 32 - high_blocks; size_t low_mask = (1u << (required_blocks - high_blocks)) - 1; region->bitmap[i] |= (high_mask << first_free); region->bitmap[i + 1] |= low_mask; block_index = i * 32 + first_free; result = (char *)region->memory + block_index * _block_sizes[best_fit]; } } if (result) { for (size_t j = 0; j < required_blocks; ++j) region->refcounts[block_index + j] = 1; break; } } } else { for (size_t i = 0; i < dword_count; ++i) { if (region->bitmap[i] == UINT32_MAX) { continue; } /* Check if we can start the allocation here */ } } rtUnlockMutex(region->guard); return result; } RT_DLLEXPORT void rtReleaseBuffer(const void *begin, size_t size) { if (!begin) return; uintptr_t begin_addr = (uintptr_t)begin; for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) { uintptr_t region_addr = (uintptr_t)_regions[i].memory; size_t region_size = _block_sizes[i] * _regions[i].block_count; if (begin_addr >= region_addr && begin_addr + size <= region_addr + region_size) { size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i]; size_t first_block = (begin_addr - region_addr) / _block_sizes[i]; rtLockMutex(_regions[i].guard); for (size_t j = 0; j < block_count; ++j) { size_t dword = (first_block + j) / 32; size_t bit = (first_block + j) % 32; if (--_regions[i].refcounts[first_block + j] == 0) _regions[i].bitmap[dword] &= ~(1u << bit); } rtUnlockMutex(_regions[i].guard); return; } } rtLog("BUFFERMGR", "Tried to release an invalid buffer"); } RT_DLLEXPORT void rtIncreaseBufferRefCount(const void *begin, size_t size) { uintptr_t begin_addr = (uintptr_t)begin; for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) { uintptr_t region_addr = (uintptr_t)_regions[i].memory; size_t region_size = _block_sizes[i] * _regions[i].block_count; if (begin_addr >= region_addr && begin_addr + size <= region_addr + region_size) { size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i]; size_t first_block = (begin_addr - region_addr) / _block_sizes[i]; rtLockMutex(_regions[i].guard); for (size_t j = 0; j < block_count; ++j) { ++_regions[i].refcounts[first_block + j]; } rtUnlockMutex(_regions[i].guard); return; } } rtLog("BUFFERMGR", "Tried to increase the refcount of an invalid buffer"); }