#include "buffer_manager.h" #include "config.h" #include "runtime.h" #include "threading.h" #include #include #include #include #include #if 0 typedef struct rt_buffer_region_s { void *memory; int16_t *refcounts; // One per block uint32_t *bitmap; size_t block_count; rt_mutex *guard; } rt_buffer_region; #endif /* Count leading zeroes. * Note that the return value of __builtin_clz(0) is undefined. */ #ifdef _MSC_VER #include #define lzcnt32(x) __lzcnt((x)) #define popcnt32(x) __popcnt((x)) static __forceinline uint32_t tzcnt32(uint32_t x) { unsigned long i; _BitScanForward(&i, x); return (uint32_t)i; } static __forceinline bool IsLZCNTSupported(void) { #define Type 0x80000001 int info[4]; __cpuid(info, Type); return (info[2] & (1 << 5)) != 0; #undef Type } #elif defined(__GNUC__) #define lzcnt32(x) __builtin_clz((x)) #define tzcnt32(x) __builtin_ctz((x)) #define popcnt32(x) __builtin_popcount((x)) #define IsLZCNTSupported() true #endif #if 0 /* NOTE(Kevin): Keep these sorted! */ static size_t _block_sizes[] = {RT_KB(512), RT_MB(1), RT_MB(4), RT_MB(8)}; #define NUM_BLOCK_SIZES (sizeof(_block_sizes) / sizeof(_block_sizes[0])) static rt_buffer_region _regions[NUM_BLOCK_SIZES]; RT_CVAR_SZ(rt_BufferManagerMemory, "Total number of bytes allocated for the buffer manager. Default: 1GB", RT_GB(1)); rt_result InitBufferManager(void) { if ((rt_BufferManagerMemory.sz % NUM_BLOCK_SIZES) != 0) rtLog("BUFFERMGR", "Configured memory amount is not dividable by number of block " "sizes: %u MB/%u", rt_BufferManagerMemory.sz / (1024 * 1024), NUM_BLOCK_SIZES); size_t mem_per_size = rt_BufferManagerMemory.sz / NUM_BLOCK_SIZES; for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) { if ((mem_per_size % _block_sizes[i]) != 0) rtLog("BUFFERMGR", "Memory per block size is not dividable by block size: %u " "MB/%u KB", mem_per_size / (1024 * 1024), _block_sizes[i] / 1024); size_t block_count = mem_per_size / _block_sizes[i]; _regions[i].block_count = block_count; _regions[i].guard = rtCreateMutex(); if (!_regions[i].guard) { rtReportError("BUFFERMGR", "Failed to create guard mutex %u", i); return RT_BUFFER_MGR_MUTEX_CREATION_FAILED; } _regions[i].memory = malloc(mem_per_size); if (!_regions[i].memory) { rtDestroyMutex(_regions[i].guard); rtReportError("BUFFERMGR", "Failed to allocate memory.", i); return RT_BUFFER_MGR_OUT_OF_MEMORY; } _regions[i].bitmap = calloc((block_count + 31) / 32, sizeof(uint32_t)); if (!_regions[i].bitmap) { rtDestroyMutex(_regions[i].guard); free(_regions[i].memory); rtReportError("BUFFERMGR", "Failed to allocate memory.", i); return RT_BUFFER_MGR_OUT_OF_MEMORY; } _regions[i].refcounts = calloc(block_count, sizeof(uint16_t)); if (!_regions[i].refcounts) { rtDestroyMutex(_regions[i].guard); free(_regions[i].memory); free(_regions[i].bitmap); rtReportError("BUFFERMGR", "Failed to allocate memory.", i); return RT_BUFFER_MGR_OUT_OF_MEMORY; } } return RT_SUCCESS; } void ShutdownBufferManager(void) { for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) { rtDestroyMutex(_regions[i].guard); free(_regions[i].memory); free(_regions[i].bitmap); free(_regions[i].refcounts); } } RT_DLLEXPORT void *rtAllocBuffer(size_t size) { assert(IsLZCNTSupported()); // Determine the best block size to use size_t required_blocks = (size + _block_sizes[0] - 1) / _block_sizes[0]; size_t best_fit = 0; for (size_t i = 1; i < NUM_BLOCK_SIZES; ++i) { size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i]; if (block_count < required_blocks && size >= _block_sizes[i]) { required_blocks = block_count; best_fit = i; } } void *result = NULL; rt_buffer_region *region = &_regions[best_fit]; rtLockMutex(region->guard); size_t dword_count = (region->block_count + 31) / 32; if (required_blocks < 32) { /* Fast path for allocations that potentially fit into one dword */ uint32_t in_use_mask = (1ull << required_blocks) - 1; size_t max_occupancy = 32 - required_blocks; for (size_t i = 0; i < dword_count; ++i) { size_t block_index = 0; if (region->bitmap[i] != 0 && popcnt32(region->bitmap[i]) < max_occupancy) { size_t free_high_blocks = lzcnt32(region->bitmap[i]); if (free_high_blocks >= required_blocks) { /* High blocks are free */ size_t first_free = 32 - free_high_blocks; region->bitmap[i] |= (in_use_mask << first_free); block_index = i * 32 + first_free; result = (char *)region->memory + block_index * _block_sizes[best_fit]; } else if (tzcnt32(region->bitmap[i]) >= required_blocks) { /* Low blocks are free */ region->bitmap[i] |= in_use_mask; block_index = i * 32; result = (char *)region->memory + block_index * _block_sizes[best_fit]; } else { /* Check if we can find a large enough range of free blocks. * Start after the first set bit. */ for (uint32_t j = tzcnt32(region->bitmap[i]) + 1; j < 32 - required_blocks; ++j) { if ((region->bitmap[i] & in_use_mask << j) == 0) { region->bitmap[i] |= (in_use_mask << j); block_index = i * 32 + j; result = (char *)region->memory + block_index * _block_sizes[best_fit]; break; } } } } else if (region->bitmap[i] == 0) { /* All free */ region->bitmap[i] = in_use_mask; block_index = i * 32; result = (char *)region->memory + block_index * _block_sizes[best_fit]; } else if (i < dword_count - 1) { /* Check if we can use high blocks from this dword and low blocks from the next one */ size_t high_blocks = lzcnt32(region->bitmap[i]); size_t low_blocks = (region->bitmap[i + 1] != 0) ? tzcnt32(region->bitmap[i + 1]) : 32; if (high_blocks + low_blocks >= required_blocks) { size_t high_mask = (1u << high_blocks) - 1; size_t first_free = 32 - high_blocks; size_t low_mask = (1u << (required_blocks - high_blocks)) - 1; region->bitmap[i] |= (high_mask << first_free); region->bitmap[i + 1] |= low_mask; block_index = i * 32 + first_free; result = (char *)region->memory + block_index * _block_sizes[best_fit]; } } if (result) { for (size_t j = 0; j < required_blocks; ++j) region->refcounts[block_index + j] = 1; break; } } } else { for (size_t i = 0; i < dword_count; ++i) { if (region->bitmap[i] == UINT32_MAX) { continue; } /* Check if we can start the allocation here */ } } rtUnlockMutex(region->guard); return result; } RT_DLLEXPORT void rtReleaseBuffer(const void *begin, size_t size) { if (!begin) return; uintptr_t begin_addr = (uintptr_t)begin; for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) { uintptr_t region_addr = (uintptr_t)_regions[i].memory; size_t region_size = _block_sizes[i] * _regions[i].block_count; if (begin_addr >= region_addr && begin_addr + size <= region_addr + region_size) { size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i]; size_t first_block = (begin_addr - region_addr) / _block_sizes[i]; rtLockMutex(_regions[i].guard); for (size_t j = 0; j < block_count; ++j) { size_t dword = (first_block + j) / 32; size_t bit = (first_block + j) % 32; if (--_regions[i].refcounts[first_block + j] == 0) _regions[i].bitmap[dword] &= ~(1u << bit); } rtUnlockMutex(_regions[i].guard); return; } } rtLog("BUFFERMGR", "Tried to release an invalid buffer"); } RT_DLLEXPORT void rtIncreaseBufferRefCount(const void *begin, size_t size) { uintptr_t begin_addr = (uintptr_t)begin; for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) { uintptr_t region_addr = (uintptr_t)_regions[i].memory; size_t region_size = _block_sizes[i] * _regions[i].block_count; if (begin_addr >= region_addr && begin_addr + size <= region_addr + region_size) { size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i]; size_t first_block = (begin_addr - region_addr) / _block_sizes[i]; rtLockMutex(_regions[i].guard); for (size_t j = 0; j < block_count; ++j) { ++_regions[i].refcounts[first_block + j]; } rtUnlockMutex(_regions[i].guard); return; } } rtLog("BUFFERMGR", "Tried to increase the refcount of an invalid buffer"); } #endif #define BLOCK_SIZE 4096u static uint32_t *_refcounts; static uint32_t *_bitmap; static char *_memory; static rt_mutex *_guard; static size_t _block_count; RT_CVAR_I(rt_BufferMemoryBudget, "The amount of memory to allocate for the buffer manager. Default: 512MB", RT_MB(512)); extern rt_result InitBufferManager(void) { _guard = rtCreateMutex(); if (!_guard) { rtReportError("BUFFERMGR", "Failed to create the buffer manager mutex."); return RT_UNKNOWN_ERROR; } if (!IsLZCNTSupported()) { rtReportError("BUFFERMGR", "The required lzcnt intrinisc is not supported."); return RT_UNKNOWN_ERROR; } size_t budget = (size_t)rt_BufferMemoryBudget.i; size_t block_count = budget / BLOCK_SIZE; if ((budget % block_count) != 0) { rtLog("BUFFERMGR", "The configured buffer memory budget %zu is not dividable by the block size (4KB).", budget); } size_t dword_count = (block_count + 31) / 32; _block_count = block_count; _memory = malloc(budget + dword_count * sizeof(uint32_t) + block_count * sizeof(uint32_t)); if (!_memory) { return RT_OUT_OF_MEMORY; } _bitmap = (uint32_t*)(_memory + budget); memset(_bitmap, 0, sizeof(uint32_t) * dword_count); _refcounts = _bitmap + dword_count; memset(_refcounts, 0, sizeof(uint32_t) * block_count); return RT_SUCCESS; } extern void ShutdownBufferManager(void) { rtDestroyMutex(_guard); } /* Public API */ RT_DLLEXPORT void *rtAllocBuffer(size_t size) { size_t alloc_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE; size_t dword_count = (_block_count + 31) / 32; void *result = NULL; size_t first_block = 0; rtLockMutex(_guard); for (size_t i = 0; i < _block_count; ++i) { size_t dword = i / 32; if (_bitmap[dword] == 0 || tzcnt32(_bitmap[dword]) >= alloc_blocks) { size_t mask = (1ull << alloc_blocks) - 1; _bitmap[dword] |= mask; result = _memory + i * BLOCK_SIZE; first_block = i; } else if (lzcnt32(_bitmap[dword]) >= alloc_blocks) { size_t first = (_bitmap[dword] != 0) ? 32 - lzcnt32(_bitmap[dword]) : 0; size_t mask = ((1ull << alloc_blocks) - 1) << first; _bitmap[dword] |= mask; result = _memory + (i + first) * BLOCK_SIZE; first_block = i + first; break; } else if (_bitmap[dword] != UINT32_MAX) { size_t first = 32 - lzcnt32(_bitmap[dword]); size_t leftover = alloc_blocks - lzcnt32(_bitmap[dword]); if (dword == dword_count - 1) { break; // Reached the end } if (leftover < 32) { size_t next_dword_free = _bitmap[dword + 1] != 0 ? tzcnt32(_bitmap[dword + 1]) : 32; if (next_dword_free < leftover) continue; _bitmap[dword] = UINT32_MAX; size_t mask = (1ull << leftover) - 1; _bitmap[dword + 1] |= mask; result = _memory + (i + first) * BLOCK_SIZE; first_block = i + first; break; } else { // Check each bit separately bool free = true; for (size_t j = i + first; j < i + first + alloc_blocks; ++j) { size_t dwordj = j / 32; size_t bitj = j % 32; if ((_bitmap[dwordj] & (1u << bitj)) != 0) { free = false; break; } } if (free) { for (size_t j = i + first; j < i + first + alloc_blocks; ++j) { size_t dwordj = j / 32; size_t bitj = j % 32; _bitmap[dwordj] |= (1u << bitj); } result = _memory + (i + first) * BLOCK_SIZE; first_block = i + first; } } } else { /* These 32 blocks are all allocated. Go to the next dword */ assert((i % 32) == 0); i += 31; } } for (size_t i = first_block; i < first_block + alloc_blocks; ++i) _refcounts[i] = 1; rtUnlockMutex(_guard); rtLog("BUFFERMGR", "Result ptr %llx", (uintptr_t)result); return result; } RT_DLLEXPORT void rtReleaseBuffer(const void *begin, size_t size) { size_t alloc_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE; uintptr_t off = (uintptr_t)begin - (uintptr_t)_memory; uintptr_t first_block = off / BLOCK_SIZE; rtLockMutex(_guard); for (size_t i = first_block; i < first_block + alloc_blocks; ++i) { if (--_refcounts[i] == 0) { size_t dword = i / 32; size_t bit = i % 32; _bitmap[dword] &= ~(1u << bit); } } rtUnlockMutex(_guard); } RT_DLLEXPORT void rtIncreaseBufferRefCount(const void *begin, size_t size) { size_t alloc_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE; uintptr_t off = (uintptr_t)begin - (uintptr_t)_memory; uintptr_t first_block = off / BLOCK_SIZE; rtLockMutex(_guard); for (size_t i = first_block; i < first_block + alloc_blocks; ++i) { ++_refcounts[i]; } rtUnlockMutex(_guard); }