#include "gpu.h" #include "gpu_sync.h" #include "swapchain.h" #include "runtime/atomics.h" #include "runtime/config.h" #include "runtime/handles.h" #include "runtime/mem_arena.h" #include "runtime/runtime.h" #include "gfx/renderer_api.h" #include RT_CVAR_I(rt_VkMaxCommandPools, "Maximum number of command pools that can be created. Default: 32", 32); RT_CVAR_I( rt_VkCommandBufferRingBufferSize, "Size of the ring buffer used to store command buffers. Must be a power of two! Default: 512", 512); typedef struct { VkCommandPool pools[RT_VK_MAX_SUPPORTED_FRAMES_IN_FLIGHT * 3]; uint32_t distinct_pool_count; VkCommandPool *compute_pools; VkCommandPool *graphics_pools; VkCommandPool *transfer_pools; } rt_thread_pools; typedef struct { VkCommandBuffer command_buffer; uint32_t version; rt_gpu_queue target_queue; } rt_command_buffer; static rt_thread_pools *_pools; static uint32_t _next_pools; static RT_THREAD_LOCAL unsigned int t_first_pool; static rt_command_buffer *_command_buffers; /* We let this overflow on its own. Use MOD rt_VkCommandBufferRingBufferSize to get the actual * index. */ static uint32_t _next_command_buffer; rt_result InitCommandBufferManagement(void) { _pools = calloc((size_t)rt_VkMaxCommandPools.i, sizeof(rt_thread_pools)); if (!_pools) return RT_OUT_OF_MEMORY; _command_buffers = calloc((size_t)rt_VkCommandBufferRingBufferSize.i, sizeof(rt_command_buffer)); if (!_command_buffers) { free(_pools); return RT_OUT_OF_MEMORY; } /* We keep 0 free as a "Not initialized" value for t_first_pool. * The atomicinc used to acquire a pool returns the incremented value, so 0 is never returned. */ _next_pools = 0; return RT_SUCCESS; } static void DestroyPools(rt_thread_pools *pools) { for (uint32_t j = 0; j < pools->distinct_pool_count; ++j) vkDestroyCommandPool(g_gpu.device, pools->pools[j], g_gpu.alloc_cb); free(_pools); } void ShutdownCommandBufferManagement(void) { /* _next_pools is the number of existing pools */ for (uint32_t i = 1; i < _next_pools; ++i) { DestroyPools(&_pools[i]); } } void rtResetCommandPools(unsigned int frame_id) { unsigned int pool_idx = frame_id % g_gpu.max_frames_in_flight; for (uint32_t i = 1; i < _next_pools; ++i) { if (vkResetCommandPool(g_gpu.device, _pools[i].graphics_pools[pool_idx], VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) != VK_SUCCESS) { rtLog("vk", "Failed to reset graphics pool slot %u index %u", i, pool_idx); } if (_pools[i].compute_pools != _pools[i].graphics_pools) { if (vkResetCommandPool(g_gpu.device, _pools[i].compute_pools[pool_idx], VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) != VK_SUCCESS) { rtLog("vk", "Failed to reset compute pool slot %u index %u", i, pool_idx); } } if (_pools[i].transfer_pools != _pools[i].graphics_pools && _pools[i].transfer_pools != _pools[i].compute_pools) { if (vkResetCommandPool(g_gpu.device, _pools[i].transfer_pools[pool_idx], VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) != VK_SUCCESS) { rtLog("vk", "Failed to reset transfer pool slot %u index %u", i, pool_idx); } } } } static rt_result CreatePools(rt_thread_pools *pools) { /* Graphics pools */ pools->graphics_pools = pools->pools; pools->distinct_pool_count = 0; VkCommandPoolCreateInfo graphics_info = {.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, .queueFamilyIndex = g_gpu.graphics_family, .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT}; for (uint32_t i = 0; i < g_gpu.max_frames_in_flight; ++i) { if (vkCreateCommandPool(g_gpu.device, &graphics_info, g_gpu.alloc_cb, &pools->graphics_pools[i]) != VK_SUCCESS) { rtLog("vk", "Failed to create a graphics command pool."); DestroyPools(pools); return RT_UNKNOWN_ERROR; } ++pools->distinct_pool_count; } if (g_gpu.compute_family != g_gpu.graphics_family) { VkCommandPoolCreateInfo compute_info = { .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, .queueFamilyIndex = g_gpu.compute_family, .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT, }; pools->compute_pools = &pools->pools[pools->distinct_pool_count]; for (uint32_t i = 0; i < g_gpu.max_frames_in_flight; ++i) { if (vkCreateCommandPool(g_gpu.device, &compute_info, g_gpu.alloc_cb, &pools->compute_pools[i]) != VK_SUCCESS) { rtLog("vk", "Failed to create a compute command pool."); DestroyPools(pools); return RT_UNKNOWN_ERROR; } ++pools->distinct_pool_count; } } else { pools->compute_pools = pools->graphics_pools; } if (g_gpu.transfer_family != g_gpu.graphics_family && g_gpu.transfer_family != g_gpu.compute_family) { VkCommandPoolCreateInfo transfer_info = { .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, .queueFamilyIndex = g_gpu.transfer_family, .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT, }; pools->transfer_pools = &pools->pools[pools->distinct_pool_count]; for (uint32_t i = 0; i < g_gpu.max_frames_in_flight; ++i) { if (vkCreateCommandPool(g_gpu.device, &transfer_info, g_gpu.alloc_cb, &pools->transfer_pools[i]) != VK_SUCCESS) { rtLog("vk", "Failed to create a transfer command pool."); DestroyPools(pools); return RT_UNKNOWN_ERROR; } ++pools->distinct_pool_count; } } else if (g_gpu.transfer_family == g_gpu.graphics_family) { pools->transfer_pools = pools->graphics_pools; } else if (g_gpu.transfer_family == g_gpu.compute_family) { pools->transfer_pools = pools->compute_pools; } return RT_SUCCESS; } rt_result RT_RENDERER_API_FN(AllocCommandBuffers)(uint32_t count, const rt_alloc_command_buffer_info *info, rt_command_buffer_handle *p_command_buffers) { rt_thread_pools *pools = &_pools[t_first_pool]; if (t_first_pool == 0) { /* Acquire pools */ t_first_pool = rtAtomic32Inc(&_next_pools); RT_ASSERT((int)t_first_pool < rt_VkMaxCommandPools.i, "Too many command pools created."); pools = &_pools[t_first_pool]; rt_result create_res = CreatePools(pools); if (create_res != RT_SUCCESS) return create_res; } if ((int)t_first_pool >= rt_VkMaxCommandPools.i) return RT_OUT_OF_MEMORY; uint32_t frame_id = g_gpu.current_frame_id % g_gpu.max_frames_in_flight; rt_result result = RT_SUCCESS; /* TODO: We should probably batch allocations of the same type */ uint32_t mod = (uint32_t)rt_VkCommandBufferRingBufferSize.i; uint32_t start = rtAtomic32FetchAdd(&_next_command_buffer, count); for (uint32_t i = 0; i < count; ++i) { uint32_t slot = (start + i) % mod; _command_buffers[slot].version = (_command_buffers[slot].version + 1) % RT_RENDER_BACKEND_HANDLE_MAX_VERSION; if (_command_buffers[slot].version == 0) _command_buffers[slot].version = 1; VkCommandPool pool = pools->graphics_pools[frame_id]; if (info[i].target_queue == RT_COMPUTE_QUEUE) pool = pools->compute_pools[frame_id]; else if (info[i].target_queue == RT_TRANSFER_QUEUE) pool = pools->transfer_pools[frame_id]; _command_buffers[slot].target_queue = info[i].target_queue; VkCommandBufferAllocateInfo alloc_info = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, .commandBufferCount = 1, .commandPool = pool, }; if (vkAllocateCommandBuffers(g_gpu.device, &alloc_info, &_command_buffers[slot].command_buffer) != VK_SUCCESS) { result = RT_UNKNOWN_ERROR; break; } VkCommandBufferBeginInfo begin_info = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, }; vkBeginCommandBuffer(_command_buffers[slot].command_buffer, &begin_info); p_command_buffers[i].index = (slot + 1); p_command_buffers[i].version = _command_buffers[slot].version; } return result; } rt_result RT_RENDERER_API_FN(SubmitCommandBuffers)(rt_gpu_queue queue, const rt_submit_command_buffers_info *info) { uint32_t count = info->command_buffer_count; rt_temp_arena temp = rtGetTemporaryArena(NULL, 0); if (!temp.arena) return RT_OUT_OF_MEMORY; rt_result result = RT_SUCCESS; VkQueue target_queue = rtGetQueue(queue); VkCommandBufferSubmitInfo *command_buffers = RT_ARENA_PUSH_ARRAY(temp.arena, VkCommandBufferSubmitInfo, count); if (!command_buffers) { result = RT_OUT_OF_MEMORY; goto out; } VkSemaphoreSubmitInfo *wait_semaphores = RT_ARENA_PUSH_ARRAY(temp.arena, VkSemaphoreSubmitInfo, info->wait_semaphore_count); if (!wait_semaphores && info->wait_semaphore_count > 0) { result = RT_OUT_OF_MEMORY; goto out; } VkSemaphoreSubmitInfo *signal_semaphores = RT_ARENA_PUSH_ARRAY(temp.arena, VkSemaphoreSubmitInfo, info->signal_semaphore_count); if (!signal_semaphores && info->signal_semaphore_count > 0) { result = RT_OUT_OF_MEMORY; goto out; } uint32_t wait_count = info->wait_semaphore_count; uint32_t signal_count = info->signal_semaphore_count; for (uint32_t i = 0; i < wait_count; ++i) { VkSemaphoreSubmitInfo semaphore_info = { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, .semaphore = rtGetSemaphore(info->wait_semaphores[i]), .value = info->wait_values[i], .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, .deviceIndex = 0, }; wait_semaphores[i] = semaphore_info; } for (uint32_t i = 0; i < signal_count; ++i) { VkSemaphoreSubmitInfo semaphore_info = { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, .semaphore = rtGetSemaphore(info->signal_semaphores[i]), .value = info->signal_values[i], .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, .deviceIndex = 0, }; signal_semaphores[i] = semaphore_info; } for (uint32_t i = 0; i < count; ++i) { if (!RT_IS_HANDLE_VALID(info->command_buffers[i])) { rtLog("vk", "Tried to submit an invalid command buffer."); result = RT_INVALID_VALUE; goto out; } uint32_t slot = info->command_buffers[i].index - 1; if (_command_buffers[slot].version != info->command_buffers[i].version) { rtLog("vk", "Mismatch between handle version and stored version while submitting a command " "buffer"); result = RT_INVALID_VALUE; goto out; } if (_command_buffers[slot].target_queue != queue) { rtLog("vk", "Mismatch between command buffer target queue and submit target queue."); result = RT_INVALID_VALUE; goto out; } command_buffers[i].sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO; command_buffers[i].pNext = NULL; command_buffers[i].deviceMask = 0; command_buffers[i].commandBuffer = _command_buffers[slot].command_buffer; vkEndCommandBuffer(command_buffers[i].commandBuffer); } VkSubmitInfo2 submit_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, .waitSemaphoreInfoCount = wait_count, .signalSemaphoreInfoCount = signal_count, .pWaitSemaphoreInfos = wait_semaphores, .pSignalSemaphoreInfos = signal_semaphores, .commandBufferInfoCount = count, .pCommandBufferInfos = command_buffers, }; if (vkQueueSubmit2(target_queue, 1, &submit_info, VK_NULL_HANDLE) != VK_SUCCESS) { rtLog("vk", "vkQueueSubmit failed."); result = RT_UNKNOWN_ERROR; } out: rtReturnTemporaryArena(temp); return result; } VkCommandBuffer rtGetCommandBuffer(rt_command_buffer_handle cmdbuf) { uint32_t mod = (uint32_t)rt_VkCommandBufferRingBufferSize.i; if (!RT_IS_HANDLE_VALID(cmdbuf)) return VK_NULL_HANDLE; uint32_t slot = (cmdbuf.index - 1) % mod; if (_command_buffers[slot].version != cmdbuf.version) { return VK_NULL_HANDLE; } return _command_buffers[slot].command_buffer; } VkCommandBuffer rtAllocSingleCommandBuffer(rt_gpu_queue queue) { rt_thread_pools *pools = &_pools[t_first_pool]; if (t_first_pool == 0) { /* Acquire pools */ t_first_pool = rtAtomic32Inc(&_next_pools); RT_ASSERT((int)t_first_pool < rt_VkMaxCommandPools.i, "Too many command pools created."); pools = &_pools[t_first_pool]; rt_result create_res = CreatePools(pools); if (create_res != RT_SUCCESS) return VK_NULL_HANDLE; } if ((int)t_first_pool >= rt_VkMaxCommandPools.i) return VK_NULL_HANDLE; uint32_t frame_id = g_gpu.current_frame_id % g_gpu.max_frames_in_flight; VkCommandPool pool = pools->graphics_pools[frame_id]; if (queue == RT_COMPUTE_QUEUE) pool = pools->compute_pools[frame_id]; else if (queue == RT_TRANSFER_QUEUE) pool = pools->transfer_pools[frame_id]; VkCommandBufferAllocateInfo alloc_info = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, .commandBufferCount = 1, .commandPool = pool, }; VkCommandBuffer cmdbuf; if (vkAllocateCommandBuffers(g_gpu.device, &alloc_info, &cmdbuf) != VK_SUCCESS) { return VK_NULL_HANDLE; } return cmdbuf; } rt_result rtSubmitSingleCommandBuffer(VkCommandBuffer command_buffer, const VkSemaphore *wait_semaphores, const uint32_t *wait_values, uint32_t wait_semaphore_count, const VkSemaphore *signal_semaphores, const uint32_t *signal_values, uint32_t signal_semaphore_count, rt_gpu_queue queue) { rt_temp_arena temp = rtGetTemporaryArena(NULL, 0); if (!temp.arena) return RT_OUT_OF_MEMORY; VkQueue target_queue = rtGetQueue(queue); rt_result result = RT_SUCCESS; VkSemaphoreSubmitInfo *wait_semaphore_info = RT_ARENA_PUSH_ARRAY(temp.arena, VkSemaphoreSubmitInfo, wait_semaphore_count); if (!wait_semaphore_info && wait_semaphore_count > 0) { result = RT_OUT_OF_MEMORY; goto out; } VkSemaphoreSubmitInfo *signal_semaphore_info = RT_ARENA_PUSH_ARRAY(temp.arena, VkSemaphoreSubmitInfo, signal_semaphore_count); if (!signal_semaphore_info && signal_semaphore_count > 0) { result = RT_OUT_OF_MEMORY; goto out; } uint32_t wait_count = wait_semaphore_count; uint32_t signal_count = signal_semaphore_count; for (uint32_t i = 0; i < wait_count; ++i) { VkSemaphoreSubmitInfo semaphore_info = { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, .semaphore = wait_semaphores[i], .value = (wait_values) ? wait_values[i] : 0, .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, .deviceIndex = 0, }; wait_semaphore_info[i] = semaphore_info; } for (uint32_t i = 0; i < signal_count; ++i) { VkSemaphoreSubmitInfo semaphore_info = { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, .semaphore = signal_semaphores[i], .value = (signal_values) ? signal_values[i] : 0, .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, .deviceIndex = 0, }; signal_semaphore_info[i] = semaphore_info; } VkCommandBufferSubmitInfo command_buffer_info = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, .deviceMask = 0, .commandBuffer = command_buffer, }; VkSubmitInfo2 submit_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, .waitSemaphoreInfoCount = wait_count, .signalSemaphoreInfoCount = signal_count, .pWaitSemaphoreInfos = wait_semaphore_info, .pSignalSemaphoreInfos = signal_semaphore_info, .commandBufferInfoCount = 1, .pCommandBufferInfos = &command_buffer_info, }; if (vkQueueSubmit2(target_queue, 1, &submit_info, VK_NULL_HANDLE) != VK_SUCCESS) { rtLog("vk", "vkQueueSubmit failed."); result = RT_UNKNOWN_ERROR; } out: rtReturnTemporaryArena(temp); return result; }