From 887fa63c430a10c788b46ea664d2dad0dd9ea77b Mon Sep 17 00:00:00 2001 From: Kevin Trogant Date: Tue, 13 Feb 2024 08:35:19 +0100 Subject: [PATCH] Manage command buffers I decided to make queues explicit, to simplify handling queue ownership transfers in the renderer code. The framegraph + pass code has explicit knowledge about resource ownership, so it makes sense to handle it there. - Manage pools - Allocate command buffers - Submit command buffers --- src/renderer/vk/command_buffers.c | 286 +++++++++++++++++++++++++++++- src/renderer/vk/command_buffers.h | 2 + src/renderer/vk/frame.c | 9 + src/renderer/vk/gpu.h | 14 ++ src/renderer/vk/helper.c | 13 ++ src/renderer/vk/init.c | 41 ++++- src/renderer/vk/meson.build | 1 + src/renderer/vk/swapchain.c | 7 +- src/runtime/atomics.h | 29 +++ src/runtime/gfx.h | 2 + src/runtime/gfx_main.c | 18 +- src/runtime/main_loop.c | 10 +- src/runtime/main_loop.h | 4 +- src/runtime/meson.build | 1 + src/runtime/renderer_api.h | 63 +++++-- src/runtime/runtime.h | 2 + 16 files changed, 455 insertions(+), 47 deletions(-) create mode 100644 src/renderer/vk/frame.c create mode 100644 src/runtime/atomics.h diff --git a/src/renderer/vk/command_buffers.c b/src/renderer/vk/command_buffers.c index 4f546b1..fb77acb 100644 --- a/src/renderer/vk/command_buffers.c +++ b/src/renderer/vk/command_buffers.c @@ -1,14 +1,284 @@ -#include "runtime/renderer_api.h" +#include "gpu.h" +#include "swapchain.h" -rt_result -RT_RENDERER_API_FN(AllocCommandBuffers)(uint32_t count, - rt_render_command_buffer_handle *p_command_buffers, - rt_gpu_semaphore_handle *p_semaphores) { +#include "runtime/atomics.h" +#include "runtime/config.h" +#include "runtime/mem_arena.h" +#include "runtime/renderer_api.h" +#include "runtime/runtime.h" + +#include + +RT_CVAR_I(rt_VkMaxCommandPools, + "Maximum number of command pools that can be created. Default: 32", + 32); +RT_CVAR_I( + rt_VkCommandBufferRingBufferSize, + "Size of the ring buffer used to store command buffers. Must be a power of two! Default: 512", + 512); + +typedef struct { + VkCommandPool pools[RT_VK_MAX_SUPPORTED_FRAMES_IN_FLIGHT * 3]; + uint32_t distinct_pool_count; + + VkCommandPool *compute_pools; + VkCommandPool *graphics_pools; + VkCommandPool *transfer_pools; +} rt_thread_pools; + +typedef struct { + VkCommandBuffer command_buffer; + uint32_t version; + rt_gpu_queue target_queue; +} rt_command_buffer; + +static rt_thread_pools *_pools; +static uint32_t _next_pools; +static RT_THREAD_LOCAL unsigned int t_first_pool; + +static rt_command_buffer *_command_buffers; +/* We let this overflow on its own. Use MOD rt_VkCommandBufferRingBufferSize to get the actual + * index. */ +static uint32_t _next_command_buffer; + +rt_result InitCommandBufferManagement(void) { + _pools = calloc((size_t)rt_VkMaxCommandPools.i, sizeof(rt_thread_pools)); + if (!_pools) + return RT_OUT_OF_MEMORY; + + _command_buffers = + calloc((size_t)rt_VkCommandBufferRingBufferSize.i, sizeof(rt_command_buffer)); + if (!_command_buffers) { + free(_pools); + return RT_OUT_OF_MEMORY; + } + + /* Keep 0 free as a "Not initialized" value for t_first_pool */ + _next_pools = 1; return RT_SUCCESS; } -rt_result -RT_RENDERER_API_FN(SubmitCommandBuffers)(uint32_t count, - const rt_render_command_buffer_handle *command_buffers) { +static void DestroyPools(rt_thread_pools *pools) { + for (uint32_t j = 0; j < pools->distinct_pool_count; ++j) + vkDestroyCommandPool(g_gpu.device, pools->pools[j], g_gpu.alloc_cb); + free(_pools); +} + +void ShutdownCommandBufferManagement(void) { + /* _next_pools is the number of existing pools */ + for (uint32_t i = 1; i < _next_pools; ++i) { + DestroyPools(&_pools[i]); + } +} + +void rtResetCommandPools(unsigned int frame_id) { + unsigned int pool_idx = frame_id % g_gpu.max_frames_in_flight; + for (uint32_t i = 1; i < _next_pools; ++i) { + if (vkResetCommandPool(g_gpu.device, + _pools[i].graphics_pools[pool_idx], + VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) != VK_SUCCESS) { + rtLog("vk", "Failed to reset graphics pool slot %u index %u", i, pool_idx); + } + if (_pools[i].compute_pools != _pools[i].graphics_pools) { + if (vkResetCommandPool(g_gpu.device, + _pools[i].compute_pools[pool_idx], + VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) != VK_SUCCESS) { + rtLog("vk", "Failed to reset compute pool slot %u index %u", i, pool_idx); + } + } + if (_pools[i].transfer_pools != _pools[i].graphics_pools && + _pools[i].transfer_pools != _pools[i].compute_pools) { + if (vkResetCommandPool(g_gpu.device, + _pools[i].transfer_pools[pool_idx], + VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) != VK_SUCCESS) { + rtLog("vk", "Failed to reset transfer pool slot %u index %u", i, pool_idx); + } + } + } +} + +static rt_result CreatePools(rt_thread_pools *pools) { + /* Graphics pools */ + pools->graphics_pools = pools->pools; + pools->distinct_pool_count = 0; + VkCommandPoolCreateInfo graphics_info = {.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .queueFamilyIndex = g_gpu.graphics_family, + .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT}; + for (uint32_t i = 0; i < g_gpu.max_frames_in_flight; ++i) { + if (vkCreateCommandPool(g_gpu.device, + &graphics_info, + g_gpu.alloc_cb, + &pools->graphics_pools[i]) != VK_SUCCESS) { + rtLog("vk", "Failed to create a graphics command pool."); + DestroyPools(pools); + return RT_UNKNOWN_ERROR; + } + ++pools->distinct_pool_count; + } + + if (g_gpu.compute_family != g_gpu.graphics_family) { + VkCommandPoolCreateInfo compute_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .queueFamilyIndex = g_gpu.compute_family, + .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT, + }; + pools->compute_pools = &pools->pools[pools->distinct_pool_count]; + for (uint32_t i = 0; i < g_gpu.max_frames_in_flight; ++i) { + if (vkCreateCommandPool(g_gpu.device, + &compute_info, + g_gpu.alloc_cb, + &pools->compute_pools[i]) != VK_SUCCESS) { + rtLog("vk", "Failed to create a compute command pool."); + DestroyPools(pools); + return RT_UNKNOWN_ERROR; + } + ++pools->distinct_pool_count; + } + } else { + pools->compute_pools = pools->graphics_pools; + } + + if (g_gpu.transfer_family != g_gpu.graphics_family && + g_gpu.transfer_family != g_gpu.compute_family) { + VkCommandPoolCreateInfo transfer_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .queueFamilyIndex = g_gpu.transfer_family, + .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT, + }; + pools->transfer_pools = &pools->pools[pools->distinct_pool_count]; + for (uint32_t i = 0; i < g_gpu.max_frames_in_flight; ++i) { + if (vkCreateCommandPool(g_gpu.device, + &transfer_info, + g_gpu.alloc_cb, + &pools->transfer_pools[i]) != VK_SUCCESS) { + rtLog("vk", "Failed to create a transfer command pool."); + DestroyPools(pools); + return RT_UNKNOWN_ERROR; + } + ++pools->distinct_pool_count; + } + } else if (g_gpu.transfer_family == g_gpu.graphics_family) { + pools->transfer_pools = pools->graphics_pools; + } else if (g_gpu.transfer_family == g_gpu.compute_family) { + pools->transfer_pools = pools->compute_pools; + } return RT_SUCCESS; +} + +rt_result RT_RENDERER_API_FN(AllocCommandBuffers)(uint32_t count, + const rt_alloc_command_buffer_info *info, + rt_command_buffer_handle *p_command_buffers) { + rt_thread_pools *pools = &_pools[t_first_pool]; + if (t_first_pool == 0) { + /* Acquire pools */ + t_first_pool = rtAtomic32Inc(&_next_pools); + RT_ASSERT((int)t_first_pool < rt_VkMaxCommandPools.i, "Too many command pools created."); + + pools = &_pools[t_first_pool]; + rt_result create_res = CreatePools(pools); + if (create_res != RT_SUCCESS) + return create_res; + } + if ((int)t_first_pool >= rt_VkMaxCommandPools.i) + return RT_OUT_OF_MEMORY; + + uint32_t frame_id = 0; + rt_result result = RT_SUCCESS; + + /* TODO: We should probably batch allocations of the same type */ + uint32_t mod = (uint32_t)rt_VkCommandBufferRingBufferSize.i; + uint32_t start = rtAtomic32FetchAdd(&_next_command_buffer, count); + for (uint32_t i = 0; i < count; ++i) { + uint32_t slot = (start + i) % mod; + _command_buffers[slot].version = + (_command_buffers[slot].version + 1) % RT_RENDER_BACKEND_HANDLE_MAX_VERSION; + if (_command_buffers[slot].version == 0) + _command_buffers[slot].version = 1; + + VkCommandPool pool = pools->graphics_pools[frame_id]; + if (info[i].target_queue == RT_COMPUTE_QUEUE) + pool = pools->compute_pools[frame_id]; + else if (info[i].target_queue == RT_TRANSFER_QUEUE) + pool = pools->transfer_pools[frame_id]; + _command_buffers[slot].target_queue = info[i].target_queue; + + VkCommandBufferAllocateInfo alloc_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + .commandPool = pool, + }; + if (vkAllocateCommandBuffers(g_gpu.device, + &alloc_info, + &_command_buffers[slot].command_buffer) != VK_SUCCESS) { + result = RT_UNKNOWN_ERROR; + break; + } + + p_command_buffers[i].index = slot; + p_command_buffers[i].version = _command_buffers[slot].version; + } + + return result; +} + +rt_result RT_RENDERER_API_FN(SubmitCommandBuffers)(rt_gpu_queue queue, + const rt_submit_command_buffers_info *info) { + + uint32_t count = info->command_buffer_count; + rt_temp_arena temp = rtGetTemporaryArena(NULL, 0); + if (!temp.arena) + return RT_OUT_OF_MEMORY; + + rt_result result = RT_SUCCESS; + VkQueue target_queue = rtGetQueue(queue); + + VkCommandBuffer *command_buffers = RT_ARENA_PUSH_ARRAY(temp.arena, VkCommandBuffer, count); + if (!command_buffers) { + result = RT_OUT_OF_MEMORY; + goto out; + } + + for (uint32_t i = 0; i < count; ++i) { + uint32_t slot = info->command_buffers[i].index; + if (_command_buffers[slot].version != info->command_buffers[i].version) { + rtLog("vk", + "Mismatch between handle version and stored version while submitting a command " + "buffer"); + result = RT_INVALID_VALUE; + goto out; + } + if (_command_buffers[slot].target_queue != queue) { + rtLog("vk", "Mismatch between command buffer target queue and submit target queue."); + result = RT_INVALID_VALUE; + goto out; + } + command_buffers[i] = _command_buffers[slot].command_buffer; + } + + /* TODO(Kevin): Retrieve semaphores */ + VkSemaphore *wait_semaphores = NULL; + VkSemaphore *signal_semaphores = NULL; + uint32_t wait_count = 0; + uint32_t signal_count = 0; + + VkSubmitInfo submit_info = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .pCommandBuffers = command_buffers, + .commandBufferCount = count, + .pWaitSemaphores = wait_semaphores, + .pWaitDstStageMask = NULL, + .waitSemaphoreCount = wait_count, + .pSignalSemaphores = signal_semaphores, + .signalSemaphoreCount = signal_count, + }; + + if (vkQueueSubmit(target_queue, 1, &submit_info, VK_NULL_HANDLE) != VK_SUCCESS) { + rtLog("vk", "vkQueueSubmit failed."); + result = RT_UNKNOWN_ERROR; + } + +out: + rtReturnTemporaryArena(temp); + return result; } \ No newline at end of file diff --git a/src/renderer/vk/command_buffers.h b/src/renderer/vk/command_buffers.h index 2a8ecf7..d8803ae 100644 --- a/src/renderer/vk/command_buffers.h +++ b/src/renderer/vk/command_buffers.h @@ -3,4 +3,6 @@ #include "runtime/runtime.h" +void rtResetCommandPools(unsigned int frame_id); + #endif diff --git a/src/renderer/vk/frame.c b/src/renderer/vk/frame.c new file mode 100644 index 0000000..5a9af8d --- /dev/null +++ b/src/renderer/vk/frame.c @@ -0,0 +1,9 @@ +#include "gpu.h" +#include "command_buffers.h" + +#include "runtime/renderer_api.h" + +void RT_RENDERER_API_FN(BeginFrame)(unsigned int frame_id) { + g_gpu.current_frame_id = frame_id; + rtResetCommandPools(frame_id); +} \ No newline at end of file diff --git a/src/renderer/vk/gpu.h b/src/renderer/vk/gpu.h index 3bf5422..7b54351 100644 --- a/src/renderer/vk/gpu.h +++ b/src/renderer/vk/gpu.h @@ -9,6 +9,13 @@ #include "runtime/renderer_api.h" +/* Minimum supported value of g_gpu.max_frames_in_flight */ +#define RT_VK_MIN_SUPPORTED_FRAMES_IN_FLIGHT 2 + +/* Maximum supported number of frames in flight. + * The actually configured value is contained in g_gpu. */ +#define RT_VK_MAX_SUPPORTED_FRAMES_IN_FLIGHT 3 + #ifdef _WIN32 struct HINSTANCE__; struct HWND__; @@ -36,9 +43,11 @@ typedef struct { VkQueue graphics_queue; VkQueue compute_queue; VkQueue present_queue; + VkQueue transfer_queue; uint32_t graphics_family; uint32_t compute_family; uint32_t present_family; + uint32_t transfer_family; rt_native_window native_window; @@ -48,6 +57,9 @@ typedef struct { VkPhysicalDeviceFeatures phys_device_features; VmaAllocator allocator; + + unsigned int max_frames_in_flight; + unsigned int current_frame_id; } rt_vk_gpu; #ifndef RT_VK_DONT_DEFINE_GPU_GLOBAL @@ -60,4 +72,6 @@ VkFormat rtPixelFormatToVkFormat(rt_pixel_format format); VkSampleCountFlagBits rtSampleCountToFlags(unsigned int count); +VkQueue rtGetQueue(rt_gpu_queue queue); + #endif diff --git a/src/renderer/vk/helper.c b/src/renderer/vk/helper.c index 81fbf2d..2de9a4f 100644 --- a/src/renderer/vk/helper.c +++ b/src/renderer/vk/helper.c @@ -40,4 +40,17 @@ VkSampleCountFlagBits rtSampleCountToFlags(unsigned int count) { break; } return (VkSampleCountFlagBits)count; +} + +VkQueue rtGetQueue(rt_gpu_queue queue) { + switch (queue) { + case RT_GRAPHICS_QUEUE: + return g_gpu.graphics_queue; + case RT_COMPUTE_QUEUE: + return g_gpu.compute_queue; + case RT_TRANSFER_QUEUE: + return g_gpu.transfer_queue; + default: + return VK_NULL_HANDLE; + } } \ No newline at end of file diff --git a/src/renderer/vk/init.c b/src/renderer/vk/init.c index 254a40c..34d6d6c 100644 --- a/src/renderer/vk/init.c +++ b/src/renderer/vk/init.c @@ -19,6 +19,8 @@ RT_CVAR_I(r_VkEnableAPIAllocTracking, RT_CVAR_S(r_VkPhysDeviceName, "Name of the selected physical device. Default: \"\"", ""); +RT_CVAR_I(r_VkMaxFramesInFlight, "Maximum number of frames in flight. [2/3] Default: 2", 2); + rt_vk_gpu g_gpu; static VkAllocationCallbacks _tracking_alloc_cbs; @@ -82,12 +84,15 @@ DebugUtilsMessengerCb(VkDebugUtilsMessageSeverityFlagBitsEXT severity, extern rt_cvar r_VkPreferredSwapchainImages; extern rt_cvar r_VkPreferMailboxMode; +extern rt_cvar r_VkMaxPipelineCount; void RT_RENDERER_API_FN(RegisterCVars)(void) { rtRegisterCVAR(&r_VkEnableAPIAllocTracking); rtRegisterCVAR(&r_VkPhysDeviceName); rtRegisterCVAR(&r_VkPreferredSwapchainImages); rtRegisterCVAR(&r_VkPreferMailboxMode); + rtRegisterCVAR(&r_VkMaxFramesInFlight); + rtRegisterCVAR(&r_VkMaxPipelineCount); } static rt_result CreateInstance(void) { @@ -211,12 +216,14 @@ typedef struct { uint32_t graphics; uint32_t compute; uint32_t present; + uint32_t transfer; } rt_queue_indices; static rt_queue_indices RetrieveQueueIndices(VkPhysicalDevice phys_dev, VkSurfaceKHR surface) { rt_queue_indices indices = {.graphics = UINT32_MAX, .compute = UINT32_MAX, - .present = UINT32_MAX}; + .present = UINT32_MAX, + .transfer = UINT32_MAX}; uint32_t count = 0; vkGetPhysicalDeviceQueueFamilyProperties(phys_dev, &count, NULL); @@ -232,12 +239,20 @@ static rt_queue_indices RetrieveQueueIndices(VkPhysicalDevice phys_dev, VkSurfac indices.graphics = i; if ((props[i].queueFlags & VK_QUEUE_COMPUTE_BIT) != 0) indices.compute = i; + if ((props[i].queueFlags & VK_QUEUE_TRANSFER_BIT) != 0) + indices.transfer = i; VkBool32 present_supported = VK_FALSE; vkGetPhysicalDeviceSurfaceSupportKHR(phys_dev, i, surface, &present_supported); if (present_supported) indices.present = i; } + + if (indices.transfer == UINT32_MAX && indices.graphics != UINT32_MAX) + indices.transfer = indices.graphics; + else if (indices.transfer == UINT32_MAX && indices.compute != UINT32_MAX) + indices.transfer = indices.compute; + free(props); return indices; } @@ -405,11 +420,12 @@ static rt_result CreateDevice(void) { g_gpu.compute_family = queue_indices.compute; g_gpu.graphics_family = queue_indices.graphics; g_gpu.present_family = queue_indices.present; + g_gpu.transfer_family = queue_indices.transfer; float priority = 1.f; uint32_t distinct_queue_count = 1; - VkDeviceQueueCreateInfo queue_info[3]; + VkDeviceQueueCreateInfo queue_info[4]; queue_info[0].sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; queue_info[0].pNext = NULL; queue_info[0].flags = 0; @@ -433,6 +449,17 @@ static rt_result CreateDevice(void) { queue_info[distinct_queue_count].queueCount = 1; queue_info[distinct_queue_count].queueFamilyIndex = queue_indices.present; queue_info[distinct_queue_count].pQueuePriorities = &priority; + ++distinct_queue_count; + } + if (queue_indices.transfer != queue_indices.graphics && + queue_indices.transfer != queue_indices.compute) { + queue_info[distinct_queue_count].sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + queue_info[distinct_queue_count].pNext = NULL; + queue_info[distinct_queue_count].flags = 0; + queue_info[distinct_queue_count].queueCount = 1; + queue_info[distinct_queue_count].queueFamilyIndex = queue_indices.transfer; + queue_info[distinct_queue_count].pQueuePriorities = &priority; + ++distinct_queue_count; } VkPhysicalDeviceDescriptorIndexingFeatures indexing_features = { @@ -463,6 +490,7 @@ static rt_result CreateDevice(void) { vkGetDeviceQueue(g_gpu.device, queue_indices.graphics, 0, &g_gpu.graphics_queue); vkGetDeviceQueue(g_gpu.device, queue_indices.compute, 0, &g_gpu.compute_queue); vkGetDeviceQueue(g_gpu.device, queue_indices.present, 0, &g_gpu.present_queue); + vkGetDeviceQueue(g_gpu.device, queue_indices.transfer, 0, &g_gpu.transfer_queue); return RT_SUCCESS; } @@ -519,6 +547,8 @@ extern rt_result InitPipelineManagement(void); extern void ShutdownPipelineManagement(void); extern rt_result InitRenderTargetManagement(void); extern void ShutdownRenderTargetManagement(void); +extern rt_result InitCommandBufferManagement(void); +extern void ShutdownCommandBufferManagement(void); rt_result RT_RENDERER_API_FN(Init)(const rt_renderer_init_info *info) { rtLog("vk", "Init"); @@ -533,6 +563,9 @@ rt_result RT_RENDERER_API_FN(Init)(const rt_renderer_init_info *info) { } else { g_gpu.alloc_cb = NULL; } + g_gpu.max_frames_in_flight = RT_RESTRICT_VALUE_TO_BOUNDS(r_VkMaxFramesInFlight.i, + RT_VK_MIN_SUPPORTED_FRAMES_IN_FLIGHT, + RT_VK_MAX_SUPPORTED_FRAMES_IN_FLIGHT); int res = CreateInstance(); if (res != RT_SUCCESS) @@ -553,6 +586,9 @@ rt_result RT_RENDERER_API_FN(Init)(const rt_renderer_init_info *info) { if (res != RT_SUCCESS) return res; res = InitRenderTargetManagement(); + if (res != RT_SUCCESS) + return res; + res = InitCommandBufferManagement(); if (res != RT_SUCCESS) return res; res = rtCreateSwapchain(); @@ -566,6 +602,7 @@ void RT_RENDERER_API_FN(Shutdown)(void) { rtLog("vk", "Shutdown"); vkDeviceWaitIdle(g_gpu.device); rtDestroySwapchain(); + ShutdownCommandBufferManagement(); ShutdownRenderTargetManagement(); ShutdownPipelineManagement(); DestroyAllocator(); diff --git a/src/renderer/vk/meson.build b/src/renderer/vk/meson.build index f3c002e..dd5191a 100644 --- a/src/renderer/vk/meson.build +++ b/src/renderer/vk/meson.build @@ -16,6 +16,7 @@ if vk_dep.found() 'swapchain.h', 'command_buffers.c', + 'frame.c', 'helper.c', 'init.c', 'pipelines.c', diff --git a/src/renderer/vk/swapchain.c b/src/renderer/vk/swapchain.c index aaee21a..509827d 100644 --- a/src/renderer/vk/swapchain.c +++ b/src/renderer/vk/swapchain.c @@ -121,12 +121,7 @@ rt_result rtCreateSwapchain(void) { return 50; } g_swapchain.format = device_params.surface_format.format; - g_swapchain.extent = - - - - - device_params.extent; + g_swapchain.extent = device_params.extent; /* Retrieve images */ g_swapchain.image_count = 0; diff --git a/src/runtime/atomics.h b/src/runtime/atomics.h new file mode 100644 index 0000000..1d63d3e --- /dev/null +++ b/src/runtime/atomics.h @@ -0,0 +1,29 @@ +#ifndef RT_ATOMICS_H +#define RT_ATOMICS_H + +/* Macros & helpers for atomic instructions */ + +#ifdef _MSC_VER + +/* Increment and decrement return the new value */ + +#define rtAtomic32Inc(pa) _InterlockedIncrement((volatile LONG *)(pa)) +#define rtAtomic64Inc(pa) _InterlockedIncrement64((volatile LONG64 *)(pa)) +#define rtAtomic32Dec(pa) _InterlockedDecrement((volatile LONG *)(pa)) +#define rtAtomic64Dec(pa) _InterlockedDecrement64((volatile LONG64 *)(pa)) + +#define rtAtomic32FetchAdd(pa, value) _InterlockedExchangeAdd((volatile LONG *)(pa), (LONG)(value)) +#define rtAtomic64FetchAdd(pa, value) _InterlockedExchangeAdd64((volatile LONG64 *)(pa), (LONG)(value)) + +#elif defined(__GNUC__) || defined(__clang__) + +#define rtAtomic32Inc(pa) __atomic_add_fetch((pa), 1, __ATOMIC_SEQ_CST) +#define rtAtomic64Inc(pa) __atomic_add_fetch((pa), 1LL, __ATOMIC_SEQ_CST) +#define rtAtomic32Dec(pa) __atomic_sub_fetch((pa), 1, __ATOMIC_SEQ_CST) +#define rtAtomic64Dec(pa) __atomic_sub_fetch((pa), 1LL, __ATOMIC_SEQ_CST) + +#define rtAtomic32FetchAdd(pa, value) __atomic_fetch_add((pa), (value), __ATOMIC_SEQ_CST) +#define rtAtomic64FetchAdd(pa, value) _-atomic_fetch_add((pa), (value), __ATOMIC_SEQ_CST) +#endif + +#endif diff --git a/src/runtime/gfx.h b/src/runtime/gfx.h index 4357de2..75ee5f6 100644 --- a/src/runtime/gfx.h +++ b/src/runtime/gfx.h @@ -61,6 +61,8 @@ RT_DLLEXPORT rt_result rtInitGFX(rt_renderer_init_info *renderer_info); RT_DLLEXPORT void rtShutdownGFX(void); +RT_DLLEXPORT void rtBeginGFXFrame(unsigned int frame_id); + /* ********************************************************************* * Framegraph API * diff --git a/src/runtime/gfx_main.c b/src/runtime/gfx_main.c index 7f9f6d7..c288472 100644 --- a/src/runtime/gfx_main.c +++ b/src/runtime/gfx_main.c @@ -25,17 +25,17 @@ RT_CVAR_S(rt_Renderer, "Select the render backend. Available options: [vk], Defa extern void RT_RENDERER_API_FN(RegisterCVars)(void); extern rt_result RT_RENDERER_API_FN(Init)(const rt_renderer_init_info *); extern void RT_RENDERER_API_FN(Shutdown)(void); +extern void RT_RENDERER_API_FN(BeginFrame)(unsigned int); extern rt_pipeline_handle RT_RENDERER_API_FN(CompilePipeline)(const rt_pipeline_info *); extern void RT_RENDERER_API_FN(DestroyPipeline)(rt_pipeline_handle); extern rt_render_target_handle RT_RENDERER_API_FN(CreateRenderTarget)(const rt_render_target_info *); extern void RT_RENDERER_API_FN(DestroyRenderTarget)(rt_render_target_handle); -extern rt_result - RT_RENDERER_API_FN(AllocCommandBuffers)(uint32_t count, - rt_render_command_buffer_handle *p_command_buffers, - rt_gpu_semaphore_handle *p_semaphores); -extern rt_result RT_RENDERER_API_FN( - SubmitCommandBuffers)(uint32_t count, const rt_render_command_buffer_handle *command_buffers); +extern rt_result RT_RENDERER_API_FN(AllocCommandBuffers)(uint32_t, + const rt_alloc_command_buffer_info *, + rt_command_buffer_handle *); +extern rt_result RT_RENDERER_API_FN(SubmitCommandBuffers)(rt_gpu_queue, + const rt_submit_command_buffers_info *); #endif extern rt_result InitFramegraphManager(void); @@ -62,6 +62,7 @@ static bool LoadRenderer(void) { RETRIEVE_SYMBOL(RegisterCVars, rt_register_renderer_cvars_fn); RETRIEVE_SYMBOL(Init, rt_init_renderer_fn); RETRIEVE_SYMBOL(Shutdown, rt_shutdown_renderer_fn); + RETRIEVE_SYMBOL(BeginFrame, rt_begin_frame_fn); RETRIEVE_SYMBOL(CompilePipeline, rt_compile_pipeline_fn); RETRIEVE_SYMBOL(DestroyPipeline, rt_destroy_pipeline_fn); RETRIEVE_SYMBOL(CreateRenderTarget, rt_create_render_target_fn); @@ -80,6 +81,7 @@ static bool LoadRenderer(void) { g_renderer.RegisterCVars = &rtRenRegisterCVars; g_renderer.Init = &rtRenInit; g_renderer.Shutdown = &rtRenShutdown; + g_renderer.BeginFrame = &rtRenBeginFrame; g_renderer.CompilePipeline = &rtRenCompilePipeline; g_renderer.DestroyPipeline = &rtRenDestroyPipeline; g_renderer.CreateRenderTarget = &rtRenCreateRenderTarget; @@ -121,3 +123,7 @@ RT_DLLEXPORT void rtShutdownGFX(void) { ShutdownFramegraphManager(); g_renderer.Shutdown(); } + +RT_DLLEXPORT void rtBeginGFXFrame(unsigned int frame_id) { + g_renderer.BeginFrame(frame_id); +} \ No newline at end of file diff --git a/src/runtime/main_loop.c b/src/runtime/main_loop.c index f6c46bb..0e576cd 100644 --- a/src/runtime/main_loop.c +++ b/src/runtime/main_loop.c @@ -2,6 +2,7 @@ #include "main_loop.h" #include "runtime.h" #include "config.h" +#include "gfx.h" RT_CVAR_I(rt_MaxFrameLatency, "Maximum latency between update and rendering. Default: 2", 2); @@ -16,11 +17,11 @@ void UpdateThreadEntry(void *param) { while (!g_main_loop.shutdown) { /* Wait until the render thread has catched up */ rtWaitOnSemaphore(&g_main_loop.update_proceed); - rtLog("UT", "Processing %d", g_main_loop.u_frame_id); + rtLog("UT", "Processing %u", g_main_loop.u_frame_id); (g_main_loop.GameUpdate)(); - rtLog("UT", "Finished %d", g_main_loop.u_frame_id); + rtLog("UT", "Finished %u", g_main_loop.u_frame_id); g_main_loop.u_frame_id += 1; /* Signal the render thread that data is available */ rtSignalSemaphore(&g_main_loop.render_proceed); @@ -35,11 +36,12 @@ void RenderThreadEntry(void *param) { rtLog("RT", "RenderThread Entry"); while (!g_main_loop.shutdown) { rtWaitOnSemaphore(&g_main_loop.render_proceed); - rtLog("RT", "Processing %d", g_main_loop.r_frame_id); + rtLog("RT", "Processing %u", g_main_loop.r_frame_id); + rtBeginGFXFrame(g_main_loop.r_frame_id); (g_main_loop.GameRender)(); - rtLog("RT", "Finished %d", g_main_loop.r_frame_id); + rtLog("RT", "Finished %u", g_main_loop.r_frame_id); g_main_loop.r_frame_id += 1; /* Signal the update thread that we have finished and it can proceed */ rtSignalSemaphore(&g_main_loop.update_proceed); diff --git a/src/runtime/main_loop.h b/src/runtime/main_loop.h index 8960344..8bb3771 100644 --- a/src/runtime/main_loop.h +++ b/src/runtime/main_loop.h @@ -8,8 +8,8 @@ typedef void rt_main_loop_update_fn(void); typedef void rt_main_loop_render_fn(void); typedef struct { - int u_frame_id; - int r_frame_id; + unsigned int u_frame_id; + unsigned int r_frame_id; rt_semaphore update_proceed; rt_semaphore render_proceed; diff --git a/src/runtime/meson.build b/src/runtime/meson.build index 8f4d0aa..0896abe 100644 --- a/src/runtime/meson.build +++ b/src/runtime/meson.build @@ -5,6 +5,7 @@ runtime_lib = library('rt', # Project Sources 'aio.h', 'app.h', + 'atomics.h', 'buffer_manager.h', 'compression.h', 'config.h', diff --git a/src/runtime/renderer_api.h b/src/runtime/renderer_api.h index 8604a0d..e8655fa 100644 --- a/src/runtime/renderer_api.h +++ b/src/runtime/renderer_api.h @@ -13,6 +13,25 @@ extern "C" { #endif +/* Handles for backend objects */ + +#define RT_RENDER_BACKEND_HANDLE_MAX_VERSION 255 + +#define RT_RENDER_BACKEND_HANDLE(name) \ + typedef struct { \ + uint32_t version : 8; \ + uint32_t index : 24; \ + } name + +RT_RENDER_BACKEND_HANDLE(rt_pipeline_handle); +RT_RENDER_BACKEND_HANDLE(rt_render_target_handle); +RT_RENDER_BACKEND_HANDLE(rt_command_buffer_handle); +RT_RENDER_BACKEND_HANDLE(rt_gpu_semaphore_handle); + +#undef RT_RENDER_BACKEND_HANDLE + +/* Init data for the renderer */ + #ifdef _WIN32 struct HINSTANCE__; struct HWND__; @@ -30,6 +49,14 @@ struct rt_renderer_init_info_s { #endif }; +/* Argument types for render commands */ + +typedef enum { + RT_GRAPHICS_QUEUE, + RT_COMPUTE_QUEUE, + RT_TRANSFER_QUEUE, +} rt_gpu_queue; + typedef struct { rt_resource_id vertex_shader; rt_resource_id fragment_shader; @@ -86,41 +113,39 @@ typedef struct { size_t bytecode_length; } rt_shader_info; -/* Handles for backend objects */ +typedef struct { + rt_gpu_queue target_queue; +} rt_alloc_command_buffer_info; -#define RT_RENDER_BACKEND_HANDLE_MAX_VERSION 255 +typedef struct { + const rt_command_buffer_handle *command_buffers; + const rt_gpu_semaphore_handle *wait_semaphores; + const rt_gpu_semaphore_handle *signal_semaphores; + uint32_t command_buffer_count; + uint32_t wait_semaphore_count; + uint32_t signal_semaphore_count; +} rt_submit_command_buffers_info; -#define RT_RENDER_BACKEND_HANDLE(name) \ - typedef struct { \ - uint32_t version : 8; \ - uint32_t index : 24; \ - } name - -RT_RENDER_BACKEND_HANDLE(rt_pipeline_handle); -RT_RENDER_BACKEND_HANDLE(rt_render_target_handle); -RT_RENDER_BACKEND_HANDLE(rt_render_command_buffer_handle); -RT_RENDER_BACKEND_HANDLE(rt_gpu_semaphore_handle); - -#undef RT_RENDER_BACKEND_HANDLE +/* Renderer API */ typedef void rt_register_renderer_cvars_fn(void); typedef rt_result rt_init_renderer_fn(const rt_renderer_init_info *info); typedef void rt_shutdown_renderer_fn(void); +typedef void rt_begin_frame_fn(unsigned int frame_id); typedef rt_pipeline_handle rt_compile_pipeline_fn(const rt_pipeline_info *info); typedef void rt_destroy_pipeline_fn(rt_pipeline_handle handle); typedef rt_render_target_handle rt_create_render_target_fn(const rt_render_target_info *info); typedef void rt_destroy_render_target_fn(rt_render_target_handle handle); typedef rt_result rt_alloc_command_buffers_fn(uint32_t count, - rt_render_command_buffer_handle *p_command_buffers, - rt_gpu_semaphore_handle *p_semaphores); -typedef rt_result -rt_submit_command_buffers_fn(uint32_t count, - const rt_render_command_buffer_handle *command_buffers); + const rt_alloc_command_buffer_info *info, + rt_command_buffer_handle *p_command_buffers); +typedef rt_result rt_submit_command_buffers_fn(rt_gpu_queue queue, const rt_submit_command_buffers_info *info); typedef struct { rt_register_renderer_cvars_fn *RegisterCVars; rt_init_renderer_fn *Init; rt_shutdown_renderer_fn *Shutdown; + rt_begin_frame_fn *BeginFrame; rt_compile_pipeline_fn *CompilePipeline; rt_destroy_pipeline_fn *DestroyPipeline; rt_create_render_target_fn *CreateRenderTarget; diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index 7a7f2b0..23a4c58 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -28,6 +28,8 @@ extern "C" { #define RT_UNUSED(x) ((void)sizeof((x))) #define RT_ARRAY_COUNT(x) (sizeof((x)) / sizeof((x)[0])) +#define RT_RESTRICT_VALUE_TO_BOUNDS(v, lower, upper) (((v) < (lower)) ? (lower) : (((v) > (upper)) ? (upper) : (v))) + #define RT_KB(n) ((n)*1024U) #define RT_MB(n) ((n)*1024U * 1024U) #define RT_GB(n) ((n)*1024U * 1024U * 1024U)