From 11ca327951f13ec8cbb7bb58f26d8526c675728a Mon Sep 17 00:00:00 2001 From: GPUCode Date: Sat, 25 Feb 2023 12:41:36 +0200 Subject: [PATCH] renderer_vulkan: Optimize stream buffer usage * Use vma for allocating the memory. In addition place upload/download buffers on the cpu, they don't need to be device local --- .../renderer_vulkan/vk_rasterizer.cpp | 8 +- .../renderer_vulkan/vk_rasterizer.h | 1 - .../renderer_vulkan/vk_stream_buffer.cpp | 102 ++++++++---------- .../renderer_vulkan/vk_stream_buffer.h | 15 ++- .../renderer_vulkan/vk_texture_runtime.cpp | 7 +- 5 files changed, 63 insertions(+), 70 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index ef8eb4bd6..cd1b3a731 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -145,7 +145,6 @@ RasterizerVulkan::~RasterizerVulkan() { scheduler.Finish(); const vk::Device device = instance.GetDevice(); - device.destroySampler(default_sampler); device.destroyBufferView(texture_lf_view); device.destroyBufferView(texture_rg_view); device.destroyBufferView(texture_rgba_view); @@ -363,8 +362,8 @@ bool RasterizerVulkan::AccelerateDrawBatch(bool is_indexed) { return false; } - // Vertex data analysis and setup might involve rasterizer cache memory flushes - // so perform it early to avoid invalidating our state in the middle of the draw + // Vertex data setup might involve scheduler flushes so perform it + // early to avoid invalidating our state in the middle of the draw. vertex_info = AnalyzeVertexArray(is_indexed, instance.GetMinVertexStrideAlignment()); SetupVertexArray(); @@ -413,14 +412,15 @@ bool RasterizerVulkan::AccelerateDrawBatchInternal(bool is_indexed) { void RasterizerVulkan::SetupIndexArray() { const bool index_u8 = regs.pipeline.index_array.format == 0; const bool native_u8 = index_u8 && instance.IsIndexTypeUint8Supported(); - const vk::IndexType index_type = native_u8 ? vk::IndexType::eUint8EXT : vk::IndexType::eUint16; const u32 index_buffer_size = regs.pipeline.num_vertices * (native_u8 ? 1 : 2); + const vk::IndexType index_type = native_u8 ? vk::IndexType::eUint8EXT : vk::IndexType::eUint16; const u8* index_data = memory.GetPhysicalPointer(regs.pipeline.vertex_attributes.GetPhysicalBaseAddress() + regs.pipeline.index_array.offset); auto [index_ptr, index_offset, _] = stream_buffer.Map(index_buffer_size, 2); + if (index_u8 && !native_u8) { u16* index_ptr_u16 = reinterpret_cast(index_ptr); for (u32 i = 0; i < regs.pipeline.num_vertices; i++) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 6b3b12d91..f1275d605 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -140,7 +140,6 @@ private: std::array binding_offsets{}; std::array enable_attributes{}; std::array vertex_buffers; - vk::Sampler default_sampler; Surface null_surface; Surface null_storage_surface; PipelineInfo pipeline_info; diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp index a7772873c..a40e4652b 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp @@ -10,60 +10,40 @@ #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" +#include + namespace Vulkan { namespace { +VmaAllocationCreateFlags MakeVMAFlags(BufferType type) { + switch (type) { + case BufferType::Upload: + return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT; + case BufferType::Download: + return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; + case BufferType::Stream: + return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | + VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT; + } +} + constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000; constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; -/// Find a memory type with the passed requirements -std::optional FindMemoryType(const vk::PhysicalDeviceMemoryProperties& properties, - vk::MemoryPropertyFlags wanted, - u32 filter = std::numeric_limits::max()) { - for (u32 i = 0; i < properties.memoryTypeCount; ++i) { - const auto flags = properties.memoryTypes[i].propertyFlags; - if ((flags & wanted) == wanted && (filter & (1U << i)) != 0) { - return i; - } - } - return std::nullopt; -} - -/// Get the preferred host visible memory type. -u32 GetMemoryType(const vk::PhysicalDeviceMemoryProperties& properties, bool readback, - u32 filter = std::numeric_limits::max()) { - // Prefer device local host visible allocations. Both AMD and Nvidia now provide one. - // Otherwise search for a host visible allocation. - const vk::MemoryPropertyFlags HOST_MEMORY = - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent; - const vk::MemoryPropertyFlags DYNAMIC_MEMORY = - HOST_MEMORY | (readback ? vk::MemoryPropertyFlagBits::eHostCached - : vk::MemoryPropertyFlagBits::eDeviceLocal); - - std::optional preferred_type = FindMemoryType(properties, DYNAMIC_MEMORY); - if (!preferred_type) { - preferred_type = FindMemoryType(properties, HOST_MEMORY); - ASSERT_MSG(preferred_type, "No host visible and coherent memory type found"); - } - return preferred_type.value_or(0); -} - } // Anonymous namespace StreamBuffer::StreamBuffer(const Instance& instance_, Scheduler& scheduler_, - vk::BufferUsageFlags usage_, u64 size, bool readback_) - : instance{instance_}, scheduler{scheduler_}, usage{usage_}, readback{readback_} { + vk::BufferUsageFlags usage_, u64 size, BufferType type_) + : instance{instance_}, scheduler{scheduler_}, + stream_buffer_size{size}, usage{usage_}, type{type_} { CreateBuffers(size); ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE); ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE); } StreamBuffer::~StreamBuffer() { - const vk::Device device = instance.GetDevice(); - device.unmapMemory(memory); - device.destroyBuffer(buffer); - device.freeMemory(memory); + vmaDestroyBuffer(instance.GetAllocator(), buffer, allocation); } std::tuple StreamBuffer::Map(u64 size, u64 alignment) { @@ -109,31 +89,35 @@ void StreamBuffer::Commit(u64 size) { } void StreamBuffer::CreateBuffers(u64 prefered_size) { - const vk::Device device = instance.GetDevice(); - const auto memory_properties = instance.GetPhysicalDevice().getMemoryProperties(); - const u32 preferred_type = GetMemoryType(memory_properties, readback); - const u32 preferred_heap = memory_properties.memoryTypes[preferred_type].heapIndex; - - // Substract from the preferred heap size some bytes to avoid getting out of memory. - const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size; - // As per DXVK's example, using `heap_size / 2` - const VkDeviceSize allocable_size = heap_size / 2; - buffer = device.createBuffer({ - .size = std::min(prefered_size, allocable_size), + const vk::BufferCreateInfo buffer_info = { + .size = prefered_size, .usage = usage, - }); + }; - const auto requirements = device.getBufferMemoryRequirements(buffer); - const u32 required_flags = requirements.memoryTypeBits; - stream_buffer_size = static_cast(requirements.size); + const VmaAllocationCreateInfo alloc_create_info = { + .flags = MakeVMAFlags(type) | VMA_ALLOCATION_CREATE_MAPPED_BIT, + .usage = VMA_MEMORY_USAGE_AUTO, + }; - memory = device.allocateMemory({ - .allocationSize = requirements.size, - .memoryTypeIndex = GetMemoryType(memory_properties, required_flags), - }); + VkBuffer unsafe_buffer{}; + VkBufferCreateInfo unsafe_buffer_info = static_cast(buffer_info); + VmaAllocationInfo alloc_info{}; + vmaCreateBuffer(instance.GetAllocator(), &unsafe_buffer_info, &alloc_create_info, + &unsafe_buffer, &allocation, &alloc_info); + buffer = vk::Buffer{unsafe_buffer}; - device.bindBufferMemory(buffer, memory, 0); - mapped = reinterpret_cast(device.mapMemory(memory, 0, VK_WHOLE_SIZE)); + VkMemoryPropertyFlags memory_flags{}; + vmaGetAllocationMemoryProperties(instance.GetAllocator(), allocation, &memory_flags); + if (type == BufferType::Stream) { + ASSERT_MSG(memory_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, + "Stream buffer must be host visible!"); + if (!(memory_flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) { + LOG_WARNING(Render_Vulkan, + "Unable to use device local memory for stream buffer. It will be slower!"); + } + } + + mapped = reinterpret_cast(alloc_info.pMappedData); } void StreamBuffer::ReserveWatches(std::vector& watches, std::size_t grow_size) { diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index f95797dc7..458dfbc06 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -10,8 +10,16 @@ #include #include "video_core/renderer_vulkan/vk_common.h" +VK_DEFINE_HANDLE(VmaAllocation) + namespace Vulkan { +enum class BufferType : u32 { + Upload = 0, + Download = 1, + Stream = 2, +}; + class Instance; class Scheduler; @@ -20,7 +28,8 @@ class StreamBuffer final { public: explicit StreamBuffer(const Instance& instance, Scheduler& scheduler, - vk::BufferUsageFlags usage, u64 size, bool readback = false); + vk::BufferUsageFlags usage, u64 size, + BufferType type = BufferType::Stream); ~StreamBuffer(); /** @@ -60,11 +69,11 @@ private: Scheduler& scheduler; ///< Command scheduler. vk::Buffer buffer; ///< Mapped buffer. - vk::DeviceMemory memory; ///< Memory allocation. + VmaAllocation allocation; ///< VMA allocation u8* mapped{}; ///< Pointer to the mapped memory u64 stream_buffer_size{}; ///< Stream buffer size. vk::BufferUsageFlags usage{}; - bool readback{}; ///< Flag indicating if the buffer should use cached memory + BufferType type; u64 offset{}; ///< Buffer iterator. u64 mapped_size{}; ///< Size reserved for the current copy. diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp index 996975224..261c1fc37 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp @@ -106,7 +106,7 @@ u32 UnpackDepthStencil(const StagingData& data, vk::Format dest) { return depth_offset; } -constexpr u64 UPLOAD_BUFFER_SIZE = 128 * 1024 * 1024; +constexpr u64 UPLOAD_BUFFER_SIZE = 64 * 1024 * 1024; constexpr u64 DOWNLOAD_BUFFER_SIZE = 16 * 1024 * 1024; } // Anonymous namespace @@ -115,9 +115,10 @@ TextureRuntime::TextureRuntime(const Instance& instance, Scheduler& scheduler, RenderpassCache& renderpass_cache, DescriptorManager& desc_manager) : instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache}, blit_helper{instance, scheduler, desc_manager, renderpass_cache}, - upload_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, UPLOAD_BUFFER_SIZE}, + upload_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, UPLOAD_BUFFER_SIZE, + BufferType::Upload}, download_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferDst, - DOWNLOAD_BUFFER_SIZE, true} { + DOWNLOAD_BUFFER_SIZE, BufferType::Download} { auto Register = [this](VideoCore::PixelFormat dest, std::unique_ptr&& obj) {