renderer_vulkan: Optimize stream buffer usage

* Use vma for allocating the memory. In addition place upload/download buffers on the cpu, they don't need to be device local
2023-02-25 12:41:36 +02:00
parent 631da59392
commit 11ca327951
5 changed files with 63 additions and 70 deletions
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -145,7 +145,6 @@ RasterizerVulkan::~RasterizerVulkan() {
    scheduler.Finish();
    const vk::Device device = instance.GetDevice();

-    device.destroySampler(default_sampler);
    device.destroyBufferView(texture_lf_view);
    device.destroyBufferView(texture_rg_view);
    device.destroyBufferView(texture_rgba_view);
@@ -363,8 +362,8 @@ bool RasterizerVulkan::AccelerateDrawBatch(bool is_indexed) {
        return false;
    }

-    // Vertex data analysis and setup might involve rasterizer cache memory flushes
-    // so perform it early to avoid invalidating our state in the middle of the draw
+    // Vertex data setup might involve scheduler flushes so perform it
+    // early to avoid invalidating our state in the middle of the draw.
    vertex_info = AnalyzeVertexArray(is_indexed, instance.GetMinVertexStrideAlignment());
    SetupVertexArray();

@@ -413,14 +412,15 @@ bool RasterizerVulkan::AccelerateDrawBatchInternal(bool is_indexed) {
 void RasterizerVulkan::SetupIndexArray() {
    const bool index_u8 = regs.pipeline.index_array.format == 0;
    const bool native_u8 = index_u8 && instance.IsIndexTypeUint8Supported();
-    const vk::IndexType index_type = native_u8 ? vk::IndexType::eUint8EXT : vk::IndexType::eUint16;
    const u32 index_buffer_size = regs.pipeline.num_vertices * (native_u8 ? 1 : 2);
+    const vk::IndexType index_type = native_u8 ? vk::IndexType::eUint8EXT : vk::IndexType::eUint16;

    const u8* index_data =
        memory.GetPhysicalPointer(regs.pipeline.vertex_attributes.GetPhysicalBaseAddress() +
                                  regs.pipeline.index_array.offset);

    auto [index_ptr, index_offset, _] = stream_buffer.Map(index_buffer_size, 2);
+
    if (index_u8 && !native_u8) {
        u16* index_ptr_u16 = reinterpret_cast<u16*>(index_ptr);
        for (u32 i = 0; i < regs.pipeline.num_vertices; i++) {
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -140,7 +140,6 @@ private:
    std::array<u32, 16> binding_offsets{};
    std::array<bool, 16> enable_attributes{};
    std::array<vk::Buffer, 16> vertex_buffers;
-    vk::Sampler default_sampler;
    Surface null_surface;
    Surface null_storage_surface;
    PipelineInfo pipeline_info;
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
@@ -10,60 +10,40 @@
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_stream_buffer.h"

+#include <vk_mem_alloc.h>
+
 namespace Vulkan {

 namespace {

+VmaAllocationCreateFlags MakeVMAFlags(BufferType type) {
+    switch (type) {
+    case BufferType::Upload:
+        return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
+    case BufferType::Download:
+        return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+    case BufferType::Stream:
+        return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
+               VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT;
+    }
+}
+
 constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
 constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;

-/// Find a memory type with the passed requirements
-std::optional<u32> FindMemoryType(const vk::PhysicalDeviceMemoryProperties& properties,
-                                  vk::MemoryPropertyFlags wanted,
-                                  u32 filter = std::numeric_limits<u32>::max()) {
-    for (u32 i = 0; i < properties.memoryTypeCount; ++i) {
-        const auto flags = properties.memoryTypes[i].propertyFlags;
-        if ((flags & wanted) == wanted && (filter & (1U << i)) != 0) {
-            return i;
-        }
-    }
-    return std::nullopt;
-}
-
-/// Get the preferred host visible memory type.
-u32 GetMemoryType(const vk::PhysicalDeviceMemoryProperties& properties, bool readback,
-                  u32 filter = std::numeric_limits<u32>::max()) {
-    // Prefer device local host visible allocations. Both AMD and Nvidia now provide one.
-    // Otherwise search for a host visible allocation.
-    const vk::MemoryPropertyFlags HOST_MEMORY =
-        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent;
-    const vk::MemoryPropertyFlags DYNAMIC_MEMORY =
-        HOST_MEMORY | (readback ? vk::MemoryPropertyFlagBits::eHostCached
-                                : vk::MemoryPropertyFlagBits::eDeviceLocal);
-
-    std::optional preferred_type = FindMemoryType(properties, DYNAMIC_MEMORY);
-    if (!preferred_type) {
-        preferred_type = FindMemoryType(properties, HOST_MEMORY);
-        ASSERT_MSG(preferred_type, "No host visible and coherent memory type found");
-    }
-    return preferred_type.value_or(0);
-}
-
 } // Anonymous namespace

 StreamBuffer::StreamBuffer(const Instance& instance_, Scheduler& scheduler_,
-                           vk::BufferUsageFlags usage_, u64 size, bool readback_)
-    : instance{instance_}, scheduler{scheduler_}, usage{usage_}, readback{readback_} {
+                           vk::BufferUsageFlags usage_, u64 size, BufferType type_)
+    : instance{instance_}, scheduler{scheduler_},
+      stream_buffer_size{size}, usage{usage_}, type{type_} {
    CreateBuffers(size);
    ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
    ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
 }

 StreamBuffer::~StreamBuffer() {
-    const vk::Device device = instance.GetDevice();
-    device.unmapMemory(memory);
-    device.destroyBuffer(buffer);
-    device.freeMemory(memory);
+    vmaDestroyBuffer(instance.GetAllocator(), buffer, allocation);
 }

 std::tuple<u8*, u64, bool> StreamBuffer::Map(u64 size, u64 alignment) {
@@ -109,31 +89,35 @@ void StreamBuffer::Commit(u64 size) {
 }

 void StreamBuffer::CreateBuffers(u64 prefered_size) {
-    const vk::Device device = instance.GetDevice();
-    const auto memory_properties = instance.GetPhysicalDevice().getMemoryProperties();
-    const u32 preferred_type = GetMemoryType(memory_properties, readback);
-    const u32 preferred_heap = memory_properties.memoryTypes[preferred_type].heapIndex;
-
-    // Substract from the preferred heap size some bytes to avoid getting out of memory.
-    const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size;
-    // As per DXVK's example, using `heap_size / 2`
-    const VkDeviceSize allocable_size = heap_size / 2;
-    buffer = device.createBuffer({
-        .size = std::min(prefered_size, allocable_size),
+    const vk::BufferCreateInfo buffer_info = {
+        .size = prefered_size,
        .usage = usage,
-    });
+    };

-    const auto requirements = device.getBufferMemoryRequirements(buffer);
-    const u32 required_flags = requirements.memoryTypeBits;
-    stream_buffer_size = static_cast<u64>(requirements.size);
+    const VmaAllocationCreateInfo alloc_create_info = {
+        .flags = MakeVMAFlags(type) | VMA_ALLOCATION_CREATE_MAPPED_BIT,
+        .usage = VMA_MEMORY_USAGE_AUTO,
+    };

-    memory = device.allocateMemory({
-        .allocationSize = requirements.size,
-        .memoryTypeIndex = GetMemoryType(memory_properties, required_flags),
-    });
+    VkBuffer unsafe_buffer{};
+    VkBufferCreateInfo unsafe_buffer_info = static_cast<VkBufferCreateInfo>(buffer_info);
+    VmaAllocationInfo alloc_info{};
+    vmaCreateBuffer(instance.GetAllocator(), &unsafe_buffer_info, &alloc_create_info,
+                    &unsafe_buffer, &allocation, &alloc_info);
+    buffer = vk::Buffer{unsafe_buffer};

-    device.bindBufferMemory(buffer, memory, 0);
-    mapped = reinterpret_cast<u8*>(device.mapMemory(memory, 0, VK_WHOLE_SIZE));
+    VkMemoryPropertyFlags memory_flags{};
+    vmaGetAllocationMemoryProperties(instance.GetAllocator(), allocation, &memory_flags);
+    if (type == BufferType::Stream) {
+        ASSERT_MSG(memory_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
+                   "Stream buffer must be host visible!");
+        if (!(memory_flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) {
+            LOG_WARNING(Render_Vulkan,
+                        "Unable to use device local memory for stream buffer. It will be slower!");
+        }
+    }
+
+    mapped = reinterpret_cast<u8*>(alloc_info.pMappedData);
 }

 void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) {
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -10,8 +10,16 @@
 #include <vector>
 #include "video_core/renderer_vulkan/vk_common.h"

+VK_DEFINE_HANDLE(VmaAllocation)
+
 namespace Vulkan {

+enum class BufferType : u32 {
+    Upload = 0,
+    Download = 1,
+    Stream = 2,
+};
+
 class Instance;
 class Scheduler;

@@ -20,7 +28,8 @@ class StreamBuffer final {

 public:
    explicit StreamBuffer(const Instance& instance, Scheduler& scheduler,
-                          vk::BufferUsageFlags usage, u64 size, bool readback = false);
+                          vk::BufferUsageFlags usage, u64 size,
+                          BufferType type = BufferType::Stream);
    ~StreamBuffer();

    /**
@@ -60,11 +69,11 @@ private:
    Scheduler& scheduler;     ///< Command scheduler.

    vk::Buffer buffer;        ///< Mapped buffer.
-    vk::DeviceMemory memory;  ///< Memory allocation.
+    VmaAllocation allocation; ///< VMA allocation
    u8* mapped{};             ///< Pointer to the mapped memory
    u64 stream_buffer_size{}; ///< Stream buffer size.
    vk::BufferUsageFlags usage{};
-    bool readback{}; ///< Flag indicating if the buffer should use cached memory
+    BufferType type;

    u64 offset{};      ///< Buffer iterator.
    u64 mapped_size{}; ///< Size reserved for the current copy.
--- a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp
@@ -106,7 +106,7 @@ u32 UnpackDepthStencil(const StagingData& data, vk::Format dest) {
    return depth_offset;
 }

-constexpr u64 UPLOAD_BUFFER_SIZE = 128 * 1024 * 1024;
+constexpr u64 UPLOAD_BUFFER_SIZE = 64 * 1024 * 1024;
 constexpr u64 DOWNLOAD_BUFFER_SIZE = 16 * 1024 * 1024;

 } // Anonymous namespace
@@ -115,9 +115,10 @@ TextureRuntime::TextureRuntime(const Instance& instance, Scheduler& scheduler,
                               RenderpassCache& renderpass_cache, DescriptorManager& desc_manager)
    : instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache},
      blit_helper{instance, scheduler, desc_manager, renderpass_cache},
-      upload_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, UPLOAD_BUFFER_SIZE},
+      upload_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, UPLOAD_BUFFER_SIZE,
+                    BufferType::Upload},
      download_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferDst,
-                      DOWNLOAD_BUFFER_SIZE, true} {
+                      DOWNLOAD_BUFFER_SIZE, BufferType::Download} {

    auto Register = [this](VideoCore::PixelFormat dest,
                           std::unique_ptr<FormatReinterpreterBase>&& obj) {