renderer_vulkan: Optimize stream buffer usage

* Use vma for allocating the memory. In addition place upload/download buffers on the cpu, they don't need to be device local
This commit is contained in:
GPUCode
2023-02-25 12:41:36 +02:00
parent 631da59392
commit 11ca327951
5 changed files with 63 additions and 70 deletions

View File

@ -145,7 +145,6 @@ RasterizerVulkan::~RasterizerVulkan() {
scheduler.Finish(); scheduler.Finish();
const vk::Device device = instance.GetDevice(); const vk::Device device = instance.GetDevice();
device.destroySampler(default_sampler);
device.destroyBufferView(texture_lf_view); device.destroyBufferView(texture_lf_view);
device.destroyBufferView(texture_rg_view); device.destroyBufferView(texture_rg_view);
device.destroyBufferView(texture_rgba_view); device.destroyBufferView(texture_rgba_view);
@ -363,8 +362,8 @@ bool RasterizerVulkan::AccelerateDrawBatch(bool is_indexed) {
return false; return false;
} }
// Vertex data analysis and setup might involve rasterizer cache memory flushes // Vertex data setup might involve scheduler flushes so perform it
// so perform it early to avoid invalidating our state in the middle of the draw // early to avoid invalidating our state in the middle of the draw.
vertex_info = AnalyzeVertexArray(is_indexed, instance.GetMinVertexStrideAlignment()); vertex_info = AnalyzeVertexArray(is_indexed, instance.GetMinVertexStrideAlignment());
SetupVertexArray(); SetupVertexArray();
@ -413,14 +412,15 @@ bool RasterizerVulkan::AccelerateDrawBatchInternal(bool is_indexed) {
void RasterizerVulkan::SetupIndexArray() { void RasterizerVulkan::SetupIndexArray() {
const bool index_u8 = regs.pipeline.index_array.format == 0; const bool index_u8 = regs.pipeline.index_array.format == 0;
const bool native_u8 = index_u8 && instance.IsIndexTypeUint8Supported(); const bool native_u8 = index_u8 && instance.IsIndexTypeUint8Supported();
const vk::IndexType index_type = native_u8 ? vk::IndexType::eUint8EXT : vk::IndexType::eUint16;
const u32 index_buffer_size = regs.pipeline.num_vertices * (native_u8 ? 1 : 2); const u32 index_buffer_size = regs.pipeline.num_vertices * (native_u8 ? 1 : 2);
const vk::IndexType index_type = native_u8 ? vk::IndexType::eUint8EXT : vk::IndexType::eUint16;
const u8* index_data = const u8* index_data =
memory.GetPhysicalPointer(regs.pipeline.vertex_attributes.GetPhysicalBaseAddress() + memory.GetPhysicalPointer(regs.pipeline.vertex_attributes.GetPhysicalBaseAddress() +
regs.pipeline.index_array.offset); regs.pipeline.index_array.offset);
auto [index_ptr, index_offset, _] = stream_buffer.Map(index_buffer_size, 2); auto [index_ptr, index_offset, _] = stream_buffer.Map(index_buffer_size, 2);
if (index_u8 && !native_u8) { if (index_u8 && !native_u8) {
u16* index_ptr_u16 = reinterpret_cast<u16*>(index_ptr); u16* index_ptr_u16 = reinterpret_cast<u16*>(index_ptr);
for (u32 i = 0; i < regs.pipeline.num_vertices; i++) { for (u32 i = 0; i < regs.pipeline.num_vertices; i++) {

View File

@ -140,7 +140,6 @@ private:
std::array<u32, 16> binding_offsets{}; std::array<u32, 16> binding_offsets{};
std::array<bool, 16> enable_attributes{}; std::array<bool, 16> enable_attributes{};
std::array<vk::Buffer, 16> vertex_buffers; std::array<vk::Buffer, 16> vertex_buffers;
vk::Sampler default_sampler;
Surface null_surface; Surface null_surface;
Surface null_storage_surface; Surface null_storage_surface;
PipelineInfo pipeline_info; PipelineInfo pipeline_info;

View File

@ -10,60 +10,40 @@
#include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include <vk_mem_alloc.h>
namespace Vulkan { namespace Vulkan {
namespace { namespace {
VmaAllocationCreateFlags MakeVMAFlags(BufferType type) {
switch (type) {
case BufferType::Upload:
return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
case BufferType::Download:
return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
case BufferType::Stream:
return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT;
}
}
constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000; constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
/// Find a memory type with the passed requirements
std::optional<u32> FindMemoryType(const vk::PhysicalDeviceMemoryProperties& properties,
vk::MemoryPropertyFlags wanted,
u32 filter = std::numeric_limits<u32>::max()) {
for (u32 i = 0; i < properties.memoryTypeCount; ++i) {
const auto flags = properties.memoryTypes[i].propertyFlags;
if ((flags & wanted) == wanted && (filter & (1U << i)) != 0) {
return i;
}
}
return std::nullopt;
}
/// Get the preferred host visible memory type.
u32 GetMemoryType(const vk::PhysicalDeviceMemoryProperties& properties, bool readback,
u32 filter = std::numeric_limits<u32>::max()) {
// Prefer device local host visible allocations. Both AMD and Nvidia now provide one.
// Otherwise search for a host visible allocation.
const vk::MemoryPropertyFlags HOST_MEMORY =
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent;
const vk::MemoryPropertyFlags DYNAMIC_MEMORY =
HOST_MEMORY | (readback ? vk::MemoryPropertyFlagBits::eHostCached
: vk::MemoryPropertyFlagBits::eDeviceLocal);
std::optional preferred_type = FindMemoryType(properties, DYNAMIC_MEMORY);
if (!preferred_type) {
preferred_type = FindMemoryType(properties, HOST_MEMORY);
ASSERT_MSG(preferred_type, "No host visible and coherent memory type found");
}
return preferred_type.value_or(0);
}
} // Anonymous namespace } // Anonymous namespace
StreamBuffer::StreamBuffer(const Instance& instance_, Scheduler& scheduler_, StreamBuffer::StreamBuffer(const Instance& instance_, Scheduler& scheduler_,
vk::BufferUsageFlags usage_, u64 size, bool readback_) vk::BufferUsageFlags usage_, u64 size, BufferType type_)
: instance{instance_}, scheduler{scheduler_}, usage{usage_}, readback{readback_} { : instance{instance_}, scheduler{scheduler_},
stream_buffer_size{size}, usage{usage_}, type{type_} {
CreateBuffers(size); CreateBuffers(size);
ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE); ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE); ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
} }
StreamBuffer::~StreamBuffer() { StreamBuffer::~StreamBuffer() {
const vk::Device device = instance.GetDevice(); vmaDestroyBuffer(instance.GetAllocator(), buffer, allocation);
device.unmapMemory(memory);
device.destroyBuffer(buffer);
device.freeMemory(memory);
} }
std::tuple<u8*, u64, bool> StreamBuffer::Map(u64 size, u64 alignment) { std::tuple<u8*, u64, bool> StreamBuffer::Map(u64 size, u64 alignment) {
@ -109,31 +89,35 @@ void StreamBuffer::Commit(u64 size) {
} }
void StreamBuffer::CreateBuffers(u64 prefered_size) { void StreamBuffer::CreateBuffers(u64 prefered_size) {
const vk::Device device = instance.GetDevice(); const vk::BufferCreateInfo buffer_info = {
const auto memory_properties = instance.GetPhysicalDevice().getMemoryProperties(); .size = prefered_size,
const u32 preferred_type = GetMemoryType(memory_properties, readback);
const u32 preferred_heap = memory_properties.memoryTypes[preferred_type].heapIndex;
// Substract from the preferred heap size some bytes to avoid getting out of memory.
const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size;
// As per DXVK's example, using `heap_size / 2`
const VkDeviceSize allocable_size = heap_size / 2;
buffer = device.createBuffer({
.size = std::min(prefered_size, allocable_size),
.usage = usage, .usage = usage,
}); };
const auto requirements = device.getBufferMemoryRequirements(buffer); const VmaAllocationCreateInfo alloc_create_info = {
const u32 required_flags = requirements.memoryTypeBits; .flags = MakeVMAFlags(type) | VMA_ALLOCATION_CREATE_MAPPED_BIT,
stream_buffer_size = static_cast<u64>(requirements.size); .usage = VMA_MEMORY_USAGE_AUTO,
};
memory = device.allocateMemory({ VkBuffer unsafe_buffer{};
.allocationSize = requirements.size, VkBufferCreateInfo unsafe_buffer_info = static_cast<VkBufferCreateInfo>(buffer_info);
.memoryTypeIndex = GetMemoryType(memory_properties, required_flags), VmaAllocationInfo alloc_info{};
}); vmaCreateBuffer(instance.GetAllocator(), &unsafe_buffer_info, &alloc_create_info,
&unsafe_buffer, &allocation, &alloc_info);
buffer = vk::Buffer{unsafe_buffer};
device.bindBufferMemory(buffer, memory, 0); VkMemoryPropertyFlags memory_flags{};
mapped = reinterpret_cast<u8*>(device.mapMemory(memory, 0, VK_WHOLE_SIZE)); vmaGetAllocationMemoryProperties(instance.GetAllocator(), allocation, &memory_flags);
if (type == BufferType::Stream) {
ASSERT_MSG(memory_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
"Stream buffer must be host visible!");
if (!(memory_flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) {
LOG_WARNING(Render_Vulkan,
"Unable to use device local memory for stream buffer. It will be slower!");
}
}
mapped = reinterpret_cast<u8*>(alloc_info.pMappedData);
} }
void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) { void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) {

View File

@ -10,8 +10,16 @@
#include <vector> #include <vector>
#include "video_core/renderer_vulkan/vk_common.h" #include "video_core/renderer_vulkan/vk_common.h"
VK_DEFINE_HANDLE(VmaAllocation)
namespace Vulkan { namespace Vulkan {
enum class BufferType : u32 {
Upload = 0,
Download = 1,
Stream = 2,
};
class Instance; class Instance;
class Scheduler; class Scheduler;
@ -20,7 +28,8 @@ class StreamBuffer final {
public: public:
explicit StreamBuffer(const Instance& instance, Scheduler& scheduler, explicit StreamBuffer(const Instance& instance, Scheduler& scheduler,
vk::BufferUsageFlags usage, u64 size, bool readback = false); vk::BufferUsageFlags usage, u64 size,
BufferType type = BufferType::Stream);
~StreamBuffer(); ~StreamBuffer();
/** /**
@ -60,11 +69,11 @@ private:
Scheduler& scheduler; ///< Command scheduler. Scheduler& scheduler; ///< Command scheduler.
vk::Buffer buffer; ///< Mapped buffer. vk::Buffer buffer; ///< Mapped buffer.
vk::DeviceMemory memory; ///< Memory allocation. VmaAllocation allocation; ///< VMA allocation
u8* mapped{}; ///< Pointer to the mapped memory u8* mapped{}; ///< Pointer to the mapped memory
u64 stream_buffer_size{}; ///< Stream buffer size. u64 stream_buffer_size{}; ///< Stream buffer size.
vk::BufferUsageFlags usage{}; vk::BufferUsageFlags usage{};
bool readback{}; ///< Flag indicating if the buffer should use cached memory BufferType type;
u64 offset{}; ///< Buffer iterator. u64 offset{}; ///< Buffer iterator.
u64 mapped_size{}; ///< Size reserved for the current copy. u64 mapped_size{}; ///< Size reserved for the current copy.

View File

@ -106,7 +106,7 @@ u32 UnpackDepthStencil(const StagingData& data, vk::Format dest) {
return depth_offset; return depth_offset;
} }
constexpr u64 UPLOAD_BUFFER_SIZE = 128 * 1024 * 1024; constexpr u64 UPLOAD_BUFFER_SIZE = 64 * 1024 * 1024;
constexpr u64 DOWNLOAD_BUFFER_SIZE = 16 * 1024 * 1024; constexpr u64 DOWNLOAD_BUFFER_SIZE = 16 * 1024 * 1024;
} // Anonymous namespace } // Anonymous namespace
@ -115,9 +115,10 @@ TextureRuntime::TextureRuntime(const Instance& instance, Scheduler& scheduler,
RenderpassCache& renderpass_cache, DescriptorManager& desc_manager) RenderpassCache& renderpass_cache, DescriptorManager& desc_manager)
: instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache}, : instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache},
blit_helper{instance, scheduler, desc_manager, renderpass_cache}, blit_helper{instance, scheduler, desc_manager, renderpass_cache},
upload_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, UPLOAD_BUFFER_SIZE}, upload_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, UPLOAD_BUFFER_SIZE,
BufferType::Upload},
download_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferDst, download_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferDst,
DOWNLOAD_BUFFER_SIZE, true} { DOWNLOAD_BUFFER_SIZE, BufferType::Download} {
auto Register = [this](VideoCore::PixelFormat dest, auto Register = [this](VideoCore::PixelFormat dest,
std::unique_ptr<FormatReinterpreterBase>&& obj) { std::unique_ptr<FormatReinterpreterBase>&& obj) {