renderer_vulkan: Optimize stream buffer usage

* Use vma for allocating the memory. In addition place upload/download buffers on the cpu, they don't need to be device local
This commit is contained in:
GPUCode
2023-02-25 12:41:36 +02:00
parent 631da59392
commit 11ca327951
5 changed files with 63 additions and 70 deletions

View File

@ -145,7 +145,6 @@ RasterizerVulkan::~RasterizerVulkan() {
scheduler.Finish();
const vk::Device device = instance.GetDevice();
device.destroySampler(default_sampler);
device.destroyBufferView(texture_lf_view);
device.destroyBufferView(texture_rg_view);
device.destroyBufferView(texture_rgba_view);
@ -363,8 +362,8 @@ bool RasterizerVulkan::AccelerateDrawBatch(bool is_indexed) {
return false;
}
// Vertex data analysis and setup might involve rasterizer cache memory flushes
// so perform it early to avoid invalidating our state in the middle of the draw
// Vertex data setup might involve scheduler flushes so perform it
// early to avoid invalidating our state in the middle of the draw.
vertex_info = AnalyzeVertexArray(is_indexed, instance.GetMinVertexStrideAlignment());
SetupVertexArray();
@ -413,14 +412,15 @@ bool RasterizerVulkan::AccelerateDrawBatchInternal(bool is_indexed) {
void RasterizerVulkan::SetupIndexArray() {
const bool index_u8 = regs.pipeline.index_array.format == 0;
const bool native_u8 = index_u8 && instance.IsIndexTypeUint8Supported();
const vk::IndexType index_type = native_u8 ? vk::IndexType::eUint8EXT : vk::IndexType::eUint16;
const u32 index_buffer_size = regs.pipeline.num_vertices * (native_u8 ? 1 : 2);
const vk::IndexType index_type = native_u8 ? vk::IndexType::eUint8EXT : vk::IndexType::eUint16;
const u8* index_data =
memory.GetPhysicalPointer(regs.pipeline.vertex_attributes.GetPhysicalBaseAddress() +
regs.pipeline.index_array.offset);
auto [index_ptr, index_offset, _] = stream_buffer.Map(index_buffer_size, 2);
if (index_u8 && !native_u8) {
u16* index_ptr_u16 = reinterpret_cast<u16*>(index_ptr);
for (u32 i = 0; i < regs.pipeline.num_vertices; i++) {

View File

@ -140,7 +140,6 @@ private:
std::array<u32, 16> binding_offsets{};
std::array<bool, 16> enable_attributes{};
std::array<vk::Buffer, 16> vertex_buffers;
vk::Sampler default_sampler;
Surface null_surface;
Surface null_storage_surface;
PipelineInfo pipeline_info;

View File

@ -10,60 +10,40 @@
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include <vk_mem_alloc.h>
namespace Vulkan {
namespace {
VmaAllocationCreateFlags MakeVMAFlags(BufferType type) {
switch (type) {
case BufferType::Upload:
return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
case BufferType::Download:
return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
case BufferType::Stream:
return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT;
}
}
constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
/// Find a memory type with the passed requirements
std::optional<u32> FindMemoryType(const vk::PhysicalDeviceMemoryProperties& properties,
vk::MemoryPropertyFlags wanted,
u32 filter = std::numeric_limits<u32>::max()) {
for (u32 i = 0; i < properties.memoryTypeCount; ++i) {
const auto flags = properties.memoryTypes[i].propertyFlags;
if ((flags & wanted) == wanted && (filter & (1U << i)) != 0) {
return i;
}
}
return std::nullopt;
}
/// Get the preferred host visible memory type.
u32 GetMemoryType(const vk::PhysicalDeviceMemoryProperties& properties, bool readback,
u32 filter = std::numeric_limits<u32>::max()) {
// Prefer device local host visible allocations. Both AMD and Nvidia now provide one.
// Otherwise search for a host visible allocation.
const vk::MemoryPropertyFlags HOST_MEMORY =
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent;
const vk::MemoryPropertyFlags DYNAMIC_MEMORY =
HOST_MEMORY | (readback ? vk::MemoryPropertyFlagBits::eHostCached
: vk::MemoryPropertyFlagBits::eDeviceLocal);
std::optional preferred_type = FindMemoryType(properties, DYNAMIC_MEMORY);
if (!preferred_type) {
preferred_type = FindMemoryType(properties, HOST_MEMORY);
ASSERT_MSG(preferred_type, "No host visible and coherent memory type found");
}
return preferred_type.value_or(0);
}
} // Anonymous namespace
StreamBuffer::StreamBuffer(const Instance& instance_, Scheduler& scheduler_,
vk::BufferUsageFlags usage_, u64 size, bool readback_)
: instance{instance_}, scheduler{scheduler_}, usage{usage_}, readback{readback_} {
vk::BufferUsageFlags usage_, u64 size, BufferType type_)
: instance{instance_}, scheduler{scheduler_},
stream_buffer_size{size}, usage{usage_}, type{type_} {
CreateBuffers(size);
ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
}
StreamBuffer::~StreamBuffer() {
const vk::Device device = instance.GetDevice();
device.unmapMemory(memory);
device.destroyBuffer(buffer);
device.freeMemory(memory);
vmaDestroyBuffer(instance.GetAllocator(), buffer, allocation);
}
std::tuple<u8*, u64, bool> StreamBuffer::Map(u64 size, u64 alignment) {
@ -109,31 +89,35 @@ void StreamBuffer::Commit(u64 size) {
}
void StreamBuffer::CreateBuffers(u64 prefered_size) {
const vk::Device device = instance.GetDevice();
const auto memory_properties = instance.GetPhysicalDevice().getMemoryProperties();
const u32 preferred_type = GetMemoryType(memory_properties, readback);
const u32 preferred_heap = memory_properties.memoryTypes[preferred_type].heapIndex;
// Substract from the preferred heap size some bytes to avoid getting out of memory.
const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size;
// As per DXVK's example, using `heap_size / 2`
const VkDeviceSize allocable_size = heap_size / 2;
buffer = device.createBuffer({
.size = std::min(prefered_size, allocable_size),
const vk::BufferCreateInfo buffer_info = {
.size = prefered_size,
.usage = usage,
});
};
const auto requirements = device.getBufferMemoryRequirements(buffer);
const u32 required_flags = requirements.memoryTypeBits;
stream_buffer_size = static_cast<u64>(requirements.size);
const VmaAllocationCreateInfo alloc_create_info = {
.flags = MakeVMAFlags(type) | VMA_ALLOCATION_CREATE_MAPPED_BIT,
.usage = VMA_MEMORY_USAGE_AUTO,
};
memory = device.allocateMemory({
.allocationSize = requirements.size,
.memoryTypeIndex = GetMemoryType(memory_properties, required_flags),
});
VkBuffer unsafe_buffer{};
VkBufferCreateInfo unsafe_buffer_info = static_cast<VkBufferCreateInfo>(buffer_info);
VmaAllocationInfo alloc_info{};
vmaCreateBuffer(instance.GetAllocator(), &unsafe_buffer_info, &alloc_create_info,
&unsafe_buffer, &allocation, &alloc_info);
buffer = vk::Buffer{unsafe_buffer};
device.bindBufferMemory(buffer, memory, 0);
mapped = reinterpret_cast<u8*>(device.mapMemory(memory, 0, VK_WHOLE_SIZE));
VkMemoryPropertyFlags memory_flags{};
vmaGetAllocationMemoryProperties(instance.GetAllocator(), allocation, &memory_flags);
if (type == BufferType::Stream) {
ASSERT_MSG(memory_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
"Stream buffer must be host visible!");
if (!(memory_flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) {
LOG_WARNING(Render_Vulkan,
"Unable to use device local memory for stream buffer. It will be slower!");
}
}
mapped = reinterpret_cast<u8*>(alloc_info.pMappedData);
}
void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) {

View File

@ -10,8 +10,16 @@
#include <vector>
#include "video_core/renderer_vulkan/vk_common.h"
VK_DEFINE_HANDLE(VmaAllocation)
namespace Vulkan {
enum class BufferType : u32 {
Upload = 0,
Download = 1,
Stream = 2,
};
class Instance;
class Scheduler;
@ -20,7 +28,8 @@ class StreamBuffer final {
public:
explicit StreamBuffer(const Instance& instance, Scheduler& scheduler,
vk::BufferUsageFlags usage, u64 size, bool readback = false);
vk::BufferUsageFlags usage, u64 size,
BufferType type = BufferType::Stream);
~StreamBuffer();
/**
@ -60,11 +69,11 @@ private:
Scheduler& scheduler; ///< Command scheduler.
vk::Buffer buffer; ///< Mapped buffer.
vk::DeviceMemory memory; ///< Memory allocation.
VmaAllocation allocation; ///< VMA allocation
u8* mapped{}; ///< Pointer to the mapped memory
u64 stream_buffer_size{}; ///< Stream buffer size.
vk::BufferUsageFlags usage{};
bool readback{}; ///< Flag indicating if the buffer should use cached memory
BufferType type;
u64 offset{}; ///< Buffer iterator.
u64 mapped_size{}; ///< Size reserved for the current copy.

View File

@ -106,7 +106,7 @@ u32 UnpackDepthStencil(const StagingData& data, vk::Format dest) {
return depth_offset;
}
constexpr u64 UPLOAD_BUFFER_SIZE = 128 * 1024 * 1024;
constexpr u64 UPLOAD_BUFFER_SIZE = 64 * 1024 * 1024;
constexpr u64 DOWNLOAD_BUFFER_SIZE = 16 * 1024 * 1024;
} // Anonymous namespace
@ -115,9 +115,10 @@ TextureRuntime::TextureRuntime(const Instance& instance, Scheduler& scheduler,
RenderpassCache& renderpass_cache, DescriptorManager& desc_manager)
: instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache},
blit_helper{instance, scheduler, desc_manager, renderpass_cache},
upload_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, UPLOAD_BUFFER_SIZE},
upload_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, UPLOAD_BUFFER_SIZE,
BufferType::Upload},
download_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferDst,
DOWNLOAD_BUFFER_SIZE, true} {
DOWNLOAD_BUFFER_SIZE, BufferType::Download} {
auto Register = [this](VideoCore::PixelFormat dest,
std::unique_ptr<FormatReinterpreterBase>&& obj) {