renderer_vulkan: Optimize stream buffer usage
* Use vma for allocating the memory. In addition place upload/download buffers on the cpu, they don't need to be device local
This commit is contained in:
@ -145,7 +145,6 @@ RasterizerVulkan::~RasterizerVulkan() {
|
||||
scheduler.Finish();
|
||||
const vk::Device device = instance.GetDevice();
|
||||
|
||||
device.destroySampler(default_sampler);
|
||||
device.destroyBufferView(texture_lf_view);
|
||||
device.destroyBufferView(texture_rg_view);
|
||||
device.destroyBufferView(texture_rgba_view);
|
||||
@ -363,8 +362,8 @@ bool RasterizerVulkan::AccelerateDrawBatch(bool is_indexed) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Vertex data analysis and setup might involve rasterizer cache memory flushes
|
||||
// so perform it early to avoid invalidating our state in the middle of the draw
|
||||
// Vertex data setup might involve scheduler flushes so perform it
|
||||
// early to avoid invalidating our state in the middle of the draw.
|
||||
vertex_info = AnalyzeVertexArray(is_indexed, instance.GetMinVertexStrideAlignment());
|
||||
SetupVertexArray();
|
||||
|
||||
@ -413,14 +412,15 @@ bool RasterizerVulkan::AccelerateDrawBatchInternal(bool is_indexed) {
|
||||
void RasterizerVulkan::SetupIndexArray() {
|
||||
const bool index_u8 = regs.pipeline.index_array.format == 0;
|
||||
const bool native_u8 = index_u8 && instance.IsIndexTypeUint8Supported();
|
||||
const vk::IndexType index_type = native_u8 ? vk::IndexType::eUint8EXT : vk::IndexType::eUint16;
|
||||
const u32 index_buffer_size = regs.pipeline.num_vertices * (native_u8 ? 1 : 2);
|
||||
const vk::IndexType index_type = native_u8 ? vk::IndexType::eUint8EXT : vk::IndexType::eUint16;
|
||||
|
||||
const u8* index_data =
|
||||
memory.GetPhysicalPointer(regs.pipeline.vertex_attributes.GetPhysicalBaseAddress() +
|
||||
regs.pipeline.index_array.offset);
|
||||
|
||||
auto [index_ptr, index_offset, _] = stream_buffer.Map(index_buffer_size, 2);
|
||||
|
||||
if (index_u8 && !native_u8) {
|
||||
u16* index_ptr_u16 = reinterpret_cast<u16*>(index_ptr);
|
||||
for (u32 i = 0; i < regs.pipeline.num_vertices; i++) {
|
||||
|
@ -140,7 +140,6 @@ private:
|
||||
std::array<u32, 16> binding_offsets{};
|
||||
std::array<bool, 16> enable_attributes{};
|
||||
std::array<vk::Buffer, 16> vertex_buffers;
|
||||
vk::Sampler default_sampler;
|
||||
Surface null_surface;
|
||||
Surface null_storage_surface;
|
||||
PipelineInfo pipeline_info;
|
||||
|
@ -10,60 +10,40 @@
|
||||
#include "video_core/renderer_vulkan/vk_scheduler.h"
|
||||
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
|
||||
|
||||
#include <vk_mem_alloc.h>
|
||||
|
||||
namespace Vulkan {
|
||||
|
||||
namespace {
|
||||
|
||||
VmaAllocationCreateFlags MakeVMAFlags(BufferType type) {
|
||||
switch (type) {
|
||||
case BufferType::Upload:
|
||||
return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
|
||||
case BufferType::Download:
|
||||
return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
|
||||
case BufferType::Stream:
|
||||
return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
|
||||
VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
|
||||
constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
|
||||
|
||||
/// Find a memory type with the passed requirements
|
||||
std::optional<u32> FindMemoryType(const vk::PhysicalDeviceMemoryProperties& properties,
|
||||
vk::MemoryPropertyFlags wanted,
|
||||
u32 filter = std::numeric_limits<u32>::max()) {
|
||||
for (u32 i = 0; i < properties.memoryTypeCount; ++i) {
|
||||
const auto flags = properties.memoryTypes[i].propertyFlags;
|
||||
if ((flags & wanted) == wanted && (filter & (1U << i)) != 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
/// Get the preferred host visible memory type.
|
||||
u32 GetMemoryType(const vk::PhysicalDeviceMemoryProperties& properties, bool readback,
|
||||
u32 filter = std::numeric_limits<u32>::max()) {
|
||||
// Prefer device local host visible allocations. Both AMD and Nvidia now provide one.
|
||||
// Otherwise search for a host visible allocation.
|
||||
const vk::MemoryPropertyFlags HOST_MEMORY =
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent;
|
||||
const vk::MemoryPropertyFlags DYNAMIC_MEMORY =
|
||||
HOST_MEMORY | (readback ? vk::MemoryPropertyFlagBits::eHostCached
|
||||
: vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
std::optional preferred_type = FindMemoryType(properties, DYNAMIC_MEMORY);
|
||||
if (!preferred_type) {
|
||||
preferred_type = FindMemoryType(properties, HOST_MEMORY);
|
||||
ASSERT_MSG(preferred_type, "No host visible and coherent memory type found");
|
||||
}
|
||||
return preferred_type.value_or(0);
|
||||
}
|
||||
|
||||
} // Anonymous namespace
|
||||
|
||||
StreamBuffer::StreamBuffer(const Instance& instance_, Scheduler& scheduler_,
|
||||
vk::BufferUsageFlags usage_, u64 size, bool readback_)
|
||||
: instance{instance_}, scheduler{scheduler_}, usage{usage_}, readback{readback_} {
|
||||
vk::BufferUsageFlags usage_, u64 size, BufferType type_)
|
||||
: instance{instance_}, scheduler{scheduler_},
|
||||
stream_buffer_size{size}, usage{usage_}, type{type_} {
|
||||
CreateBuffers(size);
|
||||
ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
|
||||
ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
|
||||
}
|
||||
|
||||
StreamBuffer::~StreamBuffer() {
|
||||
const vk::Device device = instance.GetDevice();
|
||||
device.unmapMemory(memory);
|
||||
device.destroyBuffer(buffer);
|
||||
device.freeMemory(memory);
|
||||
vmaDestroyBuffer(instance.GetAllocator(), buffer, allocation);
|
||||
}
|
||||
|
||||
std::tuple<u8*, u64, bool> StreamBuffer::Map(u64 size, u64 alignment) {
|
||||
@ -109,31 +89,35 @@ void StreamBuffer::Commit(u64 size) {
|
||||
}
|
||||
|
||||
void StreamBuffer::CreateBuffers(u64 prefered_size) {
|
||||
const vk::Device device = instance.GetDevice();
|
||||
const auto memory_properties = instance.GetPhysicalDevice().getMemoryProperties();
|
||||
const u32 preferred_type = GetMemoryType(memory_properties, readback);
|
||||
const u32 preferred_heap = memory_properties.memoryTypes[preferred_type].heapIndex;
|
||||
|
||||
// Substract from the preferred heap size some bytes to avoid getting out of memory.
|
||||
const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size;
|
||||
// As per DXVK's example, using `heap_size / 2`
|
||||
const VkDeviceSize allocable_size = heap_size / 2;
|
||||
buffer = device.createBuffer({
|
||||
.size = std::min(prefered_size, allocable_size),
|
||||
const vk::BufferCreateInfo buffer_info = {
|
||||
.size = prefered_size,
|
||||
.usage = usage,
|
||||
});
|
||||
};
|
||||
|
||||
const auto requirements = device.getBufferMemoryRequirements(buffer);
|
||||
const u32 required_flags = requirements.memoryTypeBits;
|
||||
stream_buffer_size = static_cast<u64>(requirements.size);
|
||||
const VmaAllocationCreateInfo alloc_create_info = {
|
||||
.flags = MakeVMAFlags(type) | VMA_ALLOCATION_CREATE_MAPPED_BIT,
|
||||
.usage = VMA_MEMORY_USAGE_AUTO,
|
||||
};
|
||||
|
||||
memory = device.allocateMemory({
|
||||
.allocationSize = requirements.size,
|
||||
.memoryTypeIndex = GetMemoryType(memory_properties, required_flags),
|
||||
});
|
||||
VkBuffer unsafe_buffer{};
|
||||
VkBufferCreateInfo unsafe_buffer_info = static_cast<VkBufferCreateInfo>(buffer_info);
|
||||
VmaAllocationInfo alloc_info{};
|
||||
vmaCreateBuffer(instance.GetAllocator(), &unsafe_buffer_info, &alloc_create_info,
|
||||
&unsafe_buffer, &allocation, &alloc_info);
|
||||
buffer = vk::Buffer{unsafe_buffer};
|
||||
|
||||
device.bindBufferMemory(buffer, memory, 0);
|
||||
mapped = reinterpret_cast<u8*>(device.mapMemory(memory, 0, VK_WHOLE_SIZE));
|
||||
VkMemoryPropertyFlags memory_flags{};
|
||||
vmaGetAllocationMemoryProperties(instance.GetAllocator(), allocation, &memory_flags);
|
||||
if (type == BufferType::Stream) {
|
||||
ASSERT_MSG(memory_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
|
||||
"Stream buffer must be host visible!");
|
||||
if (!(memory_flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) {
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"Unable to use device local memory for stream buffer. It will be slower!");
|
||||
}
|
||||
}
|
||||
|
||||
mapped = reinterpret_cast<u8*>(alloc_info.pMappedData);
|
||||
}
|
||||
|
||||
void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) {
|
||||
|
@ -10,8 +10,16 @@
|
||||
#include <vector>
|
||||
#include "video_core/renderer_vulkan/vk_common.h"
|
||||
|
||||
VK_DEFINE_HANDLE(VmaAllocation)
|
||||
|
||||
namespace Vulkan {
|
||||
|
||||
enum class BufferType : u32 {
|
||||
Upload = 0,
|
||||
Download = 1,
|
||||
Stream = 2,
|
||||
};
|
||||
|
||||
class Instance;
|
||||
class Scheduler;
|
||||
|
||||
@ -20,7 +28,8 @@ class StreamBuffer final {
|
||||
|
||||
public:
|
||||
explicit StreamBuffer(const Instance& instance, Scheduler& scheduler,
|
||||
vk::BufferUsageFlags usage, u64 size, bool readback = false);
|
||||
vk::BufferUsageFlags usage, u64 size,
|
||||
BufferType type = BufferType::Stream);
|
||||
~StreamBuffer();
|
||||
|
||||
/**
|
||||
@ -60,11 +69,11 @@ private:
|
||||
Scheduler& scheduler; ///< Command scheduler.
|
||||
|
||||
vk::Buffer buffer; ///< Mapped buffer.
|
||||
vk::DeviceMemory memory; ///< Memory allocation.
|
||||
VmaAllocation allocation; ///< VMA allocation
|
||||
u8* mapped{}; ///< Pointer to the mapped memory
|
||||
u64 stream_buffer_size{}; ///< Stream buffer size.
|
||||
vk::BufferUsageFlags usage{};
|
||||
bool readback{}; ///< Flag indicating if the buffer should use cached memory
|
||||
BufferType type;
|
||||
|
||||
u64 offset{}; ///< Buffer iterator.
|
||||
u64 mapped_size{}; ///< Size reserved for the current copy.
|
||||
|
@ -106,7 +106,7 @@ u32 UnpackDepthStencil(const StagingData& data, vk::Format dest) {
|
||||
return depth_offset;
|
||||
}
|
||||
|
||||
constexpr u64 UPLOAD_BUFFER_SIZE = 128 * 1024 * 1024;
|
||||
constexpr u64 UPLOAD_BUFFER_SIZE = 64 * 1024 * 1024;
|
||||
constexpr u64 DOWNLOAD_BUFFER_SIZE = 16 * 1024 * 1024;
|
||||
|
||||
} // Anonymous namespace
|
||||
@ -115,9 +115,10 @@ TextureRuntime::TextureRuntime(const Instance& instance, Scheduler& scheduler,
|
||||
RenderpassCache& renderpass_cache, DescriptorManager& desc_manager)
|
||||
: instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache},
|
||||
blit_helper{instance, scheduler, desc_manager, renderpass_cache},
|
||||
upload_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, UPLOAD_BUFFER_SIZE},
|
||||
upload_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, UPLOAD_BUFFER_SIZE,
|
||||
BufferType::Upload},
|
||||
download_buffer{instance, scheduler, vk::BufferUsageFlagBits::eTransferDst,
|
||||
DOWNLOAD_BUFFER_SIZE, true} {
|
||||
DOWNLOAD_BUFFER_SIZE, BufferType::Download} {
|
||||
|
||||
auto Register = [this](VideoCore::PixelFormat dest,
|
||||
std::unique_ptr<FormatReinterpreterBase>&& obj) {
|
||||
|
Reference in New Issue
Block a user