From 0a40a513a6e1bb9077f9c991d882668eb82555a8 Mon Sep 17 00:00:00 2001 From: emufan4568 Date: Tue, 18 Oct 2022 22:17:30 +0300 Subject: [PATCH] renderer_vulkan: Improve StreamBuffer API and use it in TextureRuntime * Also use separate upload and download buffers optimized for write and readback respectively. This gives a huge 20+ FPS boost in most games which were bottlenecked by slow reads --- .../rasterizer_cache/morton_swizzle.h | 17 ++-- .../rasterizer_cache/rasterizer_cache.h | 28 +++--- .../renderer_vulkan/renderer_vulkan.cpp | 3 +- .../renderer_vulkan/vk_stream_buffer.cpp | 89 +++++++++++++------ .../renderer_vulkan/vk_stream_buffer.h | 38 ++++---- .../renderer_vulkan/vk_texture_runtime.cpp | 66 ++++++-------- .../renderer_vulkan/vk_texture_runtime.h | 10 +-- 7 files changed, 142 insertions(+), 109 deletions(-) diff --git a/src/video_core/rasterizer_cache/morton_swizzle.h b/src/video_core/rasterizer_cache/morton_swizzle.h index 3cc3099dd..901eeb3fe 100644 --- a/src/video_core/rasterizer_cache/morton_swizzle.h +++ b/src/video_core/rasterizer_cache/morton_swizzle.h @@ -23,7 +23,7 @@ inline T MakeInt(const std::byte* bytes) { } template -inline void DecodePixel(const std::byte* source, std::byte* dest) { +constexpr void DecodePixel(const std::byte* source, std::byte* dest) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; if constexpr (format == PixelFormat::D24S8) { @@ -69,7 +69,7 @@ inline void DecodePixel(const std::byte* source, std::byte* dest) { } template -inline void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) { +constexpr void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) { const u32 morton_offset = VideoCore::MortonInterleave(x, y); const u8 value = static_cast(source_tile[morton_offset >> 1]); const u8 pixel = Color::Convert4To8((morton_offset % 2) ? (value >> 4) : (value & 0xF)); @@ -84,7 +84,7 @@ inline void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte* } template -inline void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) { +constexpr void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) { constexpr u32 subtile_width = 4; constexpr u32 subtile_height = 4; constexpr bool has_alpha = format == PixelFormat::ETC1A4; @@ -114,7 +114,7 @@ inline void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byt } template -inline void EncodePixel(const std::byte* source, std::byte* dest) { +constexpr void EncodePixel(const std::byte* source, std::byte* dest) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; if constexpr (format == PixelFormat::D24S8) { @@ -146,8 +146,8 @@ inline void EncodePixel(const std::byte* source, std::byte* dest) { } template -inline void MortonCopyTile(u32 stride, std::span tile_buffer, - std::span linear_buffer) { +constexpr void MortonCopyTile(u32 stride, std::span tile_buffer, + std::span linear_buffer) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; constexpr u32 linear_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format); constexpr bool is_compressed = format == PixelFormat::ETC1 || format == PixelFormat::ETC1A4; @@ -200,8 +200,9 @@ inline void MortonCopyTile(u32 stride, std::span tile_buffer, * in the linear_buffer. */ template -static void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset, - std::span linear_buffer, std::span tiled_buffer) { +static constexpr void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset, + std::span linear_buffer, + std::span tiled_buffer) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; constexpr u32 aligned_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format); constexpr u32 tile_size = GetFormatBpp(format) * 64 / 8; diff --git a/src/video_core/rasterizer_cache/rasterizer_cache.h b/src/video_core/rasterizer_cache/rasterizer_cache.h index 2491e3211..1ebdf284e 100644 --- a/src/video_core/rasterizer_cache/rasterizer_cache.h +++ b/src/video_core/rasterizer_cache/rasterizer_cache.h @@ -948,22 +948,22 @@ void RasterizerCache::DownloadSurface(const Surface& surface, SurfaceInterval surface->Download(download, staging); - download_queue.push_back( - [this, surface, flush_start, flush_end, flush_info, mapped = staging.mapped]() { - MemoryRef dest_ptr = VideoCore::g_memory->GetPhysicalRef(flush_start); - if (!dest_ptr) [[unlikely]] { - return; - } + MemoryRef dest_ptr = VideoCore::g_memory->GetPhysicalRef(flush_start); + if (!dest_ptr) [[unlikely]] { + return; + } - const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start); + const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start); - if (surface->is_tiled) { - SwizzleTexture(flush_info, flush_start, flush_end, mapped, download_dest, - runtime.NeedsConvertion(surface->pixel_format)); - } else { - runtime.FormatConvert(*surface, false, mapped, download_dest); - } - }); + download_queue.push_back([this, surface, flush_start, flush_end, flush_info, + mapped = staging.mapped, download_dest]() { + if (surface->is_tiled) { + SwizzleTexture(flush_info, flush_start, flush_end, mapped, download_dest, + runtime.NeedsConvertion(surface->pixel_format)); + } else { + runtime.FormatConvert(*surface, false, mapped, download_dest); + } + }); } template diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 9c925e692..9375c6acd 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -962,11 +962,10 @@ void RendererVulkan::SwapBuffers() { void RendererVulkan::FlushBuffers() { vertex_buffer.Flush(); rasterizer->FlushBuffers(); - renderpass_cache.ExitRenderpass(); + runtime.FlushBuffers(); } void RendererVulkan::OnSlotSwitch() { - runtime.OnSlotSwitch(scheduler.GetCurrentSlotIndex()); renderpass_cache.OnSlotSwitch(); rasterizer->pipeline_cache.MarkDirty(); } diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp index 2b1f7133c..f5496a209 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp @@ -40,14 +40,18 @@ inline auto ToVkAccessStageFlags(vk::BufferUsageFlagBits usage) { return result; } -StagingBuffer::StagingBuffer(const Instance& instance, u32 size, vk::BufferUsageFlags usage) +StagingBuffer::StagingBuffer(const Instance& instance, u32 size, bool readback) : instance{instance} { + const vk::BufferUsageFlags usage = + readback ? vk::BufferUsageFlagBits::eTransferDst : vk::BufferUsageFlagBits::eTransferSrc; const vk::BufferCreateInfo buffer_info = {.size = size, .usage = usage}; - const VmaAllocationCreateInfo alloc_create_info = { - .flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | - VMA_ALLOCATION_CREATE_MAPPED_BIT, - .usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST}; + const VmaAllocationCreateFlags flags = + readback ? VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT + : VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT; + const VmaAllocationCreateInfo alloc_create_info = {.flags = + flags | VMA_ALLOCATION_CREATE_MAPPED_BIT, + .usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST}; VkBuffer unsafe_buffer = VK_NULL_HANDLE; VkBufferCreateInfo unsafe_buffer_info = static_cast(buffer_info); @@ -66,9 +70,15 @@ StagingBuffer::~StagingBuffer() { } StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size, - vk::BufferUsageFlagBits usage, std::span view_formats) + bool readback) : instance{instance}, scheduler{scheduler}, total_size{size * SCHEDULER_COMMAND_COUNT}, - staging{instance, total_size, vk::BufferUsageFlagBits::eTransferSrc}, usage{usage} { + staging{instance, total_size, readback}, bucket_size{size} {} + +StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size, + vk::BufferUsageFlagBits usage, std::span view_formats, + bool readback) + : instance{instance}, scheduler{scheduler}, total_size{size * SCHEDULER_COMMAND_COUNT}, + staging{instance, total_size, readback}, usage{usage}, bucket_size{size} { const vk::BufferCreateInfo buffer_info = { .size = total_size, .usage = usage | vk::BufferUsageFlagBits::eTransferDst}; @@ -97,7 +107,6 @@ StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u } view_count = view_formats.size(); - bucket_size = size; } StreamBuffer::~StreamBuffer() { @@ -141,36 +150,60 @@ void StreamBuffer::Commit(u32 size) { void StreamBuffer::Flush() { const u32 current_bucket = scheduler.GetCurrentSlotIndex(); + const u32 flush_start = current_bucket * bucket_size; const u32 flush_size = buckets[current_bucket].offset; ASSERT(flush_size <= bucket_size); - if (flush_size > 0) { - vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer(); + if (flush_size > 0) [[likely]] { + // Ensure all staging writes are visible to the host memory domain VmaAllocator allocator = instance.GetAllocator(); + vmaFlushAllocation(allocator, staging.allocation, flush_start, flush_size); - const u32 flush_start = current_bucket * bucket_size; - const vk::BufferCopy copy_region = { - .srcOffset = flush_start, .dstOffset = flush_start, .size = flush_size}; + // Make the data available to the GPU if possible + if (buffer) { + const vk::BufferCopy copy_region = { + .srcOffset = flush_start, .dstOffset = flush_start, .size = flush_size}; - vmaFlushAllocation(allocator, allocation, flush_start, flush_size); - command_buffer.copyBuffer(staging.buffer, buffer, copy_region); + vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer(); + command_buffer.copyBuffer(staging.buffer, buffer, copy_region); - // Add pipeline barrier for the flushed region - auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage); - const vk::BufferMemoryBarrier buffer_barrier = { - .srcAccessMask = vk::AccessFlagBits::eTransferWrite, - .dstAccessMask = access_mask, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = buffer, - .offset = flush_start, - .size = flush_size}; + // Add pipeline barrier for the flushed region + auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage); + const vk::BufferMemoryBarrier buffer_barrier = { + .srcAccessMask = vk::AccessFlagBits::eTransferWrite, + .dstAccessMask = access_mask, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = buffer, + .offset = flush_start, + .size = flush_size}; - command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask, - vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, {}); + command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask, + vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, + {}); + } } - // Reset the offset of the next bucket + SwitchBucket(); +} + +void StreamBuffer::Invalidate() { + const u32 current_bucket = scheduler.GetCurrentSlotIndex(); + const u32 flush_start = current_bucket * bucket_size; + const u32 flush_size = buckets[current_bucket].offset; + ASSERT(flush_size <= bucket_size); + + if (flush_size > 0) [[likely]] { + // Ensure the staging memory can be read by the host + VmaAllocator allocator = instance.GetAllocator(); + vmaInvalidateAllocation(allocator, staging.allocation, flush_start, flush_size); + } + + SwitchBucket(); +} + +void StreamBuffer::SwitchBucket() { + const u32 current_bucket = scheduler.GetCurrentSlotIndex(); const u32 next_bucket = (current_bucket + 1) % SCHEDULER_COMMAND_COUNT; buckets[next_bucket].offset = 0; buckets[next_bucket].invalid = true; diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index 5205e4b92..7a63e66a9 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -19,13 +19,8 @@ class TaskScheduler; constexpr u32 MAX_BUFFER_VIEWS = 3; -struct LockedRegion { - u32 size = 0; - u64 fence_counter = 0; -}; - struct StagingBuffer { - StagingBuffer(const Instance& instance, u32 size, vk::BufferUsageFlags usage); + StagingBuffer(const Instance& instance, u32 size, bool readback); ~StagingBuffer(); const Instance& instance; @@ -36,10 +31,19 @@ struct StagingBuffer { class StreamBuffer { public: + /// Staging only constructor StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size, - vk::BufferUsageFlagBits usage, std::span views); + bool readback = false); + /// Staging + GPU streaming constructor + StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size, + vk::BufferUsageFlagBits usage, std::span views, + bool readback = false); ~StreamBuffer(); + StreamBuffer(const StreamBuffer&) = delete; + StreamBuffer& operator=(const StreamBuffer&) = delete; + + /// Maps aligned staging memory of size bytes std::tuple Map(u32 size, u32 alignment = 0); /// Commits size bytes from the currently mapped staging memory @@ -48,24 +52,28 @@ public: /// Flushes staging memory to the GPU buffer void Flush(); - /// Returns the Vulkan buffer handle + /// Invalidates staging memory for reading + void Invalidate(); + + /// Switches to the next available bucket + void SwitchBucket(); + + /// Returns the GPU buffer handle vk::Buffer GetHandle() const { return buffer; } + /// Returns the staging buffer handle + vk::Buffer GetStagingHandle() const { + return staging.buffer; + } + /// Returns an immutable reference to the requested buffer view const vk::BufferView& GetView(u32 index = 0) const { ASSERT(index < view_count); return views[index]; } -private: - /// Invalidates the buffer offsets - void Invalidate(); - - /// Removes the lock on regions whose fence counter has been reached by the GPU - bool UnlockFreeRegions(u32 target_size); - private: struct Bucket { bool invalid; diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp index c7e350b27..055f7811a 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp @@ -32,19 +32,14 @@ vk::ImageAspectFlags ToVkAspect(VideoCore::SurfaceType type) { return vk::ImageAspectFlagBits::eColor; } -constexpr u32 STAGING_BUFFER_SIZE = 64 * 1024 * 1024; +constexpr u32 UPLOAD_BUFFER_SIZE = 32 * 1024 * 1024; +constexpr u32 DOWNLOAD_BUFFER_SIZE = 32 * 1024 * 1024; TextureRuntime::TextureRuntime(const Instance& instance, TaskScheduler& scheduler, RenderpassCache& renderpass_cache) - : instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache}, blit_helper{ - instance, - scheduler} { - - for (auto& buffer : staging_buffers) { - buffer = std::make_unique(instance, STAGING_BUFFER_SIZE, - vk::BufferUsageFlagBits::eTransferSrc | - vk::BufferUsageFlagBits::eTransferDst); - } + : instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache}, + blit_helper{instance, scheduler}, upload_buffer{instance, scheduler, UPLOAD_BUFFER_SIZE}, + download_buffer{instance, scheduler, DOWNLOAD_BUFFER_SIZE, true} { auto Register = [this](VideoCore::PixelFormat dest, std::unique_ptr&& obj) { @@ -64,7 +59,9 @@ TextureRuntime::~TextureRuntime() { for (const auto& [key, alloc] : texture_recycler) { vmaDestroyImage(allocator, alloc.image, alloc.allocation); device.destroyImageView(alloc.image_view); - device.destroyImageView(alloc.base_view); + if (alloc.base_view) { + device.destroyImageView(alloc.base_view); + } if (alloc.depth_view) { device.destroyImageView(alloc.depth_view); device.destroyImageView(alloc.stencil_view); @@ -82,29 +79,24 @@ TextureRuntime::~TextureRuntime() { } StagingData TextureRuntime::FindStaging(u32 size, bool upload) { - const u32 current_slot = scheduler.GetCurrentSlotIndex(); - u32& offset = staging_offsets[current_slot]; // Depth uploads require 4 byte alignment, doesn't hurt to do it for everyone - offset = Common::AlignUp(offset, 4); + auto& buffer = upload ? upload_buffer : download_buffer; + auto [data, offset, invalidate] = buffer.Map(size, 4); - if (offset + size > STAGING_BUFFER_SIZE) { - LOG_CRITICAL(Render_Vulkan, "Staging buffer size exceeded!"); - UNREACHABLE(); - } - - const auto& buffer = staging_buffers[current_slot]; - return StagingData{.buffer = buffer->buffer, + return StagingData{.buffer = buffer.GetStagingHandle(), .size = size, - .mapped = buffer->mapped.subspan(offset, size), + .mapped = std::span{reinterpret_cast(data), size}, .buffer_offset = offset}; } -void TextureRuntime::Finish() { - scheduler.Submit(SubmitMode::Flush); +void TextureRuntime::FlushBuffers() { + upload_buffer.Flush(); } -void TextureRuntime::OnSlotSwitch(u32 new_slot) { - staging_offsets[new_slot] = 0; +void TextureRuntime::Finish() { + renderpass_cache.ExitRenderpass(); + scheduler.Submit(SubmitMode::Flush); + download_buffer.Invalidate(); } ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelFormat format, @@ -112,6 +104,7 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma const FormatTraits traits = instance.GetTraits(format); const vk::ImageAspectFlags aspect = ToVkAspect(VideoCore::GetFormatType(format)); + // Depth buffers are not supposed to support blit by the spec so don't require it. const bool is_suitable = traits.transfer_support && traits.attachment_support && (traits.blit_support || aspect & vk::ImageAspectFlagBits::eDepth); const vk::Format vk_format = is_suitable ? traits.native : traits.fallback; @@ -416,12 +409,14 @@ bool TextureRuntime::BlitTextures(Surface& source, Surface& dest, .baseArrayLayer = blit.dst_layer, .layerCount = 1}, .dstOffsets = dest_offsets}; + // Don't use linear filtering on depth attachments - const vk::Filter filtering = - blit_area.srcSubresource.aspectMask & vk::ImageAspectFlagBits::eDepth || - blit_area.dstSubresource.aspectMask & vk::ImageAspectFlagBits::eDepth - ? vk::Filter::eNearest - : vk::Filter::eLinear; + const VideoCore::PixelFormat format = source.pixel_format; + const vk::Filter filtering = format == VideoCore::PixelFormat::D24S8 || + format == VideoCore::PixelFormat::D24 || + format == VideoCore::PixelFormat::D16 + ? vk::Filter::eNearest + : vk::Filter::eLinear; vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); command_buffer.blitImage(source.alloc.image, vk::ImageLayout::eTransferSrcOptimal, @@ -682,8 +677,7 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa InvalidateAllWatcher(); // Lock this data until the next scheduler switch - const u32 current_slot = scheduler.GetCurrentSlotIndex(); - runtime.staging_offsets[current_slot] += staging.size; + runtime.upload_buffer.Commit(staging.size); } MICROPROFILE_DEFINE(Vulkan_Download, "VulkanSurface", "Texture Download", MP_RGB(128, 192, 64)); @@ -724,8 +718,7 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi } // Lock this data until the next scheduler switch - const u32 current_slot = scheduler.GetCurrentSlotIndex(); - runtime.staging_offsets[current_slot] += staging.size; + runtime.download_buffer.Commit(staging.size); } u32 Surface::GetInternalBytesPerPixel() const { @@ -811,8 +804,7 @@ void Surface::DepthStencilDownload(const VideoCore::BufferTextureCopy& download, // For depth downloads create an R32UI surface and use a compute shader for convert. // Then we blit and download that surface. - // NOTE: We don't need to set pixel format here since R32Uint automatically gives us - // a storage view. Also the D24S8 creates a unique cache key for it + // NOTE: We keep the pixel format to D24S8 to avoid linear filtering during scale SurfaceParams r32_params = *this; r32_params.width = scaled_rect.GetWidth(); r32_params.stride = scaled_rect.GetWidth(); diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.h b/src/video_core/renderer_vulkan/vk_texture_runtime.h index 5335f302e..1e3689869 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.h +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.h @@ -104,6 +104,9 @@ public: VideoCore::TextureType type, vk::Format format, vk::ImageUsageFlags usage); + /// Flushes staging buffers + void FlushBuffers(); + /// Causes a GPU command flush void Finish(); @@ -138,9 +141,6 @@ public: /// Returns true if the provided pixel format needs convertion [[nodiscard]] bool NeedsConvertion(VideoCore::PixelFormat format) const; - /// Performs operations that need to be done on every scheduler slot switch - void OnSlotSwitch(u32 new_slot); - private: /// Returns the current Vulkan instance const Instance& GetInstance() const { @@ -157,9 +157,9 @@ private: TaskScheduler& scheduler; RenderpassCache& renderpass_cache; BlitHelper blit_helper; + StreamBuffer upload_buffer; + StreamBuffer download_buffer; std::array reinterpreters; - std::array, SCHEDULER_COMMAND_COUNT> staging_buffers; - std::array staging_offsets{}; std::unordered_multimap texture_recycler; std::unordered_map clear_framebuffers; };