renderer_vulkan: Improve StreamBuffer API and use it in TextureRuntime

* Also use separate upload and download buffers optimized for write and readback respectively. This gives a huge 20+ FPS boost in most games which were bottlenecked by slow reads
This commit is contained in:
emufan4568
2022-10-18 22:17:30 +03:00
committed by GPUCode
parent a4dc6a55b7
commit 0a40a513a6
7 changed files with 142 additions and 109 deletions

View File

@ -23,7 +23,7 @@ inline T MakeInt(const std::byte* bytes) {
}
template <PixelFormat format, bool converted>
inline void DecodePixel(const std::byte* source, std::byte* dest) {
constexpr void DecodePixel(const std::byte* source, std::byte* dest) {
constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
if constexpr (format == PixelFormat::D24S8) {
@ -69,7 +69,7 @@ inline void DecodePixel(const std::byte* source, std::byte* dest) {
}
template <PixelFormat format>
inline void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) {
constexpr void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) {
const u32 morton_offset = VideoCore::MortonInterleave(x, y);
const u8 value = static_cast<const u8>(source_tile[morton_offset >> 1]);
const u8 pixel = Color::Convert4To8((morton_offset % 2) ? (value >> 4) : (value & 0xF));
@ -84,7 +84,7 @@ inline void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte*
}
template <PixelFormat format>
inline void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) {
constexpr void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) {
constexpr u32 subtile_width = 4;
constexpr u32 subtile_height = 4;
constexpr bool has_alpha = format == PixelFormat::ETC1A4;
@ -114,7 +114,7 @@ inline void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byt
}
template <PixelFormat format, bool converted>
inline void EncodePixel(const std::byte* source, std::byte* dest) {
constexpr void EncodePixel(const std::byte* source, std::byte* dest) {
constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
if constexpr (format == PixelFormat::D24S8) {
@ -146,8 +146,8 @@ inline void EncodePixel(const std::byte* source, std::byte* dest) {
}
template <bool morton_to_linear, PixelFormat format, bool converted>
inline void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer,
std::span<std::byte> linear_buffer) {
constexpr void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer,
std::span<std::byte> linear_buffer) {
constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
constexpr u32 linear_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format);
constexpr bool is_compressed = format == PixelFormat::ETC1 || format == PixelFormat::ETC1A4;
@ -200,8 +200,9 @@ inline void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer,
* in the linear_buffer.
*/
template <bool morton_to_linear, PixelFormat format, bool converted = false>
static void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset,
std::span<std::byte> linear_buffer, std::span<std::byte> tiled_buffer) {
static constexpr void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset,
std::span<std::byte> linear_buffer,
std::span<std::byte> tiled_buffer) {
constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
constexpr u32 aligned_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format);
constexpr u32 tile_size = GetFormatBpp(format) * 64 / 8;

View File

@ -948,22 +948,22 @@ void RasterizerCache<T>::DownloadSurface(const Surface& surface, SurfaceInterval
surface->Download(download, staging);
download_queue.push_back(
[this, surface, flush_start, flush_end, flush_info, mapped = staging.mapped]() {
MemoryRef dest_ptr = VideoCore::g_memory->GetPhysicalRef(flush_start);
if (!dest_ptr) [[unlikely]] {
return;
}
MemoryRef dest_ptr = VideoCore::g_memory->GetPhysicalRef(flush_start);
if (!dest_ptr) [[unlikely]] {
return;
}
const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start);
const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start);
if (surface->is_tiled) {
SwizzleTexture(flush_info, flush_start, flush_end, mapped, download_dest,
runtime.NeedsConvertion(surface->pixel_format));
} else {
runtime.FormatConvert(*surface, false, mapped, download_dest);
}
});
download_queue.push_back([this, surface, flush_start, flush_end, flush_info,
mapped = staging.mapped, download_dest]() {
if (surface->is_tiled) {
SwizzleTexture(flush_info, flush_start, flush_end, mapped, download_dest,
runtime.NeedsConvertion(surface->pixel_format));
} else {
runtime.FormatConvert(*surface, false, mapped, download_dest);
}
});
}
template <class T>

View File

@ -962,11 +962,10 @@ void RendererVulkan::SwapBuffers() {
void RendererVulkan::FlushBuffers() {
vertex_buffer.Flush();
rasterizer->FlushBuffers();
renderpass_cache.ExitRenderpass();
runtime.FlushBuffers();
}
void RendererVulkan::OnSlotSwitch() {
runtime.OnSlotSwitch(scheduler.GetCurrentSlotIndex());
renderpass_cache.OnSlotSwitch();
rasterizer->pipeline_cache.MarkDirty();
}

View File

@ -40,14 +40,18 @@ inline auto ToVkAccessStageFlags(vk::BufferUsageFlagBits usage) {
return result;
}
StagingBuffer::StagingBuffer(const Instance& instance, u32 size, vk::BufferUsageFlags usage)
StagingBuffer::StagingBuffer(const Instance& instance, u32 size, bool readback)
: instance{instance} {
const vk::BufferUsageFlags usage =
readback ? vk::BufferUsageFlagBits::eTransferDst : vk::BufferUsageFlagBits::eTransferSrc;
const vk::BufferCreateInfo buffer_info = {.size = size, .usage = usage};
const VmaAllocationCreateInfo alloc_create_info = {
.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
VMA_ALLOCATION_CREATE_MAPPED_BIT,
.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST};
const VmaAllocationCreateFlags flags =
readback ? VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT
: VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
const VmaAllocationCreateInfo alloc_create_info = {.flags =
flags | VMA_ALLOCATION_CREATE_MAPPED_BIT,
.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST};
VkBuffer unsafe_buffer = VK_NULL_HANDLE;
VkBufferCreateInfo unsafe_buffer_info = static_cast<VkBufferCreateInfo>(buffer_info);
@ -66,9 +70,15 @@ StagingBuffer::~StagingBuffer() {
}
StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
vk::BufferUsageFlagBits usage, std::span<const vk::Format> view_formats)
bool readback)
: instance{instance}, scheduler{scheduler}, total_size{size * SCHEDULER_COMMAND_COUNT},
staging{instance, total_size, vk::BufferUsageFlagBits::eTransferSrc}, usage{usage} {
staging{instance, total_size, readback}, bucket_size{size} {}
StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
vk::BufferUsageFlagBits usage, std::span<const vk::Format> view_formats,
bool readback)
: instance{instance}, scheduler{scheduler}, total_size{size * SCHEDULER_COMMAND_COUNT},
staging{instance, total_size, readback}, usage{usage}, bucket_size{size} {
const vk::BufferCreateInfo buffer_info = {
.size = total_size, .usage = usage | vk::BufferUsageFlagBits::eTransferDst};
@ -97,7 +107,6 @@ StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u
}
view_count = view_formats.size();
bucket_size = size;
}
StreamBuffer::~StreamBuffer() {
@ -141,36 +150,60 @@ void StreamBuffer::Commit(u32 size) {
void StreamBuffer::Flush() {
const u32 current_bucket = scheduler.GetCurrentSlotIndex();
const u32 flush_start = current_bucket * bucket_size;
const u32 flush_size = buckets[current_bucket].offset;
ASSERT(flush_size <= bucket_size);
if (flush_size > 0) {
vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer();
if (flush_size > 0) [[likely]] {
// Ensure all staging writes are visible to the host memory domain
VmaAllocator allocator = instance.GetAllocator();
vmaFlushAllocation(allocator, staging.allocation, flush_start, flush_size);
const u32 flush_start = current_bucket * bucket_size;
const vk::BufferCopy copy_region = {
.srcOffset = flush_start, .dstOffset = flush_start, .size = flush_size};
// Make the data available to the GPU if possible
if (buffer) {
const vk::BufferCopy copy_region = {
.srcOffset = flush_start, .dstOffset = flush_start, .size = flush_size};
vmaFlushAllocation(allocator, allocation, flush_start, flush_size);
command_buffer.copyBuffer(staging.buffer, buffer, copy_region);
vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer();
command_buffer.copyBuffer(staging.buffer, buffer, copy_region);
// Add pipeline barrier for the flushed region
auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage);
const vk::BufferMemoryBarrier buffer_barrier = {
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = access_mask,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = buffer,
.offset = flush_start,
.size = flush_size};
// Add pipeline barrier for the flushed region
auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage);
const vk::BufferMemoryBarrier buffer_barrier = {
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = access_mask,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = buffer,
.offset = flush_start,
.size = flush_size};
command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask,
vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, {});
command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask,
vk::DependencyFlagBits::eByRegion, {}, buffer_barrier,
{});
}
}
// Reset the offset of the next bucket
SwitchBucket();
}
void StreamBuffer::Invalidate() {
const u32 current_bucket = scheduler.GetCurrentSlotIndex();
const u32 flush_start = current_bucket * bucket_size;
const u32 flush_size = buckets[current_bucket].offset;
ASSERT(flush_size <= bucket_size);
if (flush_size > 0) [[likely]] {
// Ensure the staging memory can be read by the host
VmaAllocator allocator = instance.GetAllocator();
vmaInvalidateAllocation(allocator, staging.allocation, flush_start, flush_size);
}
SwitchBucket();
}
void StreamBuffer::SwitchBucket() {
const u32 current_bucket = scheduler.GetCurrentSlotIndex();
const u32 next_bucket = (current_bucket + 1) % SCHEDULER_COMMAND_COUNT;
buckets[next_bucket].offset = 0;
buckets[next_bucket].invalid = true;

View File

@ -19,13 +19,8 @@ class TaskScheduler;
constexpr u32 MAX_BUFFER_VIEWS = 3;
struct LockedRegion {
u32 size = 0;
u64 fence_counter = 0;
};
struct StagingBuffer {
StagingBuffer(const Instance& instance, u32 size, vk::BufferUsageFlags usage);
StagingBuffer(const Instance& instance, u32 size, bool readback);
~StagingBuffer();
const Instance& instance;
@ -36,10 +31,19 @@ struct StagingBuffer {
class StreamBuffer {
public:
/// Staging only constructor
StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
vk::BufferUsageFlagBits usage, std::span<const vk::Format> views);
bool readback = false);
/// Staging + GPU streaming constructor
StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
vk::BufferUsageFlagBits usage, std::span<const vk::Format> views,
bool readback = false);
~StreamBuffer();
StreamBuffer(const StreamBuffer&) = delete;
StreamBuffer& operator=(const StreamBuffer&) = delete;
/// Maps aligned staging memory of size bytes
std::tuple<u8*, u32, bool> Map(u32 size, u32 alignment = 0);
/// Commits size bytes from the currently mapped staging memory
@ -48,24 +52,28 @@ public:
/// Flushes staging memory to the GPU buffer
void Flush();
/// Returns the Vulkan buffer handle
/// Invalidates staging memory for reading
void Invalidate();
/// Switches to the next available bucket
void SwitchBucket();
/// Returns the GPU buffer handle
vk::Buffer GetHandle() const {
return buffer;
}
/// Returns the staging buffer handle
vk::Buffer GetStagingHandle() const {
return staging.buffer;
}
/// Returns an immutable reference to the requested buffer view
const vk::BufferView& GetView(u32 index = 0) const {
ASSERT(index < view_count);
return views[index];
}
private:
/// Invalidates the buffer offsets
void Invalidate();
/// Removes the lock on regions whose fence counter has been reached by the GPU
bool UnlockFreeRegions(u32 target_size);
private:
struct Bucket {
bool invalid;

View File

@ -32,19 +32,14 @@ vk::ImageAspectFlags ToVkAspect(VideoCore::SurfaceType type) {
return vk::ImageAspectFlagBits::eColor;
}
constexpr u32 STAGING_BUFFER_SIZE = 64 * 1024 * 1024;
constexpr u32 UPLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
constexpr u32 DOWNLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
TextureRuntime::TextureRuntime(const Instance& instance, TaskScheduler& scheduler,
RenderpassCache& renderpass_cache)
: instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache}, blit_helper{
instance,
scheduler} {
for (auto& buffer : staging_buffers) {
buffer = std::make_unique<StagingBuffer>(instance, STAGING_BUFFER_SIZE,
vk::BufferUsageFlagBits::eTransferSrc |
vk::BufferUsageFlagBits::eTransferDst);
}
: instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache},
blit_helper{instance, scheduler}, upload_buffer{instance, scheduler, UPLOAD_BUFFER_SIZE},
download_buffer{instance, scheduler, DOWNLOAD_BUFFER_SIZE, true} {
auto Register = [this](VideoCore::PixelFormat dest,
std::unique_ptr<FormatReinterpreterBase>&& obj) {
@ -64,7 +59,9 @@ TextureRuntime::~TextureRuntime() {
for (const auto& [key, alloc] : texture_recycler) {
vmaDestroyImage(allocator, alloc.image, alloc.allocation);
device.destroyImageView(alloc.image_view);
device.destroyImageView(alloc.base_view);
if (alloc.base_view) {
device.destroyImageView(alloc.base_view);
}
if (alloc.depth_view) {
device.destroyImageView(alloc.depth_view);
device.destroyImageView(alloc.stencil_view);
@ -82,29 +79,24 @@ TextureRuntime::~TextureRuntime() {
}
StagingData TextureRuntime::FindStaging(u32 size, bool upload) {
const u32 current_slot = scheduler.GetCurrentSlotIndex();
u32& offset = staging_offsets[current_slot];
// Depth uploads require 4 byte alignment, doesn't hurt to do it for everyone
offset = Common::AlignUp(offset, 4);
auto& buffer = upload ? upload_buffer : download_buffer;
auto [data, offset, invalidate] = buffer.Map(size, 4);
if (offset + size > STAGING_BUFFER_SIZE) {
LOG_CRITICAL(Render_Vulkan, "Staging buffer size exceeded!");
UNREACHABLE();
}
const auto& buffer = staging_buffers[current_slot];
return StagingData{.buffer = buffer->buffer,
return StagingData{.buffer = buffer.GetStagingHandle(),
.size = size,
.mapped = buffer->mapped.subspan(offset, size),
.mapped = std::span<std::byte>{reinterpret_cast<std::byte*>(data), size},
.buffer_offset = offset};
}
void TextureRuntime::Finish() {
scheduler.Submit(SubmitMode::Flush);
void TextureRuntime::FlushBuffers() {
upload_buffer.Flush();
}
void TextureRuntime::OnSlotSwitch(u32 new_slot) {
staging_offsets[new_slot] = 0;
void TextureRuntime::Finish() {
renderpass_cache.ExitRenderpass();
scheduler.Submit(SubmitMode::Flush);
download_buffer.Invalidate();
}
ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelFormat format,
@ -112,6 +104,7 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma
const FormatTraits traits = instance.GetTraits(format);
const vk::ImageAspectFlags aspect = ToVkAspect(VideoCore::GetFormatType(format));
// Depth buffers are not supposed to support blit by the spec so don't require it.
const bool is_suitable = traits.transfer_support && traits.attachment_support &&
(traits.blit_support || aspect & vk::ImageAspectFlagBits::eDepth);
const vk::Format vk_format = is_suitable ? traits.native : traits.fallback;
@ -416,12 +409,14 @@ bool TextureRuntime::BlitTextures(Surface& source, Surface& dest,
.baseArrayLayer = blit.dst_layer,
.layerCount = 1},
.dstOffsets = dest_offsets};
// Don't use linear filtering on depth attachments
const vk::Filter filtering =
blit_area.srcSubresource.aspectMask & vk::ImageAspectFlagBits::eDepth ||
blit_area.dstSubresource.aspectMask & vk::ImageAspectFlagBits::eDepth
? vk::Filter::eNearest
: vk::Filter::eLinear;
const VideoCore::PixelFormat format = source.pixel_format;
const vk::Filter filtering = format == VideoCore::PixelFormat::D24S8 ||
format == VideoCore::PixelFormat::D24 ||
format == VideoCore::PixelFormat::D16
? vk::Filter::eNearest
: vk::Filter::eLinear;
vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer();
command_buffer.blitImage(source.alloc.image, vk::ImageLayout::eTransferSrcOptimal,
@ -682,8 +677,7 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa
InvalidateAllWatcher();
// Lock this data until the next scheduler switch
const u32 current_slot = scheduler.GetCurrentSlotIndex();
runtime.staging_offsets[current_slot] += staging.size;
runtime.upload_buffer.Commit(staging.size);
}
MICROPROFILE_DEFINE(Vulkan_Download, "VulkanSurface", "Texture Download", MP_RGB(128, 192, 64));
@ -724,8 +718,7 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi
}
// Lock this data until the next scheduler switch
const u32 current_slot = scheduler.GetCurrentSlotIndex();
runtime.staging_offsets[current_slot] += staging.size;
runtime.download_buffer.Commit(staging.size);
}
u32 Surface::GetInternalBytesPerPixel() const {
@ -811,8 +804,7 @@ void Surface::DepthStencilDownload(const VideoCore::BufferTextureCopy& download,
// For depth downloads create an R32UI surface and use a compute shader for convert.
// Then we blit and download that surface.
// NOTE: We don't need to set pixel format here since R32Uint automatically gives us
// a storage view. Also the D24S8 creates a unique cache key for it
// NOTE: We keep the pixel format to D24S8 to avoid linear filtering during scale
SurfaceParams r32_params = *this;
r32_params.width = scaled_rect.GetWidth();
r32_params.stride = scaled_rect.GetWidth();

View File

@ -104,6 +104,9 @@ public:
VideoCore::TextureType type, vk::Format format,
vk::ImageUsageFlags usage);
/// Flushes staging buffers
void FlushBuffers();
/// Causes a GPU command flush
void Finish();
@ -138,9 +141,6 @@ public:
/// Returns true if the provided pixel format needs convertion
[[nodiscard]] bool NeedsConvertion(VideoCore::PixelFormat format) const;
/// Performs operations that need to be done on every scheduler slot switch
void OnSlotSwitch(u32 new_slot);
private:
/// Returns the current Vulkan instance
const Instance& GetInstance() const {
@ -157,9 +157,9 @@ private:
TaskScheduler& scheduler;
RenderpassCache& renderpass_cache;
BlitHelper blit_helper;
StreamBuffer upload_buffer;
StreamBuffer download_buffer;
std::array<ReinterpreterList, VideoCore::PIXEL_FORMAT_COUNT> reinterpreters;
std::array<std::unique_ptr<StagingBuffer>, SCHEDULER_COMMAND_COUNT> staging_buffers;
std::array<u32, SCHEDULER_COMMAND_COUNT> staging_offsets{};
std::unordered_multimap<HostTextureTag, ImageAlloc> texture_recycler;
std::unordered_map<vk::ImageView, vk::Framebuffer> clear_framebuffers;
};