renderer_vulkan: Improve StreamBuffer API and use it in TextureRuntime

* Also use separate upload and download buffers optimized for write and readback respectively. This gives a huge 20+ FPS boost in most games which were bottlenecked by slow reads
This commit is contained in:
emufan4568
2022-10-18 22:17:30 +03:00
committed by GPUCode
parent a4dc6a55b7
commit 0a40a513a6
7 changed files with 142 additions and 109 deletions

View File

@ -23,7 +23,7 @@ inline T MakeInt(const std::byte* bytes) {
} }
template <PixelFormat format, bool converted> template <PixelFormat format, bool converted>
inline void DecodePixel(const std::byte* source, std::byte* dest) { constexpr void DecodePixel(const std::byte* source, std::byte* dest) {
constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
if constexpr (format == PixelFormat::D24S8) { if constexpr (format == PixelFormat::D24S8) {
@ -69,7 +69,7 @@ inline void DecodePixel(const std::byte* source, std::byte* dest) {
} }
template <PixelFormat format> template <PixelFormat format>
inline void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) { constexpr void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) {
const u32 morton_offset = VideoCore::MortonInterleave(x, y); const u32 morton_offset = VideoCore::MortonInterleave(x, y);
const u8 value = static_cast<const u8>(source_tile[morton_offset >> 1]); const u8 value = static_cast<const u8>(source_tile[morton_offset >> 1]);
const u8 pixel = Color::Convert4To8((morton_offset % 2) ? (value >> 4) : (value & 0xF)); const u8 pixel = Color::Convert4To8((morton_offset % 2) ? (value >> 4) : (value & 0xF));
@ -84,7 +84,7 @@ inline void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte*
} }
template <PixelFormat format> template <PixelFormat format>
inline void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) { constexpr void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) {
constexpr u32 subtile_width = 4; constexpr u32 subtile_width = 4;
constexpr u32 subtile_height = 4; constexpr u32 subtile_height = 4;
constexpr bool has_alpha = format == PixelFormat::ETC1A4; constexpr bool has_alpha = format == PixelFormat::ETC1A4;
@ -114,7 +114,7 @@ inline void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byt
} }
template <PixelFormat format, bool converted> template <PixelFormat format, bool converted>
inline void EncodePixel(const std::byte* source, std::byte* dest) { constexpr void EncodePixel(const std::byte* source, std::byte* dest) {
constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
if constexpr (format == PixelFormat::D24S8) { if constexpr (format == PixelFormat::D24S8) {
@ -146,8 +146,8 @@ inline void EncodePixel(const std::byte* source, std::byte* dest) {
} }
template <bool morton_to_linear, PixelFormat format, bool converted> template <bool morton_to_linear, PixelFormat format, bool converted>
inline void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer, constexpr void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer,
std::span<std::byte> linear_buffer) { std::span<std::byte> linear_buffer) {
constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
constexpr u32 linear_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format); constexpr u32 linear_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format);
constexpr bool is_compressed = format == PixelFormat::ETC1 || format == PixelFormat::ETC1A4; constexpr bool is_compressed = format == PixelFormat::ETC1 || format == PixelFormat::ETC1A4;
@ -200,8 +200,9 @@ inline void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer,
* in the linear_buffer. * in the linear_buffer.
*/ */
template <bool morton_to_linear, PixelFormat format, bool converted = false> template <bool morton_to_linear, PixelFormat format, bool converted = false>
static void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset, static constexpr void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset,
std::span<std::byte> linear_buffer, std::span<std::byte> tiled_buffer) { std::span<std::byte> linear_buffer,
std::span<std::byte> tiled_buffer) {
constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
constexpr u32 aligned_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format); constexpr u32 aligned_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format);
constexpr u32 tile_size = GetFormatBpp(format) * 64 / 8; constexpr u32 tile_size = GetFormatBpp(format) * 64 / 8;

View File

@ -948,22 +948,22 @@ void RasterizerCache<T>::DownloadSurface(const Surface& surface, SurfaceInterval
surface->Download(download, staging); surface->Download(download, staging);
download_queue.push_back( MemoryRef dest_ptr = VideoCore::g_memory->GetPhysicalRef(flush_start);
[this, surface, flush_start, flush_end, flush_info, mapped = staging.mapped]() { if (!dest_ptr) [[unlikely]] {
MemoryRef dest_ptr = VideoCore::g_memory->GetPhysicalRef(flush_start); return;
if (!dest_ptr) [[unlikely]] { }
return;
}
const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start); const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start);
if (surface->is_tiled) { download_queue.push_back([this, surface, flush_start, flush_end, flush_info,
SwizzleTexture(flush_info, flush_start, flush_end, mapped, download_dest, mapped = staging.mapped, download_dest]() {
runtime.NeedsConvertion(surface->pixel_format)); if (surface->is_tiled) {
} else { SwizzleTexture(flush_info, flush_start, flush_end, mapped, download_dest,
runtime.FormatConvert(*surface, false, mapped, download_dest); runtime.NeedsConvertion(surface->pixel_format));
} } else {
}); runtime.FormatConvert(*surface, false, mapped, download_dest);
}
});
} }
template <class T> template <class T>

View File

@ -962,11 +962,10 @@ void RendererVulkan::SwapBuffers() {
void RendererVulkan::FlushBuffers() { void RendererVulkan::FlushBuffers() {
vertex_buffer.Flush(); vertex_buffer.Flush();
rasterizer->FlushBuffers(); rasterizer->FlushBuffers();
renderpass_cache.ExitRenderpass(); runtime.FlushBuffers();
} }
void RendererVulkan::OnSlotSwitch() { void RendererVulkan::OnSlotSwitch() {
runtime.OnSlotSwitch(scheduler.GetCurrentSlotIndex());
renderpass_cache.OnSlotSwitch(); renderpass_cache.OnSlotSwitch();
rasterizer->pipeline_cache.MarkDirty(); rasterizer->pipeline_cache.MarkDirty();
} }

View File

@ -40,14 +40,18 @@ inline auto ToVkAccessStageFlags(vk::BufferUsageFlagBits usage) {
return result; return result;
} }
StagingBuffer::StagingBuffer(const Instance& instance, u32 size, vk::BufferUsageFlags usage) StagingBuffer::StagingBuffer(const Instance& instance, u32 size, bool readback)
: instance{instance} { : instance{instance} {
const vk::BufferUsageFlags usage =
readback ? vk::BufferUsageFlagBits::eTransferDst : vk::BufferUsageFlagBits::eTransferSrc;
const vk::BufferCreateInfo buffer_info = {.size = size, .usage = usage}; const vk::BufferCreateInfo buffer_info = {.size = size, .usage = usage};
const VmaAllocationCreateInfo alloc_create_info = { const VmaAllocationCreateFlags flags =
.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | readback ? VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT
VMA_ALLOCATION_CREATE_MAPPED_BIT, : VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST}; const VmaAllocationCreateInfo alloc_create_info = {.flags =
flags | VMA_ALLOCATION_CREATE_MAPPED_BIT,
.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST};
VkBuffer unsafe_buffer = VK_NULL_HANDLE; VkBuffer unsafe_buffer = VK_NULL_HANDLE;
VkBufferCreateInfo unsafe_buffer_info = static_cast<VkBufferCreateInfo>(buffer_info); VkBufferCreateInfo unsafe_buffer_info = static_cast<VkBufferCreateInfo>(buffer_info);
@ -66,9 +70,15 @@ StagingBuffer::~StagingBuffer() {
} }
StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size, StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
vk::BufferUsageFlagBits usage, std::span<const vk::Format> view_formats) bool readback)
: instance{instance}, scheduler{scheduler}, total_size{size * SCHEDULER_COMMAND_COUNT}, : instance{instance}, scheduler{scheduler}, total_size{size * SCHEDULER_COMMAND_COUNT},
staging{instance, total_size, vk::BufferUsageFlagBits::eTransferSrc}, usage{usage} { staging{instance, total_size, readback}, bucket_size{size} {}
StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
vk::BufferUsageFlagBits usage, std::span<const vk::Format> view_formats,
bool readback)
: instance{instance}, scheduler{scheduler}, total_size{size * SCHEDULER_COMMAND_COUNT},
staging{instance, total_size, readback}, usage{usage}, bucket_size{size} {
const vk::BufferCreateInfo buffer_info = { const vk::BufferCreateInfo buffer_info = {
.size = total_size, .usage = usage | vk::BufferUsageFlagBits::eTransferDst}; .size = total_size, .usage = usage | vk::BufferUsageFlagBits::eTransferDst};
@ -97,7 +107,6 @@ StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u
} }
view_count = view_formats.size(); view_count = view_formats.size();
bucket_size = size;
} }
StreamBuffer::~StreamBuffer() { StreamBuffer::~StreamBuffer() {
@ -141,36 +150,60 @@ void StreamBuffer::Commit(u32 size) {
void StreamBuffer::Flush() { void StreamBuffer::Flush() {
const u32 current_bucket = scheduler.GetCurrentSlotIndex(); const u32 current_bucket = scheduler.GetCurrentSlotIndex();
const u32 flush_start = current_bucket * bucket_size;
const u32 flush_size = buckets[current_bucket].offset; const u32 flush_size = buckets[current_bucket].offset;
ASSERT(flush_size <= bucket_size); ASSERT(flush_size <= bucket_size);
if (flush_size > 0) { if (flush_size > 0) [[likely]] {
vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer(); // Ensure all staging writes are visible to the host memory domain
VmaAllocator allocator = instance.GetAllocator(); VmaAllocator allocator = instance.GetAllocator();
vmaFlushAllocation(allocator, staging.allocation, flush_start, flush_size);
const u32 flush_start = current_bucket * bucket_size; // Make the data available to the GPU if possible
const vk::BufferCopy copy_region = { if (buffer) {
.srcOffset = flush_start, .dstOffset = flush_start, .size = flush_size}; const vk::BufferCopy copy_region = {
.srcOffset = flush_start, .dstOffset = flush_start, .size = flush_size};
vmaFlushAllocation(allocator, allocation, flush_start, flush_size); vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer();
command_buffer.copyBuffer(staging.buffer, buffer, copy_region); command_buffer.copyBuffer(staging.buffer, buffer, copy_region);
// Add pipeline barrier for the flushed region // Add pipeline barrier for the flushed region
auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage); auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage);
const vk::BufferMemoryBarrier buffer_barrier = { const vk::BufferMemoryBarrier buffer_barrier = {
.srcAccessMask = vk::AccessFlagBits::eTransferWrite, .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = access_mask, .dstAccessMask = access_mask,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = buffer, .buffer = buffer,
.offset = flush_start, .offset = flush_start,
.size = flush_size}; .size = flush_size};
command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask, command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask,
vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, {}); vk::DependencyFlagBits::eByRegion, {}, buffer_barrier,
{});
}
} }
// Reset the offset of the next bucket SwitchBucket();
}
void StreamBuffer::Invalidate() {
const u32 current_bucket = scheduler.GetCurrentSlotIndex();
const u32 flush_start = current_bucket * bucket_size;
const u32 flush_size = buckets[current_bucket].offset;
ASSERT(flush_size <= bucket_size);
if (flush_size > 0) [[likely]] {
// Ensure the staging memory can be read by the host
VmaAllocator allocator = instance.GetAllocator();
vmaInvalidateAllocation(allocator, staging.allocation, flush_start, flush_size);
}
SwitchBucket();
}
void StreamBuffer::SwitchBucket() {
const u32 current_bucket = scheduler.GetCurrentSlotIndex();
const u32 next_bucket = (current_bucket + 1) % SCHEDULER_COMMAND_COUNT; const u32 next_bucket = (current_bucket + 1) % SCHEDULER_COMMAND_COUNT;
buckets[next_bucket].offset = 0; buckets[next_bucket].offset = 0;
buckets[next_bucket].invalid = true; buckets[next_bucket].invalid = true;

View File

@ -19,13 +19,8 @@ class TaskScheduler;
constexpr u32 MAX_BUFFER_VIEWS = 3; constexpr u32 MAX_BUFFER_VIEWS = 3;
struct LockedRegion {
u32 size = 0;
u64 fence_counter = 0;
};
struct StagingBuffer { struct StagingBuffer {
StagingBuffer(const Instance& instance, u32 size, vk::BufferUsageFlags usage); StagingBuffer(const Instance& instance, u32 size, bool readback);
~StagingBuffer(); ~StagingBuffer();
const Instance& instance; const Instance& instance;
@ -36,10 +31,19 @@ struct StagingBuffer {
class StreamBuffer { class StreamBuffer {
public: public:
/// Staging only constructor
StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size, StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
vk::BufferUsageFlagBits usage, std::span<const vk::Format> views); bool readback = false);
/// Staging + GPU streaming constructor
StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
vk::BufferUsageFlagBits usage, std::span<const vk::Format> views,
bool readback = false);
~StreamBuffer(); ~StreamBuffer();
StreamBuffer(const StreamBuffer&) = delete;
StreamBuffer& operator=(const StreamBuffer&) = delete;
/// Maps aligned staging memory of size bytes
std::tuple<u8*, u32, bool> Map(u32 size, u32 alignment = 0); std::tuple<u8*, u32, bool> Map(u32 size, u32 alignment = 0);
/// Commits size bytes from the currently mapped staging memory /// Commits size bytes from the currently mapped staging memory
@ -48,24 +52,28 @@ public:
/// Flushes staging memory to the GPU buffer /// Flushes staging memory to the GPU buffer
void Flush(); void Flush();
/// Returns the Vulkan buffer handle /// Invalidates staging memory for reading
void Invalidate();
/// Switches to the next available bucket
void SwitchBucket();
/// Returns the GPU buffer handle
vk::Buffer GetHandle() const { vk::Buffer GetHandle() const {
return buffer; return buffer;
} }
/// Returns the staging buffer handle
vk::Buffer GetStagingHandle() const {
return staging.buffer;
}
/// Returns an immutable reference to the requested buffer view /// Returns an immutable reference to the requested buffer view
const vk::BufferView& GetView(u32 index = 0) const { const vk::BufferView& GetView(u32 index = 0) const {
ASSERT(index < view_count); ASSERT(index < view_count);
return views[index]; return views[index];
} }
private:
/// Invalidates the buffer offsets
void Invalidate();
/// Removes the lock on regions whose fence counter has been reached by the GPU
bool UnlockFreeRegions(u32 target_size);
private: private:
struct Bucket { struct Bucket {
bool invalid; bool invalid;

View File

@ -32,19 +32,14 @@ vk::ImageAspectFlags ToVkAspect(VideoCore::SurfaceType type) {
return vk::ImageAspectFlagBits::eColor; return vk::ImageAspectFlagBits::eColor;
} }
constexpr u32 STAGING_BUFFER_SIZE = 64 * 1024 * 1024; constexpr u32 UPLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
constexpr u32 DOWNLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
TextureRuntime::TextureRuntime(const Instance& instance, TaskScheduler& scheduler, TextureRuntime::TextureRuntime(const Instance& instance, TaskScheduler& scheduler,
RenderpassCache& renderpass_cache) RenderpassCache& renderpass_cache)
: instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache}, blit_helper{ : instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache},
instance, blit_helper{instance, scheduler}, upload_buffer{instance, scheduler, UPLOAD_BUFFER_SIZE},
scheduler} { download_buffer{instance, scheduler, DOWNLOAD_BUFFER_SIZE, true} {
for (auto& buffer : staging_buffers) {
buffer = std::make_unique<StagingBuffer>(instance, STAGING_BUFFER_SIZE,
vk::BufferUsageFlagBits::eTransferSrc |
vk::BufferUsageFlagBits::eTransferDst);
}
auto Register = [this](VideoCore::PixelFormat dest, auto Register = [this](VideoCore::PixelFormat dest,
std::unique_ptr<FormatReinterpreterBase>&& obj) { std::unique_ptr<FormatReinterpreterBase>&& obj) {
@ -64,7 +59,9 @@ TextureRuntime::~TextureRuntime() {
for (const auto& [key, alloc] : texture_recycler) { for (const auto& [key, alloc] : texture_recycler) {
vmaDestroyImage(allocator, alloc.image, alloc.allocation); vmaDestroyImage(allocator, alloc.image, alloc.allocation);
device.destroyImageView(alloc.image_view); device.destroyImageView(alloc.image_view);
device.destroyImageView(alloc.base_view); if (alloc.base_view) {
device.destroyImageView(alloc.base_view);
}
if (alloc.depth_view) { if (alloc.depth_view) {
device.destroyImageView(alloc.depth_view); device.destroyImageView(alloc.depth_view);
device.destroyImageView(alloc.stencil_view); device.destroyImageView(alloc.stencil_view);
@ -82,29 +79,24 @@ TextureRuntime::~TextureRuntime() {
} }
StagingData TextureRuntime::FindStaging(u32 size, bool upload) { StagingData TextureRuntime::FindStaging(u32 size, bool upload) {
const u32 current_slot = scheduler.GetCurrentSlotIndex();
u32& offset = staging_offsets[current_slot];
// Depth uploads require 4 byte alignment, doesn't hurt to do it for everyone // Depth uploads require 4 byte alignment, doesn't hurt to do it for everyone
offset = Common::AlignUp(offset, 4); auto& buffer = upload ? upload_buffer : download_buffer;
auto [data, offset, invalidate] = buffer.Map(size, 4);
if (offset + size > STAGING_BUFFER_SIZE) { return StagingData{.buffer = buffer.GetStagingHandle(),
LOG_CRITICAL(Render_Vulkan, "Staging buffer size exceeded!");
UNREACHABLE();
}
const auto& buffer = staging_buffers[current_slot];
return StagingData{.buffer = buffer->buffer,
.size = size, .size = size,
.mapped = buffer->mapped.subspan(offset, size), .mapped = std::span<std::byte>{reinterpret_cast<std::byte*>(data), size},
.buffer_offset = offset}; .buffer_offset = offset};
} }
void TextureRuntime::Finish() { void TextureRuntime::FlushBuffers() {
scheduler.Submit(SubmitMode::Flush); upload_buffer.Flush();
} }
void TextureRuntime::OnSlotSwitch(u32 new_slot) { void TextureRuntime::Finish() {
staging_offsets[new_slot] = 0; renderpass_cache.ExitRenderpass();
scheduler.Submit(SubmitMode::Flush);
download_buffer.Invalidate();
} }
ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelFormat format, ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelFormat format,
@ -112,6 +104,7 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma
const FormatTraits traits = instance.GetTraits(format); const FormatTraits traits = instance.GetTraits(format);
const vk::ImageAspectFlags aspect = ToVkAspect(VideoCore::GetFormatType(format)); const vk::ImageAspectFlags aspect = ToVkAspect(VideoCore::GetFormatType(format));
// Depth buffers are not supposed to support blit by the spec so don't require it.
const bool is_suitable = traits.transfer_support && traits.attachment_support && const bool is_suitable = traits.transfer_support && traits.attachment_support &&
(traits.blit_support || aspect & vk::ImageAspectFlagBits::eDepth); (traits.blit_support || aspect & vk::ImageAspectFlagBits::eDepth);
const vk::Format vk_format = is_suitable ? traits.native : traits.fallback; const vk::Format vk_format = is_suitable ? traits.native : traits.fallback;
@ -416,12 +409,14 @@ bool TextureRuntime::BlitTextures(Surface& source, Surface& dest,
.baseArrayLayer = blit.dst_layer, .baseArrayLayer = blit.dst_layer,
.layerCount = 1}, .layerCount = 1},
.dstOffsets = dest_offsets}; .dstOffsets = dest_offsets};
// Don't use linear filtering on depth attachments // Don't use linear filtering on depth attachments
const vk::Filter filtering = const VideoCore::PixelFormat format = source.pixel_format;
blit_area.srcSubresource.aspectMask & vk::ImageAspectFlagBits::eDepth || const vk::Filter filtering = format == VideoCore::PixelFormat::D24S8 ||
blit_area.dstSubresource.aspectMask & vk::ImageAspectFlagBits::eDepth format == VideoCore::PixelFormat::D24 ||
? vk::Filter::eNearest format == VideoCore::PixelFormat::D16
: vk::Filter::eLinear; ? vk::Filter::eNearest
: vk::Filter::eLinear;
vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer();
command_buffer.blitImage(source.alloc.image, vk::ImageLayout::eTransferSrcOptimal, command_buffer.blitImage(source.alloc.image, vk::ImageLayout::eTransferSrcOptimal,
@ -682,8 +677,7 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa
InvalidateAllWatcher(); InvalidateAllWatcher();
// Lock this data until the next scheduler switch // Lock this data until the next scheduler switch
const u32 current_slot = scheduler.GetCurrentSlotIndex(); runtime.upload_buffer.Commit(staging.size);
runtime.staging_offsets[current_slot] += staging.size;
} }
MICROPROFILE_DEFINE(Vulkan_Download, "VulkanSurface", "Texture Download", MP_RGB(128, 192, 64)); MICROPROFILE_DEFINE(Vulkan_Download, "VulkanSurface", "Texture Download", MP_RGB(128, 192, 64));
@ -724,8 +718,7 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi
} }
// Lock this data until the next scheduler switch // Lock this data until the next scheduler switch
const u32 current_slot = scheduler.GetCurrentSlotIndex(); runtime.download_buffer.Commit(staging.size);
runtime.staging_offsets[current_slot] += staging.size;
} }
u32 Surface::GetInternalBytesPerPixel() const { u32 Surface::GetInternalBytesPerPixel() const {
@ -811,8 +804,7 @@ void Surface::DepthStencilDownload(const VideoCore::BufferTextureCopy& download,
// For depth downloads create an R32UI surface and use a compute shader for convert. // For depth downloads create an R32UI surface and use a compute shader for convert.
// Then we blit and download that surface. // Then we blit and download that surface.
// NOTE: We don't need to set pixel format here since R32Uint automatically gives us // NOTE: We keep the pixel format to D24S8 to avoid linear filtering during scale
// a storage view. Also the D24S8 creates a unique cache key for it
SurfaceParams r32_params = *this; SurfaceParams r32_params = *this;
r32_params.width = scaled_rect.GetWidth(); r32_params.width = scaled_rect.GetWidth();
r32_params.stride = scaled_rect.GetWidth(); r32_params.stride = scaled_rect.GetWidth();

View File

@ -104,6 +104,9 @@ public:
VideoCore::TextureType type, vk::Format format, VideoCore::TextureType type, vk::Format format,
vk::ImageUsageFlags usage); vk::ImageUsageFlags usage);
/// Flushes staging buffers
void FlushBuffers();
/// Causes a GPU command flush /// Causes a GPU command flush
void Finish(); void Finish();
@ -138,9 +141,6 @@ public:
/// Returns true if the provided pixel format needs convertion /// Returns true if the provided pixel format needs convertion
[[nodiscard]] bool NeedsConvertion(VideoCore::PixelFormat format) const; [[nodiscard]] bool NeedsConvertion(VideoCore::PixelFormat format) const;
/// Performs operations that need to be done on every scheduler slot switch
void OnSlotSwitch(u32 new_slot);
private: private:
/// Returns the current Vulkan instance /// Returns the current Vulkan instance
const Instance& GetInstance() const { const Instance& GetInstance() const {
@ -157,9 +157,9 @@ private:
TaskScheduler& scheduler; TaskScheduler& scheduler;
RenderpassCache& renderpass_cache; RenderpassCache& renderpass_cache;
BlitHelper blit_helper; BlitHelper blit_helper;
StreamBuffer upload_buffer;
StreamBuffer download_buffer;
std::array<ReinterpreterList, VideoCore::PIXEL_FORMAT_COUNT> reinterpreters; std::array<ReinterpreterList, VideoCore::PIXEL_FORMAT_COUNT> reinterpreters;
std::array<std::unique_ptr<StagingBuffer>, SCHEDULER_COMMAND_COUNT> staging_buffers;
std::array<u32, SCHEDULER_COMMAND_COUNT> staging_offsets{};
std::unordered_multimap<HostTextureTag, ImageAlloc> texture_recycler; std::unordered_multimap<HostTextureTag, ImageAlloc> texture_recycler;
std::unordered_map<vk::ImageView, vk::Framebuffer> clear_framebuffers; std::unordered_map<vk::ImageView, vk::Framebuffer> clear_framebuffers;
}; };