renderer_vulkan: Improve StreamBuffer API and use it in TextureRuntime

* Also use separate upload and download buffers optimized for write and readback respectively. This gives a huge 20+ FPS boost in most games which were bottlenecked by slow reads
2022-10-18 22:17:30 +03:00
parent a4dc6a55b7
commit 0a40a513a6
7 changed files with 142 additions and 109 deletions
--- a/src/video_core/rasterizer_cache/morton_swizzle.h
+++ b/src/video_core/rasterizer_cache/morton_swizzle.h
@@ -23,7 +23,7 @@ inline T MakeInt(const std::byte* bytes) {
 }
 template <PixelFormat format, bool converted>
-inline void DecodePixel(const std::byte* source, std::byte* dest) {
+constexpr void DecodePixel(const std::byte* source, std::byte* dest) {
    constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
    if constexpr (format == PixelFormat::D24S8) {
@@ -69,7 +69,7 @@ inline void DecodePixel(const std::byte* source, std::byte* dest) {
 }
 template <PixelFormat format>
-inline void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) {
+constexpr void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) {
    const u32 morton_offset = VideoCore::MortonInterleave(x, y);
    const u8 value = static_cast<const u8>(source_tile[morton_offset >> 1]);
    const u8 pixel = Color::Convert4To8((morton_offset % 2) ? (value >> 4) : (value & 0xF));
@@ -84,7 +84,7 @@ inline void DecodePixel4(u32 x, u32 y, const std::byte* source_tile, std::byte*
 }
 template <PixelFormat format>
-inline void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) {
+constexpr void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byte* dest_pixel) {
    constexpr u32 subtile_width = 4;
    constexpr u32 subtile_height = 4;
    constexpr bool has_alpha = format == PixelFormat::ETC1A4;
@@ -114,7 +114,7 @@ inline void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byt
 }
 template <PixelFormat format, bool converted>
-inline void EncodePixel(const std::byte* source, std::byte* dest) {
+constexpr void EncodePixel(const std::byte* source, std::byte* dest) {
    constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
    if constexpr (format == PixelFormat::D24S8) {
@@ -146,8 +146,8 @@ inline void EncodePixel(const std::byte* source, std::byte* dest) {
 }
 template <bool morton_to_linear, PixelFormat format, bool converted>
-inline void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer,
+constexpr void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer,
-                           std::span<std::byte> linear_buffer) {
+                              std::span<std::byte> linear_buffer) {
    constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
    constexpr u32 linear_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format);
    constexpr bool is_compressed = format == PixelFormat::ETC1 || format == PixelFormat::ETC1A4;
@@ -200,8 +200,9 @@ inline void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer,
 * in the linear_buffer.
 */
 template <bool morton_to_linear, PixelFormat format, bool converted = false>
-static void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset,
+static constexpr void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset,
-                       std::span<std::byte> linear_buffer, std::span<std::byte> tiled_buffer) {
+                                 std::span<std::byte> linear_buffer,
                                 std::span<std::byte> tiled_buffer) {
    constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
    constexpr u32 aligned_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format);
    constexpr u32 tile_size = GetFormatBpp(format) * 64 / 8;
--- a/src/video_core/rasterizer_cache/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache/rasterizer_cache.h
@@ -948,22 +948,22 @@ void RasterizerCache<T>::DownloadSurface(const Surface& surface, SurfaceInterval
    surface->Download(download, staging);
-    download_queue.push_back(
+    MemoryRef dest_ptr = VideoCore::g_memory->GetPhysicalRef(flush_start);
-        [this, surface, flush_start, flush_end, flush_info, mapped = staging.mapped]() {
+    if (!dest_ptr) [[unlikely]] {
-            MemoryRef dest_ptr = VideoCore::g_memory->GetPhysicalRef(flush_start);
+        return;
-            if (!dest_ptr) [[unlikely]] {
+    }
                return;
            }
-            const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start);
+    const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start);
-            if (surface->is_tiled) {
+    download_queue.push_back([this, surface, flush_start, flush_end, flush_info,
-                SwizzleTexture(flush_info, flush_start, flush_end, mapped, download_dest,
+                              mapped = staging.mapped, download_dest]() {
-                               runtime.NeedsConvertion(surface->pixel_format));
+        if (surface->is_tiled) {
-            } else {
+            SwizzleTexture(flush_info, flush_start, flush_end, mapped, download_dest,
-                runtime.FormatConvert(*surface, false, mapped, download_dest);
+                           runtime.NeedsConvertion(surface->pixel_format));
-            }
+        } else {
-        });
+            runtime.FormatConvert(*surface, false, mapped, download_dest);
        }
    });
 }
 template <class T>
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -962,11 +962,10 @@ void RendererVulkan::SwapBuffers() {
 void RendererVulkan::FlushBuffers() {
    vertex_buffer.Flush();
    rasterizer->FlushBuffers();
-    renderpass_cache.ExitRenderpass();
+    runtime.FlushBuffers();
 }
 void RendererVulkan::OnSlotSwitch() {
    runtime.OnSlotSwitch(scheduler.GetCurrentSlotIndex());
    renderpass_cache.OnSlotSwitch();
    rasterizer->pipeline_cache.MarkDirty();
 }
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
@@ -40,14 +40,18 @@ inline auto ToVkAccessStageFlags(vk::BufferUsageFlagBits usage) {
    return result;
 }
-StagingBuffer::StagingBuffer(const Instance& instance, u32 size, vk::BufferUsageFlags usage)
+StagingBuffer::StagingBuffer(const Instance& instance, u32 size, bool readback)
    : instance{instance} {
    const vk::BufferUsageFlags usage =
        readback ? vk::BufferUsageFlagBits::eTransferDst : vk::BufferUsageFlagBits::eTransferSrc;
    const vk::BufferCreateInfo buffer_info = {.size = size, .usage = usage};
-    const VmaAllocationCreateInfo alloc_create_info = {
+    const VmaAllocationCreateFlags flags =
-        .flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
+        readback ? VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT
-                 VMA_ALLOCATION_CREATE_MAPPED_BIT,
+                 : VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
-        .usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST};
+    const VmaAllocationCreateInfo alloc_create_info = {.flags =
                                                           flags | VMA_ALLOCATION_CREATE_MAPPED_BIT,
                                                       .usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST};
    VkBuffer unsafe_buffer = VK_NULL_HANDLE;
    VkBufferCreateInfo unsafe_buffer_info = static_cast<VkBufferCreateInfo>(buffer_info);
@@ -66,9 +70,15 @@ StagingBuffer::~StagingBuffer() {
 }
 StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
-                           vk::BufferUsageFlagBits usage, std::span<const vk::Format> view_formats)
+                           bool readback)
    : instance{instance}, scheduler{scheduler}, total_size{size * SCHEDULER_COMMAND_COUNT},
-      staging{instance, total_size, vk::BufferUsageFlagBits::eTransferSrc}, usage{usage} {
+      staging{instance, total_size, readback}, bucket_size{size} {}
 StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
                           vk::BufferUsageFlagBits usage, std::span<const vk::Format> view_formats,
                           bool readback)
    : instance{instance}, scheduler{scheduler}, total_size{size * SCHEDULER_COMMAND_COUNT},
      staging{instance, total_size, readback}, usage{usage}, bucket_size{size} {
    const vk::BufferCreateInfo buffer_info = {
        .size = total_size, .usage = usage | vk::BufferUsageFlagBits::eTransferDst};
@@ -97,7 +107,6 @@ StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u
    }
    view_count = view_formats.size();
    bucket_size = size;
 }
 StreamBuffer::~StreamBuffer() {
@@ -141,36 +150,60 @@ void StreamBuffer::Commit(u32 size) {
 void StreamBuffer::Flush() {
    const u32 current_bucket = scheduler.GetCurrentSlotIndex();
    const u32 flush_start = current_bucket * bucket_size;
    const u32 flush_size = buckets[current_bucket].offset;
    ASSERT(flush_size <= bucket_size);
-    if (flush_size > 0) {
+    if (flush_size > 0) [[likely]] {
-        vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer();
+        // Ensure all staging writes are visible to the host memory domain
        VmaAllocator allocator = instance.GetAllocator();
        vmaFlushAllocation(allocator, staging.allocation, flush_start, flush_size);
-        const u32 flush_start = current_bucket * bucket_size;
+        // Make the data available to the GPU if possible
-        const vk::BufferCopy copy_region = {
+        if (buffer) {
-            .srcOffset = flush_start, .dstOffset = flush_start, .size = flush_size};
+            const vk::BufferCopy copy_region = {
                .srcOffset = flush_start, .dstOffset = flush_start, .size = flush_size};
-        vmaFlushAllocation(allocator, allocation, flush_start, flush_size);
+            vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer();
-        command_buffer.copyBuffer(staging.buffer, buffer, copy_region);
+            command_buffer.copyBuffer(staging.buffer, buffer, copy_region);
-        // Add pipeline barrier for the flushed region
+            // Add pipeline barrier for the flushed region
-        auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage);
+            auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage);
-        const vk::BufferMemoryBarrier buffer_barrier = {
+            const vk::BufferMemoryBarrier buffer_barrier = {
-            .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
+                .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
-            .dstAccessMask = access_mask,
+                .dstAccessMask = access_mask,
-            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = buffer,
+                .buffer = buffer,
-            .offset = flush_start,
+                .offset = flush_start,
-            .size = flush_size};
+                .size = flush_size};
-        command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask,
+            command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask,
-                                       vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, {});
+                                           vk::DependencyFlagBits::eByRegion, {}, buffer_barrier,
                                           {});
        }
    }
-    // Reset the offset of the next bucket
+    SwitchBucket();
 }
 void StreamBuffer::Invalidate() {
    const u32 current_bucket = scheduler.GetCurrentSlotIndex();
    const u32 flush_start = current_bucket * bucket_size;
    const u32 flush_size = buckets[current_bucket].offset;
    ASSERT(flush_size <= bucket_size);
    if (flush_size > 0) [[likely]] {
        // Ensure the staging memory can be read by the host
        VmaAllocator allocator = instance.GetAllocator();
        vmaInvalidateAllocation(allocator, staging.allocation, flush_start, flush_size);
    }
    SwitchBucket();
 }
 void StreamBuffer::SwitchBucket() {
    const u32 current_bucket = scheduler.GetCurrentSlotIndex();
    const u32 next_bucket = (current_bucket + 1) % SCHEDULER_COMMAND_COUNT;
    buckets[next_bucket].offset = 0;
    buckets[next_bucket].invalid = true;
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -19,13 +19,8 @@ class TaskScheduler;
 constexpr u32 MAX_BUFFER_VIEWS = 3;
 struct LockedRegion {
    u32 size = 0;
    u64 fence_counter = 0;
 };
 struct StagingBuffer {
-    StagingBuffer(const Instance& instance, u32 size, vk::BufferUsageFlags usage);
+    StagingBuffer(const Instance& instance, u32 size, bool readback);
    ~StagingBuffer();
    const Instance& instance;
@@ -36,10 +31,19 @@ struct StagingBuffer {
 class StreamBuffer {
 public:
    /// Staging only constructor
    StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
-                 vk::BufferUsageFlagBits usage, std::span<const vk::Format> views);
+                 bool readback = false);
    /// Staging + GPU streaming constructor
    StreamBuffer(const Instance& instance, TaskScheduler& scheduler, u32 size,
                 vk::BufferUsageFlagBits usage, std::span<const vk::Format> views,
                 bool readback = false);
    ~StreamBuffer();
    StreamBuffer(const StreamBuffer&) = delete;
    StreamBuffer& operator=(const StreamBuffer&) = delete;
    /// Maps aligned staging memory of size bytes
    std::tuple<u8*, u32, bool> Map(u32 size, u32 alignment = 0);
    /// Commits size bytes from the currently mapped staging memory
@@ -48,24 +52,28 @@ public:
    /// Flushes staging memory to the GPU buffer
    void Flush();
-    /// Returns the Vulkan buffer handle
+    /// Invalidates staging memory for reading
    void Invalidate();
    /// Switches to the next available bucket
    void SwitchBucket();
    /// Returns the GPU buffer handle
    vk::Buffer GetHandle() const {
        return buffer;
    }
    /// Returns the staging buffer handle
    vk::Buffer GetStagingHandle() const {
        return staging.buffer;
    }
    /// Returns an immutable reference to the requested buffer view
    const vk::BufferView& GetView(u32 index = 0) const {
        ASSERT(index < view_count);
        return views[index];
    }
 private:
    /// Invalidates the buffer offsets
    void Invalidate();
    /// Removes the lock on regions whose fence counter has been reached by the GPU
    bool UnlockFreeRegions(u32 target_size);
 private:
    struct Bucket {
        bool invalid;
--- a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp
@@ -32,19 +32,14 @@ vk::ImageAspectFlags ToVkAspect(VideoCore::SurfaceType type) {
    return vk::ImageAspectFlagBits::eColor;
 }
-constexpr u32 STAGING_BUFFER_SIZE = 64 * 1024 * 1024;
+constexpr u32 UPLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
 constexpr u32 DOWNLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
 TextureRuntime::TextureRuntime(const Instance& instance, TaskScheduler& scheduler,
                               RenderpassCache& renderpass_cache)
-    : instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache}, blit_helper{
+    : instance{instance}, scheduler{scheduler}, renderpass_cache{renderpass_cache},
-                                                                                        instance,
+      blit_helper{instance, scheduler}, upload_buffer{instance, scheduler, UPLOAD_BUFFER_SIZE},
-                                                                                        scheduler} {
+      download_buffer{instance, scheduler, DOWNLOAD_BUFFER_SIZE, true} {
    for (auto& buffer : staging_buffers) {
        buffer = std::make_unique<StagingBuffer>(instance, STAGING_BUFFER_SIZE,
                                                 vk::BufferUsageFlagBits::eTransferSrc |
                                                     vk::BufferUsageFlagBits::eTransferDst);
    }
    auto Register = [this](VideoCore::PixelFormat dest,
                           std::unique_ptr<FormatReinterpreterBase>&& obj) {
@@ -64,7 +59,9 @@ TextureRuntime::~TextureRuntime() {
    for (const auto& [key, alloc] : texture_recycler) {
        vmaDestroyImage(allocator, alloc.image, alloc.allocation);
        device.destroyImageView(alloc.image_view);
-        device.destroyImageView(alloc.base_view);
+        if (alloc.base_view) {
            device.destroyImageView(alloc.base_view);
        }
        if (alloc.depth_view) {
            device.destroyImageView(alloc.depth_view);
            device.destroyImageView(alloc.stencil_view);
@@ -82,29 +79,24 @@ TextureRuntime::~TextureRuntime() {
 }
 StagingData TextureRuntime::FindStaging(u32 size, bool upload) {
    const u32 current_slot = scheduler.GetCurrentSlotIndex();
    u32& offset = staging_offsets[current_slot];
    // Depth uploads require 4 byte alignment, doesn't hurt to do it for everyone
-    offset = Common::AlignUp(offset, 4);
+    auto& buffer = upload ? upload_buffer : download_buffer;
    auto [data, offset, invalidate] = buffer.Map(size, 4);
-    if (offset + size > STAGING_BUFFER_SIZE) {
+    return StagingData{.buffer = buffer.GetStagingHandle(),
        LOG_CRITICAL(Render_Vulkan, "Staging buffer size exceeded!");
        UNREACHABLE();
    }
    const auto& buffer = staging_buffers[current_slot];
    return StagingData{.buffer = buffer->buffer,
                       .size = size,
-                       .mapped = buffer->mapped.subspan(offset, size),
+                       .mapped = std::span<std::byte>{reinterpret_cast<std::byte*>(data), size},
                       .buffer_offset = offset};
 }
-void TextureRuntime::Finish() {
+void TextureRuntime::FlushBuffers() {
-    scheduler.Submit(SubmitMode::Flush);
+    upload_buffer.Flush();
 }
-void TextureRuntime::OnSlotSwitch(u32 new_slot) {
+void TextureRuntime::Finish() {
-    staging_offsets[new_slot] = 0;
+    renderpass_cache.ExitRenderpass();
    scheduler.Submit(SubmitMode::Flush);
    download_buffer.Invalidate();
 }
 ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelFormat format,
@@ -112,6 +104,7 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma
    const FormatTraits traits = instance.GetTraits(format);
    const vk::ImageAspectFlags aspect = ToVkAspect(VideoCore::GetFormatType(format));
    // Depth buffers are not supposed to support blit by the spec so don't require it.
    const bool is_suitable = traits.transfer_support && traits.attachment_support &&
                             (traits.blit_support || aspect & vk::ImageAspectFlagBits::eDepth);
    const vk::Format vk_format = is_suitable ? traits.native : traits.fallback;
@@ -416,12 +409,14 @@ bool TextureRuntime::BlitTextures(Surface& source, Surface& dest,
                                                        .baseArrayLayer = blit.dst_layer,
                                                        .layerCount = 1},
                                     .dstOffsets = dest_offsets};
    // Don't use linear filtering on depth attachments
-    const vk::Filter filtering =
+    const VideoCore::PixelFormat format = source.pixel_format;
-        blit_area.srcSubresource.aspectMask & vk::ImageAspectFlagBits::eDepth ||
+    const vk::Filter filtering = format == VideoCore::PixelFormat::D24S8 ||
-                blit_area.dstSubresource.aspectMask & vk::ImageAspectFlagBits::eDepth
+                                         format == VideoCore::PixelFormat::D24 ||
-            ? vk::Filter::eNearest
+                                         format == VideoCore::PixelFormat::D16
-            : vk::Filter::eLinear;
+                                     ? vk::Filter::eNearest
                                     : vk::Filter::eLinear;
    vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer();
    command_buffer.blitImage(source.alloc.image, vk::ImageLayout::eTransferSrcOptimal,
@@ -682,8 +677,7 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa
    InvalidateAllWatcher();
    // Lock this data until the next scheduler switch
-    const u32 current_slot = scheduler.GetCurrentSlotIndex();
+    runtime.upload_buffer.Commit(staging.size);
    runtime.staging_offsets[current_slot] += staging.size;
 }
 MICROPROFILE_DEFINE(Vulkan_Download, "VulkanSurface", "Texture Download", MP_RGB(128, 192, 64));
@@ -724,8 +718,7 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi
    }
    // Lock this data until the next scheduler switch
-    const u32 current_slot = scheduler.GetCurrentSlotIndex();
+    runtime.download_buffer.Commit(staging.size);
    runtime.staging_offsets[current_slot] += staging.size;
 }
 u32 Surface::GetInternalBytesPerPixel() const {
@@ -811,8 +804,7 @@ void Surface::DepthStencilDownload(const VideoCore::BufferTextureCopy& download,
    // For depth downloads create an R32UI surface and use a compute shader for convert.
    // Then we blit and download that surface.
-    // NOTE: We don't need to set pixel format here since R32Uint automatically gives us
+    // NOTE: We keep the pixel format to D24S8 to avoid linear filtering during scale
    // a storage view. Also the D24S8 creates a unique cache key for it
    SurfaceParams r32_params = *this;
    r32_params.width = scaled_rect.GetWidth();
    r32_params.stride = scaled_rect.GetWidth();
--- a/src/video_core/renderer_vulkan/vk_texture_runtime.h
+++ b/src/video_core/renderer_vulkan/vk_texture_runtime.h
@@ -104,6 +104,9 @@ public:
                                      VideoCore::TextureType type, vk::Format format,
                                      vk::ImageUsageFlags usage);
    /// Flushes staging buffers
    void FlushBuffers();
    /// Causes a GPU command flush
    void Finish();
@@ -138,9 +141,6 @@ public:
    /// Returns true if the provided pixel format needs convertion
    [[nodiscard]] bool NeedsConvertion(VideoCore::PixelFormat format) const;
    /// Performs operations that need to be done on every scheduler slot switch
    void OnSlotSwitch(u32 new_slot);
 private:
    /// Returns the current Vulkan instance
    const Instance& GetInstance() const {
@@ -157,9 +157,9 @@ private:
    TaskScheduler& scheduler;
    RenderpassCache& renderpass_cache;
    BlitHelper blit_helper;
    StreamBuffer upload_buffer;
    StreamBuffer download_buffer;
    std::array<ReinterpreterList, VideoCore::PIXEL_FORMAT_COUNT> reinterpreters;
    std::array<std::unique_ptr<StagingBuffer>, SCHEDULER_COMMAND_COUNT> staging_buffers;
    std::array<u32, SCHEDULER_COMMAND_COUNT> staging_offsets{};
    std::unordered_multimap<HostTextureTag, ImageAlloc> texture_recycler;
    std::unordered_map<vk::ImageView, vk::Framebuffer> clear_framebuffers;
 };