renderer_opengl: Rewrite stream buffer

* New implementation is based on Dolphin's MapAndSync and BufferStorage buffers. Replacing orphaning with syncrhonization which should make it much faster than before. * Texture downloads don't use PBO anymore since it didn't offer any speed benefits. Finally a bug was fixed that only affected the glBufferData fallback path and should fix android/gles
2023-02-04 15:47:58 +02:00
parent 79a53c8003
commit b31b6e35a2
9 changed files with 202 additions and 182 deletions
--- a/src/video_core/rasterizer_cache/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache/rasterizer_cache.h
@ -174,15 +174,13 @@ private:
    SurfaceMap dirty_regions;
    SurfaceSet remove_surfaces;
    u16 resolution_scale_factor;
-    std::vector<std::function<void()>> download_queue;
    std::unordered_map<TextureCubeConfig, Surface> texture_cube_cache;
 };

 template <class T>
 RasterizerCache<T>::RasterizerCache(Memory::MemorySystem& memory_, TextureRuntime& runtime_)
-    : memory{memory_}, runtime{runtime_} {
-    resolution_scale_factor = VideoCore::GetResolutionScaleFactor();
-}
+    : memory{memory_}, runtime{runtime_}, resolution_scale_factor{
+                                              VideoCore::GetResolutionScaleFactor()} {}

 template <class T>
 template <MatchFlags find_flags>
@ -597,13 +595,15 @@ template <class T>
 auto RasterizerCache<T>::GetTextureCube(const TextureCubeConfig& config) -> const Surface& {
    auto [it, new_surface] = texture_cube_cache.try_emplace(config);
    if (new_surface) {
-        SurfaceParams cube_params = {.addr = config.px,
+        SurfaceParams cube_params = {
+            .addr = config.px,
            .width = config.width,
            .height = config.width,
            .stride = config.width,
            .texture_type = TextureType::CubeMap,
            .pixel_format = PixelFormatFromTextureFormat(config.format),
-                                     .type = SurfaceType::Texture};
+            .type = SurfaceType::Texture,
+        };

        it->second = CreateSurface(cube_params);
    }
@ -915,6 +915,7 @@ void RasterizerCache<T>::UploadSurface(const Surface& surface, SurfaceInterval i

    const auto staging = runtime.FindStaging(
        load_info.width * load_info.height * surface->GetInternalBytesPerPixel(), true);
+
    MemoryRef source_ptr = memory.GetPhysicalRef(load_info.addr);
    if (!source_ptr) [[unlikely]] {
        return;
@ -924,11 +925,12 @@ void RasterizerCache<T>::UploadSurface(const Surface& surface, SurfaceInterval i
    DecodeTexture(load_info, load_info.addr, load_info.end, upload_data, staging.mapped,
                  runtime.NeedsConvertion(surface->pixel_format));

-    const BufferTextureCopy upload = {.buffer_offset = 0,
+    const BufferTextureCopy upload = {
+        .buffer_offset = 0,
        .buffer_size = staging.size,
        .texture_rect = surface->GetSubRect(load_info),
-                                      .texture_level = 0};
-
+        .texture_level = 0,
+    };
    surface->Upload(upload, staging);
 }

@ -942,25 +944,25 @@ void RasterizerCache<T>::DownloadSurface(const Surface& surface, SurfaceInterval

    const auto staging = runtime.FindStaging(
        flush_info.width * flush_info.height * surface->GetInternalBytesPerPixel(), false);
-    const BufferTextureCopy download = {.buffer_offset = 0,
+
+    const BufferTextureCopy download = {
+        .buffer_offset = 0,
        .buffer_size = staging.size,
        .texture_rect = surface->GetSubRect(flush_info),
-                                        .texture_level = 0};
-
+        .texture_level = 0,
+    };
    surface->Download(download, staging);

+    runtime.Finish();
+
    MemoryRef dest_ptr = memory.GetPhysicalRef(flush_start);
    if (!dest_ptr) [[unlikely]] {
        return;
    }

    const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start);
-
-    download_queue.push_back([this, surface, flush_start, flush_end, flush_info,
-                              mapped = staging.mapped, download_dest]() {
-        EncodeTexture(flush_info, flush_start, flush_end, mapped, download_dest,
+    EncodeTexture(flush_info, flush_start, flush_end, staging.mapped, download_dest,
                  runtime.NeedsConvertion(surface->pixel_format));
-    });
 }

 template <class T>
@ -1122,17 +1124,6 @@ void RasterizerCache<T>::FlushRegion(PAddr addr, u32 size, Surface flush_surface
        flushed_intervals += interval;
    }

-    // Batch execute all requested downloads. This gives more time for them to complete
-    // before we issue the CPU to GPU flush and reduces scheduler slot switches in Vulkan
-    if (!download_queue.empty()) {
-        runtime.Finish();
-        for (const auto& download_func : download_queue) {
-            download_func();
-        }
-
-        download_queue.clear();
-    }
-
    // Reset dirty regions
    dirty_regions -= flushed_intervals;
 }
--- a/src/video_core/renderer_opengl/gl_driver.cpp
+++ b/src/video_core/renderer_opengl/gl_driver.cpp
@ -74,16 +74,16 @@ Driver::Driver(bool gles, bool enable_debug) : is_gles{gles} {
    if (!gladLoadGL()) {
        return;
    }
+#endif

    /*
     * Qualcomm has some spammy info messages that are marked as errors but not important
     * https://developer.qualcomm.com/comment/11845
     */
-    if (!gles) {
+    if (enable_debug) {
        glEnable(GL_DEBUG_OUTPUT);
        glDebugMessageCallback(DebugHandler, nullptr);
    }
-#endif

    ReportDriverInfo();
    DeduceVendor();
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -15,7 +15,6 @@
 #include "video_core/renderer_opengl/gl_shader_gen.h"
 #include "video_core/renderer_opengl/pica_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
-#include "video_core/video_core.h"

 namespace OpenGL {

@ -66,7 +65,7 @@ RasterizerOpenGL::RasterizerOpenGL(Memory::MemorySystem& memory_, Frontend::EmuW

    // Set vertex attributes for software shader path
    state.draw.vertex_array = sw_vao.handle;
-    state.draw.vertex_buffer = vertex_buffer.GetHandle();
+    state.draw.vertex_buffer = vertex_buffer.Handle();
    state.Apply();

    glVertexAttribPointer(ATTRIBUTE_POSITION, 4, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex),
@ -111,16 +110,16 @@ RasterizerOpenGL::RasterizerOpenGL(Memory::MemorySystem& memory_, Frontend::EmuW
    state.texture_buffer_lut_rgba.texture_buffer = texture_buffer_lut_rgba.handle;
    state.Apply();
    glActiveTexture(TextureUnits::TextureBufferLUT_LF.Enum());
-    glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, texture_lf_buffer.GetHandle());
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, texture_lf_buffer.Handle());
    glActiveTexture(TextureUnits::TextureBufferLUT_RG.Enum());
-    glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, texture_buffer.GetHandle());
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, texture_buffer.Handle());
    glActiveTexture(TextureUnits::TextureBufferLUT_RGBA.Enum());
-    glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, texture_buffer.GetHandle());
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, texture_buffer.Handle());

    // Bind index buffer for hardware shader path
    state.draw.vertex_array = hw_vao.handle;
    state.Apply();
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer.GetHandle());
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer.Handle());

    glEnable(GL_BLEND);

@ -166,7 +165,7 @@ void RasterizerOpenGL::SetupVertexArray(u8* array_ptr, GLintptr buffer_offset,
    PAddr base_address = vertex_attributes.GetPhysicalBaseAddress();

    state.draw.vertex_array = hw_vao.handle;
-    state.draw.vertex_buffer = vertex_buffer.GetHandle();
+    state.draw.vertex_buffer = vertex_buffer.Handle();
    state.Apply();

    std::array<bool, 16> enable_attributes{};
@ -305,7 +304,7 @@ bool RasterizerOpenGL::AccelerateDrawBatchInternal(bool is_indexed) {
        return false;
    }

-    state.draw.vertex_buffer = vertex_buffer.GetHandle();
+    state.draw.vertex_buffer = vertex_buffer.Handle();
    state.Apply();

    u8* buffer_ptr;
@ -625,7 +624,7 @@ bool RasterizerOpenGL::Draw(bool accelerate, bool is_indexed) {
        succeeded = AccelerateDrawBatchInternal(is_indexed);
    } else {
        state.draw.vertex_array = sw_vao.handle;
-        state.draw.vertex_buffer = vertex_buffer.GetHandle();
+        state.draw.vertex_buffer = vertex_buffer.Handle();
        shader_program_manager.UseTrivialVertexShader();
        shader_program_manager.UseTrivialGeometryShader();
        shader_program_manager.ApplyTo(state);
@ -1184,7 +1183,7 @@ void RasterizerOpenGL::SyncAndUploadLUTsLF() {
    GLintptr offset;
    bool invalidate;
    std::size_t bytes_used = 0;
-    glBindBuffer(GL_TEXTURE_BUFFER, texture_lf_buffer.GetHandle());
+    glBindBuffer(GL_TEXTURE_BUFFER, texture_lf_buffer.Handle());
    std::tie(buffer, offset, invalidate) = texture_lf_buffer.Map(max_size, sizeof(Common::Vec4f));

    // Sync the lighting luts
@ -1254,7 +1253,7 @@ void RasterizerOpenGL::SyncAndUploadLUTs() {
    GLintptr offset;
    bool invalidate;
    std::size_t bytes_used = 0;
-    glBindBuffer(GL_TEXTURE_BUFFER, texture_buffer.GetHandle());
+    glBindBuffer(GL_TEXTURE_BUFFER, texture_buffer.Handle());
    std::tie(buffer, offset, invalidate) = texture_buffer.Map(max_size, sizeof(Common::Vec4f));

    // helper function for SyncProcTexNoiseLUT/ColorMap/AlphaMap
@ -1349,7 +1348,7 @@ void RasterizerOpenGL::SyncAndUploadLUTs() {
 void RasterizerOpenGL::UploadUniforms(bool accelerate_draw) {
    // glBindBufferRange below also changes the generic buffer binding point, so we sync the state
    // first
-    state.draw.uniform_buffer = uniform_buffer.GetHandle();
+    state.draw.uniform_buffer = uniform_buffer.Handle();
    state.Apply();

    bool sync_vs = accelerate_draw;
@ -1371,7 +1370,7 @@ void RasterizerOpenGL::UploadUniforms(bool accelerate_draw) {
        vs_uniforms.uniforms.SetFromRegs(Pica::g_state.regs.vs, Pica::g_state.vs);
        std::memcpy(uniforms + used_bytes, &vs_uniforms, sizeof(vs_uniforms));
        glBindBufferRange(GL_UNIFORM_BUFFER, static_cast<GLuint>(Pica::Shader::UniformBindings::VS),
-                          uniform_buffer.GetHandle(), offset + used_bytes, sizeof(vs_uniforms));
+                          uniform_buffer.Handle(), offset + used_bytes, sizeof(vs_uniforms));
        used_bytes += uniform_size_aligned_vs;
    }

@ -1380,7 +1379,7 @@ void RasterizerOpenGL::UploadUniforms(bool accelerate_draw) {
                    sizeof(Pica::Shader::UniformData));
        glBindBufferRange(
            GL_UNIFORM_BUFFER, static_cast<GLuint>(Pica::Shader::UniformBindings::Common),
-            uniform_buffer.GetHandle(), offset + used_bytes, sizeof(Pica::Shader::UniformData));
+            uniform_buffer.Handle(), offset + used_bytes, sizeof(Pica::Shader::UniformData));
        uniform_block_data.dirty = false;
        used_bytes += uniform_size_aligned_fs;
    }
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@ -151,11 +151,11 @@ private:

    OGLTexture default_texture;
    std::array<SamplerInfo, 3> texture_samplers;
-    OGLStreamBuffer vertex_buffer;
-    OGLStreamBuffer uniform_buffer;
-    OGLStreamBuffer index_buffer;
-    OGLStreamBuffer texture_buffer;
-    OGLStreamBuffer texture_lf_buffer;
+    StreamBuffer vertex_buffer;
+    StreamBuffer uniform_buffer;
+    StreamBuffer index_buffer;
+    StreamBuffer texture_buffer;
+    StreamBuffer texture_lf_buffer;
    OGLFramebuffer framebuffer;
    GLint uniform_buffer_alignment;
    std::size_t uniform_size_aligned_vs;
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@ -2,15 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <glad/glad.h>
-#include "common/common_funcs.h"
-#include "common/logging/log.h"
+#include "common/common_types.h"
 #include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/gl_vars.h"

 namespace OpenGL {

-OpenGLState OpenGLState::cur_state;
+OpenGLState OpenGLState::cur_state{};

 OpenGLState::OpenGLState() {
    // These all match default OpenGL values
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@ -4,96 +4,107 @@

 #include "common/alignment.h"
 #include "common/assert.h"
-#include "common/microprofile.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"

-MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
-                    MP_RGB(128, 128, 192));
-
 namespace OpenGL {

-OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool readback_,
-                                 bool prefer_coherent)
-    : gl_target(target), readback(readback_), buffer_size(size) {
+StreamBuffer::StreamBuffer(GLenum target, size_t size_)
+    : gl_target{target}, buffer_size{size_}, slot_size{buffer_size / SYNC_POINTS},
+      buffer_storage{bool(GLAD_GL_ARB_buffer_storage)} {
+    for (int i = 0; i < SYNC_POINTS; i++) {
+        fences[i].Create();
+    }
+
    gl_buffer.Create();
    glBindBuffer(gl_target, gl_buffer.handle);

-    if (GLAD_GL_ARB_buffer_storage) {
-        persistent = true;
-        coherent = prefer_coherent;
-        GLbitfield flags = (readback ? GL_MAP_READ_BIT : GL_MAP_WRITE_BIT) | GL_MAP_PERSISTENT_BIT |
-                           (coherent ? GL_MAP_COHERENT_BIT : 0);
-        glBufferStorage(gl_target, size, nullptr, flags);
-        mapped_ptr = static_cast<u8*>(
-            glMapBufferRange(gl_target, 0, buffer_size,
-                             flags | (!coherent && !readback ? GL_MAP_FLUSH_EXPLICIT_BIT : 0)));
+    if (buffer_storage) {
+        glBufferStorage(gl_target, buffer_size, nullptr,
+                        GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
+        mapped_ptr =
+            (u8*)glMapBufferRange(gl_target, 0, buffer_size,
+                                  GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
    } else {
-        glBufferData(gl_target, size, nullptr, GL_STREAM_DRAW);
+        glBufferData(gl_target, buffer_size, nullptr, GL_STREAM_DRAW);
    }
 }

-OGLStreamBuffer::~OGLStreamBuffer() {
-    if (persistent) {
+StreamBuffer::~StreamBuffer() {
+    if (buffer_storage) {
        glBindBuffer(gl_target, gl_buffer.handle);
        glUnmapBuffer(gl_target);
    }
-    gl_buffer.Release();
 }

-GLuint OGLStreamBuffer::GetHandle() const {
-    return gl_buffer.handle;
-}
-
-GLsizeiptr OGLStreamBuffer::GetSize() const {
-    return buffer_size;
-}
-
-std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
-    ASSERT(size <= buffer_size);
-    ASSERT(alignment <= buffer_size);
+std::tuple<u8*, u64, bool> StreamBuffer::Map(u64 size, u64 alignment) {
    mapped_size = size;

    if (alignment > 0) {
-        buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
+        iterator = Common::AlignUp(iterator, alignment);
    }

+    // Insert waiting slots for used memory
+    for (u32 i = Slot(used_iterator); i < Slot(iterator); i++) {
+        fences[i].Create();
+    }
+    used_iterator = iterator;
+
+    // Wait for new slots to end of buffer
+    for (u32 i = Slot(free_iterator) + 1; i <= Slot(iterator + size) && i < SYNC_POINTS; i++) {
+        glClientWaitSync(fences[i].handle, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+        fences[i].Release();
+    }
+
+    // If we allocate a large amount of memory (A), commit a smaller amount, then allocate memory
+    // smaller than allocation A, we will have already waited for these fences in A, but not used
+    // the space. In this case, don't set m_free_iterator to a position before that which we know
+    // is safe to use, which would result in waiting on the same fence(s) next time.
+    if ((iterator + size) > free_iterator) {
+        free_iterator = iterator + size;
+    }
+
+    // If buffer is full
    bool invalidate = false;
-    if (buffer_pos + size > buffer_size) {
-        buffer_pos = 0;
+    if (iterator + size >= buffer_size) {
        invalidate = true;

-        if (persistent) {
-            glUnmapBuffer(gl_target);
-        }
+        // Insert waiting slots in unused space at the end of the buffer
+        for (int i = Slot(used_iterator); i < SYNC_POINTS; i++) {
+            fences[i].Create();
        }

-    if (invalidate || !persistent) {
-        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        GLbitfield flags = (readback ? GL_MAP_READ_BIT : GL_MAP_WRITE_BIT) |
-                           (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
-                           (coherent ? GL_MAP_COHERENT_BIT : 0) |
-                           (!coherent && !readback ? GL_MAP_FLUSH_EXPLICIT_BIT : 0) |
-                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
-        mapped_ptr = static_cast<u8*>(
-            glMapBufferRange(gl_target, buffer_pos, buffer_size - buffer_pos, flags));
-        mapped_offset = buffer_pos;
+        // Move to the start
+        used_iterator = iterator = 0; // offset 0 is always aligned
+
+        // Wait for space at the start
+        for (int i = 0; i <= Slot(iterator + size); i++) {
+            glClientWaitSync(fences[i].handle, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+            fences[i].Release();
+        }
+        free_iterator = iterator + size;
    }

-    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+    u8* pointer{};
+    if (buffer_storage) {
+        pointer = mapped_ptr + iterator;
+    } else {
+        pointer = (u8*)glMapBufferRange(gl_target, iterator, size,
+                                        GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT |
+                                            GL_MAP_UNSYNCHRONIZED_BIT);
+    }
+
+    return std::make_tuple(pointer, iterator, invalidate);
 }

-void OGLStreamBuffer::Unmap(GLsizeiptr size) {
-    ASSERT(size <= mapped_size);
+void StreamBuffer::Unmap(u64 used_size) {
+    ASSERT_MSG(used_size <= mapped_size, "Reserved size {} is too small compared to {}",
+               mapped_size, used_size);

-    if (!coherent && !readback && size > 0) {
-        glFlushMappedBufferRange(gl_target, buffer_pos - mapped_offset, size);
-    }
-
-    if (!persistent) {
+    if (!buffer_storage) {
+        glFlushMappedBufferRange(gl_target, 0, used_size);
        glUnmapBuffer(gl_target);
    }
-
-    buffer_pos += size;
+    iterator += used_size;
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@ -8,40 +8,50 @@

 namespace OpenGL {

-class OGLStreamBuffer : private NonCopyable {
+class StreamBuffer {
+    static constexpr std::size_t SYNC_POINTS = 16;
+
 public:
-    explicit OGLStreamBuffer(GLenum target, GLsizeiptr size, bool readback = false,
-                             bool prefer_coherent = false);
-    ~OGLStreamBuffer();
+    StreamBuffer(GLenum target, size_t size);
+    ~StreamBuffer();

-    GLuint GetHandle() const;
-    GLsizeiptr GetSize() const;
+    [[nodiscard]] GLuint Handle() const noexcept {
+        return gl_buffer.handle;
+    }

-    /*
-     * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
-     * and the optional alignment requirement.
-     * If the buffer is full, the whole buffer is reallocated which invalidates old chunks.
-     * The return values are the pointer to the new chunk, the offset within the buffer,
-     * and the invalidation flag for previous chunks.
-     * The actual used size must be specified on unmapping the chunk.
+    [[nodiscard]] size_t Size() const noexcept {
+        return buffer_size;
+    }
+
+    /* This mapping function will return a pair of:
+     * - the pointer to the mapped buffer
+     * - the offset into the real GPU buffer (always multiple of stride)
+     * On mapping, the maximum of size for allocation has to be set.
+     * The size really pushed into this fifo only has to be known on Unmapping.
+     * Mapping invalidates the current buffer content,
+     * so it isn't allowed to access the old content any more.
     */
-    std::tuple<u8*, GLintptr, bool> Map(GLsizeiptr size, GLintptr alignment = 0);
-
-    void Unmap(GLsizeiptr size);
+    std::tuple<u8*, u64, bool> Map(u64 size, u64 alignment = 0);
+    void Unmap(u64 used_size);

 private:
-    OGLBuffer gl_buffer;
+    [[nodiscard]] u64 Slot(u64 offset) noexcept {
+        return offset / slot_size;
+    }
+
    GLenum gl_target;
+    size_t buffer_size;
+    size_t slot_size;
+    bool buffer_storage{};
+    u8* mapped_ptr{};
+    u64 mapped_size;

-    bool readback = false;
-    bool coherent = false;
-    bool persistent = false;
+    u64 iterator = 0;
+    u64 used_iterator = 0;
+    u64 free_iterator = 0;

-    GLintptr buffer_pos = 0;
-    GLsizeiptr buffer_size = 0;
-    GLintptr mapped_offset = 0;
-    GLsizeiptr mapped_size = 0;
-    u8* mapped_ptr = nullptr;
+    OGLBuffer gl_buffer;
+    std::array<OGLSync, SYNC_POINTS> fences{};
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_texture_runtime.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_runtime.cpp
@ -53,16 +53,15 @@ static constexpr std::array COLOR_TUPLES_OES = {
    return GL_COLOR_BUFFER_BIT;
 }

-constexpr u32 UPLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
-constexpr u32 DOWNLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
+constexpr std::size_t UPLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
+constexpr std::size_t DOWNLOAD_BUFFER_SIZE = 4 * 1024 * 1024;

 TextureRuntime::TextureRuntime(Driver& driver)
    : driver{driver}, filterer{Settings::values.texture_filter_name.GetValue(),
                               VideoCore::GetResolutionScaleFactor()},
-      upload_buffer{GL_PIXEL_UNPACK_BUFFER, UPLOAD_BUFFER_SIZE}, download_buffer{
-                                                                     GL_PIXEL_PACK_BUFFER,
-                                                                     DOWNLOAD_BUFFER_SIZE, true} {
+      upload_buffer{GL_PIXEL_UNPACK_BUFFER, UPLOAD_BUFFER_SIZE} {

+    download_buffer.resize(DOWNLOAD_BUFFER_SIZE);
    read_fbo.Create();
    draw_fbo.Create();

@ -77,13 +76,22 @@ TextureRuntime::TextureRuntime(Driver& driver)
 }

 StagingData TextureRuntime::FindStaging(u32 size, bool upload) {
-    auto& buffer = upload ? upload_buffer : download_buffer;
-    auto [data, offset, invalidate] = buffer.Map(size, 4);
-
-    return StagingData{.buffer = buffer.GetHandle(),
+    if (!upload) {
+        ASSERT_MSG(download_buffer.size() <= size, "Download buffer to small");
+        return StagingData{
            .size = size,
-                       .mapped = std::span<u8>{data, size},
-                       .buffer_offset = offset};
+            .mapped = std::span{download_buffer.data(), size},
+            .buffer_offset = 0,
+        };
+    }
+
+    auto [data, offset, invalidate] = upload_buffer.Map(size, 4);
+    return StagingData{
+        .buffer = upload_buffer.Handle(),
+        .size = size,
+        .mapped = std::span{data, size},
+        .buffer_offset = offset,
+    };
 }

 const FormatTuple& TextureRuntime::GetFormatTuple(VideoCore::PixelFormat pixel_format) {
@ -333,6 +341,9 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa
        glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(stride));
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, staging.buffer);

+        // Unmap the buffer FindStaging mapped beforehand
+        runtime.upload_buffer.Unmap(staging.size);
+
        glActiveTexture(GL_TEXTURE0);
        glBindTexture(GL_TEXTURE_2D, texture.handle);

@ -343,7 +354,6 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa
                        reinterpret_cast<void*>(staging.buffer_offset));

        glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
-        runtime.upload_buffer.Unmap(staging.size);
    }

    InvalidateAllWatcher();
@ -360,7 +370,6 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi
    SCOPE_EXIT({ prev_state.Apply(); });

    glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(stride));
-    glBindBuffer(GL_PIXEL_PACK_BUFFER, staging.buffer);

    const bool is_scaled = res_scale != 1;
    if (is_scaled) {
@ -372,9 +381,7 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi
        const auto& tuple = runtime.GetFormatTuple(pixel_format);
        glReadPixels(download.texture_rect.left, download.texture_rect.bottom,
                     download.texture_rect.GetWidth(), download.texture_rect.GetHeight(),
-                     tuple.format, tuple.type, reinterpret_cast<void*>(staging.buffer_offset));
-
-        runtime.download_buffer.Unmap(staging.size);
+                     tuple.format, tuple.type, staging.mapped.data());
    }

    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
@ -393,20 +400,24 @@ void Surface::ScaledUpload(const VideoCore::BufferTextureCopy& upload, const Sta
    unscaled_params.res_scale = 1;
    Surface unscaled_surface{unscaled_params, runtime};

-    const VideoCore::BufferTextureCopy unscaled_upload = {.buffer_offset = upload.buffer_offset,
+    const VideoCore::BufferTextureCopy unscaled_upload = {
+        .buffer_offset = upload.buffer_offset,
        .buffer_size = upload.buffer_size,
-                                                          .texture_rect = unscaled_rect};
+        .texture_rect = unscaled_rect,
+    };

    unscaled_surface.Upload(unscaled_upload, staging);

    const auto& filterer = runtime.GetFilterer();
    if (!filterer.Filter(unscaled_surface.texture, unscaled_rect, texture, scaled_rect, type)) {
-        const VideoCore::TextureBlit blit = {.src_level = 0,
+        const VideoCore::TextureBlit blit = {
+            .src_level = 0,
            .dst_level = upload.texture_level,
            .src_layer = 0,
            .dst_layer = 0,
            .src_rect = unscaled_rect,
-                                             .dst_rect = scaled_rect};
+            .dst_rect = scaled_rect,
+        };

        // If filtering fails, resort to normal blitting
        runtime.BlitTextures(unscaled_surface, *this, blit);
@ -428,14 +439,15 @@ void Surface::ScaledDownload(const VideoCore::BufferTextureCopy& download,
    unscaled_params.res_scale = 1;
    Surface unscaled_surface{unscaled_params, runtime};

-    const VideoCore::TextureBlit blit = {.src_level = download.texture_level,
+    // Blit the scaled rectangle to the unscaled texture
+    const VideoCore::TextureBlit blit = {
+        .src_level = download.texture_level,
        .dst_level = 0,
        .src_layer = 0,
        .dst_layer = 0,
        .src_rect = scaled_rect,
-                                         .dst_rect = unscaled_rect};
-
-    // Blit the scaled rectangle to the unscaled texture
+        .dst_rect = unscaled_rect,
+    };
    runtime.BlitTextures(*this, unscaled_surface, blit);

    glActiveTexture(GL_TEXTURE0);
@ -446,13 +458,10 @@ void Surface::ScaledDownload(const VideoCore::BufferTextureCopy& download,
        runtime.BindFramebuffer(GL_READ_FRAMEBUFFER, 0, GL_TEXTURE_2D, type,
                                unscaled_surface.texture);
        glReadPixels(0, 0, rect_width, rect_height, tuple.format, tuple.type,
-                     reinterpret_cast<void*>(staging.buffer_offset));
+                     staging.mapped.data());
    } else {
-        glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type,
-                      reinterpret_cast<void*>(staging.buffer_offset));
+        glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, staging.mapped.data());
    }
-
-    runtime.download_buffer.Unmap(staging.size);
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_texture_runtime.h
+++ b/src/video_core/renderer_opengl/gl_texture_runtime.h
@ -23,7 +23,7 @@ struct StagingData {
    GLuint buffer;
    u32 size = 0;
    std::span<u8> mapped{};
-    GLintptr buffer_offset = 0;
+    u64 buffer_offset = 0;
 };

 class Driver;
@ -46,6 +46,7 @@ public:
    /// Returns the OpenGL format tuple associated with the provided pixel format
    const FormatTuple& GetFormatTuple(VideoCore::PixelFormat pixel_format);

+    /// Causes a GPU command flush
    void Finish() const {}

    /// Allocates an OpenGL texture with the specified dimentions and format
@ -92,7 +93,8 @@ private:
    TextureFilterer filterer;
    std::array<ReinterpreterList, VideoCore::PIXEL_FORMAT_COUNT> reinterpreters;
    std::unordered_multimap<VideoCore::HostTextureTag, OGLTexture> texture_recycler;
-    OGLStreamBuffer upload_buffer, download_buffer;
+    StreamBuffer upload_buffer;
+    std::vector<u8> download_buffer;
    OGLFramebuffer read_fbo, draw_fbo;
 };