renderer_opengl: Rewrite stream buffer

* New implementation is based on Dolphin's MapAndSync and BufferStorage buffers.
  Replacing orphaning with syncrhonization which should make it much faster than before.

* Texture downloads don't use PBO anymore since it didn't offer any speed benefits. Finally
  a bug was fixed that only affected the glBufferData fallback path and should fix android/gles
This commit is contained in:
GPUCode
2023-02-04 15:47:58 +02:00
parent 79a53c8003
commit b31b6e35a2
9 changed files with 202 additions and 182 deletions

View File

@ -174,15 +174,13 @@ private:
SurfaceMap dirty_regions;
SurfaceSet remove_surfaces;
u16 resolution_scale_factor;
std::vector<std::function<void()>> download_queue;
std::unordered_map<TextureCubeConfig, Surface> texture_cube_cache;
};
template <class T>
RasterizerCache<T>::RasterizerCache(Memory::MemorySystem& memory_, TextureRuntime& runtime_)
: memory{memory_}, runtime{runtime_} {
resolution_scale_factor = VideoCore::GetResolutionScaleFactor();
}
: memory{memory_}, runtime{runtime_}, resolution_scale_factor{
VideoCore::GetResolutionScaleFactor()} {}
template <class T>
template <MatchFlags find_flags>
@ -597,13 +595,15 @@ template <class T>
auto RasterizerCache<T>::GetTextureCube(const TextureCubeConfig& config) -> const Surface& {
auto [it, new_surface] = texture_cube_cache.try_emplace(config);
if (new_surface) {
SurfaceParams cube_params = {.addr = config.px,
SurfaceParams cube_params = {
.addr = config.px,
.width = config.width,
.height = config.width,
.stride = config.width,
.texture_type = TextureType::CubeMap,
.pixel_format = PixelFormatFromTextureFormat(config.format),
.type = SurfaceType::Texture};
.type = SurfaceType::Texture,
};
it->second = CreateSurface(cube_params);
}
@ -915,6 +915,7 @@ void RasterizerCache<T>::UploadSurface(const Surface& surface, SurfaceInterval i
const auto staging = runtime.FindStaging(
load_info.width * load_info.height * surface->GetInternalBytesPerPixel(), true);
MemoryRef source_ptr = memory.GetPhysicalRef(load_info.addr);
if (!source_ptr) [[unlikely]] {
return;
@ -924,11 +925,12 @@ void RasterizerCache<T>::UploadSurface(const Surface& surface, SurfaceInterval i
DecodeTexture(load_info, load_info.addr, load_info.end, upload_data, staging.mapped,
runtime.NeedsConvertion(surface->pixel_format));
const BufferTextureCopy upload = {.buffer_offset = 0,
const BufferTextureCopy upload = {
.buffer_offset = 0,
.buffer_size = staging.size,
.texture_rect = surface->GetSubRect(load_info),
.texture_level = 0};
.texture_level = 0,
};
surface->Upload(upload, staging);
}
@ -942,25 +944,25 @@ void RasterizerCache<T>::DownloadSurface(const Surface& surface, SurfaceInterval
const auto staging = runtime.FindStaging(
flush_info.width * flush_info.height * surface->GetInternalBytesPerPixel(), false);
const BufferTextureCopy download = {.buffer_offset = 0,
const BufferTextureCopy download = {
.buffer_offset = 0,
.buffer_size = staging.size,
.texture_rect = surface->GetSubRect(flush_info),
.texture_level = 0};
.texture_level = 0,
};
surface->Download(download, staging);
runtime.Finish();
MemoryRef dest_ptr = memory.GetPhysicalRef(flush_start);
if (!dest_ptr) [[unlikely]] {
return;
}
const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start);
download_queue.push_back([this, surface, flush_start, flush_end, flush_info,
mapped = staging.mapped, download_dest]() {
EncodeTexture(flush_info, flush_start, flush_end, mapped, download_dest,
EncodeTexture(flush_info, flush_start, flush_end, staging.mapped, download_dest,
runtime.NeedsConvertion(surface->pixel_format));
});
}
template <class T>
@ -1122,17 +1124,6 @@ void RasterizerCache<T>::FlushRegion(PAddr addr, u32 size, Surface flush_surface
flushed_intervals += interval;
}
// Batch execute all requested downloads. This gives more time for them to complete
// before we issue the CPU to GPU flush and reduces scheduler slot switches in Vulkan
if (!download_queue.empty()) {
runtime.Finish();
for (const auto& download_func : download_queue) {
download_func();
}
download_queue.clear();
}
// Reset dirty regions
dirty_regions -= flushed_intervals;
}

View File

@ -74,16 +74,16 @@ Driver::Driver(bool gles, bool enable_debug) : is_gles{gles} {
if (!gladLoadGL()) {
return;
}
#endif
/*
* Qualcomm has some spammy info messages that are marked as errors but not important
* https://developer.qualcomm.com/comment/11845
*/
if (!gles) {
if (enable_debug) {
glEnable(GL_DEBUG_OUTPUT);
glDebugMessageCallback(DebugHandler, nullptr);
}
#endif
ReportDriverInfo();
DeduceVendor();

View File

@ -15,7 +15,6 @@
#include "video_core/renderer_opengl/gl_shader_gen.h"
#include "video_core/renderer_opengl/pica_to_gl.h"
#include "video_core/renderer_opengl/renderer_opengl.h"
#include "video_core/video_core.h"
namespace OpenGL {
@ -66,7 +65,7 @@ RasterizerOpenGL::RasterizerOpenGL(Memory::MemorySystem& memory_, Frontend::EmuW
// Set vertex attributes for software shader path
state.draw.vertex_array = sw_vao.handle;
state.draw.vertex_buffer = vertex_buffer.GetHandle();
state.draw.vertex_buffer = vertex_buffer.Handle();
state.Apply();
glVertexAttribPointer(ATTRIBUTE_POSITION, 4, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex),
@ -111,16 +110,16 @@ RasterizerOpenGL::RasterizerOpenGL(Memory::MemorySystem& memory_, Frontend::EmuW
state.texture_buffer_lut_rgba.texture_buffer = texture_buffer_lut_rgba.handle;
state.Apply();
glActiveTexture(TextureUnits::TextureBufferLUT_LF.Enum());
glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, texture_lf_buffer.GetHandle());
glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, texture_lf_buffer.Handle());
glActiveTexture(TextureUnits::TextureBufferLUT_RG.Enum());
glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, texture_buffer.GetHandle());
glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, texture_buffer.Handle());
glActiveTexture(TextureUnits::TextureBufferLUT_RGBA.Enum());
glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, texture_buffer.GetHandle());
glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, texture_buffer.Handle());
// Bind index buffer for hardware shader path
state.draw.vertex_array = hw_vao.handle;
state.Apply();
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer.GetHandle());
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer.Handle());
glEnable(GL_BLEND);
@ -166,7 +165,7 @@ void RasterizerOpenGL::SetupVertexArray(u8* array_ptr, GLintptr buffer_offset,
PAddr base_address = vertex_attributes.GetPhysicalBaseAddress();
state.draw.vertex_array = hw_vao.handle;
state.draw.vertex_buffer = vertex_buffer.GetHandle();
state.draw.vertex_buffer = vertex_buffer.Handle();
state.Apply();
std::array<bool, 16> enable_attributes{};
@ -305,7 +304,7 @@ bool RasterizerOpenGL::AccelerateDrawBatchInternal(bool is_indexed) {
return false;
}
state.draw.vertex_buffer = vertex_buffer.GetHandle();
state.draw.vertex_buffer = vertex_buffer.Handle();
state.Apply();
u8* buffer_ptr;
@ -625,7 +624,7 @@ bool RasterizerOpenGL::Draw(bool accelerate, bool is_indexed) {
succeeded = AccelerateDrawBatchInternal(is_indexed);
} else {
state.draw.vertex_array = sw_vao.handle;
state.draw.vertex_buffer = vertex_buffer.GetHandle();
state.draw.vertex_buffer = vertex_buffer.Handle();
shader_program_manager.UseTrivialVertexShader();
shader_program_manager.UseTrivialGeometryShader();
shader_program_manager.ApplyTo(state);
@ -1184,7 +1183,7 @@ void RasterizerOpenGL::SyncAndUploadLUTsLF() {
GLintptr offset;
bool invalidate;
std::size_t bytes_used = 0;
glBindBuffer(GL_TEXTURE_BUFFER, texture_lf_buffer.GetHandle());
glBindBuffer(GL_TEXTURE_BUFFER, texture_lf_buffer.Handle());
std::tie(buffer, offset, invalidate) = texture_lf_buffer.Map(max_size, sizeof(Common::Vec4f));
// Sync the lighting luts
@ -1254,7 +1253,7 @@ void RasterizerOpenGL::SyncAndUploadLUTs() {
GLintptr offset;
bool invalidate;
std::size_t bytes_used = 0;
glBindBuffer(GL_TEXTURE_BUFFER, texture_buffer.GetHandle());
glBindBuffer(GL_TEXTURE_BUFFER, texture_buffer.Handle());
std::tie(buffer, offset, invalidate) = texture_buffer.Map(max_size, sizeof(Common::Vec4f));
// helper function for SyncProcTexNoiseLUT/ColorMap/AlphaMap
@ -1349,7 +1348,7 @@ void RasterizerOpenGL::SyncAndUploadLUTs() {
void RasterizerOpenGL::UploadUniforms(bool accelerate_draw) {
// glBindBufferRange below also changes the generic buffer binding point, so we sync the state
// first
state.draw.uniform_buffer = uniform_buffer.GetHandle();
state.draw.uniform_buffer = uniform_buffer.Handle();
state.Apply();
bool sync_vs = accelerate_draw;
@ -1371,7 +1370,7 @@ void RasterizerOpenGL::UploadUniforms(bool accelerate_draw) {
vs_uniforms.uniforms.SetFromRegs(Pica::g_state.regs.vs, Pica::g_state.vs);
std::memcpy(uniforms + used_bytes, &vs_uniforms, sizeof(vs_uniforms));
glBindBufferRange(GL_UNIFORM_BUFFER, static_cast<GLuint>(Pica::Shader::UniformBindings::VS),
uniform_buffer.GetHandle(), offset + used_bytes, sizeof(vs_uniforms));
uniform_buffer.Handle(), offset + used_bytes, sizeof(vs_uniforms));
used_bytes += uniform_size_aligned_vs;
}
@ -1380,7 +1379,7 @@ void RasterizerOpenGL::UploadUniforms(bool accelerate_draw) {
sizeof(Pica::Shader::UniformData));
glBindBufferRange(
GL_UNIFORM_BUFFER, static_cast<GLuint>(Pica::Shader::UniformBindings::Common),
uniform_buffer.GetHandle(), offset + used_bytes, sizeof(Pica::Shader::UniformData));
uniform_buffer.Handle(), offset + used_bytes, sizeof(Pica::Shader::UniformData));
uniform_block_data.dirty = false;
used_bytes += uniform_size_aligned_fs;
}

View File

@ -151,11 +151,11 @@ private:
OGLTexture default_texture;
std::array<SamplerInfo, 3> texture_samplers;
OGLStreamBuffer vertex_buffer;
OGLStreamBuffer uniform_buffer;
OGLStreamBuffer index_buffer;
OGLStreamBuffer texture_buffer;
OGLStreamBuffer texture_lf_buffer;
StreamBuffer vertex_buffer;
StreamBuffer uniform_buffer;
StreamBuffer index_buffer;
StreamBuffer texture_buffer;
StreamBuffer texture_lf_buffer;
OGLFramebuffer framebuffer;
GLint uniform_buffer_alignment;
std::size_t uniform_size_aligned_vs;

View File

@ -2,15 +2,13 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <glad/glad.h>
#include "common/common_funcs.h"
#include "common/logging/log.h"
#include "common/common_types.h"
#include "video_core/renderer_opengl/gl_state.h"
#include "video_core/renderer_opengl/gl_vars.h"
namespace OpenGL {
OpenGLState OpenGLState::cur_state;
OpenGLState OpenGLState::cur_state{};
OpenGLState::OpenGLState() {
// These all match default OpenGL values

View File

@ -4,96 +4,107 @@
#include "common/alignment.h"
#include "common/assert.h"
#include "common/microprofile.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
MP_RGB(128, 128, 192));
namespace OpenGL {
OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool readback_,
bool prefer_coherent)
: gl_target(target), readback(readback_), buffer_size(size) {
StreamBuffer::StreamBuffer(GLenum target, size_t size_)
: gl_target{target}, buffer_size{size_}, slot_size{buffer_size / SYNC_POINTS},
buffer_storage{bool(GLAD_GL_ARB_buffer_storage)} {
for (int i = 0; i < SYNC_POINTS; i++) {
fences[i].Create();
}
gl_buffer.Create();
glBindBuffer(gl_target, gl_buffer.handle);
if (GLAD_GL_ARB_buffer_storage) {
persistent = true;
coherent = prefer_coherent;
GLbitfield flags = (readback ? GL_MAP_READ_BIT : GL_MAP_WRITE_BIT) | GL_MAP_PERSISTENT_BIT |
(coherent ? GL_MAP_COHERENT_BIT : 0);
glBufferStorage(gl_target, size, nullptr, flags);
mapped_ptr = static_cast<u8*>(
glMapBufferRange(gl_target, 0, buffer_size,
flags | (!coherent && !readback ? GL_MAP_FLUSH_EXPLICIT_BIT : 0)));
if (buffer_storage) {
glBufferStorage(gl_target, buffer_size, nullptr,
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
mapped_ptr =
(u8*)glMapBufferRange(gl_target, 0, buffer_size,
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
} else {
glBufferData(gl_target, size, nullptr, GL_STREAM_DRAW);
glBufferData(gl_target, buffer_size, nullptr, GL_STREAM_DRAW);
}
}
OGLStreamBuffer::~OGLStreamBuffer() {
if (persistent) {
StreamBuffer::~StreamBuffer() {
if (buffer_storage) {
glBindBuffer(gl_target, gl_buffer.handle);
glUnmapBuffer(gl_target);
}
gl_buffer.Release();
}
GLuint OGLStreamBuffer::GetHandle() const {
return gl_buffer.handle;
}
GLsizeiptr OGLStreamBuffer::GetSize() const {
return buffer_size;
}
std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
ASSERT(size <= buffer_size);
ASSERT(alignment <= buffer_size);
std::tuple<u8*, u64, bool> StreamBuffer::Map(u64 size, u64 alignment) {
mapped_size = size;
if (alignment > 0) {
buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
iterator = Common::AlignUp(iterator, alignment);
}
// Insert waiting slots for used memory
for (u32 i = Slot(used_iterator); i < Slot(iterator); i++) {
fences[i].Create();
}
used_iterator = iterator;
// Wait for new slots to end of buffer
for (u32 i = Slot(free_iterator) + 1; i <= Slot(iterator + size) && i < SYNC_POINTS; i++) {
glClientWaitSync(fences[i].handle, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
fences[i].Release();
}
// If we allocate a large amount of memory (A), commit a smaller amount, then allocate memory
// smaller than allocation A, we will have already waited for these fences in A, but not used
// the space. In this case, don't set m_free_iterator to a position before that which we know
// is safe to use, which would result in waiting on the same fence(s) next time.
if ((iterator + size) > free_iterator) {
free_iterator = iterator + size;
}
// If buffer is full
bool invalidate = false;
if (buffer_pos + size > buffer_size) {
buffer_pos = 0;
if (iterator + size >= buffer_size) {
invalidate = true;
if (persistent) {
glUnmapBuffer(gl_target);
}
// Insert waiting slots in unused space at the end of the buffer
for (int i = Slot(used_iterator); i < SYNC_POINTS; i++) {
fences[i].Create();
}
if (invalidate || !persistent) {
MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
GLbitfield flags = (readback ? GL_MAP_READ_BIT : GL_MAP_WRITE_BIT) |
(persistent ? GL_MAP_PERSISTENT_BIT : 0) |
(coherent ? GL_MAP_COHERENT_BIT : 0) |
(!coherent && !readback ? GL_MAP_FLUSH_EXPLICIT_BIT : 0) |
(invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
mapped_ptr = static_cast<u8*>(
glMapBufferRange(gl_target, buffer_pos, buffer_size - buffer_pos, flags));
mapped_offset = buffer_pos;
// Move to the start
used_iterator = iterator = 0; // offset 0 is always aligned
// Wait for space at the start
for (int i = 0; i <= Slot(iterator + size); i++) {
glClientWaitSync(fences[i].handle, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
fences[i].Release();
}
free_iterator = iterator + size;
}
return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
u8* pointer{};
if (buffer_storage) {
pointer = mapped_ptr + iterator;
} else {
pointer = (u8*)glMapBufferRange(gl_target, iterator, size,
GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT |
GL_MAP_UNSYNCHRONIZED_BIT);
}
return std::make_tuple(pointer, iterator, invalidate);
}
void OGLStreamBuffer::Unmap(GLsizeiptr size) {
ASSERT(size <= mapped_size);
void StreamBuffer::Unmap(u64 used_size) {
ASSERT_MSG(used_size <= mapped_size, "Reserved size {} is too small compared to {}",
mapped_size, used_size);
if (!coherent && !readback && size > 0) {
glFlushMappedBufferRange(gl_target, buffer_pos - mapped_offset, size);
}
if (!persistent) {
if (!buffer_storage) {
glFlushMappedBufferRange(gl_target, 0, used_size);
glUnmapBuffer(gl_target);
}
buffer_pos += size;
iterator += used_size;
}
} // namespace OpenGL

View File

@ -8,40 +8,50 @@
namespace OpenGL {
class OGLStreamBuffer : private NonCopyable {
class StreamBuffer {
static constexpr std::size_t SYNC_POINTS = 16;
public:
explicit OGLStreamBuffer(GLenum target, GLsizeiptr size, bool readback = false,
bool prefer_coherent = false);
~OGLStreamBuffer();
StreamBuffer(GLenum target, size_t size);
~StreamBuffer();
GLuint GetHandle() const;
GLsizeiptr GetSize() const;
[[nodiscard]] GLuint Handle() const noexcept {
return gl_buffer.handle;
}
/*
* Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
* and the optional alignment requirement.
* If the buffer is full, the whole buffer is reallocated which invalidates old chunks.
* The return values are the pointer to the new chunk, the offset within the buffer,
* and the invalidation flag for previous chunks.
* The actual used size must be specified on unmapping the chunk.
[[nodiscard]] size_t Size() const noexcept {
return buffer_size;
}
/* This mapping function will return a pair of:
* - the pointer to the mapped buffer
* - the offset into the real GPU buffer (always multiple of stride)
* On mapping, the maximum of size for allocation has to be set.
* The size really pushed into this fifo only has to be known on Unmapping.
* Mapping invalidates the current buffer content,
* so it isn't allowed to access the old content any more.
*/
std::tuple<u8*, GLintptr, bool> Map(GLsizeiptr size, GLintptr alignment = 0);
void Unmap(GLsizeiptr size);
std::tuple<u8*, u64, bool> Map(u64 size, u64 alignment = 0);
void Unmap(u64 used_size);
private:
OGLBuffer gl_buffer;
[[nodiscard]] u64 Slot(u64 offset) noexcept {
return offset / slot_size;
}
GLenum gl_target;
size_t buffer_size;
size_t slot_size;
bool buffer_storage{};
u8* mapped_ptr{};
u64 mapped_size;
bool readback = false;
bool coherent = false;
bool persistent = false;
u64 iterator = 0;
u64 used_iterator = 0;
u64 free_iterator = 0;
GLintptr buffer_pos = 0;
GLsizeiptr buffer_size = 0;
GLintptr mapped_offset = 0;
GLsizeiptr mapped_size = 0;
u8* mapped_ptr = nullptr;
OGLBuffer gl_buffer;
std::array<OGLSync, SYNC_POINTS> fences{};
};
} // namespace OpenGL

View File

@ -53,16 +53,15 @@ static constexpr std::array COLOR_TUPLES_OES = {
return GL_COLOR_BUFFER_BIT;
}
constexpr u32 UPLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
constexpr u32 DOWNLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
constexpr std::size_t UPLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
constexpr std::size_t DOWNLOAD_BUFFER_SIZE = 4 * 1024 * 1024;
TextureRuntime::TextureRuntime(Driver& driver)
: driver{driver}, filterer{Settings::values.texture_filter_name.GetValue(),
VideoCore::GetResolutionScaleFactor()},
upload_buffer{GL_PIXEL_UNPACK_BUFFER, UPLOAD_BUFFER_SIZE}, download_buffer{
GL_PIXEL_PACK_BUFFER,
DOWNLOAD_BUFFER_SIZE, true} {
upload_buffer{GL_PIXEL_UNPACK_BUFFER, UPLOAD_BUFFER_SIZE} {
download_buffer.resize(DOWNLOAD_BUFFER_SIZE);
read_fbo.Create();
draw_fbo.Create();
@ -77,13 +76,22 @@ TextureRuntime::TextureRuntime(Driver& driver)
}
StagingData TextureRuntime::FindStaging(u32 size, bool upload) {
auto& buffer = upload ? upload_buffer : download_buffer;
auto [data, offset, invalidate] = buffer.Map(size, 4);
return StagingData{.buffer = buffer.GetHandle(),
if (!upload) {
ASSERT_MSG(download_buffer.size() <= size, "Download buffer to small");
return StagingData{
.size = size,
.mapped = std::span<u8>{data, size},
.buffer_offset = offset};
.mapped = std::span{download_buffer.data(), size},
.buffer_offset = 0,
};
}
auto [data, offset, invalidate] = upload_buffer.Map(size, 4);
return StagingData{
.buffer = upload_buffer.Handle(),
.size = size,
.mapped = std::span{data, size},
.buffer_offset = offset,
};
}
const FormatTuple& TextureRuntime::GetFormatTuple(VideoCore::PixelFormat pixel_format) {
@ -333,6 +341,9 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa
glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(stride));
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, staging.buffer);
// Unmap the buffer FindStaging mapped beforehand
runtime.upload_buffer.Unmap(staging.size);
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, texture.handle);
@ -343,7 +354,6 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa
reinterpret_cast<void*>(staging.buffer_offset));
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
runtime.upload_buffer.Unmap(staging.size);
}
InvalidateAllWatcher();
@ -360,7 +370,6 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi
SCOPE_EXIT({ prev_state.Apply(); });
glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(stride));
glBindBuffer(GL_PIXEL_PACK_BUFFER, staging.buffer);
const bool is_scaled = res_scale != 1;
if (is_scaled) {
@ -372,9 +381,7 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi
const auto& tuple = runtime.GetFormatTuple(pixel_format);
glReadPixels(download.texture_rect.left, download.texture_rect.bottom,
download.texture_rect.GetWidth(), download.texture_rect.GetHeight(),
tuple.format, tuple.type, reinterpret_cast<void*>(staging.buffer_offset));
runtime.download_buffer.Unmap(staging.size);
tuple.format, tuple.type, staging.mapped.data());
}
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
@ -393,20 +400,24 @@ void Surface::ScaledUpload(const VideoCore::BufferTextureCopy& upload, const Sta
unscaled_params.res_scale = 1;
Surface unscaled_surface{unscaled_params, runtime};
const VideoCore::BufferTextureCopy unscaled_upload = {.buffer_offset = upload.buffer_offset,
const VideoCore::BufferTextureCopy unscaled_upload = {
.buffer_offset = upload.buffer_offset,
.buffer_size = upload.buffer_size,
.texture_rect = unscaled_rect};
.texture_rect = unscaled_rect,
};
unscaled_surface.Upload(unscaled_upload, staging);
const auto& filterer = runtime.GetFilterer();
if (!filterer.Filter(unscaled_surface.texture, unscaled_rect, texture, scaled_rect, type)) {
const VideoCore::TextureBlit blit = {.src_level = 0,
const VideoCore::TextureBlit blit = {
.src_level = 0,
.dst_level = upload.texture_level,
.src_layer = 0,
.dst_layer = 0,
.src_rect = unscaled_rect,
.dst_rect = scaled_rect};
.dst_rect = scaled_rect,
};
// If filtering fails, resort to normal blitting
runtime.BlitTextures(unscaled_surface, *this, blit);
@ -428,14 +439,15 @@ void Surface::ScaledDownload(const VideoCore::BufferTextureCopy& download,
unscaled_params.res_scale = 1;
Surface unscaled_surface{unscaled_params, runtime};
const VideoCore::TextureBlit blit = {.src_level = download.texture_level,
// Blit the scaled rectangle to the unscaled texture
const VideoCore::TextureBlit blit = {
.src_level = download.texture_level,
.dst_level = 0,
.src_layer = 0,
.dst_layer = 0,
.src_rect = scaled_rect,
.dst_rect = unscaled_rect};
// Blit the scaled rectangle to the unscaled texture
.dst_rect = unscaled_rect,
};
runtime.BlitTextures(*this, unscaled_surface, blit);
glActiveTexture(GL_TEXTURE0);
@ -446,13 +458,10 @@ void Surface::ScaledDownload(const VideoCore::BufferTextureCopy& download,
runtime.BindFramebuffer(GL_READ_FRAMEBUFFER, 0, GL_TEXTURE_2D, type,
unscaled_surface.texture);
glReadPixels(0, 0, rect_width, rect_height, tuple.format, tuple.type,
reinterpret_cast<void*>(staging.buffer_offset));
staging.mapped.data());
} else {
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type,
reinterpret_cast<void*>(staging.buffer_offset));
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, staging.mapped.data());
}
runtime.download_buffer.Unmap(staging.size);
}
} // namespace OpenGL

View File

@ -23,7 +23,7 @@ struct StagingData {
GLuint buffer;
u32 size = 0;
std::span<u8> mapped{};
GLintptr buffer_offset = 0;
u64 buffer_offset = 0;
};
class Driver;
@ -46,6 +46,7 @@ public:
/// Returns the OpenGL format tuple associated with the provided pixel format
const FormatTuple& GetFormatTuple(VideoCore::PixelFormat pixel_format);
/// Causes a GPU command flush
void Finish() const {}
/// Allocates an OpenGL texture with the specified dimentions and format
@ -92,7 +93,8 @@ private:
TextureFilterer filterer;
std::array<ReinterpreterList, VideoCore::PIXEL_FORMAT_COUNT> reinterpreters;
std::unordered_multimap<VideoCore::HostTextureTag, OGLTexture> texture_recycler;
OGLStreamBuffer upload_buffer, download_buffer;
StreamBuffer upload_buffer;
std::vector<u8> download_buffer;
OGLFramebuffer read_fbo, draw_fbo;
};