gl_texture_runtime: Use OGLStreamBuffer for uploads/downloads

* Much better than the current implementation
This commit is contained in:
GPUCode
2022-10-30 18:41:47 +02:00
parent 5fe910b18f
commit 8946c1a7de
6 changed files with 72 additions and 140 deletions

View File

@ -908,7 +908,7 @@ void RasterizerCache<T>::UploadSurface(const Surface& surface, SurfaceInterval i
MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad);
const auto& staging = runtime.FindStaging(
const auto staging = runtime.FindStaging(
load_info.width * load_info.height * surface->GetInternalBytesPerPixel(), true);
MemoryRef source_ptr = VideoCore::g_memory->GetPhysicalRef(load_info.addr);
if (!source_ptr) [[unlikely]] {
@ -939,7 +939,7 @@ void RasterizerCache<T>::DownloadSurface(const Surface& surface, SurfaceInterval
const u32 flush_end = boost::icl::last_next(interval);
ASSERT(flush_start >= surface->addr && flush_end <= surface->end);
const auto& staging = runtime.FindStaging(
const auto staging = runtime.FindStaging(
flush_info.width * flush_info.height * surface->GetInternalBytesPerPixel(), false);
const BufferTextureCopy download = {.buffer_offset = 0,
.buffer_size = staging.size,

View File

@ -33,11 +33,11 @@ static bool IsVendorIntel() {
RasterizerOpenGL::RasterizerOpenGL(Frontend::EmuWindow& emu_window, Driver& driver)
: driver{driver}, runtime{driver}, res_cache{*this, runtime}, is_amd(IsVendorAmd()),
vertex_buffer(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE, is_amd),
uniform_buffer(GL_UNIFORM_BUFFER, UNIFORM_BUFFER_SIZE, false),
index_buffer(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE, false),
texture_buffer(GL_TEXTURE_BUFFER, TEXTURE_BUFFER_SIZE, false),
texture_lf_buffer(GL_TEXTURE_BUFFER, TEXTURE_BUFFER_SIZE, false) {
vertex_buffer(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE),
uniform_buffer(GL_UNIFORM_BUFFER, UNIFORM_BUFFER_SIZE),
index_buffer(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE),
texture_buffer(GL_TEXTURE_BUFFER, TEXTURE_BUFFER_SIZE),
texture_lf_buffer(GL_TEXTURE_BUFFER, TEXTURE_BUFFER_SIZE) {
// Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0
state.clip_distance[0] = true;

View File

@ -12,32 +12,21 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
namespace OpenGL {
OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool array_buffer_for_amd,
bool prefer_coherent)
OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool readback, bool prefer_coherent)
: gl_target(target), buffer_size(size) {
gl_buffer.Create();
glBindBuffer(gl_target, gl_buffer.handle);
GLsizeiptr allocate_size = size;
if (array_buffer_for_amd) {
// On AMD GPU there is a strange crash in indexed drawing. The crash happens when the buffer
// read position is near the end and is an out-of-bound access to the vertex buffer. This is
// probably a bug in the driver and is related to the usage of vec3<byte> attributes in the
// vertex array. Doubling the allocation size for the vertex buffer seems to avoid the
// crash.
allocate_size *= 2;
}
if (GLAD_GL_ARB_buffer_storage) {
persistent = true;
coherent = prefer_coherent;
GLbitfield flags =
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
glBufferStorage(gl_target, allocate_size, nullptr, flags);
(readback ? GL_MAP_READ_BIT : GL_MAP_WRITE_BIT) | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
glBufferStorage(gl_target, size, nullptr, flags);
mapped_ptr = static_cast<u8*>(glMapBufferRange(
gl_target, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
gl_target, 0, buffer_size, flags | (!coherent && !readback ? GL_MAP_FLUSH_EXPLICIT_BIT : 0)));
} else {
glBufferData(gl_target, allocate_size, nullptr, GL_STREAM_DRAW);
glBufferData(gl_target, size, nullptr, GL_STREAM_DRAW);
}
}
@ -78,8 +67,8 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
if (invalidate || !persistent) {
MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
(coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
GLbitfield flags = (readback ? GL_MAP_READ_BIT : GL_MAP_WRITE_BIT) | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
(coherent ? GL_MAP_COHERENT_BIT : 0) | (!coherent && !readback ? GL_MAP_FLUSH_EXPLICIT_BIT : 0) |
(invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
mapped_ptr = static_cast<u8*>(
glMapBufferRange(gl_target, buffer_pos, buffer_size - buffer_pos, flags));
@ -92,7 +81,7 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
void OGLStreamBuffer::Unmap(GLsizeiptr size) {
ASSERT(size <= mapped_size);
if (!coherent && size > 0) {
if (!coherent && !readback && size > 0) {
glFlushMappedBufferRange(gl_target, buffer_pos - mapped_offset, size);
}

View File

@ -10,8 +10,7 @@ namespace OpenGL {
class OGLStreamBuffer : private NonCopyable {
public:
explicit OGLStreamBuffer(GLenum target, GLsizeiptr size, bool array_buffer_for_amd,
bool prefer_coherent = false);
explicit OGLStreamBuffer(GLenum target, GLsizeiptr size, bool readback = false, bool prefer_coherent = false);
~OGLStreamBuffer();
GLuint GetHandle() const;
@ -33,6 +32,7 @@ private:
OGLBuffer gl_buffer;
GLenum gl_target;
bool readback = false;
bool coherent = false;
bool persistent = false;

View File

@ -36,7 +36,7 @@ static constexpr std::array COLOR_TUPLES_OES = {
FormatTuple{GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4}, // RGBA4
};
GLbitfield MakeBufferMask(VideoCore::SurfaceType type) {
[[nodiscard]] GLbitfield MakeBufferMask(VideoCore::SurfaceType type) {
switch (type) {
case VideoCore::SurfaceType::Color:
case VideoCore::SurfaceType::Texture:
@ -53,9 +53,13 @@ GLbitfield MakeBufferMask(VideoCore::SurfaceType type) {
return GL_COLOR_BUFFER_BIT;
}
constexpr u32 UPLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
constexpr u32 DOWNLOAD_BUFFER_SIZE = 32 * 1024 * 1024;
TextureRuntime::TextureRuntime(Driver& driver)
: driver{driver}, downloader_es{false}, filterer{Settings::values.texture_filter_name,
VideoCore::GetResolutionScaleFactor()} {
: driver{driver}, filterer{Settings::values.texture_filter_name, VideoCore::GetResolutionScaleFactor()},
downloader_es{false}, upload_buffer{GL_PIXEL_UNPACK_BUFFER, UPLOAD_BUFFER_SIZE},
download_buffer{GL_PIXEL_PACK_BUFFER, DOWNLOAD_BUFFER_SIZE, true} {
read_fbo.Create();
draw_fbo.Create();
@ -70,51 +74,14 @@ TextureRuntime::TextureRuntime(Driver& driver)
Register(VideoCore::PixelFormat::RGB5A1, std::make_unique<RGBA4toRGB5A1>());
}
const StagingBuffer& TextureRuntime::FindStaging(u32 size, bool upload) {
const GLenum target = upload ? GL_PIXEL_UNPACK_BUFFER : GL_PIXEL_PACK_BUFFER;
const GLbitfield access = upload ? GL_MAP_WRITE_BIT : GL_MAP_READ_BIT;
auto& search = upload ? upload_buffers : download_buffers;
StagingData TextureRuntime::FindStaging(u32 size, bool upload) {
auto& buffer = upload ? upload_buffer : download_buffer;
auto [data, offset, invalidate] = buffer.Map(size, 4);
// Attempt to find a free buffer that fits the requested data
for (auto it = search.lower_bound({.size = size}); it != search.end(); it++) {
if (!upload || it->IsFree()) {
it->mapped = std::span{it->mapped.data(), size};
return *it;
}
}
OGLBuffer buffer{};
buffer.Create();
glBindBuffer(target, buffer.handle);
// Allocate a new buffer and map the data to the host
std::byte* data = nullptr;
if (driver.IsOpenGLES() && driver.HasExtBufferStorage()) {
const GLbitfield storage =
upload ? GL_MAP_WRITE_BIT : GL_MAP_READ_BIT | GL_CLIENT_STORAGE_BIT_EXT;
glBufferStorageEXT(target, size, nullptr,
storage | GL_MAP_PERSISTENT_BIT_EXT | GL_MAP_COHERENT_BIT_EXT);
data = reinterpret_cast<std::byte*>(glMapBufferRange(
target, 0, size, access | GL_MAP_PERSISTENT_BIT_EXT | GL_MAP_COHERENT_BIT_EXT));
} else if (driver.HasArbBufferStorage()) {
const GLbitfield storage =
upload ? GL_MAP_WRITE_BIT : GL_MAP_READ_BIT | GL_CLIENT_STORAGE_BIT;
glBufferStorage(target, size, nullptr,
storage | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
data = reinterpret_cast<std::byte*>(glMapBufferRange(
target, 0, size, access | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT));
} else {
UNIMPLEMENTED();
}
glBindBuffer(target, 0);
StagingBuffer staging = {
.buffer = std::move(buffer), .mapped = std::span{data, size}, .size = size};
const auto& it = search.emplace(std::move(staging));
return *it;
return StagingData{.buffer = buffer.GetHandle(),
.size = size,
.mapped = std::span<std::byte>{reinterpret_cast<std::byte*>(data), size},
.buffer_offset = offset};
}
const FormatTuple& TextureRuntime::GetFormatTuple(VideoCore::PixelFormat pixel_format) {
@ -371,41 +338,40 @@ Surface::~Surface() {
}
MICROPROFILE_DEFINE(OpenGL_Upload, "OpenGL", "Texture Upload", MP_RGB(128, 192, 64));
void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingBuffer& staging) {
void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingData& staging) {
MICROPROFILE_SCOPE(OpenGL_Upload);
// Ensure no bad interactions with GL_UNPACK_ALIGNMENT
ASSERT(stride * GetBytesPerPixel(pixel_format) % 4 == 0);
OpenGLState prev_state = OpenGLState::GetCurState();
SCOPE_EXIT({ prev_state.Apply(); });
glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(stride));
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, staging.buffer.handle);
const bool is_scaled = res_scale != 1;
if (is_scaled) {
ScaledUpload(upload, staging);
} else {
OpenGLState prev_state = OpenGLState::GetCurState();
SCOPE_EXIT({ prev_state.Apply(); });
glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(stride));
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, staging.buffer);
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, texture.handle);
const auto& tuple = runtime.GetFormatTuple(pixel_format);
glTexSubImage2D(GL_TEXTURE_2D, upload.texture_level, upload.texture_rect.left,
upload.texture_rect.bottom, upload.texture_rect.GetWidth(),
upload.texture_rect.GetHeight(), tuple.format, tuple.type, 0);
upload.texture_rect.GetHeight(), tuple.format, tuple.type,
reinterpret_cast<void*>(staging.buffer_offset));
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
runtime.upload_buffer.Unmap(staging.size);
}
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
// Lock the staging buffer until glTexSubImage completes
staging.Lock();
InvalidateAllWatcher();
}
MICROPROFILE_DEFINE(OpenGL_Download, "OpenGL", "Texture Download", MP_RGB(128, 192, 64));
void Surface::Download(const VideoCore::BufferTextureCopy& download, const StagingBuffer& staging) {
void Surface::Download(const VideoCore::BufferTextureCopy& download, const StagingData& staging) {
MICROPROFILE_SCOPE(OpenGL_Download);
// Ensure no bad interactions with GL_PACK_ALIGNMENT
@ -415,11 +381,11 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi
SCOPE_EXIT({ prev_state.Apply(); });
glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(stride));
glBindBuffer(GL_PIXEL_PACK_BUFFER, staging.buffer.handle);
glBindBuffer(GL_PIXEL_PACK_BUFFER, staging.buffer);
const bool is_scaled = res_scale != 1;
if (is_scaled) {
ScaledDownload(download);
ScaledDownload(download, staging);
} else {
runtime.BindFramebuffer(GL_READ_FRAMEBUFFER, download.texture_level, GL_TEXTURE_2D, type,
texture);
@ -427,15 +393,17 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi
const auto& tuple = runtime.GetFormatTuple(pixel_format);
glReadPixels(download.texture_rect.left, download.texture_rect.bottom,
download.texture_rect.GetWidth(), download.texture_rect.GetHeight(),
tuple.format, tuple.type, 0);
tuple.format, tuple.type,
reinterpret_cast<void*>(staging.buffer_offset));
runtime.download_buffer.Unmap(staging.size);
}
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
}
void Surface::ScaledUpload(const VideoCore::BufferTextureCopy& upload,
const StagingBuffer& staging) {
const StagingData& staging) {
const u32 rect_width = upload.texture_rect.GetWidth();
const u32 rect_height = upload.texture_rect.GetHeight();
const auto scaled_rect = upload.texture_rect * res_scale;
@ -468,7 +436,7 @@ void Surface::ScaledUpload(const VideoCore::BufferTextureCopy& upload,
}
}
void Surface::ScaledDownload(const VideoCore::BufferTextureCopy& download) {
void Surface::ScaledDownload(const VideoCore::BufferTextureCopy& download, const StagingData& staging) {
const u32 rect_width = download.texture_rect.GetWidth();
const u32 rect_height = download.texture_rect.GetHeight();
const VideoCore::Rect2D scaled_rect = download.texture_rect * res_scale;
@ -498,11 +466,14 @@ void Surface::ScaledDownload(const VideoCore::BufferTextureCopy& download) {
const auto& tuple = runtime.GetFormatTuple(pixel_format);
if (driver.IsOpenGLES()) {
const auto& downloader_es = runtime.GetDownloaderES();
downloader_es.GetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, rect_height,
rect_width, 0);
downloader_es.GetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, rect_height, rect_width,
reinterpret_cast<void*>(staging.buffer_offset));
} else {
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, 0);
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type,
reinterpret_cast<void*>(staging.buffer_offset));
}
runtime.download_buffer.Unmap(staging.size);
}
} // namespace OpenGL

View File

@ -8,6 +8,7 @@
#include "video_core/rasterizer_cache/rasterizer_cache.h"
#include "video_core/rasterizer_cache/surface_base.h"
#include "video_core/renderer_opengl/gl_format_reinterpreter.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
#include "video_core/renderer_opengl/texture_downloader_es.h"
#include "video_core/renderer_opengl/texture_filters/texture_filterer.h"
@ -19,35 +20,11 @@ struct FormatTuple {
GLenum type;
};
struct StagingBuffer {
OGLBuffer buffer{};
mutable OGLSync buffer_lock{};
mutable std::span<std::byte> mapped{};
u32 size{};
bool operator<(const StagingBuffer& other) const {
return size < other.size;
}
/// Returns true if the buffer does not take part in pending transfer operations
bool IsFree() const {
if (buffer_lock) {
GLint status;
glGetSynciv(buffer_lock.handle, GL_SYNC_STATUS, 1, nullptr, &status);
return status == GL_SIGNALED;
}
return true;
}
/// Prevents the runtime from reusing the buffer until the transfer operation is complete
void Lock() const {
if (buffer_lock) {
buffer_lock.Release();
}
buffer_lock.Create();
}
struct StagingData {
GLuint buffer;
u32 size = 0;
std::span<std::byte> mapped{};
GLintptr buffer_offset = 0;
};
class Driver;
@ -65,7 +42,7 @@ public:
~TextureRuntime() = default;
/// Maps an internal staging buffer of the provided size of pixel uploads/downloads
const StagingBuffer& FindStaging(u32 size, bool upload);
StagingData FindStaging(u32 size, bool upload);
/// Returns the OpenGL format tuple associated with the provided pixel format
const FormatTuple& GetFormatTuple(VideoCore::PixelFormat pixel_format);
@ -122,17 +99,12 @@ private:
private:
Driver& driver;
TextureDownloaderES downloader_es;
TextureFilterer filterer;
TextureDownloaderES downloader_es;
std::array<ReinterpreterList, VideoCore::PIXEL_FORMAT_COUNT> reinterpreters;
// Staging buffers stored in increasing size
std::multiset<StagingBuffer> upload_buffers;
std::multiset<StagingBuffer> download_buffers;
OGLFramebuffer read_fbo, draw_fbo;
// Recycled textures to reduce driver allocation overhead
std::unordered_multimap<VideoCore::HostTextureTag, OGLTexture> texture_recycler;
OGLStreamBuffer upload_buffer, download_buffer;
OGLFramebuffer read_fbo, draw_fbo;
};
class Surface : public VideoCore::SurfaceBase<Surface> {
@ -141,10 +113,10 @@ public:
~Surface() override;
/// Uploads pixel data in staging to a rectangle region of the surface texture
void Upload(const VideoCore::BufferTextureCopy& upload, const StagingBuffer& staging);
void Upload(const VideoCore::BufferTextureCopy& upload, const StagingData& staging);
/// Downloads pixel data to staging from a rectangle region of the surface texture
void Download(const VideoCore::BufferTextureCopy& download, const StagingBuffer& staging);
void Download(const VideoCore::BufferTextureCopy& download, const StagingData& staging);
/// Returns the bpp of the internal surface format
u32 GetInternalBytesPerPixel() const {
@ -153,10 +125,10 @@ public:
private:
/// Uploads pixel data to scaled texture
void ScaledUpload(const VideoCore::BufferTextureCopy& upload, const StagingBuffer& staging);
void ScaledUpload(const VideoCore::BufferTextureCopy& upload, const StagingData& staging);
/// Downloads scaled image by downscaling the requested rectangle
void ScaledDownload(const VideoCore::BufferTextureCopy& download);
void ScaledDownload(const VideoCore::BufferTextureCopy& download, const StagingData& staging);
private:
TextureRuntime& runtime;