diff --git a/src/video_core/rasterizer_cache/cached_surface.cpp b/src/video_core/rasterizer_cache/cached_surface.cpp index 98d1a336d..a8ee5349a 100644 --- a/src/video_core/rasterizer_cache/cached_surface.cpp +++ b/src/video_core/rasterizer_cache/cached_surface.cpp @@ -35,19 +35,11 @@ void CachedSurface::LoadGLBuffer(PAddr load_start, PAddr load_end) { const bool need_swap = GLES && (pixel_format == PixelFormat::RGBA8 || pixel_format == PixelFormat::RGB8); - const u8* texture_ptr = VideoCore::g_memory->GetPhysicalPointer(addr); + u8* texture_ptr = VideoCore::g_memory->GetPhysicalPointer(addr); if (texture_ptr == nullptr) { return; } - const u32 byte_size = width * height * GetBytesPerPixel(pixel_format); - const auto texture_data = std::span{reinterpret_cast(texture_ptr), - byte_size}; - - if (gl_buffer.empty()) { - gl_buffer.resize(byte_size); - } - // TODO: Should probably be done in ::Memory:: and check for other regions too if (load_start < Memory::VRAM_VADDR_END && load_end > Memory::VRAM_VADDR_END) load_end = Memory::VRAM_VADDR_END; @@ -55,10 +47,16 @@ void CachedSurface::LoadGLBuffer(PAddr load_start, PAddr load_end) { if (load_start < Memory::VRAM_VADDR && load_end > Memory::VRAM_VADDR) load_start = Memory::VRAM_VADDR; - MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad); - ASSERT(load_start >= addr && load_end <= end); + const u32 start_offset = load_start - addr; + const u32 byte_size = width * height * GetBytesPerPixel(pixel_format); + + if (gl_buffer.empty()) { + gl_buffer.resize(byte_size); + } + + MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad); if (!is_tiled) { ASSERT(type == SurfaceType::Color); @@ -67,33 +65,31 @@ void CachedSurface::LoadGLBuffer(PAddr load_start, PAddr load_end) { // cannot fully test this if (pixel_format == PixelFormat::RGBA8) { for (std::size_t i = start_offset; i < load_end - addr; i += 4) { - gl_buffer[i] = texture_ptr[i + 3]; - gl_buffer[i + 1] = texture_ptr[i + 2]; - gl_buffer[i + 2] = texture_ptr[i + 1]; - gl_buffer[i + 3] = texture_ptr[i]; + gl_buffer[i] = (std::byte)texture_ptr[i + 3]; + gl_buffer[i + 1] = (std::byte)texture_ptr[i + 2]; + gl_buffer[i + 2] = (std::byte)texture_ptr[i + 1]; + gl_buffer[i + 3] = (std::byte)texture_ptr[i]; } } else if (pixel_format == PixelFormat::RGB8) { for (std::size_t i = start_offset; i < load_end - addr; i += 3) { - gl_buffer[i] = texture_ptr[i + 2]; - gl_buffer[i + 1] = texture_ptr[i + 1]; - gl_buffer[i + 2] = texture_ptr[i]; + gl_buffer[i] = (std::byte)texture_ptr[i + 2]; + gl_buffer[i + 1] = (std::byte)texture_ptr[i + 1]; + gl_buffer[i + 2] = (std::byte)texture_ptr[i]; } } } else { - std::memcpy(&gl_buffer[start_offset], texture_ptr + start_offset, - load_end - load_start); + std::memcpy(gl_buffer.data() + start_offset, texture_ptr + start_offset, load_end - load_start); } } else { - const auto dest_data = std::span{reinterpret_cast(gl_buffer.data()), - byte_size}; - UnswizzleTexture(*this, load_start, load_end, texture_data, dest_data); + std::span texture_data{(std::byte*)texture_ptr, byte_size}; + UnswizzleTexture(*this, load_start, load_end, texture_data, gl_buffer); } } MICROPROFILE_DEFINE(RasterizerCache_SurfaceFlush, "RasterizerCache", "Surface Flush", MP_RGB(128, 192, 64)); void CachedSurface::FlushGLBuffer(PAddr flush_start, PAddr flush_end) { - u8* const dst_buffer = VideoCore::g_memory->GetPhysicalPointer(addr); + u8* dst_buffer = VideoCore::g_memory->GetPhysicalPointer(addr); if (dst_buffer == nullptr) { return; } @@ -134,25 +130,24 @@ void CachedSurface::FlushGLBuffer(PAddr flush_start, PAddr flush_end) { ASSERT(type == SurfaceType::Color); if (pixel_format == PixelFormat::RGBA8 && GLES) { for (std::size_t i = start_offset; i < flush_end - addr; i += 4) { - dst_buffer[i] = gl_buffer[i + 3]; - dst_buffer[i + 1] = gl_buffer[i + 2]; - dst_buffer[i + 2] = gl_buffer[i + 1]; - dst_buffer[i + 3] = gl_buffer[i]; + dst_buffer[i] = (u8)gl_buffer[i + 3]; + dst_buffer[i + 1] = (u8)gl_buffer[i + 2]; + dst_buffer[i + 2] = (u8)gl_buffer[i + 1]; + dst_buffer[i + 3] = (u8)gl_buffer[i]; } } else if (pixel_format == PixelFormat::RGB8 && GLES) { for (std::size_t i = start_offset; i < flush_end - addr; i += 3) { - dst_buffer[i] = gl_buffer[i + 2]; - dst_buffer[i + 1] = gl_buffer[i + 1]; - dst_buffer[i + 2] = gl_buffer[i]; + dst_buffer[i] = (u8)gl_buffer[i + 2]; + dst_buffer[i + 1] = (u8)gl_buffer[i + 1]; + dst_buffer[i + 2] = (u8)gl_buffer[i]; } } else { std::memcpy(dst_buffer + start_offset, &gl_buffer[start_offset], flush_end - flush_start); } } else { - const auto source_data = std::span{reinterpret_cast(gl_buffer.data()), - byte_size}; - SwizzleTexture(*this, flush_start, flush_end, source_data, {}); + std::span texture_data{(std::byte*)dst_buffer + start_offset, byte_size}; + SwizzleTexture(*this, flush_start, flush_end, gl_buffer, texture_data); } } @@ -425,7 +420,7 @@ void CachedSurface::DownloadGLTexture(const Common::Rectangle& rect) { .region = rect }; - runtime.ReadTexture(texture, subresource, tuple, gl_buffer.data()); + runtime.ReadTexture(texture, subresource, tuple, (u8*)gl_buffer.data()); } glPixelStorei(GL_PACK_ROW_LENGTH, 0); diff --git a/src/video_core/rasterizer_cache/cached_surface.h b/src/video_core/rasterizer_cache/cached_surface.h index dd689e88e..9a52f97c5 100644 --- a/src/video_core/rasterizer_cache/cached_surface.h +++ b/src/video_core/rasterizer_cache/cached_surface.h @@ -104,7 +104,7 @@ public: public: bool registered = false; SurfaceRegions invalid_regions; - std::vector gl_buffer; + std::vector gl_buffer; // Number of bytes to read from fill_data u32 fill_size = 0; diff --git a/src/video_core/rasterizer_cache/morton_swizzle.h b/src/video_core/rasterizer_cache/morton_swizzle.h index 7e9200743..372d1f823 100644 --- a/src/video_core/rasterizer_cache/morton_swizzle.h +++ b/src/video_core/rasterizer_cache/morton_swizzle.h @@ -3,6 +3,8 @@ // Refer to the license.txt file included. #pragma once +#include +#include #include "common/alignment.h" #include "core/memory.h" #include "video_core/rasterizer_cache/pixel_format.h" @@ -12,50 +14,54 @@ namespace OpenGL { +inline u32 MakeInt(std::span bytes) { + u32 integer{}; + std::memcpy(&integer, bytes.data(), sizeof(u32)); + + return integer; +} + template -static void MortonCopyTile(u32 stride, u8* tile_buffer, u8* linear_buffer) { +inline void MortonCopyTile(u32 stride, std::span tile_buffer, std::span linear_buffer) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; - constexpr u32 aligned_bytes_per_pixel = GetBytesPerPixel(format); + constexpr u32 linear_bytes_per_pixel = GetBytesPerPixel(format); for (u32 y = 0; y < 8; y++) { for (u32 x = 0; x < 8; x++) { - u8* tile_ptr = tile_buffer + VideoCore::MortonInterleave(x, y) * bytes_per_pixel; - u8* linear_ptr = linear_buffer + ((7 - y) * stride + x) * aligned_bytes_per_pixel; + const u32 tile_offset = VideoCore::MortonInterleave(x, y) * bytes_per_pixel; + const u32 linear_offset = ((7 - y) * stride + x) * linear_bytes_per_pixel; + auto tile_pixel = tile_buffer.subspan(tile_offset, bytes_per_pixel); + auto linear_pixel = linear_buffer.subspan(linear_offset, linear_bytes_per_pixel); + if constexpr (morton_to_linear) { if constexpr (format == PixelFormat::D24S8) { - linear_ptr[0] = tile_ptr[3]; - std::memcpy(linear_ptr + 1, tile_ptr, 3); + const u32 s8d24 = MakeInt(tile_pixel); + const u32 d24s8 = std::rotl(s8d24, 8); + std::memcpy(linear_pixel.data(), &d24s8, sizeof(u32)); } else if (format == PixelFormat::RGBA8 && GLES) { - // because GLES does not have ABGR format - // so we will do byteswapping here - linear_ptr[0] = tile_ptr[3]; - linear_ptr[1] = tile_ptr[2]; - linear_ptr[2] = tile_ptr[1]; - linear_ptr[3] = tile_ptr[0]; + const u32 abgr = MakeInt(tile_pixel); + const u32 rgba = std::byteswap(abgr); + std::memcpy(linear_pixel.data(), &rgba, sizeof(u32)); } else if (format == PixelFormat::RGB8 && GLES) { - linear_ptr[0] = tile_ptr[2]; - linear_ptr[1] = tile_ptr[1]; - linear_ptr[2] = tile_ptr[0]; + std::memcpy(linear_pixel.data(), tile_pixel.data(), 3); + std::swap(linear_pixel[0], linear_pixel[2]); } else { - std::memcpy(linear_ptr, tile_ptr, bytes_per_pixel); + std::memcpy(linear_pixel.data(), tile_pixel.data(), bytes_per_pixel); } } else { if constexpr (format == PixelFormat::D24S8) { - std::memcpy(tile_ptr, linear_ptr + 1, 3); - tile_ptr[3] = linear_ptr[0]; + const u32 d24s8 = MakeInt(linear_pixel); + const u32 s8d24 = std::rotr(d24s8, 8); + std::memcpy(tile_pixel.data(), &s8d24, sizeof(u32)); } else if (format == PixelFormat::RGBA8 && GLES) { - // because GLES does not have ABGR format - // so we will do byteswapping here - tile_ptr[0] = linear_ptr[3]; - tile_ptr[1] = linear_ptr[2]; - tile_ptr[2] = linear_ptr[1]; - tile_ptr[3] = linear_ptr[0]; + const u32 rgba = MakeInt(linear_pixel); + const u32 abgr = std::byteswap(rgba); + std::memcpy(tile_pixel.data(), &abgr, sizeof(u32)); } else if (format == PixelFormat::RGB8 && GLES) { - tile_ptr[0] = linear_ptr[2]; - tile_ptr[1] = linear_ptr[1]; - tile_ptr[2] = linear_ptr[0]; + std::memcpy(tile_pixel.data(), linear_pixel.data(), 3); + std::swap(tile_pixel[0], tile_pixel[2]); } else { - std::memcpy(tile_ptr, linear_ptr, bytes_per_pixel); + std::memcpy(tile_pixel.data(), linear_pixel.data(), bytes_per_pixel); } } } @@ -63,13 +69,20 @@ static void MortonCopyTile(u32 stride, u8* tile_buffer, u8* linear_buffer) { } template -static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PAddr start, PAddr end) { - constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; - constexpr u32 tile_size = bytes_per_pixel * 64; +static void MortonCopy(u32 stride, u32 height, + std::span linear_buffer, std::span tiled_buffer, + PAddr base, PAddr start, PAddr end) { + constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; constexpr u32 aligned_bytes_per_pixel = GetBytesPerPixel(format); static_assert(aligned_bytes_per_pixel >= bytes_per_pixel, ""); - linear_buffer += aligned_bytes_per_pixel - bytes_per_pixel; + + constexpr u32 tile_size = bytes_per_pixel * 64; + const u32 linear_tile_size = (7 * stride + 8) * aligned_bytes_per_pixel; + + // This only applies for D24 format, by shifting the span one byte all pixels + // are written properly without byteswap + u32 linear_offset = aligned_bytes_per_pixel - bytes_per_pixel; const PAddr aligned_down_start = base + Common::AlignDown(start - base, tile_size); const PAddr aligned_start = base + Common::AlignUp(start - base, tile_size); @@ -84,18 +97,19 @@ static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PA // In OpenGL the texture origin is in the bottom left corner as opposed to other // APIs that have it at the top left. To avoid flipping texture coordinates in // the shader we read/write the linear buffer backwards - linear_buffer += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel; + //linear_buffer += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel; + linear_offset += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel; auto linear_next_tile = [&] { x = (x + 8) % stride; - linear_buffer += 8 * aligned_bytes_per_pixel; + linear_offset += 8 * aligned_bytes_per_pixel; if (!x) { y = (y + 8) % height; if (!y) { return; } - linear_buffer -= stride * 9 * aligned_bytes_per_pixel; + linear_offset -= stride * 9 * aligned_bytes_per_pixel; } }; @@ -104,8 +118,10 @@ static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PA // If during a texture download the start coordinate is inside a tile, swizzle // the tile to a temporary buffer and copy the part we are interested in if (start < aligned_start && !morton_to_linear) { - std::array tmp_buf; - MortonCopyTile(stride, tmp_buf.data(), linear_buffer); + std::array tmp_buf; + std::span linear_data = linear_buffer.last(linear_buffer.size() - linear_offset); + + MortonCopyTile(stride, tmp_buf, linear_data); std::memcpy(tile_buffer, tmp_buf.data() + start - aligned_down_start, std::min(aligned_start, end) - start); @@ -124,19 +140,23 @@ static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PA const u8* buffer_end = tile_buffer + aligned_end - aligned_start; while (tile_buffer < buffer_end) { - MortonCopyTile(stride, tile_buffer, linear_buffer); + std::span linear_data = linear_buffer.last(linear_buffer.size() - linear_offset); + auto tiled_data = std::span{(std::byte*)tile_buffer, tile_size}; + + MortonCopyTile(stride, tiled_data, linear_data); tile_buffer += tile_size; linear_next_tile(); } if (end > std::max(aligned_start, aligned_end) && !morton_to_linear) { - std::array tmp_buf; - MortonCopyTile(stride, tmp_buf.data(), linear_buffer); + std::array tmp_buf; + std::span linear_data = linear_buffer.last(linear_buffer.size() - linear_offset); + MortonCopyTile(stride, tmp_buf, linear_data); std::memcpy(tile_buffer, tmp_buf.data(), end - aligned_end); } } -using MortonFunc = void (*)(u32, u32, u8*, PAddr, PAddr, PAddr); +using MortonFunc = void (*)(u32, u32, std::span, std::span, PAddr, PAddr, PAddr); static constexpr std::array UNSWIZZLE_TABLE = { MortonCopy, // 0 diff --git a/src/video_core/rasterizer_cache/utils.cpp b/src/video_core/rasterizer_cache/utils.cpp index a819100fd..7f67b65a2 100644 --- a/src/video_core/rasterizer_cache/utils.cpp +++ b/src/video_core/rasterizer_cache/utils.cpp @@ -58,17 +58,16 @@ const FormatTuple& GetFormatTuple(PixelFormat pixel_format) { } void SwizzleTexture(const SurfaceParams& params, u32 flush_start, u32 flush_end, - std::span source, std::span dest) { + std::span source_linear, std::span dest_tiled) { const u32 func_index = static_cast(params.pixel_format); - const MortonFunc swizzle = SWIZZLE_TABLE[func_index]; - u8* source_data = reinterpret_cast(source.data()); + const MortonFunc SwizzleImpl = SWIZZLE_TABLE[func_index]; // TODO: Move memory access out of the morton function - swizzle(params.stride, params.height, source_data, params.addr, flush_start, flush_end); + SwizzleImpl(params.stride, params.height, source_linear, dest_tiled, params.addr, flush_start, flush_end); } void UnswizzleTexture(const SurfaceParams& params, u32 load_start, u32 load_end, - std::span source, std::span dest) { + std::span source_tiled, std::span dest_linear) { // TODO: Integrate this to UNSWIZZLE_TABLE if (params.type == SurfaceType::Texture) { Pica::Texture::TextureInfo tex_info{}; @@ -82,21 +81,19 @@ void UnswizzleTexture(const SurfaceParams& params, u32 load_start, u32 load_end, const auto rect = params.GetSubRect(params.FromInterval(load_interval)); DEBUG_ASSERT(params.FromInterval(load_interval).GetInterval() == load_interval); - const u8* source_data = reinterpret_cast(source.data()); + const u8* source_data = reinterpret_cast(source_tiled.data()); for (u32 y = rect.bottom; y < rect.top; y++) { for (u32 x = rect.left; x < rect.right; x++) { auto vec4 = Pica::Texture::LookupTexture(source_data, x, params.height - 1 - y, tex_info); const std::size_t offset = (x + (params.width * y)) * 4; - std::memcpy(dest.data() + offset, vec4.AsArray(), 4); + std::memcpy(dest_linear.data() + offset, vec4.AsArray(), 4); } } } else { const u32 func_index = static_cast(params.pixel_format); - const MortonFunc deswizzle = UNSWIZZLE_TABLE[func_index]; - u8* dest_data = reinterpret_cast(dest.data()); - - deswizzle(params.stride, params.height, dest_data, params.addr, load_start, load_end); + const MortonFunc UnswizzleImpl = UNSWIZZLE_TABLE[func_index]; + UnswizzleImpl(params.stride, params.height, dest_linear, source_tiled, params.addr, load_start, load_end); } } diff --git a/src/video_core/rasterizer_cache/utils.h b/src/video_core/rasterizer_cache/utils.h index 79cd228be..660655c04 100644 --- a/src/video_core/rasterizer_cache/utils.h +++ b/src/video_core/rasterizer_cache/utils.h @@ -51,10 +51,10 @@ struct TextureCubeConfig { class SurfaceParams; void SwizzleTexture(const SurfaceParams& params, u32 flush_start, u32 flush_end, - std::span source, std::span dest); + std::span source_linear, std::span dest_tiled); void UnswizzleTexture(const SurfaceParams& params, u32 load_start, u32 load_end, - std::span source, std::span dest); + std::span source_tiled, std::span dest_linear); [[nodiscard]] ClearValue MakeClearValue(SurfaceType type, PixelFormat format, const u8* fill_data);