morton_swizzle: Optimize and use std::span

2022-09-08 15:37:27 +03:00
parent 725afe33ef
commit 307154a06f
5 changed files with 102 additions and 90 deletions
--- a/src/video_core/rasterizer_cache/cached_surface.cpp
+++ b/src/video_core/rasterizer_cache/cached_surface.cpp
@ -34,19 +34,11 @@ void CachedSurface::LoadGLBuffer(PAddr load_start, PAddr load_end) {
    const bool need_swap =
        GLES && (pixel_format == PixelFormat::RGBA8 || pixel_format == PixelFormat::RGB8);
-    const u8* texture_ptr = VideoCore::g_memory->GetPhysicalPointer(addr);
+    u8* texture_ptr = VideoCore::g_memory->GetPhysicalPointer(addr);
    if (texture_ptr == nullptr) {
        return;
    }
    const u32 byte_size = width * height * GetBytesPerPixel(pixel_format);
    const auto texture_data = std::span<const std::byte>{reinterpret_cast<const std::byte*>(texture_ptr),
                                                         byte_size};
    if (gl_buffer.empty()) {
        gl_buffer.resize(byte_size);
    }
    // TODO: Should probably be done in ::Memory:: and check for other regions too
    if (load_start < Memory::VRAM_VADDR_END && load_end > Memory::VRAM_VADDR_END)
        load_end = Memory::VRAM_VADDR_END;
@ -54,10 +46,16 @@ void CachedSurface::LoadGLBuffer(PAddr load_start, PAddr load_end) {
    if (load_start < Memory::VRAM_VADDR && load_end > Memory::VRAM_VADDR)
        load_start = Memory::VRAM_VADDR;
    MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad);
    ASSERT(load_start >= addr && load_end <= end);
    const u32 start_offset = load_start - addr;
    const u32 byte_size = width * height * GetBytesPerPixel(pixel_format);
    if (gl_buffer.empty()) {
        gl_buffer.resize(byte_size);
    }
    MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad);
    if (!is_tiled) {
        ASSERT(type == SurfaceType::Color);
@ -66,32 +64,30 @@ void CachedSurface::LoadGLBuffer(PAddr load_start, PAddr load_end) {
            // cannot fully test this
            if (pixel_format == PixelFormat::RGBA8) {
                for (std::size_t i = start_offset; i < load_end - addr; i += 4) {
-                    gl_buffer[i] = texture_ptr[i + 3];
+                    gl_buffer[i] = (std::byte)texture_ptr[i + 3];
-                    gl_buffer[i + 1] = texture_ptr[i + 2];
+                    gl_buffer[i + 1] = (std::byte)texture_ptr[i + 2];
-                    gl_buffer[i + 2] = texture_ptr[i + 1];
+                    gl_buffer[i + 2] = (std::byte)texture_ptr[i + 1];
-                    gl_buffer[i + 3] = texture_ptr[i];
+                    gl_buffer[i + 3] = (std::byte)texture_ptr[i];
                }
            } else if (pixel_format == PixelFormat::RGB8) {
                for (std::size_t i = start_offset; i < load_end - addr; i += 3) {
-                    gl_buffer[i] = texture_ptr[i + 2];
+                    gl_buffer[i] = (std::byte)texture_ptr[i + 2];
-                    gl_buffer[i + 1] = texture_ptr[i + 1];
+                    gl_buffer[i + 1] = (std::byte)texture_ptr[i + 1];
-                    gl_buffer[i + 2] = texture_ptr[i];
+                    gl_buffer[i + 2] = (std::byte)texture_ptr[i];
                }
            }
        } else {
-            std::memcpy(&gl_buffer[start_offset], texture_ptr + start_offset,
+            std::memcpy(gl_buffer.data() + start_offset, texture_ptr + start_offset, load_end - load_start);
                        load_end - load_start);
        }
    } else {
-        const auto dest_data = std::span<std::byte>{reinterpret_cast<std::byte*>(gl_buffer.data()),
+        std::span<std::byte> texture_data{(std::byte*)texture_ptr, byte_size};
-                                                    byte_size};
+        UnswizzleTexture(*this, load_start, load_end, texture_data, gl_buffer);
        UnswizzleTexture(*this, load_start, load_end, texture_data, dest_data);
    }
 }
 MICROPROFILE_DEFINE(RasterizerCache_SurfaceFlush, "RasterizerCache", "Surface Flush", MP_RGB(128, 192, 64));
 void CachedSurface::FlushGLBuffer(PAddr flush_start, PAddr flush_end) {
-    u8* const dst_buffer = VideoCore::g_memory->GetPhysicalPointer(addr);
+    u8* dst_buffer = VideoCore::g_memory->GetPhysicalPointer(addr);
    if (dst_buffer == nullptr) {
        return;
    }
@ -132,25 +128,24 @@ void CachedSurface::FlushGLBuffer(PAddr flush_start, PAddr flush_end) {
        ASSERT(type == SurfaceType::Color);
        if (pixel_format == PixelFormat::RGBA8 && GLES) {
            for (std::size_t i = start_offset; i < flush_end - addr; i += 4) {
-                dst_buffer[i] = gl_buffer[i + 3];
+                dst_buffer[i] = (u8)gl_buffer[i + 3];
-                dst_buffer[i + 1] = gl_buffer[i + 2];
+                dst_buffer[i + 1] = (u8)gl_buffer[i + 2];
-                dst_buffer[i + 2] = gl_buffer[i + 1];
+                dst_buffer[i + 2] = (u8)gl_buffer[i + 1];
-                dst_buffer[i + 3] = gl_buffer[i];
+                dst_buffer[i + 3] = (u8)gl_buffer[i];
            }
        } else if (pixel_format == PixelFormat::RGB8 && GLES) {
            for (std::size_t i = start_offset; i < flush_end - addr; i += 3) {
-                dst_buffer[i] = gl_buffer[i + 2];
+                dst_buffer[i] = (u8)gl_buffer[i + 2];
-                dst_buffer[i + 1] = gl_buffer[i + 1];
+                dst_buffer[i + 1] = (u8)gl_buffer[i + 1];
-                dst_buffer[i + 2] = gl_buffer[i];
+                dst_buffer[i + 2] = (u8)gl_buffer[i];
            }
        } else {
            std::memcpy(dst_buffer + start_offset, &gl_buffer[start_offset],
                        flush_end - flush_start);
        }
    } else {
-        const auto source_data = std::span<std::byte>{reinterpret_cast<std::byte*>(gl_buffer.data()),
+        std::span<std::byte> texture_data{(std::byte*)dst_buffer + start_offset, byte_size};
-                                                      byte_size};
+        SwizzleTexture(*this, flush_start, flush_end, gl_buffer, texture_data);
        SwizzleTexture(*this, flush_start, flush_end, source_data, {});
    }
 }
@ -421,7 +416,7 @@ void CachedSurface::DownloadGLTexture(const Common::Rectangle<u32>& rect) {
            .region = rect
        };
-        runtime.ReadTexture(texture, subresource, tuple, gl_buffer.data());
+        runtime.ReadTexture(texture, subresource, tuple, (u8*)gl_buffer.data());
    }
    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
--- a/src/video_core/rasterizer_cache/cached_surface.h
+++ b/src/video_core/rasterizer_cache/cached_surface.h
@ -104,7 +104,7 @@ public:
 public:
    bool registered = false;
    SurfaceRegions invalid_regions;
-    std::vector<u8> gl_buffer;
+    std::vector<std::byte> gl_buffer;
    // Number of bytes to read from fill_data
    u32 fill_size = 0;
--- a/src/video_core/rasterizer_cache/morton_swizzle.h
+++ b/src/video_core/rasterizer_cache/morton_swizzle.h
@ -3,6 +3,8 @@
 // Refer to the license.txt file included.
 #pragma once
 #include <span>
 #include <bit>
 #include "common/alignment.h"
 #include "core/memory.h"
 #include "video_core/rasterizer_cache/pixel_format.h"
@ -12,50 +14,54 @@
 namespace OpenGL {
 inline u32 MakeInt(std::span<std::byte> bytes) {
    u32 integer{};
    std::memcpy(&integer, bytes.data(), sizeof(u32));
    return integer;
 }
 template <bool morton_to_linear, PixelFormat format>
-static void MortonCopyTile(u32 stride, u8* tile_buffer, u8* linear_buffer) {
+inline void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer, std::span<std::byte> linear_buffer) {
    constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
-    constexpr u32 aligned_bytes_per_pixel = GetBytesPerPixel(format);
+    constexpr u32 linear_bytes_per_pixel = GetBytesPerPixel(format);
    for (u32 y = 0; y < 8; y++) {
        for (u32 x = 0; x < 8; x++) {
-            u8* tile_ptr = tile_buffer + VideoCore::MortonInterleave(x, y) * bytes_per_pixel;
+            const u32 tile_offset = VideoCore::MortonInterleave(x, y) * bytes_per_pixel;
-            u8* linear_ptr = linear_buffer + ((7 - y) * stride + x) * aligned_bytes_per_pixel;
+            const u32 linear_offset = ((7 - y) * stride + x) * linear_bytes_per_pixel;
            auto tile_pixel = tile_buffer.subspan(tile_offset, bytes_per_pixel);
            auto linear_pixel = linear_buffer.subspan(linear_offset, linear_bytes_per_pixel);
            if constexpr (morton_to_linear) {
                if constexpr (format == PixelFormat::D24S8) {
-                    linear_ptr[0] = tile_ptr[3];
+                    const u32 s8d24 = MakeInt(tile_pixel);
-                    std::memcpy(linear_ptr + 1, tile_ptr, 3);
+                    const u32 d24s8 = std::rotl(s8d24, 8);
                    std::memcpy(linear_pixel.data(), &d24s8, sizeof(u32));
                } else if (format == PixelFormat::RGBA8 && GLES) {
-                    // because GLES does not have ABGR format
+                    const u32 abgr = MakeInt(tile_pixel);
-                    // so we will do byteswapping here
+                    const u32 rgba = std::byteswap(abgr);
-                    linear_ptr[0] = tile_ptr[3];
+                    std::memcpy(linear_pixel.data(), &rgba, sizeof(u32));
                    linear_ptr[1] = tile_ptr[2];
                    linear_ptr[2] = tile_ptr[1];
                    linear_ptr[3] = tile_ptr[0];
                } else if (format == PixelFormat::RGB8 && GLES) {
-                    linear_ptr[0] = tile_ptr[2];
+                    std::memcpy(linear_pixel.data(), tile_pixel.data(), 3);
-                    linear_ptr[1] = tile_ptr[1];
+                    std::swap(linear_pixel[0], linear_pixel[2]);
                    linear_ptr[2] = tile_ptr[0];
                } else {
-                    std::memcpy(linear_ptr, tile_ptr, bytes_per_pixel);
+                    std::memcpy(linear_pixel.data(), tile_pixel.data(), bytes_per_pixel);
                }
            } else {
                if constexpr (format == PixelFormat::D24S8) {
-                    std::memcpy(tile_ptr, linear_ptr + 1, 3);
+                    const u32 d24s8 = MakeInt(linear_pixel);
-                    tile_ptr[3] = linear_ptr[0];
+                    const u32 s8d24 = std::rotr(d24s8, 8);
                    std::memcpy(tile_pixel.data(), &s8d24, sizeof(u32));
                } else if (format == PixelFormat::RGBA8 && GLES) {
-                    // because GLES does not have ABGR format
+                    const u32 rgba = MakeInt(linear_pixel);
-                    // so we will do byteswapping here
+                    const u32 abgr = std::byteswap(rgba);
-                    tile_ptr[0] = linear_ptr[3];
+                    std::memcpy(tile_pixel.data(), &abgr, sizeof(u32));
                    tile_ptr[1] = linear_ptr[2];
                    tile_ptr[2] = linear_ptr[1];
                    tile_ptr[3] = linear_ptr[0];
                } else if (format == PixelFormat::RGB8 && GLES) {
-                    tile_ptr[0] = linear_ptr[2];
+                    std::memcpy(tile_pixel.data(), linear_pixel.data(), 3);
-                    tile_ptr[1] = linear_ptr[1];
+                    std::swap(tile_pixel[0], tile_pixel[2]);
                    tile_ptr[2] = linear_ptr[0];
                } else {
-                    std::memcpy(tile_ptr, linear_ptr, bytes_per_pixel);
+                    std::memcpy(tile_pixel.data(), linear_pixel.data(), bytes_per_pixel);
                }
            }
        }
@ -63,13 +69,20 @@ static void MortonCopyTile(u32 stride, u8* tile_buffer, u8* linear_buffer) {
 }
 template <bool morton_to_linear, PixelFormat format>
-static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PAddr start, PAddr end) {
+static void MortonCopy(u32 stride, u32 height,
-    constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
+                       std::span<std::byte> linear_buffer, std::span<std::byte> tiled_buffer,
-    constexpr u32 tile_size = bytes_per_pixel * 64;
+                       PAddr base, PAddr start, PAddr end) {
    constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
    constexpr u32 aligned_bytes_per_pixel = GetBytesPerPixel(format);
    static_assert(aligned_bytes_per_pixel >= bytes_per_pixel, "");
-    linear_buffer += aligned_bytes_per_pixel - bytes_per_pixel;
+
    constexpr u32 tile_size = bytes_per_pixel * 64;
    const u32 linear_tile_size = (7 * stride + 8) * aligned_bytes_per_pixel;
    // This only applies for D24 format, by shifting the span one byte all pixels
    // are written properly without byteswap
    u32 linear_offset = aligned_bytes_per_pixel - bytes_per_pixel;
    const PAddr aligned_down_start = base + Common::AlignDown(start - base, tile_size);
    const PAddr aligned_start = base + Common::AlignUp(start - base, tile_size);
@ -84,18 +97,19 @@ static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PA
    // In OpenGL the texture origin is in the bottom left corner as opposed to other
    // APIs that have it at the top left. To avoid flipping texture coordinates in
    // the shader we read/write the linear buffer backwards
-    linear_buffer += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel;
+    //linear_buffer += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel;
    linear_offset += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel;
    auto linear_next_tile = [&] {
        x = (x + 8) % stride;
-        linear_buffer += 8 * aligned_bytes_per_pixel;
+        linear_offset += 8 * aligned_bytes_per_pixel;
        if (!x) {
            y  = (y + 8) % height;
            if (!y) {
                return;
            }
-            linear_buffer -= stride * 9 * aligned_bytes_per_pixel;
+            linear_offset -= stride * 9 * aligned_bytes_per_pixel;
        }
    };
@ -104,8 +118,10 @@ static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PA
    // If during a texture download the start coordinate is inside a tile, swizzle
    // the tile to a temporary buffer and copy the part we are interested in
    if (start < aligned_start && !morton_to_linear) {
-        std::array<u8, tile_size> tmp_buf;
+        std::array<std::byte, tile_size> tmp_buf;
-        MortonCopyTile<morton_to_linear, format>(stride, tmp_buf.data(), linear_buffer);
+        std::span<std::byte> linear_data = linear_buffer.last(linear_buffer.size() - linear_offset);
        MortonCopyTile<morton_to_linear, format>(stride, tmp_buf, linear_data);
        std::memcpy(tile_buffer, tmp_buf.data() + start - aligned_down_start,
                    std::min(aligned_start, end) - start);
@ -124,19 +140,23 @@ static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PA
    const u8* buffer_end = tile_buffer + aligned_end - aligned_start;
    while (tile_buffer < buffer_end) {
-        MortonCopyTile<morton_to_linear, format>(stride, tile_buffer, linear_buffer);
+        std::span<std::byte> linear_data = linear_buffer.last(linear_buffer.size() - linear_offset);
        auto tiled_data = std::span<std::byte>{(std::byte*)tile_buffer, tile_size};
        MortonCopyTile<morton_to_linear, format>(stride, tiled_data, linear_data);
        tile_buffer += tile_size;
        linear_next_tile();
    }
    if (end > std::max(aligned_start, aligned_end) && !morton_to_linear) {
-        std::array<u8, tile_size> tmp_buf;
+        std::array<std::byte, tile_size> tmp_buf;
-        MortonCopyTile<morton_to_linear, format>(stride, tmp_buf.data(), linear_buffer);
+        std::span<std::byte> linear_data = linear_buffer.last(linear_buffer.size() - linear_offset);
        MortonCopyTile<morton_to_linear, format>(stride, tmp_buf, linear_data);
        std::memcpy(tile_buffer, tmp_buf.data(), end - aligned_end);
    }
 }
-using MortonFunc = void (*)(u32, u32, u8*, PAddr, PAddr, PAddr);
+using MortonFunc = void (*)(u32, u32, std::span<std::byte>, std::span<std::byte>, PAddr, PAddr, PAddr);
 static constexpr std::array<MortonFunc, 18> UNSWIZZLE_TABLE = {
    MortonCopy<true, PixelFormat::RGBA8>,  // 0
--- a/src/video_core/rasterizer_cache/utils.cpp
+++ b/src/video_core/rasterizer_cache/utils.cpp
@ -58,17 +58,16 @@ const FormatTuple& GetFormatTuple(PixelFormat pixel_format) {
 }
 void SwizzleTexture(const SurfaceParams& params, u32 flush_start, u32 flush_end,
-                    std::span<std::byte> source, std::span<std::byte> dest) {
+                    std::span<std::byte> source_linear, std::span<std::byte> dest_tiled) {
    const u32 func_index = static_cast<u32>(params.pixel_format);
-    const MortonFunc swizzle = SWIZZLE_TABLE[func_index];
+    const MortonFunc SwizzleImpl = SWIZZLE_TABLE[func_index];
    u8* source_data = reinterpret_cast<u8*>(source.data());
    // TODO: Move memory access out of the morton function
-    swizzle(params.stride, params.height, source_data, params.addr, flush_start, flush_end);
+    SwizzleImpl(params.stride, params.height, source_linear, dest_tiled, params.addr, flush_start, flush_end);
 }
 void UnswizzleTexture(const SurfaceParams& params, u32 load_start, u32 load_end,
-                      std::span<const std::byte> source, std::span<std::byte> dest) {
+                      std::span<std::byte> source_tiled, std::span<std::byte> dest_linear) {
    // TODO: Integrate this to UNSWIZZLE_TABLE
    if (params.type == SurfaceType::Texture) {
        Pica::Texture::TextureInfo tex_info{};
@ -82,21 +81,19 @@ void UnswizzleTexture(const SurfaceParams& params, u32 load_start, u32 load_end,
        const auto rect = params.GetSubRect(params.FromInterval(load_interval));
        DEBUG_ASSERT(params.FromInterval(load_interval).GetInterval() == load_interval);
-        const u8* source_data = reinterpret_cast<const u8*>(source.data());
+        const u8* source_data = reinterpret_cast<const u8*>(source_tiled.data());
        for (u32 y = rect.bottom; y < rect.top; y++) {
            for (u32 x = rect.left; x < rect.right; x++) {
                auto vec4 =
                    Pica::Texture::LookupTexture(source_data, x, params.height - 1 - y, tex_info);
                const std::size_t offset = (x + (params.width * y)) * 4;
-                std::memcpy(dest.data() + offset, vec4.AsArray(), 4);
+                std::memcpy(dest_linear.data() + offset, vec4.AsArray(), 4);
            }
        }
    } else {
        const u32 func_index = static_cast<u32>(params.pixel_format);
-        const MortonFunc deswizzle = UNSWIZZLE_TABLE[func_index];
+        const MortonFunc UnswizzleImpl = UNSWIZZLE_TABLE[func_index];
-        u8* dest_data = reinterpret_cast<u8*>(dest.data());
+        UnswizzleImpl(params.stride, params.height, dest_linear, source_tiled, params.addr, load_start, load_end);
        deswizzle(params.stride, params.height, dest_data, params.addr, load_start, load_end);
    }
 }
--- a/src/video_core/rasterizer_cache/utils.h
+++ b/src/video_core/rasterizer_cache/utils.h
@ -51,10 +51,10 @@ struct TextureCubeConfig {
 class SurfaceParams;
 void SwizzleTexture(const SurfaceParams& params, u32 flush_start, u32 flush_end,
-                    std::span<std::byte> source, std::span<std::byte> dest);
+                    std::span<std::byte> source_linear, std::span<std::byte> dest_tiled);
 void UnswizzleTexture(const SurfaceParams& params, u32 load_start, u32 load_end,
-                      std::span<const std::byte> source, std::span<std::byte> dest);
+                      std::span<std::byte> source_tiled, std::span<std::byte> dest_linear);
 [[nodiscard]] ClearValue MakeClearValue(SurfaceType type, PixelFormat format, const u8* fill_data);