diff --git a/src/video_core/rasterizer_cache/cached_surface.cpp b/src/video_core/rasterizer_cache/cached_surface.cpp
index 98d1a336d..a8ee5349a 100644
--- a/src/video_core/rasterizer_cache/cached_surface.cpp
+++ b/src/video_core/rasterizer_cache/cached_surface.cpp
@@ -35,19 +35,11 @@ void CachedSurface::LoadGLBuffer(PAddr load_start, PAddr load_end) {
     const bool need_swap =
         GLES && (pixel_format == PixelFormat::RGBA8 || pixel_format == PixelFormat::RGB8);
 
-    const u8* texture_ptr = VideoCore::g_memory->GetPhysicalPointer(addr);
+    u8* texture_ptr = VideoCore::g_memory->GetPhysicalPointer(addr);
     if (texture_ptr == nullptr) {
         return;
     }
 
-    const u32 byte_size = width * height * GetBytesPerPixel(pixel_format);
-    const auto texture_data = std::span<const std::byte>{reinterpret_cast<const std::byte*>(texture_ptr),
-                                                         byte_size};
-
-    if (gl_buffer.empty()) {
-        gl_buffer.resize(byte_size);
-    }
-
     // TODO: Should probably be done in ::Memory:: and check for other regions too
     if (load_start < Memory::VRAM_VADDR_END && load_end > Memory::VRAM_VADDR_END)
         load_end = Memory::VRAM_VADDR_END;
@@ -55,10 +47,16 @@ void CachedSurface::LoadGLBuffer(PAddr load_start, PAddr load_end) {
     if (load_start < Memory::VRAM_VADDR && load_end > Memory::VRAM_VADDR)
         load_start = Memory::VRAM_VADDR;
 
-    MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad);
-
     ASSERT(load_start >= addr && load_end <= end);
+
     const u32 start_offset = load_start - addr;
+    const u32 byte_size = width * height * GetBytesPerPixel(pixel_format);
+
+    if (gl_buffer.empty()) {
+        gl_buffer.resize(byte_size);
+    }
+
+    MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad);
 
     if (!is_tiled) {
         ASSERT(type == SurfaceType::Color);
@@ -67,33 +65,31 @@ void CachedSurface::LoadGLBuffer(PAddr load_start, PAddr load_end) {
             // cannot fully test this
             if (pixel_format == PixelFormat::RGBA8) {
                 for (std::size_t i = start_offset; i < load_end - addr; i += 4) {
-                    gl_buffer[i] = texture_ptr[i + 3];
-                    gl_buffer[i + 1] = texture_ptr[i + 2];
-                    gl_buffer[i + 2] = texture_ptr[i + 1];
-                    gl_buffer[i + 3] = texture_ptr[i];
+                    gl_buffer[i] = (std::byte)texture_ptr[i + 3];
+                    gl_buffer[i + 1] = (std::byte)texture_ptr[i + 2];
+                    gl_buffer[i + 2] = (std::byte)texture_ptr[i + 1];
+                    gl_buffer[i + 3] = (std::byte)texture_ptr[i];
                 }
             } else if (pixel_format == PixelFormat::RGB8) {
                 for (std::size_t i = start_offset; i < load_end - addr; i += 3) {
-                    gl_buffer[i] = texture_ptr[i + 2];
-                    gl_buffer[i + 1] = texture_ptr[i + 1];
-                    gl_buffer[i + 2] = texture_ptr[i];
+                    gl_buffer[i] = (std::byte)texture_ptr[i + 2];
+                    gl_buffer[i + 1] = (std::byte)texture_ptr[i + 1];
+                    gl_buffer[i + 2] = (std::byte)texture_ptr[i];
                 }
             }
         } else {
-            std::memcpy(&gl_buffer[start_offset], texture_ptr + start_offset,
-                        load_end - load_start);
+            std::memcpy(gl_buffer.data() + start_offset, texture_ptr + start_offset, load_end - load_start);
         }
     } else {
-        const auto dest_data = std::span<std::byte>{reinterpret_cast<std::byte*>(gl_buffer.data()),
-                                                    byte_size};
-        UnswizzleTexture(*this, load_start, load_end, texture_data, dest_data);
+        std::span<std::byte> texture_data{(std::byte*)texture_ptr, byte_size};
+        UnswizzleTexture(*this, load_start, load_end, texture_data, gl_buffer);
     }
 }
 
 MICROPROFILE_DEFINE(RasterizerCache_SurfaceFlush, "RasterizerCache", "Surface Flush",
                     MP_RGB(128, 192, 64));
 void CachedSurface::FlushGLBuffer(PAddr flush_start, PAddr flush_end) {
-    u8* const dst_buffer = VideoCore::g_memory->GetPhysicalPointer(addr);
+    u8* dst_buffer = VideoCore::g_memory->GetPhysicalPointer(addr);
     if (dst_buffer == nullptr) {
         return;
     }
@@ -134,25 +130,24 @@ void CachedSurface::FlushGLBuffer(PAddr flush_start, PAddr flush_end) {
         ASSERT(type == SurfaceType::Color);
         if (pixel_format == PixelFormat::RGBA8 && GLES) {
             for (std::size_t i = start_offset; i < flush_end - addr; i += 4) {
-                dst_buffer[i] = gl_buffer[i + 3];
-                dst_buffer[i + 1] = gl_buffer[i + 2];
-                dst_buffer[i + 2] = gl_buffer[i + 1];
-                dst_buffer[i + 3] = gl_buffer[i];
+                dst_buffer[i] = (u8)gl_buffer[i + 3];
+                dst_buffer[i + 1] = (u8)gl_buffer[i + 2];
+                dst_buffer[i + 2] = (u8)gl_buffer[i + 1];
+                dst_buffer[i + 3] = (u8)gl_buffer[i];
             }
         } else if (pixel_format == PixelFormat::RGB8 && GLES) {
             for (std::size_t i = start_offset; i < flush_end - addr; i += 3) {
-                dst_buffer[i] = gl_buffer[i + 2];
-                dst_buffer[i + 1] = gl_buffer[i + 1];
-                dst_buffer[i + 2] = gl_buffer[i];
+                dst_buffer[i] = (u8)gl_buffer[i + 2];
+                dst_buffer[i + 1] = (u8)gl_buffer[i + 1];
+                dst_buffer[i + 2] = (u8)gl_buffer[i];
             }
         } else {
             std::memcpy(dst_buffer + start_offset, &gl_buffer[start_offset],
                         flush_end - flush_start);
         }
     } else {
-        const auto source_data = std::span<std::byte>{reinterpret_cast<std::byte*>(gl_buffer.data()),
-                                                      byte_size};
-        SwizzleTexture(*this, flush_start, flush_end, source_data, {});
+        std::span<std::byte> texture_data{(std::byte*)dst_buffer + start_offset, byte_size};
+        SwizzleTexture(*this, flush_start, flush_end, gl_buffer, texture_data);
     }
 }
 
@@ -425,7 +420,7 @@ void CachedSurface::DownloadGLTexture(const Common::Rectangle<u32>& rect) {
             .region = rect
         };
 
-        runtime.ReadTexture(texture, subresource, tuple, gl_buffer.data());
+        runtime.ReadTexture(texture, subresource, tuple, (u8*)gl_buffer.data());
     }
 
     glPixelStorei(GL_PACK_ROW_LENGTH, 0);
diff --git a/src/video_core/rasterizer_cache/cached_surface.h b/src/video_core/rasterizer_cache/cached_surface.h
index dd689e88e..9a52f97c5 100644
--- a/src/video_core/rasterizer_cache/cached_surface.h
+++ b/src/video_core/rasterizer_cache/cached_surface.h
@@ -104,7 +104,7 @@ public:
 public:
     bool registered = false;
     SurfaceRegions invalid_regions;
-    std::vector<u8> gl_buffer;
+    std::vector<std::byte> gl_buffer;
 
     // Number of bytes to read from fill_data
     u32 fill_size = 0;
diff --git a/src/video_core/rasterizer_cache/morton_swizzle.h b/src/video_core/rasterizer_cache/morton_swizzle.h
index 7e9200743..372d1f823 100644
--- a/src/video_core/rasterizer_cache/morton_swizzle.h
+++ b/src/video_core/rasterizer_cache/morton_swizzle.h
@@ -3,6 +3,8 @@
 // Refer to the license.txt file included.
 
 #pragma once
+#include <span>
+#include <bit>
 #include "common/alignment.h"
 #include "core/memory.h"
 #include "video_core/rasterizer_cache/pixel_format.h"
@@ -12,50 +14,54 @@
 
 namespace OpenGL {
 
+inline u32 MakeInt(std::span<std::byte> bytes) {
+    u32 integer{};
+    std::memcpy(&integer, bytes.data(), sizeof(u32));
+
+    return integer;
+}
+
 template <bool morton_to_linear, PixelFormat format>
-static void MortonCopyTile(u32 stride, u8* tile_buffer, u8* linear_buffer) {
+inline void MortonCopyTile(u32 stride, std::span<std::byte> tile_buffer, std::span<std::byte> linear_buffer) {
     constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
-    constexpr u32 aligned_bytes_per_pixel = GetBytesPerPixel(format);
+    constexpr u32 linear_bytes_per_pixel = GetBytesPerPixel(format);
 
     for (u32 y = 0; y < 8; y++) {
         for (u32 x = 0; x < 8; x++) {
-            u8* tile_ptr = tile_buffer + VideoCore::MortonInterleave(x, y) * bytes_per_pixel;
-            u8* linear_ptr = linear_buffer + ((7 - y) * stride + x) * aligned_bytes_per_pixel;
+            const u32 tile_offset = VideoCore::MortonInterleave(x, y) * bytes_per_pixel;
+            const u32 linear_offset = ((7 - y) * stride + x) * linear_bytes_per_pixel;
+            auto tile_pixel = tile_buffer.subspan(tile_offset, bytes_per_pixel);
+            auto linear_pixel = linear_buffer.subspan(linear_offset, linear_bytes_per_pixel);
+
             if constexpr (morton_to_linear) {
                 if constexpr (format == PixelFormat::D24S8) {
-                    linear_ptr[0] = tile_ptr[3];
-                    std::memcpy(linear_ptr + 1, tile_ptr, 3);
+                    const u32 s8d24 = MakeInt(tile_pixel);
+                    const u32 d24s8 = std::rotl(s8d24, 8);
+                    std::memcpy(linear_pixel.data(), &d24s8, sizeof(u32));
                 } else if (format == PixelFormat::RGBA8 && GLES) {
-                    // because GLES does not have ABGR format
-                    // so we will do byteswapping here
-                    linear_ptr[0] = tile_ptr[3];
-                    linear_ptr[1] = tile_ptr[2];
-                    linear_ptr[2] = tile_ptr[1];
-                    linear_ptr[3] = tile_ptr[0];
+                    const u32 abgr = MakeInt(tile_pixel);
+                    const u32 rgba = std::byteswap(abgr);
+                    std::memcpy(linear_pixel.data(), &rgba, sizeof(u32));
                 } else if (format == PixelFormat::RGB8 && GLES) {
-                    linear_ptr[0] = tile_ptr[2];
-                    linear_ptr[1] = tile_ptr[1];
-                    linear_ptr[2] = tile_ptr[0];
+                    std::memcpy(linear_pixel.data(), tile_pixel.data(), 3);
+                    std::swap(linear_pixel[0], linear_pixel[2]);
                 } else {
-                    std::memcpy(linear_ptr, tile_ptr, bytes_per_pixel);
+                    std::memcpy(linear_pixel.data(), tile_pixel.data(), bytes_per_pixel);
                 }
             } else {
                 if constexpr (format == PixelFormat::D24S8) {
-                    std::memcpy(tile_ptr, linear_ptr + 1, 3);
-                    tile_ptr[3] = linear_ptr[0];
+                    const u32 d24s8 = MakeInt(linear_pixel);
+                    const u32 s8d24 = std::rotr(d24s8, 8);
+                    std::memcpy(tile_pixel.data(), &s8d24, sizeof(u32));
                 } else if (format == PixelFormat::RGBA8 && GLES) {
-                    // because GLES does not have ABGR format
-                    // so we will do byteswapping here
-                    tile_ptr[0] = linear_ptr[3];
-                    tile_ptr[1] = linear_ptr[2];
-                    tile_ptr[2] = linear_ptr[1];
-                    tile_ptr[3] = linear_ptr[0];
+                    const u32 rgba = MakeInt(linear_pixel);
+                    const u32 abgr = std::byteswap(rgba);
+                    std::memcpy(tile_pixel.data(), &abgr, sizeof(u32));
                 } else if (format == PixelFormat::RGB8 && GLES) {
-                    tile_ptr[0] = linear_ptr[2];
-                    tile_ptr[1] = linear_ptr[1];
-                    tile_ptr[2] = linear_ptr[0];
+                    std::memcpy(tile_pixel.data(), linear_pixel.data(), 3);
+                    std::swap(tile_pixel[0], tile_pixel[2]);
                 } else {
-                    std::memcpy(tile_ptr, linear_ptr, bytes_per_pixel);
+                    std::memcpy(tile_pixel.data(), linear_pixel.data(), bytes_per_pixel);
                 }
             }
         }
@@ -63,13 +69,20 @@ static void MortonCopyTile(u32 stride, u8* tile_buffer, u8* linear_buffer) {
 }
 
 template <bool morton_to_linear, PixelFormat format>
-static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PAddr start, PAddr end) {
-    constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
-    constexpr u32 tile_size = bytes_per_pixel * 64;
+static void MortonCopy(u32 stride, u32 height,
+                       std::span<std::byte> linear_buffer, std::span<std::byte> tiled_buffer,
+                       PAddr base, PAddr start, PAddr end) {
 
+    constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8;
     constexpr u32 aligned_bytes_per_pixel = GetBytesPerPixel(format);
     static_assert(aligned_bytes_per_pixel >= bytes_per_pixel, "");
-    linear_buffer += aligned_bytes_per_pixel - bytes_per_pixel;
+
+    constexpr u32 tile_size = bytes_per_pixel * 64;
+    const u32 linear_tile_size = (7 * stride + 8) * aligned_bytes_per_pixel;
+
+    // This only applies for D24 format, by shifting the span one byte all pixels
+    // are written properly without byteswap
+    u32 linear_offset = aligned_bytes_per_pixel - bytes_per_pixel;
 
     const PAddr aligned_down_start = base + Common::AlignDown(start - base, tile_size);
     const PAddr aligned_start = base + Common::AlignUp(start - base, tile_size);
@@ -84,18 +97,19 @@ static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PA
     // In OpenGL the texture origin is in the bottom left corner as opposed to other
     // APIs that have it at the top left. To avoid flipping texture coordinates in
     // the shader we read/write the linear buffer backwards
-    linear_buffer += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel;
+    //linear_buffer += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel;
+    linear_offset += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel;
 
     auto linear_next_tile = [&] {
         x = (x + 8) % stride;
-        linear_buffer += 8 * aligned_bytes_per_pixel;
+        linear_offset += 8 * aligned_bytes_per_pixel;
         if (!x) {
             y  = (y + 8) % height;
             if (!y) {
                 return;
             }
 
-            linear_buffer -= stride * 9 * aligned_bytes_per_pixel;
+            linear_offset -= stride * 9 * aligned_bytes_per_pixel;
         }
     };
 
@@ -104,8 +118,10 @@ static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PA
     // If during a texture download the start coordinate is inside a tile, swizzle
     // the tile to a temporary buffer and copy the part we are interested in
     if (start < aligned_start && !morton_to_linear) {
-        std::array<u8, tile_size> tmp_buf;
-        MortonCopyTile<morton_to_linear, format>(stride, tmp_buf.data(), linear_buffer);
+        std::array<std::byte, tile_size> tmp_buf;
+        std::span<std::byte> linear_data = linear_buffer.last(linear_buffer.size() - linear_offset);
+
+        MortonCopyTile<morton_to_linear, format>(stride, tmp_buf, linear_data);
         std::memcpy(tile_buffer, tmp_buf.data() + start - aligned_down_start,
                     std::min(aligned_start, end) - start);
 
@@ -124,19 +140,23 @@ static void MortonCopy(u32 stride, u32 height, u8* linear_buffer, PAddr base, PA
 
     const u8* buffer_end = tile_buffer + aligned_end - aligned_start;
     while (tile_buffer < buffer_end) {
-        MortonCopyTile<morton_to_linear, format>(stride, tile_buffer, linear_buffer);
+        std::span<std::byte> linear_data = linear_buffer.last(linear_buffer.size() - linear_offset);
+        auto tiled_data = std::span<std::byte>{(std::byte*)tile_buffer, tile_size};
+
+        MortonCopyTile<morton_to_linear, format>(stride, tiled_data, linear_data);
         tile_buffer += tile_size;
         linear_next_tile();
     }
 
     if (end > std::max(aligned_start, aligned_end) && !morton_to_linear) {
-        std::array<u8, tile_size> tmp_buf;
-        MortonCopyTile<morton_to_linear, format>(stride, tmp_buf.data(), linear_buffer);
+        std::array<std::byte, tile_size> tmp_buf;
+        std::span<std::byte> linear_data = linear_buffer.last(linear_buffer.size() - linear_offset);
+        MortonCopyTile<morton_to_linear, format>(stride, tmp_buf, linear_data);
         std::memcpy(tile_buffer, tmp_buf.data(), end - aligned_end);
     }
 }
 
-using MortonFunc = void (*)(u32, u32, u8*, PAddr, PAddr, PAddr);
+using MortonFunc = void (*)(u32, u32, std::span<std::byte>, std::span<std::byte>, PAddr, PAddr, PAddr);
 
 static constexpr std::array<MortonFunc, 18> UNSWIZZLE_TABLE = {
     MortonCopy<true, PixelFormat::RGBA8>,  // 0
diff --git a/src/video_core/rasterizer_cache/utils.cpp b/src/video_core/rasterizer_cache/utils.cpp
index a819100fd..7f67b65a2 100644
--- a/src/video_core/rasterizer_cache/utils.cpp
+++ b/src/video_core/rasterizer_cache/utils.cpp
@@ -58,17 +58,16 @@ const FormatTuple& GetFormatTuple(PixelFormat pixel_format) {
 }
 
 void SwizzleTexture(const SurfaceParams& params, u32 flush_start, u32 flush_end,
-                    std::span<std::byte> source, std::span<std::byte> dest) {
+                    std::span<std::byte> source_linear, std::span<std::byte> dest_tiled) {
     const u32 func_index = static_cast<u32>(params.pixel_format);
-    const MortonFunc swizzle = SWIZZLE_TABLE[func_index];
-    u8* source_data = reinterpret_cast<u8*>(source.data());
+    const MortonFunc SwizzleImpl = SWIZZLE_TABLE[func_index];
 
     // TODO: Move memory access out of the morton function
-    swizzle(params.stride, params.height, source_data, params.addr, flush_start, flush_end);
+    SwizzleImpl(params.stride, params.height, source_linear, dest_tiled, params.addr, flush_start, flush_end);
 }
 
 void UnswizzleTexture(const SurfaceParams& params, u32 load_start, u32 load_end,
-                      std::span<const std::byte> source, std::span<std::byte> dest) {
+                      std::span<std::byte> source_tiled, std::span<std::byte> dest_linear) {
     // TODO: Integrate this to UNSWIZZLE_TABLE
     if (params.type == SurfaceType::Texture) {
         Pica::Texture::TextureInfo tex_info{};
@@ -82,21 +81,19 @@ void UnswizzleTexture(const SurfaceParams& params, u32 load_start, u32 load_end,
         const auto rect = params.GetSubRect(params.FromInterval(load_interval));
         DEBUG_ASSERT(params.FromInterval(load_interval).GetInterval() == load_interval);
 
-        const u8* source_data = reinterpret_cast<const u8*>(source.data());
+        const u8* source_data = reinterpret_cast<const u8*>(source_tiled.data());
         for (u32 y = rect.bottom; y < rect.top; y++) {
             for (u32 x = rect.left; x < rect.right; x++) {
                 auto vec4 =
                     Pica::Texture::LookupTexture(source_data, x, params.height - 1 - y, tex_info);
                 const std::size_t offset = (x + (params.width * y)) * 4;
-                std::memcpy(dest.data() + offset, vec4.AsArray(), 4);
+                std::memcpy(dest_linear.data() + offset, vec4.AsArray(), 4);
             }
         }
     } else {
         const u32 func_index = static_cast<u32>(params.pixel_format);
-        const MortonFunc deswizzle = UNSWIZZLE_TABLE[func_index];
-        u8* dest_data = reinterpret_cast<u8*>(dest.data());
-
-        deswizzle(params.stride, params.height, dest_data, params.addr, load_start, load_end);
+        const MortonFunc UnswizzleImpl = UNSWIZZLE_TABLE[func_index];
+        UnswizzleImpl(params.stride, params.height, dest_linear, source_tiled, params.addr, load_start, load_end);
     }
 }
 
diff --git a/src/video_core/rasterizer_cache/utils.h b/src/video_core/rasterizer_cache/utils.h
index 79cd228be..660655c04 100644
--- a/src/video_core/rasterizer_cache/utils.h
+++ b/src/video_core/rasterizer_cache/utils.h
@@ -51,10 +51,10 @@ struct TextureCubeConfig {
 class SurfaceParams;
 
 void SwizzleTexture(const SurfaceParams& params, u32 flush_start, u32 flush_end,
-                    std::span<std::byte> source, std::span<std::byte> dest);
+                    std::span<std::byte> source_linear, std::span<std::byte> dest_tiled);
 
 void UnswizzleTexture(const SurfaceParams& params, u32 load_start, u32 load_end,
-                      std::span<const std::byte> source, std::span<std::byte> dest);
+                      std::span<std::byte> source_tiled, std::span<std::byte> dest_linear);
 
 [[nodiscard]] ClearValue MakeClearValue(SurfaceType type, PixelFormat format, const u8* fill_data);