From 9991b9b12be40d148b3f2e3e601dd2323b493df2 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 10 Oct 2022 14:08:34 +0300 Subject: [PATCH] renderer_vulkan: Optimize tiled format convertion + fix vertex buffer alignment * Integrate format convertion to the morton copy function, removing the need for an intermediate copy and convertion pass. This should be beneficial for performance especially since most games use tiled textures * Also bump vertex buffer size to avoid crashes with hardware shaders and provide correct offset on normal draws which fixes glitches in pokemon Y * Reduce the local group size to 8 in the D24S8 compute shader which fixes graphical issues in the afformentioned pokemon games at native resolution * Set LOD to 0 instead of 0.25 to fix another glitch in pokemon y --- .../configuration/configure_graphics.cpp | 1 + .../rasterizer_cache/morton_swizzle.h | 77 +++++++++-- .../rasterizer_cache/rasterizer_cache.h | 12 +- src/video_core/rasterizer_cache/utils.cpp | 11 +- src/video_core/rasterizer_cache/utils.h | 6 +- .../gl_format_reinterpreter.cpp | 4 +- .../renderer_opengl/gl_texture_runtime.cpp | 5 + .../renderer_opengl/gl_texture_runtime.h | 3 + .../vk_format_reinterpreter.cpp | 4 +- .../renderer_vulkan/vk_rasterizer.cpp | 121 +++++++++++++----- .../renderer_vulkan/vk_rasterizer.h | 8 +- .../renderer_vulkan/vk_shader_gen.cpp | 11 +- .../renderer_vulkan/vk_swapchain.cpp | 27 ++-- src/video_core/renderer_vulkan/vk_swapchain.h | 1 - .../renderer_vulkan/vk_texture_runtime.cpp | 66 +++++----- .../renderer_vulkan/vk_texture_runtime.h | 8 +- src/video_core/texture/texture_decode.cpp | 21 +++ src/video_core/texture/texture_decode.h | 4 + 18 files changed, 277 insertions(+), 113 deletions(-) diff --git a/src/citra_qt/configuration/configure_graphics.cpp b/src/citra_qt/configuration/configure_graphics.cpp index d357c7024..48abc4a15 100644 --- a/src/citra_qt/configuration/configure_graphics.cpp +++ b/src/citra_qt/configuration/configure_graphics.cpp @@ -26,6 +26,7 @@ ConfigureGraphics::ConfigureGraphics(QWidget* parent) ui->graphics_api_combo->setEnabled(not_running); ui->toggle_shader_jit->setEnabled(not_running); ui->toggle_disk_shader_cache->setEnabled(hw_renderer_enabled && not_running); + ui->physical_device_combo->setEnabled(not_running); SetPhysicalDeviceComboVisibility(ui->graphics_api_combo->currentIndex()); connect(ui->graphics_api_combo, qOverload(&QComboBox::currentIndexChanged), this, diff --git a/src/video_core/rasterizer_cache/morton_swizzle.h b/src/video_core/rasterizer_cache/morton_swizzle.h index c3bf55c44..3cc3099dd 100644 --- a/src/video_core/rasterizer_cache/morton_swizzle.h +++ b/src/video_core/rasterizer_cache/morton_swizzle.h @@ -22,13 +22,31 @@ inline T MakeInt(const std::byte* bytes) { return integer; } -template +template inline void DecodePixel(const std::byte* source, std::byte* dest) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; if constexpr (format == PixelFormat::D24S8) { const u32 d24s8 = std::rotl(MakeInt(source), 8); std::memcpy(dest, &d24s8, sizeof(u32)); + } else if constexpr (format == PixelFormat::RGBA8 && converted) { + const u32 rgba = MakeInt(source); + const u32 abgr = Common::swap32(rgba); + std::memcpy(dest, &abgr, 4); + } else if constexpr (format == PixelFormat::RGB8 && converted) { + u32 rgb{}; + std::memcpy(&rgb, source, 3); + const u32 abgr = Common::swap32(rgb << 8) | 0xFF000000; + std::memcpy(dest, &abgr, 4); + } else if constexpr (format == PixelFormat::RGB565 && converted) { + const auto abgr = Color::DecodeRGB565(reinterpret_cast(source)); + std::memcpy(dest, abgr.AsArray(), 4); + } else if constexpr (format == PixelFormat::RGB5A1 && converted) { + const auto abgr = Color::DecodeRGB5A1(reinterpret_cast(source)); + std::memcpy(dest, abgr.AsArray(), 4); + } else if constexpr (format == PixelFormat::RGBA4 && converted) { + const auto abgr = Color::DecodeRGBA4(reinterpret_cast(source)); + std::memcpy(dest, abgr.AsArray(), 4); } else if constexpr (format == PixelFormat::IA8) { std::memset(dest, static_cast(source[1]), 3); dest[3] = source[0]; @@ -95,23 +113,43 @@ inline void DecodePixelETC1(u32 x, u32 y, const std::byte* source_tile, std::byt dest_pixel[3] = std::byte{alpha}; } -template +template inline void EncodePixel(const std::byte* source, std::byte* dest) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; if constexpr (format == PixelFormat::D24S8) { const u32 s8d24 = std::rotr(MakeInt(source), 8); std::memcpy(dest, &s8d24, sizeof(u32)); + } else if constexpr (format == PixelFormat::RGBA8 && converted) { + const u32 abgr = MakeInt(source); + const u32 rgba = Common::swap32(abgr); + std::memcpy(dest, &rgba, 4); + } else if constexpr (format == PixelFormat::RGB8 && converted) { + const u32 abgr = MakeInt(source); + const u32 rgb = Common::swap32(abgr << 8); + std::memcpy(dest, &rgb, 3); + } else if constexpr (format == PixelFormat::RGB565 && converted) { + Common::Vec4 rgba; + std::memcpy(rgba.AsArray(), source, 4); + Color::EncodeRGB565(rgba, reinterpret_cast(dest)); + } else if constexpr (format == PixelFormat::RGB5A1 && converted) { + Common::Vec4 rgba; + std::memcpy(rgba.AsArray(), source, 4); + Color::EncodeRGB5A1(rgba, reinterpret_cast(dest)); + } else if constexpr (format == PixelFormat::RGBA4 && converted) { + Common::Vec4 rgba; + std::memcpy(rgba.AsArray(), source, 4); + Color::EncodeRGBA4(rgba, reinterpret_cast(dest)); } else { std::memcpy(dest, source, bytes_per_pixel); } } -template +template inline void MortonCopyTile(u32 stride, std::span tile_buffer, std::span linear_buffer) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; - constexpr u32 linear_bytes_per_pixel = GetBytesPerPixel(format); + constexpr u32 linear_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format); constexpr bool is_compressed = format == PixelFormat::ETC1 || format == PixelFormat::ETC1A4; constexpr bool is_4bit = format == PixelFormat::I4 || format == PixelFormat::A4; @@ -127,10 +165,10 @@ inline void MortonCopyTile(u32 stride, std::span tile_buffer, } else if constexpr (is_4bit) { DecodePixel4(x, y, tile_buffer.data(), linear_pixel.data()); } else { - DecodePixel(tiled_pixel.data(), linear_pixel.data()); + DecodePixel(tiled_pixel.data(), linear_pixel.data()); } } else { - EncodePixel(linear_pixel.data(), tiled_pixel.data()); + EncodePixel(linear_pixel.data(), tiled_pixel.data()); } } } @@ -138,6 +176,7 @@ inline void MortonCopyTile(u32 stride, std::span tile_buffer, /** * @brief Performs morton to/from linear convertions on the provided pixel data + * @param converted If true performs RGBA8 to/from convertion to all color formats * @param width, height The dimentions of the rectangular region of pixels in linear_buffer * @param start_offset The number of bytes from the start of the first tile to the start of * tiled_buffer @@ -160,11 +199,11 @@ inline void MortonCopyTile(u32 stride, std::span tile_buffer, * start_offset/end_offset are useful here as they tell us exactly where the data should be placed * in the linear_buffer. */ -template +template static void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset, std::span linear_buffer, std::span tiled_buffer) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; - constexpr u32 aligned_bytes_per_pixel = GetBytesPerPixel(format); + constexpr u32 aligned_bytes_per_pixel = converted ? 4 : GetBytesPerPixel(format); constexpr u32 tile_size = GetFormatBpp(format) * 64 / 8; static_assert(aligned_bytes_per_pixel >= bytes_per_pixel, ""); @@ -202,7 +241,7 @@ static void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset, if (start_offset < aligned_start_offset && !morton_to_linear) { std::array tmp_buf; auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_stride); - MortonCopyTile(width, tmp_buf, linear_data); + MortonCopyTile(width, tmp_buf, linear_data); std::memcpy(tiled_buffer.data(), tmp_buf.data() + start_offset - aligned_down_start_offset, std::min(aligned_start_offset, end_offset) - start_offset); @@ -215,7 +254,7 @@ static void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset, while (tiled_offset < buffer_end) { auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_stride); auto tiled_data = tiled_buffer.subspan(tiled_offset, tile_size); - MortonCopyTile(width, tiled_data, linear_data); + MortonCopyTile(width, tiled_data, linear_data); tiled_offset += tile_size; LinearNextTile(); } @@ -225,7 +264,7 @@ static void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset, if (end_offset > std::max(aligned_start_offset, aligned_end_offset) && !morton_to_linear) { std::array tmp_buf; auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_stride); - MortonCopyTile(width, tmp_buf, linear_data); + MortonCopyTile(width, tmp_buf, linear_data); std::memcpy(tiled_buffer.data() + tiled_offset, tmp_buf.data(), end_offset - aligned_end_offset); } @@ -254,6 +293,14 @@ static constexpr std::array UNSWIZZLE_TABLE = { MortonCopy // 17 }; +static constexpr std::array UNSWIZZLE_TABLE_CONVERTED = { + MortonCopy, // 0 + MortonCopy, // 1 + MortonCopy, // 2 + MortonCopy, // 3 + MortonCopy // 4 +}; + static constexpr std::array SWIZZLE_TABLE = { MortonCopy, // 0 MortonCopy, // 1 @@ -275,4 +322,12 @@ static constexpr std::array SWIZZLE_TABLE = { MortonCopy // 17 }; +static constexpr std::array SWIZZLE_TABLE_CONVERTED = { + MortonCopy, // 0 + MortonCopy, // 1 + MortonCopy, // 2 + MortonCopy, // 3 + MortonCopy // 4 +}; + } // namespace VideoCore diff --git a/src/video_core/rasterizer_cache/rasterizer_cache.h b/src/video_core/rasterizer_cache/rasterizer_cache.h index 8c5a38c75..56155e7e4 100644 --- a/src/video_core/rasterizer_cache/rasterizer_cache.h +++ b/src/video_core/rasterizer_cache/rasterizer_cache.h @@ -916,10 +916,8 @@ void RasterizerCache::UploadSurface(const Surface& surface, SurfaceInterval i const auto upload_data = source_ptr.GetWriteBytes(load_info.end - load_info.addr); if (surface->is_tiled) { - std::vector unswizzled_data(load_info.width * load_info.height * - GetBytesPerPixel(load_info.pixel_format)); - UnswizzleTexture(load_info, load_info.addr, load_info.end, upload_data, unswizzled_data); - runtime.FormatConvert(*surface, true, unswizzled_data, staging.mapped); + UnswizzleTexture(load_info, load_info.addr, load_info.end, upload_data, staging.mapped, + runtime.NeedsConvertion(surface->pixel_format)); } else { runtime.FormatConvert(*surface, true, upload_data, staging.mapped); } @@ -959,10 +957,8 @@ void RasterizerCache::DownloadSurface(const Surface& surface, SurfaceInterval const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start); if (surface->is_tiled) { - std::vector temp_data(flush_info.width * flush_info.height * - GetBytesPerPixel(flush_info.pixel_format)); - runtime.FormatConvert(*surface, false, mapped, temp_data); - SwizzleTexture(flush_info, flush_start, flush_end, temp_data, download_dest); + SwizzleTexture(flush_info, flush_start, flush_end, mapped, download_dest, + runtime.NeedsConvertion(surface->pixel_format)); } else { runtime.FormatConvert(*surface, false, mapped, download_dest); } diff --git a/src/video_core/rasterizer_cache/utils.cpp b/src/video_core/rasterizer_cache/utils.cpp index 52363d21c..6655575d3 100644 --- a/src/video_core/rasterizer_cache/utils.cpp +++ b/src/video_core/rasterizer_cache/utils.cpp @@ -48,17 +48,20 @@ ClearValue MakeClearValue(SurfaceType type, PixelFormat format, const u8* fill_d } void SwizzleTexture(const SurfaceParams& swizzle_info, PAddr start_addr, PAddr end_addr, - std::span source_linear, std::span dest_tiled) { + std::span source_linear, std::span dest_tiled, + bool convert) { const u32 func_index = static_cast(swizzle_info.pixel_format); - const MortonFunc SwizzleImpl = SWIZZLE_TABLE[func_index]; + const MortonFunc SwizzleImpl = (convert ? SWIZZLE_TABLE_CONVERTED : SWIZZLE_TABLE)[func_index]; SwizzleImpl(swizzle_info.width, swizzle_info.height, start_addr - swizzle_info.addr, end_addr - swizzle_info.addr, source_linear, dest_tiled); } void UnswizzleTexture(const SurfaceParams& unswizzle_info, PAddr start_addr, PAddr end_addr, - std::span source_tiled, std::span dest_linear) { + std::span source_tiled, std::span dest_linear, + bool convert) { const u32 func_index = static_cast(unswizzle_info.pixel_format); - const MortonFunc UnswizzleImpl = UNSWIZZLE_TABLE[func_index]; + const MortonFunc UnswizzleImpl = + (convert ? UNSWIZZLE_TABLE_CONVERTED : UNSWIZZLE_TABLE)[func_index]; UnswizzleImpl(unswizzle_info.width, unswizzle_info.height, start_addr - unswizzle_info.addr, end_addr - unswizzle_info.addr, dest_linear, source_tiled); } diff --git a/src/video_core/rasterizer_cache/utils.h b/src/video_core/rasterizer_cache/utils.h index 58bab1eee..91da22f84 100644 --- a/src/video_core/rasterizer_cache/utils.h +++ b/src/video_core/rasterizer_cache/utils.h @@ -116,7 +116,8 @@ struct TextureCubeConfig { * @param dest_linear The output buffer where the generated linear data will be written to. */ void UnswizzleTexture(const SurfaceParams& unswizzle_info, PAddr start_addr, PAddr end_addr, - std::span source_tiled, std::span dest_linear); + std::span source_tiled, std::span dest_linear, + bool convert = false); /** * Swizzles a linear texture according to the morton code. @@ -128,7 +129,8 @@ void UnswizzleTexture(const SurfaceParams& unswizzle_info, PAddr start_addr, PAd * @param dest_linear The output buffer where the generated linear data will be written to. */ void SwizzleTexture(const SurfaceParams& swizzle_info, PAddr start_addr, PAddr end_addr, - std::span source_linear, std::span dest_tiled); + std::span source_linear, std::span dest_tiled, + bool convert = false); } // namespace VideoCore diff --git a/src/video_core/renderer_opengl/gl_format_reinterpreter.cpp b/src/video_core/renderer_opengl/gl_format_reinterpreter.cpp index 2577714e5..f6ed56d18 100644 --- a/src/video_core/renderer_opengl/gl_format_reinterpreter.cpp +++ b/src/video_core/renderer_opengl/gl_format_reinterpreter.cpp @@ -11,7 +11,7 @@ namespace OpenGL { D24S8toRGBA8::D24S8toRGBA8(bool use_texture_view) : use_texture_view{use_texture_view} { constexpr std::string_view cs_source = R"( -layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; +layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; layout(binding = 0) uniform highp sampler2D depth; layout(binding = 1) uniform lowp usampler2D stencil; layout(rgba8, binding = 2) uniform highp writeonly image2D color; @@ -77,7 +77,7 @@ void D24S8toRGBA8::Reinterpret(const Surface& source, VideoCore::Rect2D src_rect glTexParameteri(GL_TEXTURE_2D, GL_DEPTH_STENCIL_TEXTURE_MODE, GL_STENCIL_INDEX); glUniform2i(src_offset_loc, src_rect.left, src_rect.bottom); - glDispatchCompute(src_rect.GetWidth() / 32, src_rect.GetHeight() / 32, 1); + glDispatchCompute(src_rect.GetWidth() / 8, src_rect.GetHeight() / 8, 1); if (use_texture_view) { temp_tex.Release(); diff --git a/src/video_core/renderer_opengl/gl_texture_runtime.cpp b/src/video_core/renderer_opengl/gl_texture_runtime.cpp index 716f162bb..5bd5136a9 100644 --- a/src/video_core/renderer_opengl/gl_texture_runtime.cpp +++ b/src/video_core/renderer_opengl/gl_texture_runtime.cpp @@ -319,6 +319,11 @@ const ReinterpreterList& TextureRuntime::GetPossibleReinterpretations( return reinterpreters[static_cast(dest_format)]; } +bool TextureRuntime::NeedsConvertion(VideoCore::PixelFormat format) const { + return driver.IsOpenGLES() && + (format == VideoCore::PixelFormat::RGB8 || format == VideoCore::PixelFormat::RGBA8); +} + void TextureRuntime::BindFramebuffer(GLenum target, GLint level, GLenum textarget, VideoCore::SurfaceType type, OGLTexture& texture) const { const GLint framebuffer = target == GL_DRAW_FRAMEBUFFER ? draw_fbo.handle : read_fbo.handle; diff --git a/src/video_core/renderer_opengl/gl_texture_runtime.h b/src/video_core/renderer_opengl/gl_texture_runtime.h index 01d7085b7..7add29201 100644 --- a/src/video_core/renderer_opengl/gl_texture_runtime.h +++ b/src/video_core/renderer_opengl/gl_texture_runtime.h @@ -97,6 +97,9 @@ public: [[nodiscard]] const ReinterpreterList& GetPossibleReinterpretations( VideoCore::PixelFormat dest_format) const; + /// Returns true if the provided pixel format needs convertion + [[nodiscard]] bool NeedsConvertion(VideoCore::PixelFormat format) const; + private: /// Returns the framebuffer used for texture downloads void BindFramebuffer(GLenum target, GLint level, GLenum textarget, VideoCore::SurfaceType type, diff --git a/src/video_core/renderer_vulkan/vk_format_reinterpreter.cpp b/src/video_core/renderer_vulkan/vk_format_reinterpreter.cpp index 5119ac2d1..dc4eae00c 100644 --- a/src/video_core/renderer_vulkan/vk_format_reinterpreter.cpp +++ b/src/video_core/renderer_vulkan/vk_format_reinterpreter.cpp @@ -14,7 +14,7 @@ D24S8toRGBA8::D24S8toRGBA8(const Instance& instance, TaskScheduler& scheduler, constexpr std::string_view cs_source = R"( #version 450 core #extension GL_EXT_samplerless_texture_functions : require -layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; +layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; layout(set = 0, binding = 0) uniform highp texture2D depth; layout(set = 0, binding = 1) uniform lowp utexture2D stencil; layout(set = 0, binding = 2, rgba8) uniform highp writeonly image2D color; @@ -154,7 +154,7 @@ void D24S8toRGBA8::Reinterpret(Surface& source, VideoCore::Rect2D src_rect, Surf command_buffer.pushConstants(compute_pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(Common::Vec2i), src_offset.AsArray()); - command_buffer.dispatch(src_rect.GetWidth() / 32, src_rect.GetHeight() / 32, 1); + command_buffer.dispatch(src_rect.GetWidth() / 8, src_rect.GetHeight() / 8, 1); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 3e396e473..e31f7c3bf 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -16,6 +16,8 @@ #include "video_core/renderer_vulkan/vk_task_scheduler.h" #include "video_core/video_core.h" +#include + namespace Vulkan { MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Array Setup", MP_RGB(255, 128, 0)); @@ -93,7 +95,7 @@ constexpr VertexLayout RasterizerVulkan::HardwareVertex::GetVertexLayout() { return layout; } -constexpr u32 VERTEX_BUFFER_SIZE = 128 * 1024 * 1024; +constexpr u32 VERTEX_BUFFER_SIZE = 256 * 1024 * 1024; constexpr u32 INDEX_BUFFER_SIZE = 8 * 1024 * 1024; constexpr u32 UNIFORM_BUFFER_SIZE = 16 * 1024 * 1024; constexpr u32 TEXTURE_BUFFER_SIZE = 16 * 1024 * 1024; @@ -121,10 +123,7 @@ RasterizerVulkan::RasterizerVulkan(Frontend::EmuWindow& emu_window, const Instan vk::BufferUsageFlagBits::eUniformTexelBuffer, TEXTURE_BUFFER_LF_FORMATS} { // Create a 1x1 clear texture to use in the NULL case, - default_texture = - runtime.Allocate(1, 1, VideoCore::PixelFormat::RGBA8, VideoCore::TextureType::Texture2D); - runtime.Transition(scheduler.GetUploadCommandBuffer(), default_texture, - vk::ImageLayout::eShaderReadOnlyOptimal, 0, 1); + CreateDefaultTextures(); uniform_block_data.lighting_lut_dirty.fill(true); @@ -162,7 +161,7 @@ RasterizerVulkan::RasterizerVulkan(Frontend::EmuWindow& emu_window, const Instan } for (u32 i = 0; i < 7; i++) { - pipeline_cache.BindStorageImage(i, default_texture.image_view); + pipeline_cache.BindStorageImage(i, default_storage_texture.image_view); } // Explicitly call the derived version to avoid warnings about calling virtual @@ -174,6 +173,7 @@ RasterizerVulkan::~RasterizerVulkan() { renderpass_cache.ExitRenderpass(); scheduler.Submit(SubmitMode::Flush | SubmitMode::Shutdown); + VmaAllocator allocator = instance.GetAllocator(); vk::Device device = instance.GetDevice(); for (auto& [key, sampler] : samplers) { @@ -184,10 +184,10 @@ RasterizerVulkan::~RasterizerVulkan() { device.destroyFramebuffer(framebuffer); } - const VideoCore::HostTextureTag tag = { - .format = VideoCore::PixelFormat::RGBA8, .width = 1, .height = 1}; - - runtime.Recycle(tag, std::move(default_texture)); + vmaDestroyImage(allocator, default_texture.image, default_texture.allocation); + vmaDestroyImage(allocator, default_storage_texture.image, default_storage_texture.allocation); + device.destroyImageView(default_texture.image_view); + device.destroyImageView(default_storage_texture.image_view); device.destroySampler(default_sampler); } @@ -374,7 +374,6 @@ void RasterizerVulkan::SetupVertexArray(u32 vs_input_size, u32 vs_input_index_mi enable_attributes[input_reg] = true; offset += vertex_attributes.GetStride(attribute_index); } - } else { // Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings // respectively @@ -675,7 +674,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { if (surface != nullptr) { pipeline_cache.BindStorageImage(binding, surface->alloc.image_view); } else { - pipeline_cache.BindStorageImage(binding, default_texture.image_view); + pipeline_cache.BindStorageImage(binding, default_storage_texture.image_view); } }; @@ -687,16 +686,15 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { // filters GL_LINEAR/GL_NEAREST so emulate them by setting minLod = 0, and maxLod = 0.25, // and using minFilter = VK_FILTER_LINEAR or minFilter = VK_FILTER_NEAREST const bool skip_mipmap = config.type == Pica::TexturingRegs::TextureConfig::TextureCube; - info = - SamplerInfo{.mag_filter = config.mag_filter, - .min_filter = config.min_filter, - .mip_filter = config.mip_filter, - .wrap_s = config.wrap_s, - .wrap_t = config.wrap_t, - .border_color = config.border_color.raw, - .lod_min = skip_mipmap ? 0.f : static_cast(config.lod.min_level), - .lod_max = skip_mipmap ? 0.25f : static_cast(config.lod.max_level), - .lod_bias = static_cast(config.lod.bias)}; + info = SamplerInfo{.mag_filter = config.mag_filter, + .min_filter = config.min_filter, + .mip_filter = config.mip_filter, + .wrap_s = config.wrap_s, + .wrap_t = config.wrap_t, + .border_color = config.border_color.raw, + .lod_min = skip_mipmap ? 0.f : static_cast(config.lod.min_level), + .lod_max = skip_mipmap ? 0.f : static_cast(config.lod.max_level), + .lod_bias = static_cast(config.lod.bias)}; // Search the cache and bind the appropriate sampler if (auto it = samplers.find(info); it != samplers.end()) { @@ -708,8 +706,6 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { } }; - renderpass_cache.ExitRenderpass(); - // Sync and bind the texture surfaces const auto pica_textures = regs.texturing.GetTextures(); for (unsigned texture_index = 0; texture_index < pica_textures.size(); ++texture_index) { @@ -722,9 +718,9 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { case TextureType::Shadow2D: { auto surface = res_cache.GetTextureSurface(texture); if (surface != nullptr) { - pipeline_cache.BindStorageImage(0, surface->alloc.image_view); + pipeline_cache.BindStorageImage(0, surface->GetImageView()); } else { - pipeline_cache.BindStorageImage(0, default_texture.image_view); + pipeline_cache.BindStorageImage(0, default_storage_texture.image_view); } continue; } @@ -874,13 +870,6 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { pipeline_cache.UseTrivialGeometryShader(); pipeline_cache.BindPipeline(pipeline_info); - // Bind the vertex buffer at the current mapped offset. This effectively means - // that when base_vertex is zero the GPU will start drawing from the current mapped - // offset not the start of the buffer. - vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); - command_buffer.bindVertexBuffers(0, vertex_buffer.GetHandle(), - vertex_buffer.GetBufferOffset()); - const u32 max_vertices = VERTEX_BUFFER_SIZE / sizeof(HardwareVertex); const u32 batch_size = static_cast(vertex_batch.size()); for (u32 base_vertex = 0; base_vertex < batch_size; base_vertex += max_vertices) { @@ -892,6 +881,12 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { std::memcpy(array_ptr, vertex_batch.data() + base_vertex, vertex_size); vertex_buffer.Commit(vertex_size); + // Bind the vertex buffer at the current mapped offset. This effectively means + // that when base_vertex is zero the GPU will start drawing from the current mapped + // offset not the start of the buffer. + vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); + command_buffer.bindVertexBuffers(0, vertex_buffer.GetHandle(), offset); + command_buffer.draw(vertices, 1, base_vertex, 0); } } @@ -1642,6 +1637,66 @@ vk::Framebuffer RasterizerVulkan::CreateFramebuffer(const FramebufferInfo& info) return device.createFramebuffer(framebuffer_info); } +void RasterizerVulkan::CreateDefaultTextures() { + default_texture.format = vk::Format::eR8G8B8A8Unorm; + default_storage_texture.format = vk::Format::eR8G8B8A8Uint; + + vk::ImageCreateInfo image_info = {.imageType = vk::ImageType::e2D, + .format = default_texture.format, + .extent = {1, 1, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .samples = vk::SampleCountFlagBits::e1, + .usage = GetImageUsage(vk::ImageAspectFlagBits::eColor)}; + + const VmaAllocationCreateInfo alloc_info = {.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE}; + + VkImage unsafe_image{}; + VkImageCreateInfo& unsafe_image_info = static_cast(image_info); + + VkResult result = vmaCreateImage(instance.GetAllocator(), &unsafe_image_info, &alloc_info, + &unsafe_image, &default_texture.allocation, nullptr); + if (result != VK_SUCCESS) { + LOG_CRITICAL(Render_Vulkan, "Failed allocating default texture with error {}", result); + UNREACHABLE(); + } + + default_texture.image = vk::Image{unsafe_image}; + vk::ImageViewCreateInfo view_info = { + .image = default_texture.image, + .viewType = vk::ImageViewType::e2D, + .format = default_texture.format, + .subresourceRange = {.aspectMask = vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1}}; + + vk::Device device = instance.GetDevice(); + default_texture.image_view = device.createImageView(view_info); + + // Define the default texture for storage descriptors + image_info.format = default_storage_texture.format; + result = vmaCreateImage(instance.GetAllocator(), &unsafe_image_info, &alloc_info, &unsafe_image, + &default_storage_texture.allocation, nullptr); + if (result != VK_SUCCESS) { + LOG_CRITICAL(Render_Vulkan, "Failed allocating default storage texture with error {}", + result); + UNREACHABLE(); + } + + default_storage_texture.image = vk::Image{unsafe_image}; + + view_info.format = default_storage_texture.format; + view_info.image = default_storage_texture.image; + default_storage_texture.image_view = device.createImageView(view_info); + + runtime.Transition(scheduler.GetUploadCommandBuffer(), default_texture, + vk::ImageLayout::eShaderReadOnlyOptimal, 0, 1); + runtime.Transition(scheduler.GetUploadCommandBuffer(), default_storage_texture, + vk::ImageLayout::eGeneral, 0, 1); +} + void RasterizerVulkan::FlushBuffers() { vertex_buffer.Flush(); uniform_buffer.Flush(); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 14ed6c95b..e71254774 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -222,6 +222,9 @@ private: /// Internal implementation for AccelerateDrawBatch bool AccelerateDrawBatchInternal(bool is_indexed); + /// Copies vertex data performing needed convertions and casts + void PaddedVertexCopy(u32 stride, u32 vertex_num, u8* data); + struct VertexArrayInfo { u32 vs_input_index_min; u32 vs_input_index_max; @@ -246,6 +249,8 @@ private: /// Creates a new Vulkan framebuffer object vk::Framebuffer CreateFramebuffer(const FramebufferInfo& info); + void CreateDefaultTextures(); + private: const Instance& instance; TaskScheduler& scheduler; @@ -274,8 +279,9 @@ private: std::vector vertex_batch; std::array binding_offsets{}; - ImageAlloc default_texture; vk::Sampler default_sampler; + ImageAlloc default_texture; + ImageAlloc default_storage_texture; struct { Pica::Shader::UniformData data{}; diff --git a/src/video_core/renderer_vulkan/vk_shader_gen.cpp b/src/video_core/renderer_vulkan/vk_shader_gen.cpp index 79c9ced63..77c1bb349 100644 --- a/src/video_core/renderer_vulkan/vk_shader_gen.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_gen.cpp @@ -1642,7 +1642,16 @@ layout (set = 0, binding = 0, std140) uniform vs_config { UNREACHABLE(); } - out += fmt::format("layout(location = {0}) in {1}vec4 vs_in_reg{0};\n", i, prefix); + out += + fmt::format("layout(location = {0}) in {1}vec4 vs_in_typed_reg{0};\n", i, prefix); + } + } + out += '\n'; + + // cast input registers to float to avoid computational errors + for (std::size_t i = 0; i < used_regs.size(); ++i) { + if (used_regs[i]) { + out += fmt::format("vec4 vs_in_reg{0} = vec4(vs_in_typed_reg{0});\n", i); } } out += '\n'; diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index a766023e2..0c3dc3a8b 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -33,6 +33,15 @@ void Swapchain::Create(u32 width, u32 height) { is_outdated = false; is_suboptimal = false; + // Destroy the previous image views + vk::Device device = instance.GetDevice(); + for (auto& image : swapchain_images) { + if (image.image) { + device.destroyImageView(image.image_view); + device.destroyFramebuffer(image.framebuffer); + } + } + // Fetch information about the provided surface Configure(width, height); @@ -61,7 +70,6 @@ void Swapchain::Create(u32 width, u32 height) { .clipped = true, .oldSwapchain = swapchain}; - vk::Device device = instance.GetDevice(); vk::SwapchainKHR new_swapchain = device.createSwapchainKHR(swapchain_info); // If an old swapchain exists, destroy it and move the new one to its place. @@ -71,12 +79,6 @@ void Swapchain::Create(u32 width, u32 height) { auto images = device.getSwapchainImagesKHR(swapchain); - // Destroy the previous images - for (auto& image : swapchain_images) { - device.destroyImageView(image.image_view); - device.destroyFramebuffer(image.framebuffer); - } - swapchain_images.clear(); swapchain_images.resize(images.size()); @@ -113,6 +115,10 @@ void Swapchain::Create(u32 width, u32 height) { constexpr u64 ACQUIRE_TIMEOUT = 1000000000; void Swapchain::AcquireNextImage(vk::Semaphore signal_acquired) { + if (NeedsRecreation()) [[unlikely]] { + return; + } + vk::Device device = instance.GetDevice(); vk::Result result = device.acquireNextImageKHR(swapchain, ACQUIRE_TIMEOUT, signal_acquired, VK_NULL_HANDLE, ¤t_image); @@ -132,6 +138,10 @@ void Swapchain::AcquireNextImage(vk::Semaphore signal_acquired) { } void Swapchain::Present(vk::Semaphore wait_for_present) { + if (NeedsRecreation()) [[unlikely]] { + return; + } + const vk::PresentInfoKHR present_info = {.waitSemaphoreCount = 1, .pWaitSemaphores = &wait_for_present, .swapchainCount = 1, @@ -145,6 +155,7 @@ void Swapchain::Present(vk::Semaphore wait_for_present) { case vk::Result::eSuccess: break; case vk::Result::eSuboptimalKHR: + is_suboptimal = true; LOG_DEBUG(Render_Vulkan, "Suboptimal swapchain"); break; case vk::Result::eErrorOutOfDateKHR: @@ -154,8 +165,6 @@ void Swapchain::Present(vk::Semaphore wait_for_present) { LOG_CRITICAL(Render_Vulkan, "Swapchain presentation failed"); break; } - - current_frame = (current_frame + 1) % swapchain_images.size(); } void Swapchain::Configure(u32 width, u32 height) { diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h index 972042b65..d7b30280d 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.h +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -82,7 +82,6 @@ private: // Swapchain state std::vector swapchain_images; u32 current_image = 0; - u32 current_frame = 0; bool is_outdated = true; bool is_suboptimal = true; }; diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp index f4a2aea78..ebf846caa 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp @@ -210,50 +210,28 @@ void TextureRuntime::Recycle(const VideoCore::HostTextureTag tag, ImageAlloc&& a void TextureRuntime::FormatConvert(const Surface& surface, bool upload, std::span source, std::span dest) { - if (!surface.NeedsConvert()) { + if (!NeedsConvertion(surface.pixel_format)) { std::memcpy(dest.data(), source.data(), source.size()); return; } - // Since this is the most common case handle it separately - if (surface.pixel_format == VideoCore::PixelFormat::RGBA8) { - return Pica::Texture::ConvertABGRToRGBA(source, dest); - } - - // Handle simple D24S8 interleave case - if (surface.GetInternalFormat() == vk::Format::eD24UnormS8Uint) { - if (!upload) { - return Pica::Texture::InterleaveD24S8(source, dest); - } else { - UNREACHABLE(); - } - } - if (upload) { switch (surface.pixel_format) { + case VideoCore::PixelFormat::RGBA8: + return Pica::Texture::ConvertABGRToRGBA(source, dest); case VideoCore::PixelFormat::RGB8: return Pica::Texture::ConvertBGRToRGBA(source, dest); - case VideoCore::PixelFormat::RGBA4: - return Pica::Texture::ConvertRGBA4ToRGBA8(source, dest); - case VideoCore::PixelFormat::RGB5A1: - return Pica::Texture::ConvertRGB5A1ToRGBA8(source, dest); default: break; } } else { switch (surface.pixel_format) { - case VideoCore::PixelFormat::D24S8: - return Pica::Texture::ConvertD32S8ToD24S8(source, dest); case VideoCore::PixelFormat::RGBA4: return Pica::Texture::ConvertRGBA8ToRGBA4(source, dest); - case VideoCore::PixelFormat::RGB8: - return Pica::Texture::ConvertRGBAToBGR(source, dest); - default: - break; } } - LOG_WARNING(Render_Vulkan, "Missing format convertion: {} {} {}", + LOG_WARNING(Render_Vulkan, "Missing linear format convertion: {} {} {}", vk::to_string(surface.traits.native), upload ? "->" : "<-", vk::to_string(surface.alloc.format)); } @@ -457,6 +435,14 @@ const ReinterpreterList& TextureRuntime::GetPossibleReinterpretations( return reinterpreters[static_cast(dest_format)]; } +bool TextureRuntime::NeedsConvertion(VideoCore::PixelFormat format) const { + const FormatTraits traits = instance.GetTraits(format); + const VideoCore::SurfaceType type = VideoCore::GetFormatType(format); + return type == VideoCore::SurfaceType::Color && + (format == VideoCore::PixelFormat::RGBA8 || !traits.blit_support || + !traits.attachment_support); +} + void TextureRuntime::Transition(vk::CommandBuffer command_buffer, ImageAlloc& alloc, vk::ImageLayout new_layout, u32 level, u32 level_count, u32 layer, u32 layer_count) { @@ -588,9 +574,12 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa if (is_scaled) { ScaledUpload(upload); } else { + u32 region_count = 0; + std::array copy_regions; + vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); const VideoCore::Rect2D rect = upload.texture_rect; - const vk::BufferImageCopy copy_region = { + vk::BufferImageCopy copy_region = { .bufferOffset = staging.buffer_offset, .bufferRowLength = rect.GetWidth(), .bufferImageHeight = rect.GetHeight(), @@ -601,11 +590,25 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa .imageOffset = {static_cast(rect.left), static_cast(rect.bottom), 0}, .imageExtent = {rect.GetWidth(), rect.GetHeight(), 1}}; + if (alloc.aspect & vk::ImageAspectFlagBits::eColor) { + copy_regions[region_count++] = copy_region; + } else if (alloc.aspect & vk::ImageAspectFlagBits::eDepth) { + copy_region.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eDepth; + copy_regions[region_count++] = copy_region; + + if (alloc.aspect & vk::ImageAspectFlagBits::eStencil) { + copy_region.bufferOffset += 4 * staging.size / 5; + copy_region.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eStencil; + copy_regions[region_count++] = copy_region; + } + } + runtime.Transition(command_buffer, alloc, vk::ImageLayout::eTransferDstOptimal, 0, alloc.levels, 0, texture_type == VideoCore::TextureType::CubeMap ? 6 : 1); command_buffer.copyBufferToImage(staging.buffer, alloc.image, - vk::ImageLayout::eTransferDstOptimal, copy_region); + vk::ImageLayout::eTransferDstOptimal, region_count, + copy_regions.data()); } InvalidateAllWatcher(); @@ -667,13 +670,6 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi runtime.staging_offsets[current_slot] += staging.size; } -bool Surface::NeedsConvert() const { - // RGBA8 needs a byteswap since R8G8B8A8UnormPack32 does not exist - // D24S8 always needs an interleave pass even if natively supported - return alloc.format != traits.native || pixel_format == VideoCore::PixelFormat::RGBA8 || - pixel_format == VideoCore::PixelFormat::D24S8; -} - u32 Surface::GetInternalBytesPerPixel() const { return vk::blockSize(alloc.format); } diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.h b/src/video_core/renderer_vulkan/vk_texture_runtime.h index 64f7ba776..d8f992d05 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.h +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.h @@ -33,7 +33,7 @@ struct ImageAlloc { vk::ImageUsageFlags usage; vk::Format format; vk::ImageLayout layout = vk::ImageLayout::eUndefined; - vk::ImageAspectFlags aspect = vk::ImageAspectFlagBits::eNone; + vk::ImageAspectFlags aspect = vk::ImageAspectFlagBits::eColor; u32 levels = 1; u32 layers = 1; }; @@ -92,6 +92,9 @@ public: [[nodiscard]] const ReinterpreterList& GetPossibleReinterpretations( VideoCore::PixelFormat dest_format) const; + /// Returns true if the provided pixel format needs convertion + [[nodiscard]] bool NeedsConvertion(VideoCore::PixelFormat format) const; + /// Performs operations that need to be done on every scheduler slot switch void OnSlotSwitch(u32 new_slot); @@ -131,9 +134,6 @@ public: /// Downloads pixel data to staging from a rectangle region of the surface texture void Download(const VideoCore::BufferTextureCopy& download, const StagingData& staging); - /// Returns true if the surface requires pixel data convertion - bool NeedsConvert() const; - /// Returns the bpp of the internal surface format u32 GetInternalBytesPerPixel() const; diff --git a/src/video_core/texture/texture_decode.cpp b/src/video_core/texture/texture_decode.cpp index 82c0faf34..cbc177ae7 100644 --- a/src/video_core/texture/texture_decode.cpp +++ b/src/video_core/texture/texture_decode.cpp @@ -289,6 +289,16 @@ void ConvertRGB5A1ToRGBA8(std::span source, std::span source, std::span dest) { + u32 j = 0; + for (std::size_t i = 0; i < dest.size(); i += 2) { + Common::Vec4 rgba; + std::memcpy(rgba.AsArray(), source.data() + j, sizeof(rgba)); + Color::EncodeRGB5A1(rgba, reinterpret_cast(dest.data() + i)); + j += 4; + } +} + void ConvertD32S8ToD24S8(std::span source, std::span dest) { std::size_t depth_offset = 0; std::size_t stencil_offset = 4 * source.size() / 5; @@ -316,4 +326,15 @@ void InterleaveD24S8(std::span source, std::span des } } +void DeinterleaveD24S8(std::span source, std::span dest) { + std::size_t depth_offset = 0; + std::size_t stencil_offset = 3 * source.size() / 4; + for (std::size_t i = 0; i < dest.size(); i += 4) { + dest[stencil_offset] = source[i]; + std::memcpy(dest.data() + depth_offset, source.data() + i + 1, 3); + depth_offset += 3; + stencil_offset += 1; + } +} + } // namespace Pica::Texture diff --git a/src/video_core/texture/texture_decode.h b/src/video_core/texture/texture_decode.h index a7158e351..bc27aaaf2 100644 --- a/src/video_core/texture/texture_decode.h +++ b/src/video_core/texture/texture_decode.h @@ -87,8 +87,12 @@ void ConvertRGBA8ToRGBA4(std::span source, std::span void ConvertRGB5A1ToRGBA8(std::span source, std::span dest); +void ConvertRGBA8ToRGB5A1(std::span source, std::span dest); + void ConvertD32S8ToD24S8(std::span source, std::span dest); void InterleaveD24S8(std::span source, std::span dest); +void DeinterleaveD24S8(std::span source, std::span dest); + } // namespace Pica::Texture