From 891b4bff1825bd5b461871eabd4fe9d09dc3f5af Mon Sep 17 00:00:00 2001 From: GPUCode Date: Sat, 1 Oct 2022 18:16:57 +0300 Subject: [PATCH] renderer_vulkan: Rework format handling * This is a pretty large commit that aims to solve some issues with the current format system * The instance now builds at application initialization an array of format traits for each pixel format that includes information such as blit/attachment/storage support and fallback formats * The runtime doesn't ask the instance for formats but receives these traits and can dedice on its own what to build For now we do the same as before, we require both blit and attachment support * Morton swizzling also sees many bug fixes. The previous code was very hacky and didn't work for partial texture updates. It was also inconsistent, as it would take a tiled_buffer and write to the middle of linear * Now the functions have been greatly simplified and adjusted to work better with std::span. This fixes out of bounds errors and texture glitches (like the display in Mario Kart 7) --- src/core/hle/service/cfg/cfg.h | 1 + .../rasterizer_cache/morton_swizzle.h | 79 +++++---- .../rasterizer_cache/rasterizer_cache.h | 91 ++++++---- src/video_core/rasterizer_cache/types.h | 2 + src/video_core/rasterizer_cache/utils.cpp | 16 +- src/video_core/rasterizer_cache/utils.h | 24 ++- .../renderer_opengl/gl_texture_runtime.cpp | 3 +- .../renderer_opengl/gl_texture_runtime.h | 9 +- .../renderer_vulkan/renderer_vulkan.cpp | 19 +- .../renderer_vulkan/renderer_vulkan.h | 2 + src/video_core/renderer_vulkan/vk_common.h | 2 +- .../renderer_vulkan/vk_instance.cpp | 153 ++++++++++++---- src/video_core/renderer_vulkan/vk_instance.h | 29 ++- .../renderer_vulkan/vk_pipeline_cache.cpp | 8 - .../renderer_vulkan/vk_pipeline_cache.h | 1 - .../renderer_vulkan/vk_rasterizer.cpp | 43 +++-- .../renderer_vulkan/vk_renderpass_cache.cpp | 43 ++--- .../renderer_vulkan/vk_task_scheduler.cpp | 15 +- .../renderer_vulkan/vk_task_scheduler.h | 4 +- .../renderer_vulkan/vk_texture_runtime.cpp | 167 ++++++++++-------- .../renderer_vulkan/vk_texture_runtime.h | 34 +++- src/video_core/texture/texture_decode.cpp | 60 ++++++- src/video_core/texture/texture_decode.h | 8 + 23 files changed, 532 insertions(+), 281 deletions(-) diff --git a/src/core/hle/service/cfg/cfg.h b/src/core/hle/service/cfg/cfg.h index e887a9645..0310df3aa 100644 --- a/src/core/hle/service/cfg/cfg.h +++ b/src/core/hle/service/cfg/cfg.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include "common/common_types.h" diff --git a/src/video_core/rasterizer_cache/morton_swizzle.h b/src/video_core/rasterizer_cache/morton_swizzle.h index 108c95924..9fd372176 100644 --- a/src/video_core/rasterizer_cache/morton_swizzle.h +++ b/src/video_core/rasterizer_cache/morton_swizzle.h @@ -135,42 +135,53 @@ inline void MortonCopyTile(u32 stride, std::span tile_buffer, std::sp } } +/** + * @brief Performs morton to/from linear convertions on the provided pixel data + * @param width, height The dimentions of the rectangular region of pixels in linear_buffer + * @param start_offset The number of bytes from the start of the first tile to the start of tiled_buffer + * @param end_offset The number of bytes from the start of the first tile to the end of tiled_buffer + * @param linear_buffer The linear pixel data + * @param tiled_buffer The tiled pixel data + * + * The MortonCopy is at the heart of the PICA texture implementation, as it's responsible for converting between + * linear and morton tiled layouts. The function handles both convertions but there are slightly different + * paths and inputs for each: + * + * Morton to Linear: + * During uploads, tiled_buffer is always aligned to the tile or scanline boundary depending if the linear rectangle + * spans multiple vertical tiles. linear_buffer does not reference the entire texture area, but rather the + * specific rectangle affected by the upload. + * + * Linear to Morton: + * This is similar to the other convertion but with some differences. In this case tiled_buffer is not required + * to be aligned to any specific boundary which requires special care. start_offset/end_offset are useful + * here as they tell us exactly where the data should be placed in the linear_buffer. + */ template -static void MortonCopy(u32 stride, u32 height, u32 start_offset, u32 end_offset, - std::span linear_buffer, - std::span tiled_buffer) { - +static void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset, + std::span linear_buffer, std::span tiled_buffer) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; constexpr u32 aligned_bytes_per_pixel = GetBytesPerPixel(format); + constexpr u32 tile_size = GetFormatBpp(format) * 64 / 8; static_assert(aligned_bytes_per_pixel >= bytes_per_pixel, ""); - // We could use bytes_per_pixel here but it should be avoided because it - // becomes zero for 4-bit textures! - constexpr u32 tile_size = GetFormatBpp(format) * 64 / 8; - const u32 linear_tile_size = (7 * stride + 8) * aligned_bytes_per_pixel; - - // Does this line have any significance? - //u32 linear_offset = aligned_bytes_per_pixel - bytes_per_pixel; - u32 linear_offset = 0; - u32 tiled_offset = 0; - - const PAddr aligned_down_start_offset = Common::AlignDown(start_offset, tile_size); - const PAddr aligned_start_offset = Common::AlignUp(start_offset, tile_size); - PAddr aligned_end_offset = Common::AlignDown(end_offset, tile_size); + const u32 linear_tile_stride = (7 * width + 8) * aligned_bytes_per_pixel; + const u32 aligned_down_start_offset = Common::AlignDown(start_offset, tile_size); + const u32 aligned_start_offset = Common::AlignUp(start_offset, tile_size); + const u32 aligned_end_offset = Common::AlignDown(end_offset, tile_size); ASSERT(!morton_to_linear || (aligned_start_offset == start_offset && aligned_end_offset == end_offset)); - const u32 begin_pixel_index = aligned_down_start_offset * 8 / GetFormatBpp(format); - u32 x = (begin_pixel_index % (stride * 8)) / 8; - u32 y = (begin_pixel_index / (stride * 8)) * 8; - // In OpenGL the texture origin is in the bottom left corner as opposed to other // APIs that have it at the top left. To avoid flipping texture coordinates in - // the shader we read/write the linear buffer backwards - linear_offset += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel; + // the shader we read/write the linear buffer from the bottom up + u32 linear_offset = ((height - 8) * width) * aligned_bytes_per_pixel; + u32 tiled_offset = 0; + u32 x = 0; + u32 y = 0; - auto linear_next_tile = [&] { - x = (x + 8) % stride; + const auto LinearNextTile = [&] { + x = (x + 8) % width; linear_offset += 8 * aligned_bytes_per_pixel; if (!x) { y = (y + 8) % height; @@ -178,7 +189,7 @@ static void MortonCopy(u32 stride, u32 height, u32 start_offset, u32 end_offset, return; } - linear_offset -= stride * 9 * aligned_bytes_per_pixel; + linear_offset -= width * 9 * aligned_bytes_per_pixel; } }; @@ -186,31 +197,31 @@ static void MortonCopy(u32 stride, u32 height, u32 start_offset, u32 end_offset, // the tile affected to a temporary buffer and copy the part we are interested in if (start_offset < aligned_start_offset && !morton_to_linear) { std::array tmp_buf; - auto linear_data = linear_buffer.last(linear_buffer.size_bytes() - linear_offset); - MortonCopyTile(stride, tmp_buf, linear_data); + auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_stride); + MortonCopyTile(width, tmp_buf, linear_data); std::memcpy(tiled_buffer.data(), tmp_buf.data() + start_offset - aligned_down_start_offset, std::min(aligned_start_offset, end_offset) - start_offset); tiled_offset += aligned_start_offset - start_offset; - linear_next_tile(); + LinearNextTile(); } const u32 buffer_end = tiled_offset + aligned_end_offset - aligned_start_offset; while (tiled_offset < buffer_end) { - auto linear_data = linear_buffer.last(linear_buffer.size_bytes() - linear_offset); + auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_stride); auto tiled_data = tiled_buffer.subspan(tiled_offset, tile_size); - MortonCopyTile(stride, tiled_data, linear_data); + MortonCopyTile(width, tiled_data, linear_data); tiled_offset += tile_size; - linear_next_tile(); + LinearNextTile(); } // If during a texture download the end coordinate is not tile aligned, swizzle // the tile affected to a temporary buffer and copy the part we are interested in if (end_offset > std::max(aligned_start_offset, aligned_end_offset) && !morton_to_linear) { std::array tmp_buf; - auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_size); - MortonCopyTile(stride, tmp_buf, linear_data); + auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_stride); + MortonCopyTile(width, tmp_buf, linear_data); std::memcpy(tiled_buffer.data() + tiled_offset, tmp_buf.data(), end_offset - aligned_end_offset); } } diff --git a/src/video_core/rasterizer_cache/rasterizer_cache.h b/src/video_core/rasterizer_cache/rasterizer_cache.h index 7509a0f6e..1a20892fc 100644 --- a/src/video_core/rasterizer_cache/rasterizer_cache.h +++ b/src/video_core/rasterizer_cache/rasterizer_cache.h @@ -163,7 +163,8 @@ private: SurfaceMap dirty_regions; SurfaceSet remove_surfaces; u16 resolution_scale_factor; - + std::vector> download_queue; + std::vector staging_buffer; std::unordered_map texture_cube_cache; std::recursive_mutex mutex; }; @@ -269,9 +270,26 @@ bool RasterizerCache::BlitSurfaces(const Surface& src_surface, Common::Rectan const Surface& dst_surface, Common::Rectangle dst_rect) { MICROPROFILE_SCOPE(RasterizerCache_BlitSurface); - if (CheckFormatsBlittable(src_surface->pixel_format, dst_surface->pixel_format)) { - dst_surface->InvalidateAllWatcher(); + if (!CheckFormatsBlittable(src_surface->pixel_format, dst_surface->pixel_format)) [[unlikely]] { + return false; + } + dst_surface->InvalidateAllWatcher(); + + // Prefer texture copy over blit if possible + if (src_rect.GetWidth() == dst_rect.GetWidth() && src_rect.bottom < src_rect.top) { + const TextureCopy texture_copy = { + .src_level = 0, + .dst_level = 0, + .src_layer = 0, + .dst_layer = 0, + .src_offset = {src_rect.left, src_rect.bottom}, + .dst_offset = {dst_rect.left, dst_rect.bottom}, + .extent = {src_rect.GetWidth(), src_rect.GetHeight()} + }; + + return runtime.CopyTextures(*src_surface, *dst_surface, texture_copy); + } else { const TextureBlit texture_blit = { .src_level = 0, .dst_level = 0, @@ -283,8 +301,6 @@ bool RasterizerCache::BlitSurfaces(const Surface& src_surface, Common::Rectan return runtime.BlitTextures(*src_surface, *dst_surface, texture_blit); } - - return false; } MICROPROFILE_DECLARE(RasterizerCache_CopySurface); @@ -888,35 +904,31 @@ void RasterizerCache::ValidateSurface(const Surface& surface, PAddr addr, u32 MICROPROFILE_DECLARE(RasterizerCache_SurfaceLoad); template void RasterizerCache::UploadSurface(const Surface& surface, SurfaceInterval interval) { - const SurfaceParams info = surface->FromInterval(interval); - const u32 load_start = info.addr; - const u32 load_end = info.end; - ASSERT(load_start >= surface->addr && load_end <= surface->end); + const SurfaceParams load_info = surface->FromInterval(interval); + ASSERT(load_info.addr >= surface->addr && load_info.end <= surface->end); + + MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad); const auto& staging = runtime.FindStaging( - surface->width * surface->height * 4, true); - MemoryRef source_ptr = VideoCore::g_memory->GetPhysicalRef(info.addr); + load_info.width * load_info.height * surface->GetInternalBytesPerPixel(), true); + MemoryRef source_ptr = VideoCore::g_memory->GetPhysicalRef(load_info.addr); if (!source_ptr) [[unlikely]] { return; } - const auto upload_data = source_ptr.GetWriteBytes(load_end - load_start); - - MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad); - + const auto upload_data = source_ptr.GetWriteBytes(load_info.end - load_info.addr); if (surface->is_tiled) { - std::vector unswizzled_data(staging.size); - UnswizzleTexture(*surface, load_start - surface->addr, load_end - surface->addr, - upload_data, unswizzled_data); - runtime.FormatConvert(surface->pixel_format, true, unswizzled_data, staging.mapped); + std::vector unswizzled_data(load_info.width * load_info.height * GetBytesPerPixel(load_info.pixel_format)); + UnswizzleTexture(load_info, load_info.addr, load_info.end, upload_data, unswizzled_data); + runtime.FormatConvert(*surface, true, unswizzled_data, staging.mapped); } else { - runtime.FormatConvert(surface->pixel_format, true, upload_data, staging.mapped); + runtime.FormatConvert(*surface, true, upload_data, staging.mapped); } const BufferTextureCopy upload = { .buffer_offset = 0, .buffer_size = staging.size, - .texture_rect = surface->GetSubRect(info), + .texture_rect = surface->GetSubRect(load_info), .texture_level = 0 }; @@ -926,17 +938,17 @@ void RasterizerCache::UploadSurface(const Surface& surface, SurfaceInterval i MICROPROFILE_DECLARE(RasterizerCache_SurfaceFlush); template void RasterizerCache::DownloadSurface(const Surface& surface, SurfaceInterval interval) { + const SurfaceParams flush_info = surface->FromInterval(interval); const u32 flush_start = boost::icl::first(interval); const u32 flush_end = boost::icl::last_next(interval); ASSERT(flush_start >= surface->addr && flush_end <= surface->end); const auto& staging = runtime.FindStaging( - surface->width * surface->height * 4, false); - const SurfaceParams params = surface->FromInterval(interval); + flush_info.width * flush_info.height * surface->GetInternalBytesPerPixel(), false); const BufferTextureCopy download = { .buffer_offset = 0, .buffer_size = staging.size, - .texture_rect = surface->GetSubRect(params), + .texture_rect = surface->GetSubRect(flush_info), .texture_level = 0 }; @@ -948,17 +960,15 @@ void RasterizerCache::DownloadSurface(const Surface& surface, SurfaceInterval } const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start); - - MICROPROFILE_SCOPE(RasterizerCache_SurfaceFlush); - - if (surface->is_tiled) { - std::vector swizzled_data(staging.size); - runtime.FormatConvert(surface->pixel_format, false, swizzled_data, swizzled_data); - SwizzleTexture(*surface, flush_start - surface->addr, flush_end - surface->addr, - staging.mapped, download_dest); - } else { - runtime.FormatConvert(surface->pixel_format, false, staging.mapped, download_dest); - } + download_queue.push_back([this, surface, download_dest, flush_start, flush_end, flush_info, mapped = staging.mapped]() { + if (surface->is_tiled) { + std::vector temp_data(flush_info.width * flush_info.height * GetBytesPerPixel(flush_info.pixel_format)); + runtime.FormatConvert(*surface, false, mapped, temp_data); + SwizzleTexture(flush_info, flush_start, flush_end, temp_data, download_dest); + } else { + runtime.FormatConvert(*surface, false, mapped, download_dest); + } + }); } template @@ -1135,6 +1145,17 @@ void RasterizerCache::FlushRegion(PAddr addr, u32 size, Surface flush_surface flushed_intervals += interval; } + // Batch execute all requested downloads. This gives more time for them to complete + // before we issue the CPU to GPU flush and reduces scheduler slot switches in Vulkan + if (!download_queue.empty()) { + runtime.Finish(); + for (auto& download_func : download_queue) { + download_func(); + } + + download_queue.clear(); + } + // Reset dirty regions dirty_regions -= flushed_intervals; } diff --git a/src/video_core/rasterizer_cache/types.h b/src/video_core/rasterizer_cache/types.h index 6b324d930..669ce2ddf 100644 --- a/src/video_core/rasterizer_cache/types.h +++ b/src/video_core/rasterizer_cache/types.h @@ -41,6 +41,8 @@ struct TextureClear { struct TextureCopy { u32 src_level; u32 dst_level; + u32 src_layer; + u32 dst_layer; Offset src_offset; Offset dst_offset; Extent extent; diff --git a/src/video_core/rasterizer_cache/utils.cpp b/src/video_core/rasterizer_cache/utils.cpp index 586565b8d..28c8965b1 100644 --- a/src/video_core/rasterizer_cache/utils.cpp +++ b/src/video_core/rasterizer_cache/utils.cpp @@ -11,18 +11,22 @@ namespace VideoCore { -void SwizzleTexture(const SurfaceParams& params, u32 start_offset, u32 end_offset, +void SwizzleTexture(const SurfaceParams& swizzle_info, PAddr start_addr, PAddr end_addr, std::span source_linear, std::span dest_tiled) { - const u32 func_index = static_cast(params.pixel_format); + const u32 func_index = static_cast(swizzle_info.pixel_format); const MortonFunc SwizzleImpl = SWIZZLE_TABLE[func_index]; - SwizzleImpl(params.stride, params.height, start_offset, end_offset, source_linear, dest_tiled); + SwizzleImpl(swizzle_info.width, swizzle_info.height, + start_addr - swizzle_info.addr, end_addr - swizzle_info.addr, + source_linear, dest_tiled); } -void UnswizzleTexture(const SurfaceParams& params, u32 start_offset, u32 end_offset, +void UnswizzleTexture(const SurfaceParams& unswizzle_info, PAddr start_addr, PAddr end_addr, std::span source_tiled, std::span dest_linear) { - const u32 func_index = static_cast(params.pixel_format); + const u32 func_index = static_cast(unswizzle_info.pixel_format); const MortonFunc UnswizzleImpl = UNSWIZZLE_TABLE[func_index]; - UnswizzleImpl(params.stride, params.height, start_offset, end_offset, dest_linear, source_tiled); + UnswizzleImpl(unswizzle_info.width, unswizzle_info.height, + start_addr - unswizzle_info.addr, end_addr - unswizzle_info.addr, + dest_linear, source_tiled); } ClearValue MakeClearValue(SurfaceType type, PixelFormat format, const u8* fill_data) { diff --git a/src/video_core/rasterizer_cache/utils.h b/src/video_core/rasterizer_cache/utils.h index 1de5faca1..64ee0d62f 100644 --- a/src/video_core/rasterizer_cache/utils.h +++ b/src/video_core/rasterizer_cache/utils.h @@ -44,19 +44,29 @@ class SurfaceParams; [[nodiscard]] ClearValue MakeClearValue(SurfaceType type, PixelFormat format, const u8* fill_data); -void SwizzleTexture(const SurfaceParams& params, u32 start_offset, u32 end_offset, - std::span source_linear, std::span dest_tiled); - /** * Converts a morton swizzled texture to linear format. * - * @param params Structure used to query the surface information. - * @param start_offset Is the offset at which the source_tiled span begins + * @param unswizzle_info Structure used to query the surface information. + * @param start_addr The start address of the source_tiled data. + * @param end_addr The end address of the source_tiled data. + * @param source_tiled The tiled data to convert. + * @param dest_linear The output buffer where the generated linear data will be written to. + */ +void UnswizzleTexture(const SurfaceParams& unswizzle_info, PAddr start_addr, PAddr end_addr, + std::span source_tiled, std::span dest_linear); + +/** + * Swizzles a linear texture according to the morton code. + * + * @param swizzle_info Structure used to query the surface information. + * @param start_addr The start address of the dest_tiled data. + * @param end_addr The end address of the dest_tiled data. * @param source_tiled The source morton swizzled data. * @param dest_linear The output buffer where the generated linear data will be written to. */ -void UnswizzleTexture(const SurfaceParams& params, u32 start_offset, u32 end_offset, - std::span source_tiled, std::span dest_linear); +void SwizzleTexture(const SurfaceParams& swizzle_info, PAddr start_addr, PAddr end_addr, + std::span source_linear, std::span dest_tiled); } // namespace VideoCore diff --git a/src/video_core/renderer_opengl/gl_texture_runtime.cpp b/src/video_core/renderer_opengl/gl_texture_runtime.cpp index 6230f7037..abff64276 100644 --- a/src/video_core/renderer_opengl/gl_texture_runtime.cpp +++ b/src/video_core/renderer_opengl/gl_texture_runtime.cpp @@ -124,8 +124,9 @@ const FormatTuple& TextureRuntime::GetFormatTuple(VideoCore::PixelFormat pixel_f return DEFAULT_TUPLE; } -void TextureRuntime::FormatConvert(VideoCore::PixelFormat format, bool upload, +void TextureRuntime::FormatConvert(const Surface& surface, bool upload, std::span source, std::span dest) { + const VideoCore::PixelFormat format = surface.pixel_format; if (format == VideoCore::PixelFormat::RGBA8 && driver.IsOpenGLES()) { Pica::Texture::ConvertABGRToRGBA(source, dest); } else if (format == VideoCore::PixelFormat::RGB8 && driver.IsOpenGLES()) { diff --git a/src/video_core/renderer_opengl/gl_texture_runtime.h b/src/video_core/renderer_opengl/gl_texture_runtime.h index 7843e495c..b96efe187 100644 --- a/src/video_core/renderer_opengl/gl_texture_runtime.h +++ b/src/video_core/renderer_opengl/gl_texture_runtime.h @@ -70,8 +70,10 @@ public: /// Returns the OpenGL format tuple associated with the provided pixel format const FormatTuple& GetFormatTuple(VideoCore::PixelFormat pixel_format); + void Finish() const {} + /// Performs required format convertions on the staging data - void FormatConvert(VideoCore::PixelFormat format, bool upload, + void FormatConvert(const Surface& surface, bool upload, std::span source, std::span dest); /// Allocates an OpenGL texture with the specified dimentions and format @@ -136,6 +138,11 @@ public: /// Downloads pixel data to staging from a rectangle region of the surface texture void Download(const VideoCore::BufferTextureCopy& download, const StagingBuffer& staging); + /// Returns the bpp of the internal surface format + u32 GetInternalBytesPerPixel() const { + return VideoCore::GetBytesPerPixel(pixel_format); + } + private: /// Downloads scaled image by downscaling the requested rectangle void ScaledDownload(const VideoCore::BufferTextureCopy& download); diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 43f30deec..f8537e606 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -183,7 +183,7 @@ static std::array MakeOrthographicMatrix(float width, float height } RendererVulkan::RendererVulkan(Frontend::EmuWindow& window) - : RendererBase{window}, instance{window}, scheduler{instance}, renderpass_cache{instance, scheduler}, + : RendererBase{window}, instance{window}, scheduler{instance, *this}, renderpass_cache{instance, scheduler}, runtime{instance, scheduler, renderpass_cache}, swapchain{instance, renderpass_cache}, vertex_buffer{instance, scheduler, VERTEX_BUFFER_SIZE, vk::BufferUsageFlagBits::eVertexBuffer, {}} { @@ -626,7 +626,7 @@ void RendererVulkan::BuildPipelines() { void RendererVulkan::ConfigureFramebufferTexture(TextureInfo& texture, const GPU::Regs::FramebufferConfig& framebuffer) { TextureInfo old_texture = texture; - texture = TextureInfo { + texture = TextureInfo{ .alloc = runtime.Allocate(framebuffer.width, framebuffer.height, VideoCore::PixelFormatFromGPUPixelFormat(framebuffer.color_format), VideoCore::TextureType::Texture2D), @@ -1035,21 +1035,24 @@ void RendererVulkan::SwapBuffers() { DrawScreens(layout, false); - // Flush all buffers to make the data visible to the GPU before submitting - rasterizer->FlushBuffers(); - vertex_buffer.Flush(); - scheduler.Submit(SubmitMode::SwapchainSynced); swapchain.Present(present_ready); +} - // Inform texture runtime about the switch - runtime.OnSlotSwitch(scheduler.GetCurrentSlotIndex()); +void RendererVulkan::FlushBuffers() { + vertex_buffer.Flush(); + rasterizer->FlushBuffers(); +} +void RendererVulkan::OnSlotSwitch() { // When the command buffer switches, all state becomes undefined. // This is problematic with dynamic states, so set all states here if (instance.IsExtendedDynamicStateSupported()) { rasterizer->SyncFixedState(); } + + runtime.OnSlotSwitch(scheduler.GetCurrentSlotIndex()); + rasterizer->pipeline_cache.MarkDirty(); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 8f4153311..705eee9af 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -72,6 +72,8 @@ public: void PrepareVideoDumping() override {} void CleanupVideoDumping() override {} void Sync() override; + void FlushBuffers(); + void OnSlotSwitch(); private: void ReloadSampler(); diff --git a/src/video_core/renderer_vulkan/vk_common.h b/src/video_core/renderer_vulkan/vk_common.h index ff01bb2f2..6e1954ffa 100644 --- a/src/video_core/renderer_vulkan/vk_common.h +++ b/src/video_core/renderer_vulkan/vk_common.h @@ -26,11 +26,11 @@ constexpr vk::ImageAspectFlags GetImageAspect(vk::Format format) { switch (format) { case vk::Format::eD16UnormS8Uint: case vk::Format::eD24UnormS8Uint: - case vk::Format::eX8D24UnormPack32: case vk::Format::eD32SfloatS8Uint: return vk::ImageAspectFlagBits::eStencil | vk::ImageAspectFlagBits::eDepth; break; case vk::Format::eD16Unorm: + case vk::Format::eX8D24UnormPack32: case vk::Format::eD32Sfloat: return vk::ImageAspectFlagBits::eDepth; break; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index b8791dbab..8c9a72a97 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -11,6 +11,33 @@ namespace Vulkan { +vk::Format ToVkFormat(VideoCore::PixelFormat format) { + switch (format) { + case VideoCore::PixelFormat::RGBA8: + return vk::Format::eR8G8B8A8Unorm; + case VideoCore::PixelFormat::RGB8: + return vk::Format::eB8G8R8Unorm; + case VideoCore::PixelFormat::RGB5A1: + return vk::Format::eR5G5B5A1UnormPack16; + case VideoCore::PixelFormat::RGB565: + return vk::Format::eR5G6B5UnormPack16; + case VideoCore::PixelFormat::RGBA4: + return vk::Format::eR4G4B4A4UnormPack16; + case VideoCore::PixelFormat::D16: + return vk::Format::eD16Unorm; + case VideoCore::PixelFormat::D24: + return vk::Format::eX8D24UnormPack32; + case VideoCore::PixelFormat::D24S8: + return vk::Format::eD24UnormS8Uint; + case VideoCore::PixelFormat::Invalid: + LOG_ERROR(Render_Vulkan, "Unknown texture format {}!", format); + return vk::Format::eUndefined; + default: + // Use default case for the texture formats + return vk::Format::eR8G8B8A8Unorm; + } +} + Instance::Instance(Frontend::EmuWindow& window) { auto window_info = window.GetWindowInfo(); @@ -54,6 +81,7 @@ Instance::Instance(Frontend::EmuWindow& window) { device_properties = physical_device.getProperties(); CreateDevice(); + CreateFormatTable(); } Instance::~Instance() { @@ -64,50 +92,99 @@ Instance::~Instance() { instance.destroy(); } -bool Instance::IsFormatSupported(vk::Format format, vk::FormatFeatureFlags usage) const { - static std::unordered_map supported; - if (auto it = supported.find(format); it != supported.end()) { - return (it->second.optimalTilingFeatures & usage) == usage; +FormatTraits Instance::GetTraits(VideoCore::PixelFormat pixel_format) const { + if (pixel_format == VideoCore::PixelFormat::Invalid) [[unlikely]] { + return FormatTraits{}; } - // Cache format properties so we don't have to query the driver all the time - const vk::FormatProperties properties = physical_device.getFormatProperties(format); - supported.insert(std::make_pair(format, properties)); - - return (properties.optimalTilingFeatures & usage) == usage; + const u32 index = static_cast(pixel_format); + return format_table[index]; } -vk::Format Instance::GetFormatAlternative(vk::Format format) const { - if (format == vk::Format::eUndefined) { - return format; - } +void Instance::CreateFormatTable() { + constexpr std::array pixel_formats = { + VideoCore::PixelFormat::RGBA8, + VideoCore::PixelFormat::RGB8, + VideoCore::PixelFormat::RGB5A1, + VideoCore::PixelFormat::RGB565, + VideoCore::PixelFormat::RGBA4, + VideoCore::PixelFormat::IA8, + VideoCore::PixelFormat::RG8, + VideoCore::PixelFormat::I8, + VideoCore::PixelFormat::A8, + VideoCore::PixelFormat::IA4, + VideoCore::PixelFormat::I4, + VideoCore::PixelFormat::A4, + VideoCore::PixelFormat::ETC1, + VideoCore::PixelFormat::ETC1A4, + VideoCore::PixelFormat::D16, + VideoCore::PixelFormat::D24, + VideoCore::PixelFormat::D24S8 + }; - vk::FormatFeatureFlags features = GetFormatFeatures(GetImageAspect(format)); - if (IsFormatSupported(format, features)) { - return format; - } + const vk::FormatFeatureFlags storage_usage = vk::FormatFeatureFlagBits::eStorageImage; + const vk::FormatFeatureFlags blit_usage = vk::FormatFeatureFlagBits::eSampledImage | + vk::FormatFeatureFlagBits::eTransferDst | + vk::FormatFeatureFlagBits::eTransferSrc | + vk::FormatFeatureFlagBits::eBlitSrc | + vk::FormatFeatureFlagBits::eBlitDst; - // Return the most supported alternative format preferably with the - // same block size according to the Vulkan spec. - // See 43.3. Required Format Support of the Vulkan spec - switch (format) { - case vk::Format::eD24UnormS8Uint: - return vk::Format::eD32SfloatS8Uint; - case vk::Format::eX8D24UnormPack32: - return vk::Format::eD32Sfloat; - case vk::Format::eR5G5B5A1UnormPack16: - return vk::Format::eA1R5G5B5UnormPack16; - case vk::Format::eR8G8B8Unorm: - return vk::Format::eR8G8B8A8Unorm; - case vk::Format::eUndefined: - return vk::Format::eUndefined; - case vk::Format::eR4G4B4A4UnormPack16: - // B4G4R4A4 is not guaranteed by the spec to support attachments - return GetFormatAlternative(vk::Format::eB4G4R4A4UnormPack16); - default: - LOG_WARNING(Render_Vulkan, "Format {} doesn't support attachments, falling back to RGBA8", - vk::to_string(format)); - return vk::Format::eR8G8B8A8Unorm; + for (const auto& pixel_format : pixel_formats) { + const vk::Format format = ToVkFormat(pixel_format); + const vk::FormatProperties properties = physical_device.getFormatProperties(format); + const vk::ImageAspectFlags aspect = GetImageAspect(format); + + const vk::FormatFeatureFlagBits attachment_usage = (aspect & vk::ImageAspectFlagBits::eDepth) ? + vk::FormatFeatureFlagBits::eDepthStencilAttachment : + vk::FormatFeatureFlagBits::eColorAttachment; + + const bool supports_blit = + (properties.optimalTilingFeatures & blit_usage) == blit_usage; + const bool supports_attachment = + (properties.optimalTilingFeatures & attachment_usage) == attachment_usage; + const bool supports_storage = + (properties.optimalTilingFeatures & storage_usage) == storage_usage; + + // Find the most inclusive usage flags for this format + vk::ImageUsageFlags best_usage; + if (supports_blit) { + best_usage |= vk::ImageUsageFlagBits::eSampled | + vk::ImageUsageFlagBits::eTransferDst | + vk::ImageUsageFlagBits::eTransferSrc; + } + if (supports_attachment) { + best_usage |= (aspect & vk::ImageAspectFlagBits::eDepth) ? + vk::ImageUsageFlagBits::eDepthStencilAttachment : + vk::ImageUsageFlagBits::eColorAttachment; + } + if (supports_storage) { + best_usage |= vk::ImageUsageFlagBits::eStorage; + } + + // Always fallback to RGBA8 or D32(S8) for convenience + vk::Format fallback = vk::Format::eR8G8B8A8Unorm; + if (aspect & vk::ImageAspectFlagBits::eDepth) { + fallback = vk::Format::eD32Sfloat; + if (aspect & vk::ImageAspectFlagBits::eStencil) { + fallback = vk::Format::eD32SfloatS8Uint; + } + } + + // Report completely unsupported formats + if (!supports_blit && !supports_attachment && !supports_storage) { + LOG_WARNING(Render_Vulkan, "Format {} unsupported, falling back unconditionally to {}", + vk::to_string(format), vk::to_string(fallback)); + } + + const u32 index = static_cast(pixel_format); + format_table[index] = FormatTraits{ + .blit_support = supports_blit, + .attachment_support = supports_attachment, + .storage_support = supports_storage, + .usage = best_usage, + .native = format, + .fallback = fallback + }; } } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 3297e3a31..fec584608 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -4,8 +4,9 @@ #pragma once -#include -#include "common/common_types.h" +#include +#include +#include "video_core/rasterizer_cache/pixel_format.h" #include "video_core/renderer_vulkan/vk_common.h" namespace Frontend { @@ -14,17 +15,23 @@ class EmuWindow; namespace Vulkan { +struct FormatTraits { + bool blit_support = false; ///< True if the format supports omnidirectonal blit operations + bool attachment_support = false; ///< True if the format supports being used as an attachment + bool storage_support = false; ///< True if the format supports storage operations + vk::ImageUsageFlags usage{}; ///< Most supported usage for the native format + vk::Format native = vk::Format::eUndefined; ///< Closest possible native format + vk::Format fallback = vk::Format::eUndefined; ///< Best fallback format +}; + /// The global Vulkan instance class Instance { public: Instance(Frontend::EmuWindow& window); ~Instance(); - /// Returns true when the format supports the provided feature flags - bool IsFormatSupported(vk::Format format, vk::FormatFeatureFlags usage) const; - - /// Returns the most compatible format that supports the provided feature flags - vk::Format GetFormatAlternative(vk::Format format) const; + /// Returns the FormatTraits struct for the provided pixel format + FormatTraits GetTraits(VideoCore::PixelFormat pixel_format) const; /// Returns the Vulkan instance vk::Instance GetInstance() const { @@ -103,6 +110,12 @@ public: } private: + /// Returns the optimal supported usage for the requested format + vk::FormatFeatureFlags GetFormatFeatures(vk::Format format); + + /// Creates the format compatibility table for the current device + void CreateFormatTable(); + /// Creates the logical device opportunistically enabling extensions bool CreateDevice(); @@ -118,9 +131,9 @@ private: VmaAllocator allocator; vk::Queue present_queue; vk::Queue graphics_queue; + std::array format_table; u32 present_queue_family_index = 0; u32 graphics_queue_family_index = 0; - bool timeline_semaphores = false; bool extended_dynamic_state = false; bool push_descriptors = false; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 69da9797b..575e8f780 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -180,13 +180,6 @@ PipelineCache::~PipelineCache() { void PipelineCache::BindPipeline(const PipelineInfo& info) { ApplyDynamic(info); - // When texture downloads occur the runtime will flush the GPU and cause - // a scheduler slot switch behind our back. This might invalidate any - // cached descriptor sets/require pipeline rebinding. - if (timestamp != scheduler.GetHostFenceCounter()) { - MarkDirty(); - } - u64 shader_hash = 0; for (u32 i = 0; i < MAX_SHADER_STAGES; i++) { shader_hash = Common::HashCombine(shader_hash, shader_hashes[i]); @@ -313,7 +306,6 @@ void PipelineCache::SetScissor(s32 x, s32 y, u32 width, u32 height) { void PipelineCache::MarkDirty() { descriptor_dirty.fill(true); current_pipeline = VK_NULL_HANDLE; - timestamp = scheduler.GetHostFenceCounter(); } void PipelineCache::ApplyDynamic(const PipelineInfo& info) { diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 46cd21be8..23456f77e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -248,7 +248,6 @@ private: std::array update_data{}; std::array descriptor_dirty{}; std::array descriptor_sets; - u64 timestamp = 0; // Bound shader modules enum ProgramType : u32 { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index d958de7f6..5b190669f 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -94,9 +94,9 @@ constexpr VertexLayout RasterizerVulkan::HardwareVertex::GetVertexLayout() { } constexpr u32 VERTEX_BUFFER_SIZE = 128 * 1024 * 1024; -constexpr u32 INDEX_BUFFER_SIZE = 2 * 1024 * 1024; -constexpr u32 UNIFORM_BUFFER_SIZE = 2 * 1024 * 1024; -constexpr u32 TEXTURE_BUFFER_SIZE = 2 * 1024 * 1024; +constexpr u32 INDEX_BUFFER_SIZE = 8 * 1024 * 1024; +constexpr u32 UNIFORM_BUFFER_SIZE = 16 * 1024 * 1024; +constexpr u32 TEXTURE_BUFFER_SIZE = 16 * 1024 * 1024; constexpr std::array TEXTURE_BUFFER_LF_FORMATS = { vk::Format::eR32G32Sfloat @@ -188,6 +188,7 @@ RasterizerVulkan::~RasterizerVulkan() { vmaDestroyImage(allocator, default_texture.image, default_texture.allocation); device.destroyImageView(default_texture.image_view); + device.destroyImageView(default_texture.base_view); device.destroySampler(default_sampler); } @@ -598,12 +599,6 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { surfaces_rect.bottom, surfaces_rect.top)) }; - // Sync the viewport - pipeline_cache.SetViewport(surfaces_rect.left + viewport_rect_unscaled.left * res_scale, - surfaces_rect.bottom + viewport_rect_unscaled.bottom * res_scale, - viewport_rect_unscaled.GetWidth() * res_scale, - viewport_rect_unscaled.GetHeight() * res_scale); - if (uniform_block_data.data.framebuffer_scale != res_scale) { uniform_block_data.data.framebuffer_scale = res_scale; uniform_block_data.dirty = true; @@ -678,8 +673,6 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { } }; - vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); - // Sync and bind the texture surfaces const auto pica_textures = regs.texturing.GetTextures(); for (unsigned texture_index = 0; texture_index < pica_textures.size(); ++texture_index) { @@ -725,7 +718,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { auto surface = res_cache.GetTextureCube(config); if (surface != nullptr) { - runtime.Transition(command_buffer, surface->alloc, + runtime.Transition(scheduler.GetRenderCommandBuffer(), surface->alloc, vk::ImageLayout::eShaderReadOnlyOptimal, 0, surface->alloc.levels, 0, 6); pipeline_cache.BindTexture(3, surface->alloc.image_view); @@ -746,7 +739,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { auto surface = res_cache.GetTextureSurface(texture); if (surface != nullptr) { - runtime.Transition(command_buffer, surface->alloc, + runtime.Transition(scheduler.GetRenderCommandBuffer(), surface->alloc, vk::ImageLayout::eShaderReadOnlyOptimal, 0, surface->alloc.levels); CheckBarrier(surface->alloc.image_view, texture_index); @@ -767,6 +760,15 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { } } + // NOTE: From here onwards its a safe zone to set the draw state, doing that any earlier will cause + // issues as the rasterizer cache might cause a scheduler switch and invalidate our state + + // Sync the viewport + pipeline_cache.SetViewport(surfaces_rect.left + viewport_rect_unscaled.left * res_scale, + surfaces_rect.bottom + viewport_rect_unscaled.bottom * res_scale, + viewport_rect_unscaled.GetWidth() * res_scale, + viewport_rect_unscaled.GetHeight() * res_scale); + // Sync and bind the shader if (shader_dirty) { SetShader(); @@ -786,8 +788,8 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { auto valid_surface = color_surface ? color_surface : depth_surface; const FramebufferInfo framebuffer_info = { - .color = color_surface ? color_surface->alloc.image_view : VK_NULL_HANDLE, - .depth = depth_surface ? depth_surface->alloc.image_view : VK_NULL_HANDLE, + .color = color_surface ? color_surface->GetFramebufferView() : VK_NULL_HANDLE, + .depth = depth_surface ? depth_surface->GetFramebufferView() : VK_NULL_HANDLE, .renderpass = renderpass_cache.GetRenderpass(pipeline_info.color_attachment, pipeline_info.depth_attachment, false), .width = valid_surface->GetScaledWidth(), @@ -799,6 +801,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { it->second = CreateFramebuffer(framebuffer_info); } + vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); if (color_surface) { runtime.Transition(command_buffer, color_surface->alloc, vk::ImageLayout::eColorAttachmentOptimal, @@ -1509,15 +1512,9 @@ bool RasterizerVulkan::AccelerateTextureCopy(const GPU::Regs::DisplayTransferCon const bool load_gap = output_gap != 0; auto [dst_surface, dst_rect] = res_cache.GetSurfaceSubRect(dst_params, VideoCore::ScaleMatch::Upscale, load_gap); - if (dst_surface == nullptr) { - return false; - } - if (dst_surface->type == VideoCore::SurfaceType::Texture) { - return false; - } - - if (!res_cache.BlitSurfaces(src_surface, src_rect, dst_surface, dst_rect)) { + if (!dst_surface || dst_surface->type == VideoCore::SurfaceType::Texture || + !res_cache.BlitSurfaces(src_surface, src_rect, dst_surface, dst_rect)) { return false; } diff --git a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp b/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp index 5f7a3a7b0..fd9d96af7 100644 --- a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp @@ -10,24 +10,23 @@ namespace Vulkan { -vk::Format ToVkFormatColor(u32 index) { +VideoCore::PixelFormat ToFormatColor(u32 index) { switch (index) { - case 0: return vk::Format::eR8G8B8A8Unorm; - case 1: return vk::Format::eR8G8B8Unorm; - case 2: return vk::Format::eR5G5B5A1UnormPack16; - case 3: return vk::Format::eR5G6B5UnormPack16; - case 4: return vk::Format::eR4G4B4A4UnormPack16; - default: return vk::Format::eUndefined; + case 0: return VideoCore::PixelFormat::RGBA8; + case 1: return VideoCore::PixelFormat::RGB8; + case 2: return VideoCore::PixelFormat::RGB5A1; + case 3: return VideoCore::PixelFormat::RGB565; + case 4: return VideoCore::PixelFormat::RGBA4; + default: return VideoCore::PixelFormat::Invalid; } } -vk::Format ToVkFormatDepth(u32 index) { +VideoCore::PixelFormat ToFormatDepth(u32 index) { switch (index) { - case 0: return vk::Format::eD16Unorm; - case 1: return vk::Format::eX8D24UnormPack32; - // Notice the similar gap in PixelFormat - case 3: return vk::Format::eD24UnormS8Uint; - default: return vk::Format::eUndefined; + case 0: return VideoCore::PixelFormat::D16; + case 1: return VideoCore::PixelFormat::D24; + case 3: return VideoCore::PixelFormat::D24S8; + default: return VideoCore::PixelFormat::Invalid; } } @@ -36,21 +35,23 @@ RenderpassCache::RenderpassCache(const Instance& instance, TaskScheduler& schedu // Pre-create all needed renderpasses by the renderer for (u32 color = 0; color <= MAX_COLOR_FORMATS; color++) { for (u32 depth = 0; depth <= MAX_DEPTH_FORMATS; depth++) { - const vk::Format color_format = - instance.GetFormatAlternative(ToVkFormatColor(color)); - const vk::Format depth_stencil_format = - instance.GetFormatAlternative(ToVkFormatDepth(depth)); + const FormatTraits color_traits = instance.GetTraits(ToFormatColor(color)); + const FormatTraits depth_traits = instance.GetTraits(ToFormatDepth(depth)); - if (color_format == vk::Format::eUndefined && - depth_stencil_format == vk::Format::eUndefined) { + const vk::Format color_format = + color_traits.attachment_support ? color_traits.native : color_traits.fallback; + const vk::Format depth_format = + depth_traits.attachment_support ? depth_traits.native : depth_traits.fallback; + + if (color_format == vk::Format::eUndefined && depth_format == vk::Format::eUndefined) { continue; } - cached_renderpasses[color][depth][0] = CreateRenderPass(color_format, depth_stencil_format, + cached_renderpasses[color][depth][0] = CreateRenderPass(color_format, depth_format, vk::AttachmentLoadOp::eLoad, vk::ImageLayout::eColorAttachmentOptimal, vk::ImageLayout::eColorAttachmentOptimal); - cached_renderpasses[color][depth][1] = CreateRenderPass(color_format, depth_stencil_format, + cached_renderpasses[color][depth][1] = CreateRenderPass(color_format, depth_format, vk::AttachmentLoadOp::eClear, vk::ImageLayout::eColorAttachmentOptimal, vk::ImageLayout::eColorAttachmentOptimal); diff --git a/src/video_core/renderer_vulkan/vk_task_scheduler.cpp b/src/video_core/renderer_vulkan/vk_task_scheduler.cpp index 08b3f7b96..dc596991c 100644 --- a/src/video_core/renderer_vulkan/vk_task_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_task_scheduler.cpp @@ -5,12 +5,14 @@ #define VULKAN_HPP_NO_CONSTRUCTORS #include "common/assert.h" #include "common/logging/log.h" +#include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_task_scheduler.h" #include "video_core/renderer_vulkan/vk_instance.h" namespace Vulkan { -TaskScheduler::TaskScheduler(const Instance& instance) : instance{instance} { +TaskScheduler::TaskScheduler(const Instance& instance, RendererVulkan& renderer) + : instance{instance}, renderer{renderer} { vk::Device device = instance.GetDevice(); const vk::CommandPoolCreateInfo command_pool_info = { .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer, @@ -97,11 +99,7 @@ void TaskScheduler::Synchronize(u32 slot) { const auto& command = commands[slot]; vk::Device device = instance.GetDevice(); - u32 completed_counter = completed_fence_counter; - if (instance.IsTimelineSemaphoreSupported()) { - completed_counter = device.getSemaphoreCounterValue(timeline); - } - + u32 completed_counter = GetFenceCounter(); if (command.fence_counter > completed_counter) { if (instance.IsTimelineSemaphoreSupported()) { const vk::SemaphoreWaitInfo wait_info = { @@ -127,6 +125,10 @@ void TaskScheduler::Synchronize(u32 slot) { } void TaskScheduler::Submit(SubmitMode mode) { + if (False(mode & SubmitMode::Shutdown)) { + renderer.FlushBuffers(); + } + const auto& command = commands[current_command]; command.render_command_buffer.end(); if (command.use_upload_buffer) { @@ -206,6 +208,7 @@ void TaskScheduler::Submit(SubmitMode mode) { // Switch to next cmdbuffer. if (False(mode & SubmitMode::Shutdown)) { SwitchSlot(); + renderer.OnSlotSwitch(); } } diff --git a/src/video_core/renderer_vulkan/vk_task_scheduler.h b/src/video_core/renderer_vulkan/vk_task_scheduler.h index 6cfb7cead..da3ab5860 100644 --- a/src/video_core/renderer_vulkan/vk_task_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_task_scheduler.h @@ -15,6 +15,7 @@ namespace Vulkan { class Buffer; class Instance; +class RendererVulkan; enum class SubmitMode : u8 { SwapchainSynced = 1 << 0, ///< Synchronizes command buffer execution with the swapchain @@ -26,7 +27,7 @@ DECLARE_ENUM_FLAG_OPERATORS(SubmitMode); class TaskScheduler { public: - TaskScheduler(const Instance& instance); + TaskScheduler(const Instance& instance, RendererVulkan& renderer); ~TaskScheduler(); /// Blocks the host until the current command completes execution @@ -74,6 +75,7 @@ private: private: const Instance& instance; + RendererVulkan& renderer; u64 next_fence_counter = 1; u64 completed_fence_counter = 0; diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp index 7556eac7e..cacfcf651 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp @@ -8,36 +8,10 @@ #include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_task_scheduler.h" #include "video_core/renderer_vulkan/vk_texture_runtime.h" +#include namespace Vulkan { -vk::Format ToVkFormat(VideoCore::PixelFormat format) { - switch (format) { - case VideoCore::PixelFormat::RGBA8: - return vk::Format::eR8G8B8A8Unorm; - case VideoCore::PixelFormat::RGB8: - return vk::Format::eR8G8B8Unorm; - case VideoCore::PixelFormat::RGB5A1: - return vk::Format::eR5G5B5A1UnormPack16; - case VideoCore::PixelFormat::RGB565: - return vk::Format::eR5G6B5UnormPack16; - case VideoCore::PixelFormat::RGBA4: - return vk::Format::eR4G4B4A4UnormPack16; - case VideoCore::PixelFormat::D16: - return vk::Format::eD16Unorm; - case VideoCore::PixelFormat::D24: - return vk::Format::eX8D24UnormPack32; - case VideoCore::PixelFormat::D24S8: - return vk::Format::eD24UnormS8Uint; - case VideoCore::PixelFormat::Invalid: - LOG_ERROR(Render_Vulkan, "Unknown texture format {}!", format); - return vk::Format::eUndefined; - default: - // Use default case for the texture formats - return vk::Format::eR8G8B8A8Unorm; - } -} - vk::ImageAspectFlags ToVkAspect(VideoCore::SurfaceType type) { switch (type) { case VideoCore::SurfaceType::Color: @@ -55,23 +29,7 @@ vk::ImageAspectFlags ToVkAspect(VideoCore::SurfaceType type) { return vk::ImageAspectFlagBits::eColor; } -vk::FormatFeatureFlagBits ToVkFormatFeatures(VideoCore::SurfaceType type) { - switch (type) { - case VideoCore::SurfaceType::Color: - case VideoCore::SurfaceType::Texture: - case VideoCore::SurfaceType::Fill: - return vk::FormatFeatureFlagBits::eColorAttachment; - case VideoCore::SurfaceType::Depth: - case VideoCore::SurfaceType::DepthStencil: - return vk::FormatFeatureFlagBits::eDepthStencilAttachment; - default: - UNREACHABLE_MSG("Invalid surface type!"); - } - - return vk::FormatFeatureFlagBits::eColorAttachment; -} - -constexpr u32 STAGING_BUFFER_SIZE = 16 * 1024 * 1024; +constexpr u32 STAGING_BUFFER_SIZE = 64 * 1024 * 1024; TextureRuntime::TextureRuntime(const Instance& instance, TaskScheduler& scheduler, RenderpassCache& renderpass_cache) @@ -92,6 +50,7 @@ TextureRuntime::~TextureRuntime() { for (const auto& [key, alloc] : texture_recycler) { vmaDestroyImage(allocator, alloc.image, alloc.allocation); device.destroyImageView(alloc.image_view); + device.destroyImageView(alloc.base_view); } for (const auto& [key, framebuffer] : clear_framebuffers) { @@ -118,6 +77,10 @@ StagingData TextureRuntime::FindStaging(u32 size, bool upload) { }; } +void TextureRuntime::Finish() { + scheduler.Submit(SubmitMode::Flush); +} + void TextureRuntime::OnSlotSwitch(u32 new_slot) { staging_offsets[new_slot] = 0; } @@ -140,9 +103,12 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma return alloc; } - // Create a new allocation - vk::Format vk_format = instance.GetFormatAlternative(ToVkFormat(format)); - vk::ImageAspectFlags aspect = GetImageAspect(vk_format); + const FormatTraits traits = instance.GetTraits(format); + const vk::ImageAspectFlags aspect = ToVkAspect(VideoCore::GetFormatType(format)); + + const bool is_suitable = traits.blit_support && traits.attachment_support; + const vk::Format vk_format = is_suitable ? traits.native : traits.fallback; + const vk::ImageUsageFlags vk_usage = is_suitable ? traits.usage : GetImageUsage(aspect); const u32 levels = std::bit_width(std::max(width, height)); const vk::ImageCreateInfo image_info = { @@ -155,7 +121,7 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma .mipLevels = levels, .arrayLayers = layers, .samples = vk::SampleCountFlagBits::e1, - .usage = GetImageUsage(aspect), + .usage = vk_usage }; const VmaAllocationCreateInfo alloc_info = { @@ -174,8 +140,23 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma } vk::Image image = vk::Image{unsafe_image}; - const vk::ImageViewCreateInfo view_info = { + .image = image, + .viewType = type == VideoCore::TextureType::CubeMap ? + vk::ImageViewType::eCube : + vk::ImageViewType::e2D, + .format = vk_format, + .subresourceRange = { + .aspectMask = aspect, + .baseMipLevel = 0, + .levelCount = levels, + .baseArrayLayer = 0, + .layerCount = layers + } + }; + + // Also create a base mip view in case this is used as an attachment + const vk::ImageViewCreateInfo base_view_info = { .image = image, .viewType = type == VideoCore::TextureType::CubeMap ? vk::ImageViewType::eCube : @@ -192,13 +173,17 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma vk::Device device = instance.GetDevice(); vk::ImageView image_view = device.createImageView(view_info); + vk::ImageView base_view = device.createImageView(base_view_info); return ImageAlloc{ .image = image, .image_view = image_view, + .base_view = base_view, .allocation = allocation, + .format = vk_format, .aspect = aspect, .levels = levels, + .layers = layers }; } @@ -206,21 +191,45 @@ void TextureRuntime::Recycle(const VideoCore::HostTextureTag tag, ImageAlloc&& a texture_recycler.emplace(tag, std::move(alloc)); } -void TextureRuntime::FormatConvert(VideoCore::PixelFormat format, bool upload, +void TextureRuntime::FormatConvert(const Surface& surface, bool upload, std::span source, std::span dest) { - const VideoCore::SurfaceType type = VideoCore::GetFormatType(format); - const vk::FormatFeatureFlagBits feature = ToVkFormatFeatures(type); - - if (format == VideoCore::PixelFormat::RGBA8) { - return Pica::Texture::ConvertABGRToRGBA(source, dest); - } else if (format == VideoCore::PixelFormat::RGB8 && upload) { - return Pica::Texture::ConvertBGRToRGBA(source, dest); - } else if (instance.IsFormatSupported(ToVkFormat(format), feature)) { - std::memcpy(dest.data(), source.data(), source.size()); - } else { - LOG_CRITICAL(Render_Vulkan, "Unimplemented converion for format {}!", format); + if (!surface.NeedsConvert()) { std::memcpy(dest.data(), source.data(), source.size()); + return; } + + // Since this is the most common case handle it separately + if (surface.pixel_format == VideoCore::PixelFormat::RGBA8) { + return Pica::Texture::ConvertABGRToRGBA(source, dest); + } + + // Handle simple D24S8 interleave case + if (surface.GetInternalFormat() == vk::Format::eD24UnormS8Uint) { + return Pica::Texture::InterleaveD24S8(source, dest); + } + + if (upload) { + switch (surface.pixel_format) { + case VideoCore::PixelFormat::RGB8: + return Pica::Texture::ConvertBGRToRGBA(source, dest); + case VideoCore::PixelFormat::RGBA4: + return Pica::Texture::ConvertRGBA4ToRGBA8(source, dest); + default: + break; + } + } else { + switch (surface.pixel_format) { + case VideoCore::PixelFormat::D24S8: + return Pica::Texture::ConvertD32S8ToD24S8(source, dest); + case VideoCore::PixelFormat::RGBA4: + return Pica::Texture::ConvertRGBA8ToRGBA4(source, dest); + default: + break; + } + } + + LOG_WARNING(Render_Vulkan, "Missing format convertion: {} {} {}", + vk::to_string(surface.traits.native), upload ? "->" : "<-", vk::to_string(surface.alloc.format)); } bool TextureRuntime::ClearTexture(Surface& surface, const VideoCore::TextureClear& clear, @@ -276,11 +285,12 @@ bool TextureRuntime::ClearTexture(Surface& surface, const VideoCore::TextureClea } auto [it, new_framebuffer] = clear_framebuffers.try_emplace(alloc.image_view, vk::Framebuffer{}); - if (new_framebuffer) { + if (new_framebuffer) {\ + const vk::ImageView framebuffer_view = surface.GetFramebufferView(); const vk::FramebufferCreateInfo framebuffer_info = { .renderPass = clear_renderpass, .attachmentCount = 1, - .pAttachments = &alloc.image_view, + .pAttachments = &framebuffer_view, .width = surface.GetScaledWidth(), .height = surface.GetScaledHeight(), .layers = 1 @@ -377,7 +387,7 @@ bool TextureRuntime::BlitTextures(Surface& source, Surface& dest, const VideoCor command_buffer.blitImage(source.alloc.image, vk::ImageLayout::eTransferSrcOptimal, dest.alloc.image, vk::ImageLayout::eTransferDstOptimal, - blit_area, vk::Filter::eLinear); + blit_area, vk::Filter::eNearest); return true; } @@ -528,7 +538,7 @@ void TextureRuntime::Transition(vk::CommandBuffer command_buffer, ImageAlloc& al Surface::Surface(VideoCore::SurfaceParams& params, TextureRuntime& runtime) : VideoCore::SurfaceBase{params}, runtime{runtime}, instance{runtime.GetInstance()}, - scheduler{runtime.GetScheduler()} { + scheduler{runtime.GetScheduler()}, traits{instance.GetTraits(pixel_format)} { if (pixel_format != VideoCore::PixelFormat::Invalid) { alloc = runtime.Allocate(GetScaledWidth(), GetScaledHeight(), params.pixel_format, texture_type); @@ -604,7 +614,7 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); const VideoCore::Rect2D rect = download.texture_rect; vk::BufferImageCopy copy_region = { - .bufferOffset = staging.buffer_offset, + .bufferOffset = staging.buffer_offset + download.buffer_offset, .bufferRowLength = rect.GetWidth(), .bufferImageHeight = rect.GetHeight(), .imageSubresource = { @@ -624,20 +634,17 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi copy_regions[region_count++] = copy_region; if (alloc.aspect & vk::ImageAspectFlagBits::eStencil) { - return; // HACK: Skip depth + stencil downloads for now - copy_region.bufferOffset += staging.mapped.size(); - copy_region.imageSubresource.aspectMask |= vk::ImageAspectFlagBits::eStencil; + copy_region.bufferOffset += 4 * staging.size / 5; + copy_region.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eStencil; copy_regions[region_count++] = copy_region; } } - runtime.Transition(command_buffer, alloc, vk::ImageLayout::eTransferSrcOptimal, download.texture_level, 1); + runtime.Transition(command_buffer, alloc, vk::ImageLayout::eTransferSrcOptimal, 0, alloc.levels); // Copy pixel data to the staging buffer command_buffer.copyImageToBuffer(alloc.image, vk::ImageLayout::eTransferSrcOptimal, staging.buffer, region_count, copy_regions.data()); - - scheduler.Submit(SubmitMode::Flush); } // Lock this data until the next scheduler switch @@ -645,6 +652,22 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi runtime.staging_offsets[current_slot] += staging.size; } +bool Surface::NeedsConvert() const { + // RGBA8 needs a byteswap since R8G8B8A8UnormPack32 does not exist + // D24S8 always needs an interleave pass even if natively supported + return alloc.format != traits.native || + pixel_format == VideoCore::PixelFormat::RGBA8 || + pixel_format == VideoCore::PixelFormat::D24S8; +} + +u32 Surface::GetInternalBytesPerPixel() const { + if (alloc.format == vk::Format::eD32SfloatS8Uint) { + return 8; + } + + return vk::blockSize(alloc.format); +} + void Surface::ScaledDownload(const VideoCore::BufferTextureCopy& download) { /*const u32 rect_width = download.texture_rect.GetWidth(); const u32 rect_height = download.texture_rect.GetHeight(); diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.h b/src/video_core/renderer_vulkan/vk_texture_runtime.h index 93d6dde70..60bcb9a09 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.h +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.h @@ -10,6 +10,7 @@ #include "video_core/rasterizer_cache/surface_base.h" #include "video_core/rasterizer_cache/types.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" +#include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_task_scheduler.h" namespace Vulkan { @@ -24,10 +25,14 @@ struct StagingData { struct ImageAlloc { vk::Image image; vk::ImageView image_view; + vk::ImageView base_view; VmaAllocation allocation; + vk::ImageUsageFlags usage; + vk::Format format; vk::ImageLayout layout = vk::ImageLayout::eUndefined; vk::ImageAspectFlags aspect = vk::ImageAspectFlagBits::eNone; u32 levels = 1; + u32 layers = 1; }; class Instance; @@ -48,6 +53,9 @@ public: /// Maps an internal staging buffer of the provided size of pixel uploads/downloads [[nodiscard]] StagingData FindStaging(u32 size, bool upload); + /// Causes a GPU command flush + void Finish(); + /// Allocates a vulkan image possibly resusing an existing one [[nodiscard]] ImageAlloc Allocate(u32 width, u32 height, VideoCore::PixelFormat format, VideoCore::TextureType type); @@ -56,7 +64,7 @@ public: void Recycle(const VideoCore::HostTextureTag tag, ImageAlloc&& alloc); /// Performs required format convertions on the staging data - void FormatConvert(VideoCore::PixelFormat format, bool upload, + void FormatConvert(const Surface& surface, bool upload, std::span source, std::span dest); /// Transitions the mip level range of the surface to new_layout @@ -114,6 +122,27 @@ public: /// Downloads pixel data to staging from a rectangle region of the surface texture void Download(const VideoCore::BufferTextureCopy& download, const StagingData& staging); + /// Returns true if the surface requires pixel data convertion + bool NeedsConvert() const; + + /// Returns the bpp of the internal surface format + u32 GetInternalBytesPerPixel() const; + + /// Returns an image view used to sample the surface from a shader + vk::ImageView GetImageView() const { + return alloc.image_view; + } + + /// Returns an image view used to create a framebuffer + vk::ImageView GetFramebufferView() { + return alloc.base_view; + } + + /// Returns the internal format of the allocated texture + vk::Format GetInternalFormat() const { + return alloc.format; + } + private: /// Downloads scaled image by downscaling the requested rectangle void ScaledDownload(const VideoCore::BufferTextureCopy& download); @@ -128,9 +157,8 @@ private: TextureRuntime& runtime; const Instance& instance; TaskScheduler& scheduler; - ImageAlloc alloc{}; - vk::Format internal_format = vk::Format::eUndefined; + FormatTraits traits; }; struct Traits { diff --git a/src/video_core/texture/texture_decode.cpp b/src/video_core/texture/texture_decode.cpp index 7981feefd..6f306aa80 100644 --- a/src/video_core/texture/texture_decode.cpp +++ b/src/video_core/texture/texture_decode.cpp @@ -233,21 +233,67 @@ void ConvertBGRToRGB(std::span source, std::span des void ConvertBGRToRGBA(std::span source, std::span dest) { u32 j = 0; - for (std::size_t i = 0; i < source.size(); i += 3) { - dest[j] = source[i + 2]; - dest[j + 1] = source[i + 1]; - dest[j + 2] = source[i]; - dest[j + 3] = std::byte{0xFF}; - j += 4; + for (std::size_t i = 0; i < dest.size(); i += 4) { + dest[i] = source[j + 2]; + dest[i + 1] = source[j + 1]; + dest[i + 2] = source[j]; + dest[i + 3] = std::byte{0xFF}; + j += 3; } } void ConvertABGRToRGBA(std::span source, std::span dest) { - for (u32 i = 0; i < source.size(); i += 4) { + for (u32 i = 0; i < dest.size(); i += 4) { const u32 abgr = *reinterpret_cast(source.data() + i); const u32 rgba = Common::swap32(abgr); std::memcpy(dest.data() + i, &rgba, 4); } } +void ConvertD32S8ToD24S8(std::span source, std::span dest) { + u32 depth_offset = 0; + u32 stencil_offset = 4 * source.size() / 5; + for (std::size_t i = 0; i < dest.size(); i += 4) { + float depth; + std::memcpy(&depth, source.data() + depth_offset, sizeof(float)); + u32 depth_uint = depth * 0xFFFFFF; + + dest[i] = source[stencil_offset]; + std::memcpy(dest.data() + i + 1, &depth_uint, 3); + + depth_offset += 4; + stencil_offset += 1; + } +} + +void ConvertRGBA4ToRGBA8(std::span source, std::span dest) { + u32 j = 0; + for (std::size_t i = 0; i < dest.size(); i += 4) { + auto rgba = Color::DecodeRGBA4(reinterpret_cast(source.data() + j)); + std::memcpy(dest.data() + i, rgba.AsArray(), sizeof(rgba)); + j += 2; + } +} + +void ConvertRGBA8ToRGBA4(std::span source, std::span dest) { + u32 j = 0; + for (std::size_t i = 0; i < dest.size(); i += 2) { + Common::Vec4 rgba; + std::memcpy(rgba.AsArray(), source.data() + j, sizeof(rgba)); + Color::EncodeRGBA4(rgba, reinterpret_cast(dest.data() + i)); + j += 4; + } +} + +void InterleaveD24S8(std::span source, std::span dest) { + u32 depth_offset = 0; + u32 stencil_offset = 3 * source.size() / 4; + for (std::size_t i = 0; i < dest.size(); i += 4) { + dest[i] = source[stencil_offset]; + std::memcpy(dest.data() + i + 1, source.data() + depth_offset, 3); + depth_offset += 3; + stencil_offset += 1; + } +} + } // namespace Pica::Texture diff --git a/src/video_core/texture/texture_decode.h b/src/video_core/texture/texture_decode.h index d8b8238c7..c3967435d 100644 --- a/src/video_core/texture/texture_decode.h +++ b/src/video_core/texture/texture_decode.h @@ -80,4 +80,12 @@ void ConvertBGRToRGBA(std::span source, std::span de */ void ConvertABGRToRGBA(std::span source, std::span dest); +void ConvertD32S8ToD24S8(std::span source, std::span dest); + +void ConvertRGBA4ToRGBA8(std::span source, std::span dest); + +void ConvertRGBA8ToRGBA4(std::span source, std::span dest); + +void InterleaveD24S8(std::span source, std::span dest); + } // namespace Pica::Texture