diff --git a/src/core/hle/service/cfg/cfg.h b/src/core/hle/service/cfg/cfg.h index e887a9645..0310df3aa 100644 --- a/src/core/hle/service/cfg/cfg.h +++ b/src/core/hle/service/cfg/cfg.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include "common/common_types.h" diff --git a/src/video_core/rasterizer_cache/morton_swizzle.h b/src/video_core/rasterizer_cache/morton_swizzle.h index 108c95924..9fd372176 100644 --- a/src/video_core/rasterizer_cache/morton_swizzle.h +++ b/src/video_core/rasterizer_cache/morton_swizzle.h @@ -135,42 +135,53 @@ inline void MortonCopyTile(u32 stride, std::span tile_buffer, std::sp } } +/** + * @brief Performs morton to/from linear convertions on the provided pixel data + * @param width, height The dimentions of the rectangular region of pixels in linear_buffer + * @param start_offset The number of bytes from the start of the first tile to the start of tiled_buffer + * @param end_offset The number of bytes from the start of the first tile to the end of tiled_buffer + * @param linear_buffer The linear pixel data + * @param tiled_buffer The tiled pixel data + * + * The MortonCopy is at the heart of the PICA texture implementation, as it's responsible for converting between + * linear and morton tiled layouts. The function handles both convertions but there are slightly different + * paths and inputs for each: + * + * Morton to Linear: + * During uploads, tiled_buffer is always aligned to the tile or scanline boundary depending if the linear rectangle + * spans multiple vertical tiles. linear_buffer does not reference the entire texture area, but rather the + * specific rectangle affected by the upload. + * + * Linear to Morton: + * This is similar to the other convertion but with some differences. In this case tiled_buffer is not required + * to be aligned to any specific boundary which requires special care. start_offset/end_offset are useful + * here as they tell us exactly where the data should be placed in the linear_buffer. + */ template -static void MortonCopy(u32 stride, u32 height, u32 start_offset, u32 end_offset, - std::span linear_buffer, - std::span tiled_buffer) { - +static void MortonCopy(u32 width, u32 height, u32 start_offset, u32 end_offset, + std::span linear_buffer, std::span tiled_buffer) { constexpr u32 bytes_per_pixel = GetFormatBpp(format) / 8; constexpr u32 aligned_bytes_per_pixel = GetBytesPerPixel(format); + constexpr u32 tile_size = GetFormatBpp(format) * 64 / 8; static_assert(aligned_bytes_per_pixel >= bytes_per_pixel, ""); - // We could use bytes_per_pixel here but it should be avoided because it - // becomes zero for 4-bit textures! - constexpr u32 tile_size = GetFormatBpp(format) * 64 / 8; - const u32 linear_tile_size = (7 * stride + 8) * aligned_bytes_per_pixel; - - // Does this line have any significance? - //u32 linear_offset = aligned_bytes_per_pixel - bytes_per_pixel; - u32 linear_offset = 0; - u32 tiled_offset = 0; - - const PAddr aligned_down_start_offset = Common::AlignDown(start_offset, tile_size); - const PAddr aligned_start_offset = Common::AlignUp(start_offset, tile_size); - PAddr aligned_end_offset = Common::AlignDown(end_offset, tile_size); + const u32 linear_tile_stride = (7 * width + 8) * aligned_bytes_per_pixel; + const u32 aligned_down_start_offset = Common::AlignDown(start_offset, tile_size); + const u32 aligned_start_offset = Common::AlignUp(start_offset, tile_size); + const u32 aligned_end_offset = Common::AlignDown(end_offset, tile_size); ASSERT(!morton_to_linear || (aligned_start_offset == start_offset && aligned_end_offset == end_offset)); - const u32 begin_pixel_index = aligned_down_start_offset * 8 / GetFormatBpp(format); - u32 x = (begin_pixel_index % (stride * 8)) / 8; - u32 y = (begin_pixel_index / (stride * 8)) * 8; - // In OpenGL the texture origin is in the bottom left corner as opposed to other // APIs that have it at the top left. To avoid flipping texture coordinates in - // the shader we read/write the linear buffer backwards - linear_offset += ((height - 8 - y) * stride + x) * aligned_bytes_per_pixel; + // the shader we read/write the linear buffer from the bottom up + u32 linear_offset = ((height - 8) * width) * aligned_bytes_per_pixel; + u32 tiled_offset = 0; + u32 x = 0; + u32 y = 0; - auto linear_next_tile = [&] { - x = (x + 8) % stride; + const auto LinearNextTile = [&] { + x = (x + 8) % width; linear_offset += 8 * aligned_bytes_per_pixel; if (!x) { y = (y + 8) % height; @@ -178,7 +189,7 @@ static void MortonCopy(u32 stride, u32 height, u32 start_offset, u32 end_offset, return; } - linear_offset -= stride * 9 * aligned_bytes_per_pixel; + linear_offset -= width * 9 * aligned_bytes_per_pixel; } }; @@ -186,31 +197,31 @@ static void MortonCopy(u32 stride, u32 height, u32 start_offset, u32 end_offset, // the tile affected to a temporary buffer and copy the part we are interested in if (start_offset < aligned_start_offset && !morton_to_linear) { std::array tmp_buf; - auto linear_data = linear_buffer.last(linear_buffer.size_bytes() - linear_offset); - MortonCopyTile(stride, tmp_buf, linear_data); + auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_stride); + MortonCopyTile(width, tmp_buf, linear_data); std::memcpy(tiled_buffer.data(), tmp_buf.data() + start_offset - aligned_down_start_offset, std::min(aligned_start_offset, end_offset) - start_offset); tiled_offset += aligned_start_offset - start_offset; - linear_next_tile(); + LinearNextTile(); } const u32 buffer_end = tiled_offset + aligned_end_offset - aligned_start_offset; while (tiled_offset < buffer_end) { - auto linear_data = linear_buffer.last(linear_buffer.size_bytes() - linear_offset); + auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_stride); auto tiled_data = tiled_buffer.subspan(tiled_offset, tile_size); - MortonCopyTile(stride, tiled_data, linear_data); + MortonCopyTile(width, tiled_data, linear_data); tiled_offset += tile_size; - linear_next_tile(); + LinearNextTile(); } // If during a texture download the end coordinate is not tile aligned, swizzle // the tile affected to a temporary buffer and copy the part we are interested in if (end_offset > std::max(aligned_start_offset, aligned_end_offset) && !morton_to_linear) { std::array tmp_buf; - auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_size); - MortonCopyTile(stride, tmp_buf, linear_data); + auto linear_data = linear_buffer.subspan(linear_offset, linear_tile_stride); + MortonCopyTile(width, tmp_buf, linear_data); std::memcpy(tiled_buffer.data() + tiled_offset, tmp_buf.data(), end_offset - aligned_end_offset); } } diff --git a/src/video_core/rasterizer_cache/rasterizer_cache.h b/src/video_core/rasterizer_cache/rasterizer_cache.h index 7509a0f6e..1a20892fc 100644 --- a/src/video_core/rasterizer_cache/rasterizer_cache.h +++ b/src/video_core/rasterizer_cache/rasterizer_cache.h @@ -163,7 +163,8 @@ private: SurfaceMap dirty_regions; SurfaceSet remove_surfaces; u16 resolution_scale_factor; - + std::vector> download_queue; + std::vector staging_buffer; std::unordered_map texture_cube_cache; std::recursive_mutex mutex; }; @@ -269,9 +270,26 @@ bool RasterizerCache::BlitSurfaces(const Surface& src_surface, Common::Rectan const Surface& dst_surface, Common::Rectangle dst_rect) { MICROPROFILE_SCOPE(RasterizerCache_BlitSurface); - if (CheckFormatsBlittable(src_surface->pixel_format, dst_surface->pixel_format)) { - dst_surface->InvalidateAllWatcher(); + if (!CheckFormatsBlittable(src_surface->pixel_format, dst_surface->pixel_format)) [[unlikely]] { + return false; + } + dst_surface->InvalidateAllWatcher(); + + // Prefer texture copy over blit if possible + if (src_rect.GetWidth() == dst_rect.GetWidth() && src_rect.bottom < src_rect.top) { + const TextureCopy texture_copy = { + .src_level = 0, + .dst_level = 0, + .src_layer = 0, + .dst_layer = 0, + .src_offset = {src_rect.left, src_rect.bottom}, + .dst_offset = {dst_rect.left, dst_rect.bottom}, + .extent = {src_rect.GetWidth(), src_rect.GetHeight()} + }; + + return runtime.CopyTextures(*src_surface, *dst_surface, texture_copy); + } else { const TextureBlit texture_blit = { .src_level = 0, .dst_level = 0, @@ -283,8 +301,6 @@ bool RasterizerCache::BlitSurfaces(const Surface& src_surface, Common::Rectan return runtime.BlitTextures(*src_surface, *dst_surface, texture_blit); } - - return false; } MICROPROFILE_DECLARE(RasterizerCache_CopySurface); @@ -888,35 +904,31 @@ void RasterizerCache::ValidateSurface(const Surface& surface, PAddr addr, u32 MICROPROFILE_DECLARE(RasterizerCache_SurfaceLoad); template void RasterizerCache::UploadSurface(const Surface& surface, SurfaceInterval interval) { - const SurfaceParams info = surface->FromInterval(interval); - const u32 load_start = info.addr; - const u32 load_end = info.end; - ASSERT(load_start >= surface->addr && load_end <= surface->end); + const SurfaceParams load_info = surface->FromInterval(interval); + ASSERT(load_info.addr >= surface->addr && load_info.end <= surface->end); + + MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad); const auto& staging = runtime.FindStaging( - surface->width * surface->height * 4, true); - MemoryRef source_ptr = VideoCore::g_memory->GetPhysicalRef(info.addr); + load_info.width * load_info.height * surface->GetInternalBytesPerPixel(), true); + MemoryRef source_ptr = VideoCore::g_memory->GetPhysicalRef(load_info.addr); if (!source_ptr) [[unlikely]] { return; } - const auto upload_data = source_ptr.GetWriteBytes(load_end - load_start); - - MICROPROFILE_SCOPE(RasterizerCache_SurfaceLoad); - + const auto upload_data = source_ptr.GetWriteBytes(load_info.end - load_info.addr); if (surface->is_tiled) { - std::vector unswizzled_data(staging.size); - UnswizzleTexture(*surface, load_start - surface->addr, load_end - surface->addr, - upload_data, unswizzled_data); - runtime.FormatConvert(surface->pixel_format, true, unswizzled_data, staging.mapped); + std::vector unswizzled_data(load_info.width * load_info.height * GetBytesPerPixel(load_info.pixel_format)); + UnswizzleTexture(load_info, load_info.addr, load_info.end, upload_data, unswizzled_data); + runtime.FormatConvert(*surface, true, unswizzled_data, staging.mapped); } else { - runtime.FormatConvert(surface->pixel_format, true, upload_data, staging.mapped); + runtime.FormatConvert(*surface, true, upload_data, staging.mapped); } const BufferTextureCopy upload = { .buffer_offset = 0, .buffer_size = staging.size, - .texture_rect = surface->GetSubRect(info), + .texture_rect = surface->GetSubRect(load_info), .texture_level = 0 }; @@ -926,17 +938,17 @@ void RasterizerCache::UploadSurface(const Surface& surface, SurfaceInterval i MICROPROFILE_DECLARE(RasterizerCache_SurfaceFlush); template void RasterizerCache::DownloadSurface(const Surface& surface, SurfaceInterval interval) { + const SurfaceParams flush_info = surface->FromInterval(interval); const u32 flush_start = boost::icl::first(interval); const u32 flush_end = boost::icl::last_next(interval); ASSERT(flush_start >= surface->addr && flush_end <= surface->end); const auto& staging = runtime.FindStaging( - surface->width * surface->height * 4, false); - const SurfaceParams params = surface->FromInterval(interval); + flush_info.width * flush_info.height * surface->GetInternalBytesPerPixel(), false); const BufferTextureCopy download = { .buffer_offset = 0, .buffer_size = staging.size, - .texture_rect = surface->GetSubRect(params), + .texture_rect = surface->GetSubRect(flush_info), .texture_level = 0 }; @@ -948,17 +960,15 @@ void RasterizerCache::DownloadSurface(const Surface& surface, SurfaceInterval } const auto download_dest = dest_ptr.GetWriteBytes(flush_end - flush_start); - - MICROPROFILE_SCOPE(RasterizerCache_SurfaceFlush); - - if (surface->is_tiled) { - std::vector swizzled_data(staging.size); - runtime.FormatConvert(surface->pixel_format, false, swizzled_data, swizzled_data); - SwizzleTexture(*surface, flush_start - surface->addr, flush_end - surface->addr, - staging.mapped, download_dest); - } else { - runtime.FormatConvert(surface->pixel_format, false, staging.mapped, download_dest); - } + download_queue.push_back([this, surface, download_dest, flush_start, flush_end, flush_info, mapped = staging.mapped]() { + if (surface->is_tiled) { + std::vector temp_data(flush_info.width * flush_info.height * GetBytesPerPixel(flush_info.pixel_format)); + runtime.FormatConvert(*surface, false, mapped, temp_data); + SwizzleTexture(flush_info, flush_start, flush_end, temp_data, download_dest); + } else { + runtime.FormatConvert(*surface, false, mapped, download_dest); + } + }); } template @@ -1135,6 +1145,17 @@ void RasterizerCache::FlushRegion(PAddr addr, u32 size, Surface flush_surface flushed_intervals += interval; } + // Batch execute all requested downloads. This gives more time for them to complete + // before we issue the CPU to GPU flush and reduces scheduler slot switches in Vulkan + if (!download_queue.empty()) { + runtime.Finish(); + for (auto& download_func : download_queue) { + download_func(); + } + + download_queue.clear(); + } + // Reset dirty regions dirty_regions -= flushed_intervals; } diff --git a/src/video_core/rasterizer_cache/types.h b/src/video_core/rasterizer_cache/types.h index 6b324d930..669ce2ddf 100644 --- a/src/video_core/rasterizer_cache/types.h +++ b/src/video_core/rasterizer_cache/types.h @@ -41,6 +41,8 @@ struct TextureClear { struct TextureCopy { u32 src_level; u32 dst_level; + u32 src_layer; + u32 dst_layer; Offset src_offset; Offset dst_offset; Extent extent; diff --git a/src/video_core/rasterizer_cache/utils.cpp b/src/video_core/rasterizer_cache/utils.cpp index 586565b8d..28c8965b1 100644 --- a/src/video_core/rasterizer_cache/utils.cpp +++ b/src/video_core/rasterizer_cache/utils.cpp @@ -11,18 +11,22 @@ namespace VideoCore { -void SwizzleTexture(const SurfaceParams& params, u32 start_offset, u32 end_offset, +void SwizzleTexture(const SurfaceParams& swizzle_info, PAddr start_addr, PAddr end_addr, std::span source_linear, std::span dest_tiled) { - const u32 func_index = static_cast(params.pixel_format); + const u32 func_index = static_cast(swizzle_info.pixel_format); const MortonFunc SwizzleImpl = SWIZZLE_TABLE[func_index]; - SwizzleImpl(params.stride, params.height, start_offset, end_offset, source_linear, dest_tiled); + SwizzleImpl(swizzle_info.width, swizzle_info.height, + start_addr - swizzle_info.addr, end_addr - swizzle_info.addr, + source_linear, dest_tiled); } -void UnswizzleTexture(const SurfaceParams& params, u32 start_offset, u32 end_offset, +void UnswizzleTexture(const SurfaceParams& unswizzle_info, PAddr start_addr, PAddr end_addr, std::span source_tiled, std::span dest_linear) { - const u32 func_index = static_cast(params.pixel_format); + const u32 func_index = static_cast(unswizzle_info.pixel_format); const MortonFunc UnswizzleImpl = UNSWIZZLE_TABLE[func_index]; - UnswizzleImpl(params.stride, params.height, start_offset, end_offset, dest_linear, source_tiled); + UnswizzleImpl(unswizzle_info.width, unswizzle_info.height, + start_addr - unswizzle_info.addr, end_addr - unswizzle_info.addr, + dest_linear, source_tiled); } ClearValue MakeClearValue(SurfaceType type, PixelFormat format, const u8* fill_data) { diff --git a/src/video_core/rasterizer_cache/utils.h b/src/video_core/rasterizer_cache/utils.h index 1de5faca1..64ee0d62f 100644 --- a/src/video_core/rasterizer_cache/utils.h +++ b/src/video_core/rasterizer_cache/utils.h @@ -44,19 +44,29 @@ class SurfaceParams; [[nodiscard]] ClearValue MakeClearValue(SurfaceType type, PixelFormat format, const u8* fill_data); -void SwizzleTexture(const SurfaceParams& params, u32 start_offset, u32 end_offset, - std::span source_linear, std::span dest_tiled); - /** * Converts a morton swizzled texture to linear format. * - * @param params Structure used to query the surface information. - * @param start_offset Is the offset at which the source_tiled span begins + * @param unswizzle_info Structure used to query the surface information. + * @param start_addr The start address of the source_tiled data. + * @param end_addr The end address of the source_tiled data. + * @param source_tiled The tiled data to convert. + * @param dest_linear The output buffer where the generated linear data will be written to. + */ +void UnswizzleTexture(const SurfaceParams& unswizzle_info, PAddr start_addr, PAddr end_addr, + std::span source_tiled, std::span dest_linear); + +/** + * Swizzles a linear texture according to the morton code. + * + * @param swizzle_info Structure used to query the surface information. + * @param start_addr The start address of the dest_tiled data. + * @param end_addr The end address of the dest_tiled data. * @param source_tiled The source morton swizzled data. * @param dest_linear The output buffer where the generated linear data will be written to. */ -void UnswizzleTexture(const SurfaceParams& params, u32 start_offset, u32 end_offset, - std::span source_tiled, std::span dest_linear); +void SwizzleTexture(const SurfaceParams& swizzle_info, PAddr start_addr, PAddr end_addr, + std::span source_linear, std::span dest_tiled); } // namespace VideoCore diff --git a/src/video_core/renderer_opengl/gl_texture_runtime.cpp b/src/video_core/renderer_opengl/gl_texture_runtime.cpp index 6230f7037..abff64276 100644 --- a/src/video_core/renderer_opengl/gl_texture_runtime.cpp +++ b/src/video_core/renderer_opengl/gl_texture_runtime.cpp @@ -124,8 +124,9 @@ const FormatTuple& TextureRuntime::GetFormatTuple(VideoCore::PixelFormat pixel_f return DEFAULT_TUPLE; } -void TextureRuntime::FormatConvert(VideoCore::PixelFormat format, bool upload, +void TextureRuntime::FormatConvert(const Surface& surface, bool upload, std::span source, std::span dest) { + const VideoCore::PixelFormat format = surface.pixel_format; if (format == VideoCore::PixelFormat::RGBA8 && driver.IsOpenGLES()) { Pica::Texture::ConvertABGRToRGBA(source, dest); } else if (format == VideoCore::PixelFormat::RGB8 && driver.IsOpenGLES()) { diff --git a/src/video_core/renderer_opengl/gl_texture_runtime.h b/src/video_core/renderer_opengl/gl_texture_runtime.h index 7843e495c..b96efe187 100644 --- a/src/video_core/renderer_opengl/gl_texture_runtime.h +++ b/src/video_core/renderer_opengl/gl_texture_runtime.h @@ -70,8 +70,10 @@ public: /// Returns the OpenGL format tuple associated with the provided pixel format const FormatTuple& GetFormatTuple(VideoCore::PixelFormat pixel_format); + void Finish() const {} + /// Performs required format convertions on the staging data - void FormatConvert(VideoCore::PixelFormat format, bool upload, + void FormatConvert(const Surface& surface, bool upload, std::span source, std::span dest); /// Allocates an OpenGL texture with the specified dimentions and format @@ -136,6 +138,11 @@ public: /// Downloads pixel data to staging from a rectangle region of the surface texture void Download(const VideoCore::BufferTextureCopy& download, const StagingBuffer& staging); + /// Returns the bpp of the internal surface format + u32 GetInternalBytesPerPixel() const { + return VideoCore::GetBytesPerPixel(pixel_format); + } + private: /// Downloads scaled image by downscaling the requested rectangle void ScaledDownload(const VideoCore::BufferTextureCopy& download); diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 43f30deec..f8537e606 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -183,7 +183,7 @@ static std::array MakeOrthographicMatrix(float width, float height } RendererVulkan::RendererVulkan(Frontend::EmuWindow& window) - : RendererBase{window}, instance{window}, scheduler{instance}, renderpass_cache{instance, scheduler}, + : RendererBase{window}, instance{window}, scheduler{instance, *this}, renderpass_cache{instance, scheduler}, runtime{instance, scheduler, renderpass_cache}, swapchain{instance, renderpass_cache}, vertex_buffer{instance, scheduler, VERTEX_BUFFER_SIZE, vk::BufferUsageFlagBits::eVertexBuffer, {}} { @@ -626,7 +626,7 @@ void RendererVulkan::BuildPipelines() { void RendererVulkan::ConfigureFramebufferTexture(TextureInfo& texture, const GPU::Regs::FramebufferConfig& framebuffer) { TextureInfo old_texture = texture; - texture = TextureInfo { + texture = TextureInfo{ .alloc = runtime.Allocate(framebuffer.width, framebuffer.height, VideoCore::PixelFormatFromGPUPixelFormat(framebuffer.color_format), VideoCore::TextureType::Texture2D), @@ -1035,21 +1035,24 @@ void RendererVulkan::SwapBuffers() { DrawScreens(layout, false); - // Flush all buffers to make the data visible to the GPU before submitting - rasterizer->FlushBuffers(); - vertex_buffer.Flush(); - scheduler.Submit(SubmitMode::SwapchainSynced); swapchain.Present(present_ready); +} - // Inform texture runtime about the switch - runtime.OnSlotSwitch(scheduler.GetCurrentSlotIndex()); +void RendererVulkan::FlushBuffers() { + vertex_buffer.Flush(); + rasterizer->FlushBuffers(); +} +void RendererVulkan::OnSlotSwitch() { // When the command buffer switches, all state becomes undefined. // This is problematic with dynamic states, so set all states here if (instance.IsExtendedDynamicStateSupported()) { rasterizer->SyncFixedState(); } + + runtime.OnSlotSwitch(scheduler.GetCurrentSlotIndex()); + rasterizer->pipeline_cache.MarkDirty(); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 8f4153311..705eee9af 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -72,6 +72,8 @@ public: void PrepareVideoDumping() override {} void CleanupVideoDumping() override {} void Sync() override; + void FlushBuffers(); + void OnSlotSwitch(); private: void ReloadSampler(); diff --git a/src/video_core/renderer_vulkan/vk_common.h b/src/video_core/renderer_vulkan/vk_common.h index ff01bb2f2..6e1954ffa 100644 --- a/src/video_core/renderer_vulkan/vk_common.h +++ b/src/video_core/renderer_vulkan/vk_common.h @@ -26,11 +26,11 @@ constexpr vk::ImageAspectFlags GetImageAspect(vk::Format format) { switch (format) { case vk::Format::eD16UnormS8Uint: case vk::Format::eD24UnormS8Uint: - case vk::Format::eX8D24UnormPack32: case vk::Format::eD32SfloatS8Uint: return vk::ImageAspectFlagBits::eStencil | vk::ImageAspectFlagBits::eDepth; break; case vk::Format::eD16Unorm: + case vk::Format::eX8D24UnormPack32: case vk::Format::eD32Sfloat: return vk::ImageAspectFlagBits::eDepth; break; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index b8791dbab..8c9a72a97 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -11,6 +11,33 @@ namespace Vulkan { +vk::Format ToVkFormat(VideoCore::PixelFormat format) { + switch (format) { + case VideoCore::PixelFormat::RGBA8: + return vk::Format::eR8G8B8A8Unorm; + case VideoCore::PixelFormat::RGB8: + return vk::Format::eB8G8R8Unorm; + case VideoCore::PixelFormat::RGB5A1: + return vk::Format::eR5G5B5A1UnormPack16; + case VideoCore::PixelFormat::RGB565: + return vk::Format::eR5G6B5UnormPack16; + case VideoCore::PixelFormat::RGBA4: + return vk::Format::eR4G4B4A4UnormPack16; + case VideoCore::PixelFormat::D16: + return vk::Format::eD16Unorm; + case VideoCore::PixelFormat::D24: + return vk::Format::eX8D24UnormPack32; + case VideoCore::PixelFormat::D24S8: + return vk::Format::eD24UnormS8Uint; + case VideoCore::PixelFormat::Invalid: + LOG_ERROR(Render_Vulkan, "Unknown texture format {}!", format); + return vk::Format::eUndefined; + default: + // Use default case for the texture formats + return vk::Format::eR8G8B8A8Unorm; + } +} + Instance::Instance(Frontend::EmuWindow& window) { auto window_info = window.GetWindowInfo(); @@ -54,6 +81,7 @@ Instance::Instance(Frontend::EmuWindow& window) { device_properties = physical_device.getProperties(); CreateDevice(); + CreateFormatTable(); } Instance::~Instance() { @@ -64,50 +92,99 @@ Instance::~Instance() { instance.destroy(); } -bool Instance::IsFormatSupported(vk::Format format, vk::FormatFeatureFlags usage) const { - static std::unordered_map supported; - if (auto it = supported.find(format); it != supported.end()) { - return (it->second.optimalTilingFeatures & usage) == usage; +FormatTraits Instance::GetTraits(VideoCore::PixelFormat pixel_format) const { + if (pixel_format == VideoCore::PixelFormat::Invalid) [[unlikely]] { + return FormatTraits{}; } - // Cache format properties so we don't have to query the driver all the time - const vk::FormatProperties properties = physical_device.getFormatProperties(format); - supported.insert(std::make_pair(format, properties)); - - return (properties.optimalTilingFeatures & usage) == usage; + const u32 index = static_cast(pixel_format); + return format_table[index]; } -vk::Format Instance::GetFormatAlternative(vk::Format format) const { - if (format == vk::Format::eUndefined) { - return format; - } +void Instance::CreateFormatTable() { + constexpr std::array pixel_formats = { + VideoCore::PixelFormat::RGBA8, + VideoCore::PixelFormat::RGB8, + VideoCore::PixelFormat::RGB5A1, + VideoCore::PixelFormat::RGB565, + VideoCore::PixelFormat::RGBA4, + VideoCore::PixelFormat::IA8, + VideoCore::PixelFormat::RG8, + VideoCore::PixelFormat::I8, + VideoCore::PixelFormat::A8, + VideoCore::PixelFormat::IA4, + VideoCore::PixelFormat::I4, + VideoCore::PixelFormat::A4, + VideoCore::PixelFormat::ETC1, + VideoCore::PixelFormat::ETC1A4, + VideoCore::PixelFormat::D16, + VideoCore::PixelFormat::D24, + VideoCore::PixelFormat::D24S8 + }; - vk::FormatFeatureFlags features = GetFormatFeatures(GetImageAspect(format)); - if (IsFormatSupported(format, features)) { - return format; - } + const vk::FormatFeatureFlags storage_usage = vk::FormatFeatureFlagBits::eStorageImage; + const vk::FormatFeatureFlags blit_usage = vk::FormatFeatureFlagBits::eSampledImage | + vk::FormatFeatureFlagBits::eTransferDst | + vk::FormatFeatureFlagBits::eTransferSrc | + vk::FormatFeatureFlagBits::eBlitSrc | + vk::FormatFeatureFlagBits::eBlitDst; - // Return the most supported alternative format preferably with the - // same block size according to the Vulkan spec. - // See 43.3. Required Format Support of the Vulkan spec - switch (format) { - case vk::Format::eD24UnormS8Uint: - return vk::Format::eD32SfloatS8Uint; - case vk::Format::eX8D24UnormPack32: - return vk::Format::eD32Sfloat; - case vk::Format::eR5G5B5A1UnormPack16: - return vk::Format::eA1R5G5B5UnormPack16; - case vk::Format::eR8G8B8Unorm: - return vk::Format::eR8G8B8A8Unorm; - case vk::Format::eUndefined: - return vk::Format::eUndefined; - case vk::Format::eR4G4B4A4UnormPack16: - // B4G4R4A4 is not guaranteed by the spec to support attachments - return GetFormatAlternative(vk::Format::eB4G4R4A4UnormPack16); - default: - LOG_WARNING(Render_Vulkan, "Format {} doesn't support attachments, falling back to RGBA8", - vk::to_string(format)); - return vk::Format::eR8G8B8A8Unorm; + for (const auto& pixel_format : pixel_formats) { + const vk::Format format = ToVkFormat(pixel_format); + const vk::FormatProperties properties = physical_device.getFormatProperties(format); + const vk::ImageAspectFlags aspect = GetImageAspect(format); + + const vk::FormatFeatureFlagBits attachment_usage = (aspect & vk::ImageAspectFlagBits::eDepth) ? + vk::FormatFeatureFlagBits::eDepthStencilAttachment : + vk::FormatFeatureFlagBits::eColorAttachment; + + const bool supports_blit = + (properties.optimalTilingFeatures & blit_usage) == blit_usage; + const bool supports_attachment = + (properties.optimalTilingFeatures & attachment_usage) == attachment_usage; + const bool supports_storage = + (properties.optimalTilingFeatures & storage_usage) == storage_usage; + + // Find the most inclusive usage flags for this format + vk::ImageUsageFlags best_usage; + if (supports_blit) { + best_usage |= vk::ImageUsageFlagBits::eSampled | + vk::ImageUsageFlagBits::eTransferDst | + vk::ImageUsageFlagBits::eTransferSrc; + } + if (supports_attachment) { + best_usage |= (aspect & vk::ImageAspectFlagBits::eDepth) ? + vk::ImageUsageFlagBits::eDepthStencilAttachment : + vk::ImageUsageFlagBits::eColorAttachment; + } + if (supports_storage) { + best_usage |= vk::ImageUsageFlagBits::eStorage; + } + + // Always fallback to RGBA8 or D32(S8) for convenience + vk::Format fallback = vk::Format::eR8G8B8A8Unorm; + if (aspect & vk::ImageAspectFlagBits::eDepth) { + fallback = vk::Format::eD32Sfloat; + if (aspect & vk::ImageAspectFlagBits::eStencil) { + fallback = vk::Format::eD32SfloatS8Uint; + } + } + + // Report completely unsupported formats + if (!supports_blit && !supports_attachment && !supports_storage) { + LOG_WARNING(Render_Vulkan, "Format {} unsupported, falling back unconditionally to {}", + vk::to_string(format), vk::to_string(fallback)); + } + + const u32 index = static_cast(pixel_format); + format_table[index] = FormatTraits{ + .blit_support = supports_blit, + .attachment_support = supports_attachment, + .storage_support = supports_storage, + .usage = best_usage, + .native = format, + .fallback = fallback + }; } } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 3297e3a31..fec584608 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -4,8 +4,9 @@ #pragma once -#include -#include "common/common_types.h" +#include +#include +#include "video_core/rasterizer_cache/pixel_format.h" #include "video_core/renderer_vulkan/vk_common.h" namespace Frontend { @@ -14,17 +15,23 @@ class EmuWindow; namespace Vulkan { +struct FormatTraits { + bool blit_support = false; ///< True if the format supports omnidirectonal blit operations + bool attachment_support = false; ///< True if the format supports being used as an attachment + bool storage_support = false; ///< True if the format supports storage operations + vk::ImageUsageFlags usage{}; ///< Most supported usage for the native format + vk::Format native = vk::Format::eUndefined; ///< Closest possible native format + vk::Format fallback = vk::Format::eUndefined; ///< Best fallback format +}; + /// The global Vulkan instance class Instance { public: Instance(Frontend::EmuWindow& window); ~Instance(); - /// Returns true when the format supports the provided feature flags - bool IsFormatSupported(vk::Format format, vk::FormatFeatureFlags usage) const; - - /// Returns the most compatible format that supports the provided feature flags - vk::Format GetFormatAlternative(vk::Format format) const; + /// Returns the FormatTraits struct for the provided pixel format + FormatTraits GetTraits(VideoCore::PixelFormat pixel_format) const; /// Returns the Vulkan instance vk::Instance GetInstance() const { @@ -103,6 +110,12 @@ public: } private: + /// Returns the optimal supported usage for the requested format + vk::FormatFeatureFlags GetFormatFeatures(vk::Format format); + + /// Creates the format compatibility table for the current device + void CreateFormatTable(); + /// Creates the logical device opportunistically enabling extensions bool CreateDevice(); @@ -118,9 +131,9 @@ private: VmaAllocator allocator; vk::Queue present_queue; vk::Queue graphics_queue; + std::array format_table; u32 present_queue_family_index = 0; u32 graphics_queue_family_index = 0; - bool timeline_semaphores = false; bool extended_dynamic_state = false; bool push_descriptors = false; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 69da9797b..575e8f780 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -180,13 +180,6 @@ PipelineCache::~PipelineCache() { void PipelineCache::BindPipeline(const PipelineInfo& info) { ApplyDynamic(info); - // When texture downloads occur the runtime will flush the GPU and cause - // a scheduler slot switch behind our back. This might invalidate any - // cached descriptor sets/require pipeline rebinding. - if (timestamp != scheduler.GetHostFenceCounter()) { - MarkDirty(); - } - u64 shader_hash = 0; for (u32 i = 0; i < MAX_SHADER_STAGES; i++) { shader_hash = Common::HashCombine(shader_hash, shader_hashes[i]); @@ -313,7 +306,6 @@ void PipelineCache::SetScissor(s32 x, s32 y, u32 width, u32 height) { void PipelineCache::MarkDirty() { descriptor_dirty.fill(true); current_pipeline = VK_NULL_HANDLE; - timestamp = scheduler.GetHostFenceCounter(); } void PipelineCache::ApplyDynamic(const PipelineInfo& info) { diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 46cd21be8..23456f77e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -248,7 +248,6 @@ private: std::array update_data{}; std::array descriptor_dirty{}; std::array descriptor_sets; - u64 timestamp = 0; // Bound shader modules enum ProgramType : u32 { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index d958de7f6..5b190669f 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -94,9 +94,9 @@ constexpr VertexLayout RasterizerVulkan::HardwareVertex::GetVertexLayout() { } constexpr u32 VERTEX_BUFFER_SIZE = 128 * 1024 * 1024; -constexpr u32 INDEX_BUFFER_SIZE = 2 * 1024 * 1024; -constexpr u32 UNIFORM_BUFFER_SIZE = 2 * 1024 * 1024; -constexpr u32 TEXTURE_BUFFER_SIZE = 2 * 1024 * 1024; +constexpr u32 INDEX_BUFFER_SIZE = 8 * 1024 * 1024; +constexpr u32 UNIFORM_BUFFER_SIZE = 16 * 1024 * 1024; +constexpr u32 TEXTURE_BUFFER_SIZE = 16 * 1024 * 1024; constexpr std::array TEXTURE_BUFFER_LF_FORMATS = { vk::Format::eR32G32Sfloat @@ -188,6 +188,7 @@ RasterizerVulkan::~RasterizerVulkan() { vmaDestroyImage(allocator, default_texture.image, default_texture.allocation); device.destroyImageView(default_texture.image_view); + device.destroyImageView(default_texture.base_view); device.destroySampler(default_sampler); } @@ -598,12 +599,6 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { surfaces_rect.bottom, surfaces_rect.top)) }; - // Sync the viewport - pipeline_cache.SetViewport(surfaces_rect.left + viewport_rect_unscaled.left * res_scale, - surfaces_rect.bottom + viewport_rect_unscaled.bottom * res_scale, - viewport_rect_unscaled.GetWidth() * res_scale, - viewport_rect_unscaled.GetHeight() * res_scale); - if (uniform_block_data.data.framebuffer_scale != res_scale) { uniform_block_data.data.framebuffer_scale = res_scale; uniform_block_data.dirty = true; @@ -678,8 +673,6 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { } }; - vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); - // Sync and bind the texture surfaces const auto pica_textures = regs.texturing.GetTextures(); for (unsigned texture_index = 0; texture_index < pica_textures.size(); ++texture_index) { @@ -725,7 +718,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { auto surface = res_cache.GetTextureCube(config); if (surface != nullptr) { - runtime.Transition(command_buffer, surface->alloc, + runtime.Transition(scheduler.GetRenderCommandBuffer(), surface->alloc, vk::ImageLayout::eShaderReadOnlyOptimal, 0, surface->alloc.levels, 0, 6); pipeline_cache.BindTexture(3, surface->alloc.image_view); @@ -746,7 +739,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { auto surface = res_cache.GetTextureSurface(texture); if (surface != nullptr) { - runtime.Transition(command_buffer, surface->alloc, + runtime.Transition(scheduler.GetRenderCommandBuffer(), surface->alloc, vk::ImageLayout::eShaderReadOnlyOptimal, 0, surface->alloc.levels); CheckBarrier(surface->alloc.image_view, texture_index); @@ -767,6 +760,15 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { } } + // NOTE: From here onwards its a safe zone to set the draw state, doing that any earlier will cause + // issues as the rasterizer cache might cause a scheduler switch and invalidate our state + + // Sync the viewport + pipeline_cache.SetViewport(surfaces_rect.left + viewport_rect_unscaled.left * res_scale, + surfaces_rect.bottom + viewport_rect_unscaled.bottom * res_scale, + viewport_rect_unscaled.GetWidth() * res_scale, + viewport_rect_unscaled.GetHeight() * res_scale); + // Sync and bind the shader if (shader_dirty) { SetShader(); @@ -786,8 +788,8 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { auto valid_surface = color_surface ? color_surface : depth_surface; const FramebufferInfo framebuffer_info = { - .color = color_surface ? color_surface->alloc.image_view : VK_NULL_HANDLE, - .depth = depth_surface ? depth_surface->alloc.image_view : VK_NULL_HANDLE, + .color = color_surface ? color_surface->GetFramebufferView() : VK_NULL_HANDLE, + .depth = depth_surface ? depth_surface->GetFramebufferView() : VK_NULL_HANDLE, .renderpass = renderpass_cache.GetRenderpass(pipeline_info.color_attachment, pipeline_info.depth_attachment, false), .width = valid_surface->GetScaledWidth(), @@ -799,6 +801,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { it->second = CreateFramebuffer(framebuffer_info); } + vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); if (color_surface) { runtime.Transition(command_buffer, color_surface->alloc, vk::ImageLayout::eColorAttachmentOptimal, @@ -1509,15 +1512,9 @@ bool RasterizerVulkan::AccelerateTextureCopy(const GPU::Regs::DisplayTransferCon const bool load_gap = output_gap != 0; auto [dst_surface, dst_rect] = res_cache.GetSurfaceSubRect(dst_params, VideoCore::ScaleMatch::Upscale, load_gap); - if (dst_surface == nullptr) { - return false; - } - if (dst_surface->type == VideoCore::SurfaceType::Texture) { - return false; - } - - if (!res_cache.BlitSurfaces(src_surface, src_rect, dst_surface, dst_rect)) { + if (!dst_surface || dst_surface->type == VideoCore::SurfaceType::Texture || + !res_cache.BlitSurfaces(src_surface, src_rect, dst_surface, dst_rect)) { return false; } diff --git a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp b/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp index 5f7a3a7b0..fd9d96af7 100644 --- a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp @@ -10,24 +10,23 @@ namespace Vulkan { -vk::Format ToVkFormatColor(u32 index) { +VideoCore::PixelFormat ToFormatColor(u32 index) { switch (index) { - case 0: return vk::Format::eR8G8B8A8Unorm; - case 1: return vk::Format::eR8G8B8Unorm; - case 2: return vk::Format::eR5G5B5A1UnormPack16; - case 3: return vk::Format::eR5G6B5UnormPack16; - case 4: return vk::Format::eR4G4B4A4UnormPack16; - default: return vk::Format::eUndefined; + case 0: return VideoCore::PixelFormat::RGBA8; + case 1: return VideoCore::PixelFormat::RGB8; + case 2: return VideoCore::PixelFormat::RGB5A1; + case 3: return VideoCore::PixelFormat::RGB565; + case 4: return VideoCore::PixelFormat::RGBA4; + default: return VideoCore::PixelFormat::Invalid; } } -vk::Format ToVkFormatDepth(u32 index) { +VideoCore::PixelFormat ToFormatDepth(u32 index) { switch (index) { - case 0: return vk::Format::eD16Unorm; - case 1: return vk::Format::eX8D24UnormPack32; - // Notice the similar gap in PixelFormat - case 3: return vk::Format::eD24UnormS8Uint; - default: return vk::Format::eUndefined; + case 0: return VideoCore::PixelFormat::D16; + case 1: return VideoCore::PixelFormat::D24; + case 3: return VideoCore::PixelFormat::D24S8; + default: return VideoCore::PixelFormat::Invalid; } } @@ -36,21 +35,23 @@ RenderpassCache::RenderpassCache(const Instance& instance, TaskScheduler& schedu // Pre-create all needed renderpasses by the renderer for (u32 color = 0; color <= MAX_COLOR_FORMATS; color++) { for (u32 depth = 0; depth <= MAX_DEPTH_FORMATS; depth++) { - const vk::Format color_format = - instance.GetFormatAlternative(ToVkFormatColor(color)); - const vk::Format depth_stencil_format = - instance.GetFormatAlternative(ToVkFormatDepth(depth)); + const FormatTraits color_traits = instance.GetTraits(ToFormatColor(color)); + const FormatTraits depth_traits = instance.GetTraits(ToFormatDepth(depth)); - if (color_format == vk::Format::eUndefined && - depth_stencil_format == vk::Format::eUndefined) { + const vk::Format color_format = + color_traits.attachment_support ? color_traits.native : color_traits.fallback; + const vk::Format depth_format = + depth_traits.attachment_support ? depth_traits.native : depth_traits.fallback; + + if (color_format == vk::Format::eUndefined && depth_format == vk::Format::eUndefined) { continue; } - cached_renderpasses[color][depth][0] = CreateRenderPass(color_format, depth_stencil_format, + cached_renderpasses[color][depth][0] = CreateRenderPass(color_format, depth_format, vk::AttachmentLoadOp::eLoad, vk::ImageLayout::eColorAttachmentOptimal, vk::ImageLayout::eColorAttachmentOptimal); - cached_renderpasses[color][depth][1] = CreateRenderPass(color_format, depth_stencil_format, + cached_renderpasses[color][depth][1] = CreateRenderPass(color_format, depth_format, vk::AttachmentLoadOp::eClear, vk::ImageLayout::eColorAttachmentOptimal, vk::ImageLayout::eColorAttachmentOptimal); diff --git a/src/video_core/renderer_vulkan/vk_task_scheduler.cpp b/src/video_core/renderer_vulkan/vk_task_scheduler.cpp index 08b3f7b96..dc596991c 100644 --- a/src/video_core/renderer_vulkan/vk_task_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_task_scheduler.cpp @@ -5,12 +5,14 @@ #define VULKAN_HPP_NO_CONSTRUCTORS #include "common/assert.h" #include "common/logging/log.h" +#include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_task_scheduler.h" #include "video_core/renderer_vulkan/vk_instance.h" namespace Vulkan { -TaskScheduler::TaskScheduler(const Instance& instance) : instance{instance} { +TaskScheduler::TaskScheduler(const Instance& instance, RendererVulkan& renderer) + : instance{instance}, renderer{renderer} { vk::Device device = instance.GetDevice(); const vk::CommandPoolCreateInfo command_pool_info = { .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer, @@ -97,11 +99,7 @@ void TaskScheduler::Synchronize(u32 slot) { const auto& command = commands[slot]; vk::Device device = instance.GetDevice(); - u32 completed_counter = completed_fence_counter; - if (instance.IsTimelineSemaphoreSupported()) { - completed_counter = device.getSemaphoreCounterValue(timeline); - } - + u32 completed_counter = GetFenceCounter(); if (command.fence_counter > completed_counter) { if (instance.IsTimelineSemaphoreSupported()) { const vk::SemaphoreWaitInfo wait_info = { @@ -127,6 +125,10 @@ void TaskScheduler::Synchronize(u32 slot) { } void TaskScheduler::Submit(SubmitMode mode) { + if (False(mode & SubmitMode::Shutdown)) { + renderer.FlushBuffers(); + } + const auto& command = commands[current_command]; command.render_command_buffer.end(); if (command.use_upload_buffer) { @@ -206,6 +208,7 @@ void TaskScheduler::Submit(SubmitMode mode) { // Switch to next cmdbuffer. if (False(mode & SubmitMode::Shutdown)) { SwitchSlot(); + renderer.OnSlotSwitch(); } } diff --git a/src/video_core/renderer_vulkan/vk_task_scheduler.h b/src/video_core/renderer_vulkan/vk_task_scheduler.h index 6cfb7cead..da3ab5860 100644 --- a/src/video_core/renderer_vulkan/vk_task_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_task_scheduler.h @@ -15,6 +15,7 @@ namespace Vulkan { class Buffer; class Instance; +class RendererVulkan; enum class SubmitMode : u8 { SwapchainSynced = 1 << 0, ///< Synchronizes command buffer execution with the swapchain @@ -26,7 +27,7 @@ DECLARE_ENUM_FLAG_OPERATORS(SubmitMode); class TaskScheduler { public: - TaskScheduler(const Instance& instance); + TaskScheduler(const Instance& instance, RendererVulkan& renderer); ~TaskScheduler(); /// Blocks the host until the current command completes execution @@ -74,6 +75,7 @@ private: private: const Instance& instance; + RendererVulkan& renderer; u64 next_fence_counter = 1; u64 completed_fence_counter = 0; diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp index 7556eac7e..cacfcf651 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp @@ -8,36 +8,10 @@ #include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_task_scheduler.h" #include "video_core/renderer_vulkan/vk_texture_runtime.h" +#include namespace Vulkan { -vk::Format ToVkFormat(VideoCore::PixelFormat format) { - switch (format) { - case VideoCore::PixelFormat::RGBA8: - return vk::Format::eR8G8B8A8Unorm; - case VideoCore::PixelFormat::RGB8: - return vk::Format::eR8G8B8Unorm; - case VideoCore::PixelFormat::RGB5A1: - return vk::Format::eR5G5B5A1UnormPack16; - case VideoCore::PixelFormat::RGB565: - return vk::Format::eR5G6B5UnormPack16; - case VideoCore::PixelFormat::RGBA4: - return vk::Format::eR4G4B4A4UnormPack16; - case VideoCore::PixelFormat::D16: - return vk::Format::eD16Unorm; - case VideoCore::PixelFormat::D24: - return vk::Format::eX8D24UnormPack32; - case VideoCore::PixelFormat::D24S8: - return vk::Format::eD24UnormS8Uint; - case VideoCore::PixelFormat::Invalid: - LOG_ERROR(Render_Vulkan, "Unknown texture format {}!", format); - return vk::Format::eUndefined; - default: - // Use default case for the texture formats - return vk::Format::eR8G8B8A8Unorm; - } -} - vk::ImageAspectFlags ToVkAspect(VideoCore::SurfaceType type) { switch (type) { case VideoCore::SurfaceType::Color: @@ -55,23 +29,7 @@ vk::ImageAspectFlags ToVkAspect(VideoCore::SurfaceType type) { return vk::ImageAspectFlagBits::eColor; } -vk::FormatFeatureFlagBits ToVkFormatFeatures(VideoCore::SurfaceType type) { - switch (type) { - case VideoCore::SurfaceType::Color: - case VideoCore::SurfaceType::Texture: - case VideoCore::SurfaceType::Fill: - return vk::FormatFeatureFlagBits::eColorAttachment; - case VideoCore::SurfaceType::Depth: - case VideoCore::SurfaceType::DepthStencil: - return vk::FormatFeatureFlagBits::eDepthStencilAttachment; - default: - UNREACHABLE_MSG("Invalid surface type!"); - } - - return vk::FormatFeatureFlagBits::eColorAttachment; -} - -constexpr u32 STAGING_BUFFER_SIZE = 16 * 1024 * 1024; +constexpr u32 STAGING_BUFFER_SIZE = 64 * 1024 * 1024; TextureRuntime::TextureRuntime(const Instance& instance, TaskScheduler& scheduler, RenderpassCache& renderpass_cache) @@ -92,6 +50,7 @@ TextureRuntime::~TextureRuntime() { for (const auto& [key, alloc] : texture_recycler) { vmaDestroyImage(allocator, alloc.image, alloc.allocation); device.destroyImageView(alloc.image_view); + device.destroyImageView(alloc.base_view); } for (const auto& [key, framebuffer] : clear_framebuffers) { @@ -118,6 +77,10 @@ StagingData TextureRuntime::FindStaging(u32 size, bool upload) { }; } +void TextureRuntime::Finish() { + scheduler.Submit(SubmitMode::Flush); +} + void TextureRuntime::OnSlotSwitch(u32 new_slot) { staging_offsets[new_slot] = 0; } @@ -140,9 +103,12 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma return alloc; } - // Create a new allocation - vk::Format vk_format = instance.GetFormatAlternative(ToVkFormat(format)); - vk::ImageAspectFlags aspect = GetImageAspect(vk_format); + const FormatTraits traits = instance.GetTraits(format); + const vk::ImageAspectFlags aspect = ToVkAspect(VideoCore::GetFormatType(format)); + + const bool is_suitable = traits.blit_support && traits.attachment_support; + const vk::Format vk_format = is_suitable ? traits.native : traits.fallback; + const vk::ImageUsageFlags vk_usage = is_suitable ? traits.usage : GetImageUsage(aspect); const u32 levels = std::bit_width(std::max(width, height)); const vk::ImageCreateInfo image_info = { @@ -155,7 +121,7 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma .mipLevels = levels, .arrayLayers = layers, .samples = vk::SampleCountFlagBits::e1, - .usage = GetImageUsage(aspect), + .usage = vk_usage }; const VmaAllocationCreateInfo alloc_info = { @@ -174,8 +140,23 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma } vk::Image image = vk::Image{unsafe_image}; - const vk::ImageViewCreateInfo view_info = { + .image = image, + .viewType = type == VideoCore::TextureType::CubeMap ? + vk::ImageViewType::eCube : + vk::ImageViewType::e2D, + .format = vk_format, + .subresourceRange = { + .aspectMask = aspect, + .baseMipLevel = 0, + .levelCount = levels, + .baseArrayLayer = 0, + .layerCount = layers + } + }; + + // Also create a base mip view in case this is used as an attachment + const vk::ImageViewCreateInfo base_view_info = { .image = image, .viewType = type == VideoCore::TextureType::CubeMap ? vk::ImageViewType::eCube : @@ -192,13 +173,17 @@ ImageAlloc TextureRuntime::Allocate(u32 width, u32 height, VideoCore::PixelForma vk::Device device = instance.GetDevice(); vk::ImageView image_view = device.createImageView(view_info); + vk::ImageView base_view = device.createImageView(base_view_info); return ImageAlloc{ .image = image, .image_view = image_view, + .base_view = base_view, .allocation = allocation, + .format = vk_format, .aspect = aspect, .levels = levels, + .layers = layers }; } @@ -206,21 +191,45 @@ void TextureRuntime::Recycle(const VideoCore::HostTextureTag tag, ImageAlloc&& a texture_recycler.emplace(tag, std::move(alloc)); } -void TextureRuntime::FormatConvert(VideoCore::PixelFormat format, bool upload, +void TextureRuntime::FormatConvert(const Surface& surface, bool upload, std::span source, std::span dest) { - const VideoCore::SurfaceType type = VideoCore::GetFormatType(format); - const vk::FormatFeatureFlagBits feature = ToVkFormatFeatures(type); - - if (format == VideoCore::PixelFormat::RGBA8) { - return Pica::Texture::ConvertABGRToRGBA(source, dest); - } else if (format == VideoCore::PixelFormat::RGB8 && upload) { - return Pica::Texture::ConvertBGRToRGBA(source, dest); - } else if (instance.IsFormatSupported(ToVkFormat(format), feature)) { - std::memcpy(dest.data(), source.data(), source.size()); - } else { - LOG_CRITICAL(Render_Vulkan, "Unimplemented converion for format {}!", format); + if (!surface.NeedsConvert()) { std::memcpy(dest.data(), source.data(), source.size()); + return; } + + // Since this is the most common case handle it separately + if (surface.pixel_format == VideoCore::PixelFormat::RGBA8) { + return Pica::Texture::ConvertABGRToRGBA(source, dest); + } + + // Handle simple D24S8 interleave case + if (surface.GetInternalFormat() == vk::Format::eD24UnormS8Uint) { + return Pica::Texture::InterleaveD24S8(source, dest); + } + + if (upload) { + switch (surface.pixel_format) { + case VideoCore::PixelFormat::RGB8: + return Pica::Texture::ConvertBGRToRGBA(source, dest); + case VideoCore::PixelFormat::RGBA4: + return Pica::Texture::ConvertRGBA4ToRGBA8(source, dest); + default: + break; + } + } else { + switch (surface.pixel_format) { + case VideoCore::PixelFormat::D24S8: + return Pica::Texture::ConvertD32S8ToD24S8(source, dest); + case VideoCore::PixelFormat::RGBA4: + return Pica::Texture::ConvertRGBA8ToRGBA4(source, dest); + default: + break; + } + } + + LOG_WARNING(Render_Vulkan, "Missing format convertion: {} {} {}", + vk::to_string(surface.traits.native), upload ? "->" : "<-", vk::to_string(surface.alloc.format)); } bool TextureRuntime::ClearTexture(Surface& surface, const VideoCore::TextureClear& clear, @@ -276,11 +285,12 @@ bool TextureRuntime::ClearTexture(Surface& surface, const VideoCore::TextureClea } auto [it, new_framebuffer] = clear_framebuffers.try_emplace(alloc.image_view, vk::Framebuffer{}); - if (new_framebuffer) { + if (new_framebuffer) {\ + const vk::ImageView framebuffer_view = surface.GetFramebufferView(); const vk::FramebufferCreateInfo framebuffer_info = { .renderPass = clear_renderpass, .attachmentCount = 1, - .pAttachments = &alloc.image_view, + .pAttachments = &framebuffer_view, .width = surface.GetScaledWidth(), .height = surface.GetScaledHeight(), .layers = 1 @@ -377,7 +387,7 @@ bool TextureRuntime::BlitTextures(Surface& source, Surface& dest, const VideoCor command_buffer.blitImage(source.alloc.image, vk::ImageLayout::eTransferSrcOptimal, dest.alloc.image, vk::ImageLayout::eTransferDstOptimal, - blit_area, vk::Filter::eLinear); + blit_area, vk::Filter::eNearest); return true; } @@ -528,7 +538,7 @@ void TextureRuntime::Transition(vk::CommandBuffer command_buffer, ImageAlloc& al Surface::Surface(VideoCore::SurfaceParams& params, TextureRuntime& runtime) : VideoCore::SurfaceBase{params}, runtime{runtime}, instance{runtime.GetInstance()}, - scheduler{runtime.GetScheduler()} { + scheduler{runtime.GetScheduler()}, traits{instance.GetTraits(pixel_format)} { if (pixel_format != VideoCore::PixelFormat::Invalid) { alloc = runtime.Allocate(GetScaledWidth(), GetScaledHeight(), params.pixel_format, texture_type); @@ -604,7 +614,7 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); const VideoCore::Rect2D rect = download.texture_rect; vk::BufferImageCopy copy_region = { - .bufferOffset = staging.buffer_offset, + .bufferOffset = staging.buffer_offset + download.buffer_offset, .bufferRowLength = rect.GetWidth(), .bufferImageHeight = rect.GetHeight(), .imageSubresource = { @@ -624,20 +634,17 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi copy_regions[region_count++] = copy_region; if (alloc.aspect & vk::ImageAspectFlagBits::eStencil) { - return; // HACK: Skip depth + stencil downloads for now - copy_region.bufferOffset += staging.mapped.size(); - copy_region.imageSubresource.aspectMask |= vk::ImageAspectFlagBits::eStencil; + copy_region.bufferOffset += 4 * staging.size / 5; + copy_region.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eStencil; copy_regions[region_count++] = copy_region; } } - runtime.Transition(command_buffer, alloc, vk::ImageLayout::eTransferSrcOptimal, download.texture_level, 1); + runtime.Transition(command_buffer, alloc, vk::ImageLayout::eTransferSrcOptimal, 0, alloc.levels); // Copy pixel data to the staging buffer command_buffer.copyImageToBuffer(alloc.image, vk::ImageLayout::eTransferSrcOptimal, staging.buffer, region_count, copy_regions.data()); - - scheduler.Submit(SubmitMode::Flush); } // Lock this data until the next scheduler switch @@ -645,6 +652,22 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download, const Stagi runtime.staging_offsets[current_slot] += staging.size; } +bool Surface::NeedsConvert() const { + // RGBA8 needs a byteswap since R8G8B8A8UnormPack32 does not exist + // D24S8 always needs an interleave pass even if natively supported + return alloc.format != traits.native || + pixel_format == VideoCore::PixelFormat::RGBA8 || + pixel_format == VideoCore::PixelFormat::D24S8; +} + +u32 Surface::GetInternalBytesPerPixel() const { + if (alloc.format == vk::Format::eD32SfloatS8Uint) { + return 8; + } + + return vk::blockSize(alloc.format); +} + void Surface::ScaledDownload(const VideoCore::BufferTextureCopy& download) { /*const u32 rect_width = download.texture_rect.GetWidth(); const u32 rect_height = download.texture_rect.GetHeight(); diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.h b/src/video_core/renderer_vulkan/vk_texture_runtime.h index 93d6dde70..60bcb9a09 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.h +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.h @@ -10,6 +10,7 @@ #include "video_core/rasterizer_cache/surface_base.h" #include "video_core/rasterizer_cache/types.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" +#include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_task_scheduler.h" namespace Vulkan { @@ -24,10 +25,14 @@ struct StagingData { struct ImageAlloc { vk::Image image; vk::ImageView image_view; + vk::ImageView base_view; VmaAllocation allocation; + vk::ImageUsageFlags usage; + vk::Format format; vk::ImageLayout layout = vk::ImageLayout::eUndefined; vk::ImageAspectFlags aspect = vk::ImageAspectFlagBits::eNone; u32 levels = 1; + u32 layers = 1; }; class Instance; @@ -48,6 +53,9 @@ public: /// Maps an internal staging buffer of the provided size of pixel uploads/downloads [[nodiscard]] StagingData FindStaging(u32 size, bool upload); + /// Causes a GPU command flush + void Finish(); + /// Allocates a vulkan image possibly resusing an existing one [[nodiscard]] ImageAlloc Allocate(u32 width, u32 height, VideoCore::PixelFormat format, VideoCore::TextureType type); @@ -56,7 +64,7 @@ public: void Recycle(const VideoCore::HostTextureTag tag, ImageAlloc&& alloc); /// Performs required format convertions on the staging data - void FormatConvert(VideoCore::PixelFormat format, bool upload, + void FormatConvert(const Surface& surface, bool upload, std::span source, std::span dest); /// Transitions the mip level range of the surface to new_layout @@ -114,6 +122,27 @@ public: /// Downloads pixel data to staging from a rectangle region of the surface texture void Download(const VideoCore::BufferTextureCopy& download, const StagingData& staging); + /// Returns true if the surface requires pixel data convertion + bool NeedsConvert() const; + + /// Returns the bpp of the internal surface format + u32 GetInternalBytesPerPixel() const; + + /// Returns an image view used to sample the surface from a shader + vk::ImageView GetImageView() const { + return alloc.image_view; + } + + /// Returns an image view used to create a framebuffer + vk::ImageView GetFramebufferView() { + return alloc.base_view; + } + + /// Returns the internal format of the allocated texture + vk::Format GetInternalFormat() const { + return alloc.format; + } + private: /// Downloads scaled image by downscaling the requested rectangle void ScaledDownload(const VideoCore::BufferTextureCopy& download); @@ -128,9 +157,8 @@ private: TextureRuntime& runtime; const Instance& instance; TaskScheduler& scheduler; - ImageAlloc alloc{}; - vk::Format internal_format = vk::Format::eUndefined; + FormatTraits traits; }; struct Traits { diff --git a/src/video_core/texture/texture_decode.cpp b/src/video_core/texture/texture_decode.cpp index 7981feefd..6f306aa80 100644 --- a/src/video_core/texture/texture_decode.cpp +++ b/src/video_core/texture/texture_decode.cpp @@ -233,21 +233,67 @@ void ConvertBGRToRGB(std::span source, std::span des void ConvertBGRToRGBA(std::span source, std::span dest) { u32 j = 0; - for (std::size_t i = 0; i < source.size(); i += 3) { - dest[j] = source[i + 2]; - dest[j + 1] = source[i + 1]; - dest[j + 2] = source[i]; - dest[j + 3] = std::byte{0xFF}; - j += 4; + for (std::size_t i = 0; i < dest.size(); i += 4) { + dest[i] = source[j + 2]; + dest[i + 1] = source[j + 1]; + dest[i + 2] = source[j]; + dest[i + 3] = std::byte{0xFF}; + j += 3; } } void ConvertABGRToRGBA(std::span source, std::span dest) { - for (u32 i = 0; i < source.size(); i += 4) { + for (u32 i = 0; i < dest.size(); i += 4) { const u32 abgr = *reinterpret_cast(source.data() + i); const u32 rgba = Common::swap32(abgr); std::memcpy(dest.data() + i, &rgba, 4); } } +void ConvertD32S8ToD24S8(std::span source, std::span dest) { + u32 depth_offset = 0; + u32 stencil_offset = 4 * source.size() / 5; + for (std::size_t i = 0; i < dest.size(); i += 4) { + float depth; + std::memcpy(&depth, source.data() + depth_offset, sizeof(float)); + u32 depth_uint = depth * 0xFFFFFF; + + dest[i] = source[stencil_offset]; + std::memcpy(dest.data() + i + 1, &depth_uint, 3); + + depth_offset += 4; + stencil_offset += 1; + } +} + +void ConvertRGBA4ToRGBA8(std::span source, std::span dest) { + u32 j = 0; + for (std::size_t i = 0; i < dest.size(); i += 4) { + auto rgba = Color::DecodeRGBA4(reinterpret_cast(source.data() + j)); + std::memcpy(dest.data() + i, rgba.AsArray(), sizeof(rgba)); + j += 2; + } +} + +void ConvertRGBA8ToRGBA4(std::span source, std::span dest) { + u32 j = 0; + for (std::size_t i = 0; i < dest.size(); i += 2) { + Common::Vec4 rgba; + std::memcpy(rgba.AsArray(), source.data() + j, sizeof(rgba)); + Color::EncodeRGBA4(rgba, reinterpret_cast(dest.data() + i)); + j += 4; + } +} + +void InterleaveD24S8(std::span source, std::span dest) { + u32 depth_offset = 0; + u32 stencil_offset = 3 * source.size() / 4; + for (std::size_t i = 0; i < dest.size(); i += 4) { + dest[i] = source[stencil_offset]; + std::memcpy(dest.data() + i + 1, source.data() + depth_offset, 3); + depth_offset += 3; + stencil_offset += 1; + } +} + } // namespace Pica::Texture diff --git a/src/video_core/texture/texture_decode.h b/src/video_core/texture/texture_decode.h index d8b8238c7..c3967435d 100644 --- a/src/video_core/texture/texture_decode.h +++ b/src/video_core/texture/texture_decode.h @@ -80,4 +80,12 @@ void ConvertBGRToRGBA(std::span source, std::span de */ void ConvertABGRToRGBA(std::span source, std::span dest); +void ConvertD32S8ToD24S8(std::span source, std::span dest); + +void ConvertRGBA4ToRGBA8(std::span source, std::span dest); + +void ConvertRGBA8ToRGBA4(std::span source, std::span dest); + +void InterleaveD24S8(std::span source, std::span dest); + } // namespace Pica::Texture