From 8936641841d9b200e28726525b0cc2f31ca939d2 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 26 Sep 2022 21:07:21 +0300 Subject: [PATCH] renderer_vulkan: Rewrite stream buffer + other fixes * Emulate blend color and clip planes correctly * Don't hack the depth in the vertex shader, use VK_EXT_depth_clip_control for now to set the range to -1, 1 * Rewrite the stream buffer to remove flickering problems. The new implementation doesn't try to be smart about holding memory. It divides the allocation in SCHEDULER_COMMAND_COUNT buckets and automatically switches between them based on the current slot index --- src/video_core/pica.cpp | 2 +- .../renderer_vulkan/renderer_vulkan.cpp | 2 +- src/video_core/renderer_vulkan/vk_common.h | 4 + .../renderer_vulkan/vk_instance.cpp | 26 +++-- .../renderer_vulkan/vk_pipeline_cache.cpp | 6 + .../renderer_vulkan/vk_rasterizer.cpp | 15 +-- .../renderer_vulkan/vk_shader_gen.cpp | 13 ++- .../renderer_vulkan/vk_stream_buffer.cpp | 103 ++++-------------- .../renderer_vulkan/vk_stream_buffer.h | 21 ++-- .../renderer_vulkan/vk_task_scheduler.h | 2 - .../renderer_vulkan/vk_texture_runtime.cpp | 4 +- src/video_core/shader/shader_uniforms.h | 1 + 12 files changed, 82 insertions(+), 117 deletions(-) diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index d2f8874fe..88382d904 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp @@ -40,7 +40,7 @@ void Zero(T& o) { State::State() : geometry_pipeline(*this) { auto SubmitVertex = [this](const Shader::AttributeBuffer& vertex) { using Pica::Shader::OutputVertex; - auto AddTriangle = [this](const OutputVertex& v0, const OutputVertex& v1, + auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1, const OutputVertex& v2) { VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); }; diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 1711c8996..43f30deec 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -151,7 +151,7 @@ struct ScreenRectVertex { Common::Vec2f tex_coord; }; -constexpr u32 VERTEX_BUFFER_SIZE = sizeof(ScreenRectVertex) * 64; +constexpr u32 VERTEX_BUFFER_SIZE = sizeof(ScreenRectVertex) * 8192; /** * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left diff --git a/src/video_core/renderer_vulkan/vk_common.h b/src/video_core/renderer_vulkan/vk_common.h index 34d30a98e..ff01bb2f2 100644 --- a/src/video_core/renderer_vulkan/vk_common.h +++ b/src/video_core/renderer_vulkan/vk_common.h @@ -4,6 +4,8 @@ #pragma once +#include "common/common_types.h" + // Include vulkan-hpp header #define VK_NO_PROTOTYPES 1 #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 @@ -17,6 +19,8 @@ namespace Vulkan { +constexpr u32 SCHEDULER_COMMAND_COUNT = 4; + /// Return the image aspect associated on the provided format constexpr vk::ImageAspectFlags GetImageAspect(vk::Format format) { switch (format) { diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 47db4880b..b8791dbab 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -148,6 +148,7 @@ bool Instance::CreateDevice() { }; AddExtension(VK_KHR_SWAPCHAIN_EXTENSION_NAME); + AddExtension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); timeline_semaphores = AddExtension(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME); extended_dynamic_state = AddExtension(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); push_descriptors = AddExtension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); @@ -159,27 +160,31 @@ bool Instance::CreateDevice() { return false; } - graphics_queue_family_index = -1; - present_queue_family_index = -1; - for (int i = 0; i < family_properties.size(); i++) { + bool graphics_queue_found = false; + bool present_queue_found = false; + for (std::size_t i = 0; i < family_properties.size(); i++) { // Check if queue supports graphics + const u32 index = static_cast(i); if (family_properties[i].queueFlags & vk::QueueFlagBits::eGraphics) { - graphics_queue_family_index = i; + graphics_queue_family_index = index; + graphics_queue_found = true; // If this queue also supports presentation we are finished - if (physical_device.getSurfaceSupportKHR(i, surface)) { - present_queue_family_index = i; + if (physical_device.getSurfaceSupportKHR(static_cast(i), surface)) { + present_queue_family_index = index; + present_queue_found = true; break; } } // Check if queue supports presentation - if (physical_device.getSurfaceSupportKHR(i, surface)) { - present_queue_family_index = i; + if (physical_device.getSurfaceSupportKHR(index, surface)) { + present_queue_family_index = index; + present_queue_found = true; } } - if (graphics_queue_family_index == -1 || present_queue_family_index == -1) { + if (!graphics_queue_found || !present_queue_found) { LOG_CRITICAL(Render_Vulkan, "Unable to find graphics and/or present queues."); return false; } @@ -221,6 +226,9 @@ bool Instance::CreateDevice() { .shaderClipDistance = available.shaderClipDistance } }, + vk::PhysicalDeviceDepthClipControlFeaturesEXT{ + .depthClipControl = true + }, feature_chain.get(), feature_chain.get() }; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 449faea0d..207eab4f0 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -483,7 +483,12 @@ vk::Pipeline PipelineCache::BuildPipeline(const PipelineInfo& info) { .extent = {1, 1} }; + vk::PipelineViewportDepthClipControlCreateInfoEXT depth_clip_control = { + .negativeOneToOne = true + }; + const vk::PipelineViewportStateCreateInfo viewport_info = { + .pNext = &depth_clip_control, .viewportCount = 1, .pViewports = &viewport, .scissorCount = 1, @@ -497,6 +502,7 @@ vk::Pipeline PipelineCache::BuildPipeline(const PipelineInfo& info) { vk::DynamicState::eStencilCompareMask, vk::DynamicState::eStencilWriteMask, vk::DynamicState::eStencilReference, + vk::DynamicState::eBlendConstants, // VK_EXT_extended_dynamic_state vk::DynamicState::eCullModeEXT, vk::DynamicState::eDepthCompareOpEXT, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index e2f925037..c66a4c419 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -859,6 +859,8 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { } } + renderpass_cache.ExitRenderpass(); + vertex_batch.clear(); // Mark framebuffer surfaces as dirty @@ -1631,7 +1633,7 @@ void RasterizerVulkan::SetShader() { } void RasterizerVulkan::SyncClipEnabled() { - //state.clip_distance[1] = Pica::g_state.regs.rasterizer.clip_enable != 0; + uniform_block_data.data.enable_clip1 = Pica::g_state.regs.rasterizer.clip_enable != 0; } void RasterizerVulkan::SyncClipCoef() { @@ -1689,12 +1691,11 @@ void RasterizerVulkan::SyncBlendFuncs() { } void RasterizerVulkan::SyncBlendColor() { - /*auto blend_color = - PicaToGL::ColorRGBA8(Pica::g_state.regs.framebuffer.output_merger.blend_const.raw); - state.blend.color.red = blend_color[0]; - state.blend.color.green = blend_color[1]; - state.blend.color.blue = blend_color[2]; - state.blend.color.alpha = blend_color[3];*/ + auto blend_color = + PicaToVK::ColorRGBA8(Pica::g_state.regs.framebuffer.output_merger.blend_const.raw); + + vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer(); + command_buffer.setBlendConstants(blend_color.AsArray()); } void RasterizerVulkan::SyncFogColor() { diff --git a/src/video_core/renderer_vulkan/vk_shader_gen.cpp b/src/video_core/renderer_vulkan/vk_shader_gen.cpp index 3069b095e..2dc39f41a 100644 --- a/src/video_core/renderer_vulkan/vk_shader_gen.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_gen.cpp @@ -55,6 +55,7 @@ layout (set = 0, binding = 1, std140) uniform shader_data { int proctex_diff_lut_offset; float proctex_bias; int shadow_texture_bias; + bool enable_clip1; ivec4 lighting_lut_offset[NUM_LIGHTING_SAMPLERS / 4]; vec3 fog_color; vec2 proctex_noise_f; @@ -89,9 +90,7 @@ static std::string GetVertexInterfaceDeclaration(bool is_output) { out += R"( out gl_PerVertex { vec4 gl_Position; -#if !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance) float gl_ClipDistance[2]; -#endif // !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance) }; )"; } @@ -1568,11 +1567,13 @@ void main() { normquat = vert_normquat; view = vert_view; gl_Position = vert_position; - gl_Position.z = (gl_Position.z + gl_Position.w) / 2.0; -#if !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance) + gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0 - gl_ClipDistance[1] = dot(clip_coef, vert_position); -#endif // !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance) + if (enable_clip1) { + gl_ClipDistance[1] = dot(clip_coef, vert_position); + } else { + gl_ClipDistance[1] = 0; + } } )"; diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp index 010b3b37a..d8f4aee34 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp @@ -106,8 +106,8 @@ StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler, views[i] = device.createBufferView(view_info); } - available_size = total_size; view_count = view_formats.size(); + bucket_size = size / SCHEDULER_COMMAND_COUNT; } StreamBuffer::~StreamBuffer() { @@ -123,57 +123,38 @@ StreamBuffer::~StreamBuffer() { std::tuple StreamBuffer::Map(u32 size, u32 alignment) { ASSERT(size <= total_size && alignment <= total_size); - if (alignment > 0) { - buffer_offset = Common::AlignUp(buffer_offset, alignment); + const u32 current_bucket = scheduler.GetCurrentSlotIndex(); + auto& bucket = buckets[current_bucket]; + if (bucket.offset + size > bucket_size) { + UNREACHABLE(); } - // Have we run out of available space? bool invalidate = false; - if (available_size < size) { - // If we are at the end of the buffer, start over - if (buffer_offset + size > total_size) { - // Flush any pending writes before looping back - Flush(); - - // Since new regions are discovered based on locks, insert a lock - // that reaches the end of the buffer to avoid having an empty region - const LockedRegion region = { - .size = total_size - buffer_offset, - .fence_counter = scheduler.GetFenceCounter() - }; - - regions.emplace(buffer_offset, region); - - Invalidate(); - invalidate = true; - } - - // Try to garbage collect old regions - if (!UnlockFreeRegions(size)) { - // Nuclear option: stall the GPU to remove all the locks - LOG_WARNING(Render_Vulkan, "Buffer GPU stall"); - Invalidate(); - regions.clear(); - available_size = total_size; - } + if (bucket.invalid) { + invalidate = true; + bucket.invalid = false; } + const u32 buffer_offset = current_bucket * bucket_size + bucket.offset; u8* mapped = reinterpret_cast(staging.mapped.data() + buffer_offset); return std::make_tuple(mapped, buffer_offset, invalidate); + } void StreamBuffer::Commit(u32 size) { - buffer_offset += size; - available_size -= size; + buckets[scheduler.GetCurrentSlotIndex()].offset += size; } void StreamBuffer::Flush() { - const u32 flush_size = buffer_offset - flush_start; + const u32 current_bucket = scheduler.GetCurrentSlotIndex(); + const u32 flush_size = buckets[current_bucket].offset; + ASSERT(flush_size <= bucket_size); + if (flush_size > 0) { vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer(); VmaAllocator allocator = instance.GetAllocator(); - const u32 flush_size = buffer_offset - flush_start; + const u32 flush_start = current_bucket * bucket_size; const vk::BufferCopy copy_region = { .srcOffset = flush_start, .dstOffset = flush_start, @@ -183,14 +164,6 @@ void StreamBuffer::Flush() { vmaFlushAllocation(allocator, allocation, flush_start, flush_size); command_buffer.copyBuffer(staging.buffer, buffer, copy_region); - // Lock the region - const LockedRegion region = { - .size = flush_size, - .fence_counter = scheduler.GetFenceCounter() - }; - - regions.emplace(flush_start, region); - // Add pipeline barrier for the flushed region auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage); const vk::BufferMemoryBarrier buffer_barrier = { @@ -205,45 +178,17 @@ void StreamBuffer::Flush() { command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask, vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, {}); - - flush_start = buffer_offset; } + + // Reset the offset of the next bucket + const u32 next_bucket = (current_bucket + 1) % SCHEDULER_COMMAND_COUNT; + buckets[next_bucket].offset = 0; + buckets[next_bucket].invalid = true; } -void StreamBuffer::Invalidate() { - buffer_offset = 0; - flush_start = 0; -} - -bool StreamBuffer::UnlockFreeRegions(u32 target_size) { - available_size = 0; - - // Free regions that don't need waiting - auto it = regions.lower_bound(buffer_offset); - while (it != regions.end()) { - const auto& [offset, region] = *it; - if (region.fence_counter <= scheduler.GetFenceCounter()) { - available_size += region.size; - it = regions.erase(it); - } - else { - break; - } - } - - // If that wasn't enough, try waiting for some fences - while (it != regions.end() && available_size < target_size) { - const auto& [offset, region] = *it; - - if (region.fence_counter > scheduler.GetFenceCounter()) { - scheduler.WaitFence(region.fence_counter); - } - - available_size += region.size; - it = regions.erase(it); - } - - return available_size >= target_size; +u32 StreamBuffer::GetBufferOffset() const { + const u32 current_bucket = scheduler.GetCurrentSlotIndex(); + return current_bucket * bucket_size + buckets[current_bucket].offset; } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index cae20ccac..c7aefb1be 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -34,7 +34,7 @@ struct StagingBuffer { class StreamBuffer { public: StreamBuffer(const Instance& instance, TaskScheduler& scheduler, - u32 size, vk::BufferUsageFlagBits usage, std::span views); + u32 size, vk::BufferUsageFlagBits usage, std::span views); ~StreamBuffer(); std::tuple Map(u32 size, u32 alignment = 0); @@ -45,15 +45,14 @@ public: /// Flushes staging memory to the GPU buffer void Flush(); + /// Returns the current buffer offset + u32 GetBufferOffset() const; + /// Returns the Vulkan buffer handle vk::Buffer GetHandle() const { return buffer; } - u32 GetBufferOffset() const { - return buffer_offset; - } - /// Returns an immutable reference to the requested buffer view const vk::BufferView& GetView(u32 index = 0) const { ASSERT(index < view_count); @@ -68,6 +67,12 @@ private: bool UnlockFreeRegions(u32 target_size); private: + struct Bucket { + bool invalid; + u32 fence_counter; + u32 offset; + }; + const Instance& instance; TaskScheduler& scheduler; StagingBuffer staging; @@ -79,10 +84,8 @@ private: std::array views{}; std::size_t view_count = 0; - u32 buffer_offset = 0; - u32 flush_start = 0; - s32 available_size = 0; - std::map regions; + u32 bucket_size = 0; + std::array buckets{}; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_task_scheduler.h b/src/video_core/renderer_vulkan/vk_task_scheduler.h index a4530c9b8..1fde47f65 100644 --- a/src/video_core/renderer_vulkan/vk_task_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_task_scheduler.h @@ -13,8 +13,6 @@ namespace Vulkan { -constexpr u32 SCHEDULER_COMMAND_COUNT = 4; - class Buffer; class Instance; diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp index 155ddfe3f..7556eac7e 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp @@ -215,13 +215,11 @@ void TextureRuntime::FormatConvert(VideoCore::PixelFormat format, bool upload, return Pica::Texture::ConvertABGRToRGBA(source, dest); } else if (format == VideoCore::PixelFormat::RGB8 && upload) { return Pica::Texture::ConvertBGRToRGBA(source, dest); - } else if (format == VideoCore::PixelFormat::D24S8 && !upload) { - return; // HACK: Skip depth download } else if (instance.IsFormatSupported(ToVkFormat(format), feature)) { std::memcpy(dest.data(), source.data(), source.size()); } else { LOG_CRITICAL(Render_Vulkan, "Unimplemented converion for format {}!", format); - UNREACHABLE(); + std::memcpy(dest.data(), source.data(), source.size()); } } diff --git a/src/video_core/shader/shader_uniforms.h b/src/video_core/shader/shader_uniforms.h index db8179985..b1fcac362 100644 --- a/src/video_core/shader/shader_uniforms.h +++ b/src/video_core/shader/shader_uniforms.h @@ -53,6 +53,7 @@ struct UniformData { int proctex_diff_lut_offset; float proctex_bias; int shadow_texture_bias; + bool enable_clip1; alignas(16) Common::Vec4i lighting_lut_offset[LightingRegs::NumLightingSampler / 4]; alignas(16) Common::Vec3f fog_color; alignas(8) Common::Vec2f proctex_noise_f;