renderer_vulkan: Rewrite stream buffer + other fixes

* Emulate blend color and clip planes correctly

* Don't hack the depth in the vertex shader, use VK_EXT_depth_clip_control for now to set the range to -1, 1

* Rewrite the stream buffer to remove flickering problems. The new implementation doesn't try to be smart about holding memory. It divides the allocation in SCHEDULER_COMMAND_COUNT buckets and automatically switches between them based on the current slot index
This commit is contained in:
GPUCode
2022-09-26 21:07:21 +03:00
parent e1f6b88e7b
commit 8936641841
12 changed files with 82 additions and 117 deletions

View File

@@ -40,7 +40,7 @@ void Zero(T& o) {
State::State() : geometry_pipeline(*this) { State::State() : geometry_pipeline(*this) {
auto SubmitVertex = [this](const Shader::AttributeBuffer& vertex) { auto SubmitVertex = [this](const Shader::AttributeBuffer& vertex) {
using Pica::Shader::OutputVertex; using Pica::Shader::OutputVertex;
auto AddTriangle = [this](const OutputVertex& v0, const OutputVertex& v1, auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1,
const OutputVertex& v2) { const OutputVertex& v2) {
VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
}; };

View File

@@ -151,7 +151,7 @@ struct ScreenRectVertex {
Common::Vec2f tex_coord; Common::Vec2f tex_coord;
}; };
constexpr u32 VERTEX_BUFFER_SIZE = sizeof(ScreenRectVertex) * 64; constexpr u32 VERTEX_BUFFER_SIZE = sizeof(ScreenRectVertex) * 8192;
/** /**
* Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left

View File

@@ -4,6 +4,8 @@
#pragma once #pragma once
#include "common/common_types.h"
// Include vulkan-hpp header // Include vulkan-hpp header
#define VK_NO_PROTOTYPES 1 #define VK_NO_PROTOTYPES 1
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
@@ -17,6 +19,8 @@
namespace Vulkan { namespace Vulkan {
constexpr u32 SCHEDULER_COMMAND_COUNT = 4;
/// Return the image aspect associated on the provided format /// Return the image aspect associated on the provided format
constexpr vk::ImageAspectFlags GetImageAspect(vk::Format format) { constexpr vk::ImageAspectFlags GetImageAspect(vk::Format format) {
switch (format) { switch (format) {

View File

@@ -148,6 +148,7 @@ bool Instance::CreateDevice() {
}; };
AddExtension(VK_KHR_SWAPCHAIN_EXTENSION_NAME); AddExtension(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
AddExtension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME);
timeline_semaphores = AddExtension(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME); timeline_semaphores = AddExtension(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME);
extended_dynamic_state = AddExtension(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); extended_dynamic_state = AddExtension(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME);
push_descriptors = AddExtension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); push_descriptors = AddExtension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME);
@@ -159,27 +160,31 @@ bool Instance::CreateDevice() {
return false; return false;
} }
graphics_queue_family_index = -1; bool graphics_queue_found = false;
present_queue_family_index = -1; bool present_queue_found = false;
for (int i = 0; i < family_properties.size(); i++) { for (std::size_t i = 0; i < family_properties.size(); i++) {
// Check if queue supports graphics // Check if queue supports graphics
const u32 index = static_cast<u32>(i);
if (family_properties[i].queueFlags & vk::QueueFlagBits::eGraphics) { if (family_properties[i].queueFlags & vk::QueueFlagBits::eGraphics) {
graphics_queue_family_index = i; graphics_queue_family_index = index;
graphics_queue_found = true;
// If this queue also supports presentation we are finished // If this queue also supports presentation we are finished
if (physical_device.getSurfaceSupportKHR(i, surface)) { if (physical_device.getSurfaceSupportKHR(static_cast<u32>(i), surface)) {
present_queue_family_index = i; present_queue_family_index = index;
present_queue_found = true;
break; break;
} }
} }
// Check if queue supports presentation // Check if queue supports presentation
if (physical_device.getSurfaceSupportKHR(i, surface)) { if (physical_device.getSurfaceSupportKHR(index, surface)) {
present_queue_family_index = i; present_queue_family_index = index;
present_queue_found = true;
} }
} }
if (graphics_queue_family_index == -1 || present_queue_family_index == -1) { if (!graphics_queue_found || !present_queue_found) {
LOG_CRITICAL(Render_Vulkan, "Unable to find graphics and/or present queues."); LOG_CRITICAL(Render_Vulkan, "Unable to find graphics and/or present queues.");
return false; return false;
} }
@@ -221,6 +226,9 @@ bool Instance::CreateDevice() {
.shaderClipDistance = available.shaderClipDistance .shaderClipDistance = available.shaderClipDistance
} }
}, },
vk::PhysicalDeviceDepthClipControlFeaturesEXT{
.depthClipControl = true
},
feature_chain.get<vk::PhysicalDeviceExtendedDynamicStateFeaturesEXT>(), feature_chain.get<vk::PhysicalDeviceExtendedDynamicStateFeaturesEXT>(),
feature_chain.get<vk::PhysicalDeviceTimelineSemaphoreFeaturesKHR>() feature_chain.get<vk::PhysicalDeviceTimelineSemaphoreFeaturesKHR>()
}; };

View File

@@ -483,7 +483,12 @@ vk::Pipeline PipelineCache::BuildPipeline(const PipelineInfo& info) {
.extent = {1, 1} .extent = {1, 1}
}; };
vk::PipelineViewportDepthClipControlCreateInfoEXT depth_clip_control = {
.negativeOneToOne = true
};
const vk::PipelineViewportStateCreateInfo viewport_info = { const vk::PipelineViewportStateCreateInfo viewport_info = {
.pNext = &depth_clip_control,
.viewportCount = 1, .viewportCount = 1,
.pViewports = &viewport, .pViewports = &viewport,
.scissorCount = 1, .scissorCount = 1,
@@ -497,6 +502,7 @@ vk::Pipeline PipelineCache::BuildPipeline(const PipelineInfo& info) {
vk::DynamicState::eStencilCompareMask, vk::DynamicState::eStencilCompareMask,
vk::DynamicState::eStencilWriteMask, vk::DynamicState::eStencilWriteMask,
vk::DynamicState::eStencilReference, vk::DynamicState::eStencilReference,
vk::DynamicState::eBlendConstants,
// VK_EXT_extended_dynamic_state // VK_EXT_extended_dynamic_state
vk::DynamicState::eCullModeEXT, vk::DynamicState::eCullModeEXT,
vk::DynamicState::eDepthCompareOpEXT, vk::DynamicState::eDepthCompareOpEXT,

View File

@@ -859,6 +859,8 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) {
} }
} }
renderpass_cache.ExitRenderpass();
vertex_batch.clear(); vertex_batch.clear();
// Mark framebuffer surfaces as dirty // Mark framebuffer surfaces as dirty
@@ -1631,7 +1633,7 @@ void RasterizerVulkan::SetShader() {
} }
void RasterizerVulkan::SyncClipEnabled() { void RasterizerVulkan::SyncClipEnabled() {
//state.clip_distance[1] = Pica::g_state.regs.rasterizer.clip_enable != 0; uniform_block_data.data.enable_clip1 = Pica::g_state.regs.rasterizer.clip_enable != 0;
} }
void RasterizerVulkan::SyncClipCoef() { void RasterizerVulkan::SyncClipCoef() {
@@ -1689,12 +1691,11 @@ void RasterizerVulkan::SyncBlendFuncs() {
} }
void RasterizerVulkan::SyncBlendColor() { void RasterizerVulkan::SyncBlendColor() {
/*auto blend_color = auto blend_color =
PicaToGL::ColorRGBA8(Pica::g_state.regs.framebuffer.output_merger.blend_const.raw); PicaToVK::ColorRGBA8(Pica::g_state.regs.framebuffer.output_merger.blend_const.raw);
state.blend.color.red = blend_color[0];
state.blend.color.green = blend_color[1]; vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer();
state.blend.color.blue = blend_color[2]; command_buffer.setBlendConstants(blend_color.AsArray());
state.blend.color.alpha = blend_color[3];*/
} }
void RasterizerVulkan::SyncFogColor() { void RasterizerVulkan::SyncFogColor() {

View File

@@ -55,6 +55,7 @@ layout (set = 0, binding = 1, std140) uniform shader_data {
int proctex_diff_lut_offset; int proctex_diff_lut_offset;
float proctex_bias; float proctex_bias;
int shadow_texture_bias; int shadow_texture_bias;
bool enable_clip1;
ivec4 lighting_lut_offset[NUM_LIGHTING_SAMPLERS / 4]; ivec4 lighting_lut_offset[NUM_LIGHTING_SAMPLERS / 4];
vec3 fog_color; vec3 fog_color;
vec2 proctex_noise_f; vec2 proctex_noise_f;
@@ -89,9 +90,7 @@ static std::string GetVertexInterfaceDeclaration(bool is_output) {
out += R"( out += R"(
out gl_PerVertex { out gl_PerVertex {
vec4 gl_Position; vec4 gl_Position;
#if !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance)
float gl_ClipDistance[2]; float gl_ClipDistance[2];
#endif // !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance)
}; };
)"; )";
} }
@@ -1568,11 +1567,13 @@ void main() {
normquat = vert_normquat; normquat = vert_normquat;
view = vert_view; view = vert_view;
gl_Position = vert_position; gl_Position = vert_position;
gl_Position.z = (gl_Position.z + gl_Position.w) / 2.0;
#if !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance)
gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0 gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0
gl_ClipDistance[1] = dot(clip_coef, vert_position); if (enable_clip1) {
#endif // !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance) gl_ClipDistance[1] = dot(clip_coef, vert_position);
} else {
gl_ClipDistance[1] = 0;
}
} }
)"; )";

View File

@@ -106,8 +106,8 @@ StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler,
views[i] = device.createBufferView(view_info); views[i] = device.createBufferView(view_info);
} }
available_size = total_size;
view_count = view_formats.size(); view_count = view_formats.size();
bucket_size = size / SCHEDULER_COMMAND_COUNT;
} }
StreamBuffer::~StreamBuffer() { StreamBuffer::~StreamBuffer() {
@@ -123,57 +123,38 @@ StreamBuffer::~StreamBuffer() {
std::tuple<u8*, u32, bool> StreamBuffer::Map(u32 size, u32 alignment) { std::tuple<u8*, u32, bool> StreamBuffer::Map(u32 size, u32 alignment) {
ASSERT(size <= total_size && alignment <= total_size); ASSERT(size <= total_size && alignment <= total_size);
if (alignment > 0) { const u32 current_bucket = scheduler.GetCurrentSlotIndex();
buffer_offset = Common::AlignUp(buffer_offset, alignment); auto& bucket = buckets[current_bucket];
if (bucket.offset + size > bucket_size) {
UNREACHABLE();
} }
// Have we run out of available space?
bool invalidate = false; bool invalidate = false;
if (available_size < size) { if (bucket.invalid) {
// If we are at the end of the buffer, start over invalidate = true;
if (buffer_offset + size > total_size) { bucket.invalid = false;
// Flush any pending writes before looping back
Flush();
// Since new regions are discovered based on locks, insert a lock
// that reaches the end of the buffer to avoid having an empty region
const LockedRegion region = {
.size = total_size - buffer_offset,
.fence_counter = scheduler.GetFenceCounter()
};
regions.emplace(buffer_offset, region);
Invalidate();
invalidate = true;
}
// Try to garbage collect old regions
if (!UnlockFreeRegions(size)) {
// Nuclear option: stall the GPU to remove all the locks
LOG_WARNING(Render_Vulkan, "Buffer GPU stall");
Invalidate();
regions.clear();
available_size = total_size;
}
} }
const u32 buffer_offset = current_bucket * bucket_size + bucket.offset;
u8* mapped = reinterpret_cast<u8*>(staging.mapped.data() + buffer_offset); u8* mapped = reinterpret_cast<u8*>(staging.mapped.data() + buffer_offset);
return std::make_tuple(mapped, buffer_offset, invalidate); return std::make_tuple(mapped, buffer_offset, invalidate);
} }
void StreamBuffer::Commit(u32 size) { void StreamBuffer::Commit(u32 size) {
buffer_offset += size; buckets[scheduler.GetCurrentSlotIndex()].offset += size;
available_size -= size;
} }
void StreamBuffer::Flush() { void StreamBuffer::Flush() {
const u32 flush_size = buffer_offset - flush_start; const u32 current_bucket = scheduler.GetCurrentSlotIndex();
const u32 flush_size = buckets[current_bucket].offset;
ASSERT(flush_size <= bucket_size);
if (flush_size > 0) { if (flush_size > 0) {
vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer(); vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer();
VmaAllocator allocator = instance.GetAllocator(); VmaAllocator allocator = instance.GetAllocator();
const u32 flush_size = buffer_offset - flush_start; const u32 flush_start = current_bucket * bucket_size;
const vk::BufferCopy copy_region = { const vk::BufferCopy copy_region = {
.srcOffset = flush_start, .srcOffset = flush_start,
.dstOffset = flush_start, .dstOffset = flush_start,
@@ -183,14 +164,6 @@ void StreamBuffer::Flush() {
vmaFlushAllocation(allocator, allocation, flush_start, flush_size); vmaFlushAllocation(allocator, allocation, flush_start, flush_size);
command_buffer.copyBuffer(staging.buffer, buffer, copy_region); command_buffer.copyBuffer(staging.buffer, buffer, copy_region);
// Lock the region
const LockedRegion region = {
.size = flush_size,
.fence_counter = scheduler.GetFenceCounter()
};
regions.emplace(flush_start, region);
// Add pipeline barrier for the flushed region // Add pipeline barrier for the flushed region
auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage); auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage);
const vk::BufferMemoryBarrier buffer_barrier = { const vk::BufferMemoryBarrier buffer_barrier = {
@@ -205,45 +178,17 @@ void StreamBuffer::Flush() {
command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask, command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask,
vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, {}); vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, {});
flush_start = buffer_offset;
} }
// Reset the offset of the next bucket
const u32 next_bucket = (current_bucket + 1) % SCHEDULER_COMMAND_COUNT;
buckets[next_bucket].offset = 0;
buckets[next_bucket].invalid = true;
} }
void StreamBuffer::Invalidate() { u32 StreamBuffer::GetBufferOffset() const {
buffer_offset = 0; const u32 current_bucket = scheduler.GetCurrentSlotIndex();
flush_start = 0; return current_bucket * bucket_size + buckets[current_bucket].offset;
}
bool StreamBuffer::UnlockFreeRegions(u32 target_size) {
available_size = 0;
// Free regions that don't need waiting
auto it = regions.lower_bound(buffer_offset);
while (it != regions.end()) {
const auto& [offset, region] = *it;
if (region.fence_counter <= scheduler.GetFenceCounter()) {
available_size += region.size;
it = regions.erase(it);
}
else {
break;
}
}
// If that wasn't enough, try waiting for some fences
while (it != regions.end() && available_size < target_size) {
const auto& [offset, region] = *it;
if (region.fence_counter > scheduler.GetFenceCounter()) {
scheduler.WaitFence(region.fence_counter);
}
available_size += region.size;
it = regions.erase(it);
}
return available_size >= target_size;
} }
} // namespace Vulkan } // namespace Vulkan

View File

@@ -34,7 +34,7 @@ struct StagingBuffer {
class StreamBuffer { class StreamBuffer {
public: public:
StreamBuffer(const Instance& instance, TaskScheduler& scheduler, StreamBuffer(const Instance& instance, TaskScheduler& scheduler,
u32 size, vk::BufferUsageFlagBits usage, std::span<const vk::Format> views); u32 size, vk::BufferUsageFlagBits usage, std::span<const vk::Format> views);
~StreamBuffer(); ~StreamBuffer();
std::tuple<u8*, u32, bool> Map(u32 size, u32 alignment = 0); std::tuple<u8*, u32, bool> Map(u32 size, u32 alignment = 0);
@@ -45,15 +45,14 @@ public:
/// Flushes staging memory to the GPU buffer /// Flushes staging memory to the GPU buffer
void Flush(); void Flush();
/// Returns the current buffer offset
u32 GetBufferOffset() const;
/// Returns the Vulkan buffer handle /// Returns the Vulkan buffer handle
vk::Buffer GetHandle() const { vk::Buffer GetHandle() const {
return buffer; return buffer;
} }
u32 GetBufferOffset() const {
return buffer_offset;
}
/// Returns an immutable reference to the requested buffer view /// Returns an immutable reference to the requested buffer view
const vk::BufferView& GetView(u32 index = 0) const { const vk::BufferView& GetView(u32 index = 0) const {
ASSERT(index < view_count); ASSERT(index < view_count);
@@ -68,6 +67,12 @@ private:
bool UnlockFreeRegions(u32 target_size); bool UnlockFreeRegions(u32 target_size);
private: private:
struct Bucket {
bool invalid;
u32 fence_counter;
u32 offset;
};
const Instance& instance; const Instance& instance;
TaskScheduler& scheduler; TaskScheduler& scheduler;
StagingBuffer staging; StagingBuffer staging;
@@ -79,10 +84,8 @@ private:
std::array<vk::BufferView, MAX_BUFFER_VIEWS> views{}; std::array<vk::BufferView, MAX_BUFFER_VIEWS> views{};
std::size_t view_count = 0; std::size_t view_count = 0;
u32 buffer_offset = 0; u32 bucket_size = 0;
u32 flush_start = 0; std::array<Bucket, SCHEDULER_COMMAND_COUNT> buckets{};
s32 available_size = 0;
std::map<u32, LockedRegion> regions;
}; };
} // namespace Vulkan } // namespace Vulkan

View File

@@ -13,8 +13,6 @@
namespace Vulkan { namespace Vulkan {
constexpr u32 SCHEDULER_COMMAND_COUNT = 4;
class Buffer; class Buffer;
class Instance; class Instance;

View File

@@ -215,13 +215,11 @@ void TextureRuntime::FormatConvert(VideoCore::PixelFormat format, bool upload,
return Pica::Texture::ConvertABGRToRGBA(source, dest); return Pica::Texture::ConvertABGRToRGBA(source, dest);
} else if (format == VideoCore::PixelFormat::RGB8 && upload) { } else if (format == VideoCore::PixelFormat::RGB8 && upload) {
return Pica::Texture::ConvertBGRToRGBA(source, dest); return Pica::Texture::ConvertBGRToRGBA(source, dest);
} else if (format == VideoCore::PixelFormat::D24S8 && !upload) {
return; // HACK: Skip depth download
} else if (instance.IsFormatSupported(ToVkFormat(format), feature)) { } else if (instance.IsFormatSupported(ToVkFormat(format), feature)) {
std::memcpy(dest.data(), source.data(), source.size()); std::memcpy(dest.data(), source.data(), source.size());
} else { } else {
LOG_CRITICAL(Render_Vulkan, "Unimplemented converion for format {}!", format); LOG_CRITICAL(Render_Vulkan, "Unimplemented converion for format {}!", format);
UNREACHABLE(); std::memcpy(dest.data(), source.data(), source.size());
} }
} }

View File

@@ -53,6 +53,7 @@ struct UniformData {
int proctex_diff_lut_offset; int proctex_diff_lut_offset;
float proctex_bias; float proctex_bias;
int shadow_texture_bias; int shadow_texture_bias;
bool enable_clip1;
alignas(16) Common::Vec4i lighting_lut_offset[LightingRegs::NumLightingSampler / 4]; alignas(16) Common::Vec4i lighting_lut_offset[LightingRegs::NumLightingSampler / 4];
alignas(16) Common::Vec3f fog_color; alignas(16) Common::Vec3f fog_color;
alignas(8) Common::Vec2f proctex_noise_f; alignas(8) Common::Vec2f proctex_noise_f;