renderer_vulkan: Rewrite stream buffer + other fixes

* Emulate blend color and clip planes correctly

* Don't hack the depth in the vertex shader, use VK_EXT_depth_clip_control for now to set the range to -1, 1

* Rewrite the stream buffer to remove flickering problems. The new implementation doesn't try to be smart about holding memory. It divides the allocation in SCHEDULER_COMMAND_COUNT buckets and automatically switches between them based on the current slot index
This commit is contained in:
GPUCode
2022-09-26 21:07:21 +03:00
parent e1f6b88e7b
commit 8936641841
12 changed files with 82 additions and 117 deletions

View File

@ -40,7 +40,7 @@ void Zero(T& o) {
State::State() : geometry_pipeline(*this) {
auto SubmitVertex = [this](const Shader::AttributeBuffer& vertex) {
using Pica::Shader::OutputVertex;
auto AddTriangle = [this](const OutputVertex& v0, const OutputVertex& v1,
auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1,
const OutputVertex& v2) {
VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
};

View File

@ -151,7 +151,7 @@ struct ScreenRectVertex {
Common::Vec2f tex_coord;
};
constexpr u32 VERTEX_BUFFER_SIZE = sizeof(ScreenRectVertex) * 64;
constexpr u32 VERTEX_BUFFER_SIZE = sizeof(ScreenRectVertex) * 8192;
/**
* Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left

View File

@ -4,6 +4,8 @@
#pragma once
#include "common/common_types.h"
// Include vulkan-hpp header
#define VK_NO_PROTOTYPES 1
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
@ -17,6 +19,8 @@
namespace Vulkan {
constexpr u32 SCHEDULER_COMMAND_COUNT = 4;
/// Return the image aspect associated on the provided format
constexpr vk::ImageAspectFlags GetImageAspect(vk::Format format) {
switch (format) {

View File

@ -148,6 +148,7 @@ bool Instance::CreateDevice() {
};
AddExtension(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
AddExtension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME);
timeline_semaphores = AddExtension(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME);
extended_dynamic_state = AddExtension(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME);
push_descriptors = AddExtension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME);
@ -159,27 +160,31 @@ bool Instance::CreateDevice() {
return false;
}
graphics_queue_family_index = -1;
present_queue_family_index = -1;
for (int i = 0; i < family_properties.size(); i++) {
bool graphics_queue_found = false;
bool present_queue_found = false;
for (std::size_t i = 0; i < family_properties.size(); i++) {
// Check if queue supports graphics
const u32 index = static_cast<u32>(i);
if (family_properties[i].queueFlags & vk::QueueFlagBits::eGraphics) {
graphics_queue_family_index = i;
graphics_queue_family_index = index;
graphics_queue_found = true;
// If this queue also supports presentation we are finished
if (physical_device.getSurfaceSupportKHR(i, surface)) {
present_queue_family_index = i;
if (physical_device.getSurfaceSupportKHR(static_cast<u32>(i), surface)) {
present_queue_family_index = index;
present_queue_found = true;
break;
}
}
// Check if queue supports presentation
if (physical_device.getSurfaceSupportKHR(i, surface)) {
present_queue_family_index = i;
if (physical_device.getSurfaceSupportKHR(index, surface)) {
present_queue_family_index = index;
present_queue_found = true;
}
}
if (graphics_queue_family_index == -1 || present_queue_family_index == -1) {
if (!graphics_queue_found || !present_queue_found) {
LOG_CRITICAL(Render_Vulkan, "Unable to find graphics and/or present queues.");
return false;
}
@ -221,6 +226,9 @@ bool Instance::CreateDevice() {
.shaderClipDistance = available.shaderClipDistance
}
},
vk::PhysicalDeviceDepthClipControlFeaturesEXT{
.depthClipControl = true
},
feature_chain.get<vk::PhysicalDeviceExtendedDynamicStateFeaturesEXT>(),
feature_chain.get<vk::PhysicalDeviceTimelineSemaphoreFeaturesKHR>()
};

View File

@ -483,7 +483,12 @@ vk::Pipeline PipelineCache::BuildPipeline(const PipelineInfo& info) {
.extent = {1, 1}
};
vk::PipelineViewportDepthClipControlCreateInfoEXT depth_clip_control = {
.negativeOneToOne = true
};
const vk::PipelineViewportStateCreateInfo viewport_info = {
.pNext = &depth_clip_control,
.viewportCount = 1,
.pViewports = &viewport,
.scissorCount = 1,
@ -497,6 +502,7 @@ vk::Pipeline PipelineCache::BuildPipeline(const PipelineInfo& info) {
vk::DynamicState::eStencilCompareMask,
vk::DynamicState::eStencilWriteMask,
vk::DynamicState::eStencilReference,
vk::DynamicState::eBlendConstants,
// VK_EXT_extended_dynamic_state
vk::DynamicState::eCullModeEXT,
vk::DynamicState::eDepthCompareOpEXT,

View File

@ -859,6 +859,8 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) {
}
}
renderpass_cache.ExitRenderpass();
vertex_batch.clear();
// Mark framebuffer surfaces as dirty
@ -1631,7 +1633,7 @@ void RasterizerVulkan::SetShader() {
}
void RasterizerVulkan::SyncClipEnabled() {
//state.clip_distance[1] = Pica::g_state.regs.rasterizer.clip_enable != 0;
uniform_block_data.data.enable_clip1 = Pica::g_state.regs.rasterizer.clip_enable != 0;
}
void RasterizerVulkan::SyncClipCoef() {
@ -1689,12 +1691,11 @@ void RasterizerVulkan::SyncBlendFuncs() {
}
void RasterizerVulkan::SyncBlendColor() {
/*auto blend_color =
PicaToGL::ColorRGBA8(Pica::g_state.regs.framebuffer.output_merger.blend_const.raw);
state.blend.color.red = blend_color[0];
state.blend.color.green = blend_color[1];
state.blend.color.blue = blend_color[2];
state.blend.color.alpha = blend_color[3];*/
auto blend_color =
PicaToVK::ColorRGBA8(Pica::g_state.regs.framebuffer.output_merger.blend_const.raw);
vk::CommandBuffer command_buffer = scheduler.GetRenderCommandBuffer();
command_buffer.setBlendConstants(blend_color.AsArray());
}
void RasterizerVulkan::SyncFogColor() {

View File

@ -55,6 +55,7 @@ layout (set = 0, binding = 1, std140) uniform shader_data {
int proctex_diff_lut_offset;
float proctex_bias;
int shadow_texture_bias;
bool enable_clip1;
ivec4 lighting_lut_offset[NUM_LIGHTING_SAMPLERS / 4];
vec3 fog_color;
vec2 proctex_noise_f;
@ -89,9 +90,7 @@ static std::string GetVertexInterfaceDeclaration(bool is_output) {
out += R"(
out gl_PerVertex {
vec4 gl_Position;
#if !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance)
float gl_ClipDistance[2];
#endif // !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance)
};
)";
}
@ -1568,11 +1567,13 @@ void main() {
normquat = vert_normquat;
view = vert_view;
gl_Position = vert_position;
gl_Position.z = (gl_Position.z + gl_Position.w) / 2.0;
#if !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance)
gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0
gl_ClipDistance[1] = dot(clip_coef, vert_position);
#endif // !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance)
if (enable_clip1) {
gl_ClipDistance[1] = dot(clip_coef, vert_position);
} else {
gl_ClipDistance[1] = 0;
}
}
)";

View File

@ -106,8 +106,8 @@ StreamBuffer::StreamBuffer(const Instance& instance, TaskScheduler& scheduler,
views[i] = device.createBufferView(view_info);
}
available_size = total_size;
view_count = view_formats.size();
bucket_size = size / SCHEDULER_COMMAND_COUNT;
}
StreamBuffer::~StreamBuffer() {
@ -123,57 +123,38 @@ StreamBuffer::~StreamBuffer() {
std::tuple<u8*, u32, bool> StreamBuffer::Map(u32 size, u32 alignment) {
ASSERT(size <= total_size && alignment <= total_size);
if (alignment > 0) {
buffer_offset = Common::AlignUp(buffer_offset, alignment);
const u32 current_bucket = scheduler.GetCurrentSlotIndex();
auto& bucket = buckets[current_bucket];
if (bucket.offset + size > bucket_size) {
UNREACHABLE();
}
// Have we run out of available space?
bool invalidate = false;
if (available_size < size) {
// If we are at the end of the buffer, start over
if (buffer_offset + size > total_size) {
// Flush any pending writes before looping back
Flush();
// Since new regions are discovered based on locks, insert a lock
// that reaches the end of the buffer to avoid having an empty region
const LockedRegion region = {
.size = total_size - buffer_offset,
.fence_counter = scheduler.GetFenceCounter()
};
regions.emplace(buffer_offset, region);
Invalidate();
invalidate = true;
}
// Try to garbage collect old regions
if (!UnlockFreeRegions(size)) {
// Nuclear option: stall the GPU to remove all the locks
LOG_WARNING(Render_Vulkan, "Buffer GPU stall");
Invalidate();
regions.clear();
available_size = total_size;
}
if (bucket.invalid) {
invalidate = true;
bucket.invalid = false;
}
const u32 buffer_offset = current_bucket * bucket_size + bucket.offset;
u8* mapped = reinterpret_cast<u8*>(staging.mapped.data() + buffer_offset);
return std::make_tuple(mapped, buffer_offset, invalidate);
}
void StreamBuffer::Commit(u32 size) {
buffer_offset += size;
available_size -= size;
buckets[scheduler.GetCurrentSlotIndex()].offset += size;
}
void StreamBuffer::Flush() {
const u32 flush_size = buffer_offset - flush_start;
const u32 current_bucket = scheduler.GetCurrentSlotIndex();
const u32 flush_size = buckets[current_bucket].offset;
ASSERT(flush_size <= bucket_size);
if (flush_size > 0) {
vk::CommandBuffer command_buffer = scheduler.GetUploadCommandBuffer();
VmaAllocator allocator = instance.GetAllocator();
const u32 flush_size = buffer_offset - flush_start;
const u32 flush_start = current_bucket * bucket_size;
const vk::BufferCopy copy_region = {
.srcOffset = flush_start,
.dstOffset = flush_start,
@ -183,14 +164,6 @@ void StreamBuffer::Flush() {
vmaFlushAllocation(allocator, allocation, flush_start, flush_size);
command_buffer.copyBuffer(staging.buffer, buffer, copy_region);
// Lock the region
const LockedRegion region = {
.size = flush_size,
.fence_counter = scheduler.GetFenceCounter()
};
regions.emplace(flush_start, region);
// Add pipeline barrier for the flushed region
auto [access_mask, stage_mask] = ToVkAccessStageFlags(usage);
const vk::BufferMemoryBarrier buffer_barrier = {
@ -205,45 +178,17 @@ void StreamBuffer::Flush() {
command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, stage_mask,
vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, {});
flush_start = buffer_offset;
}
// Reset the offset of the next bucket
const u32 next_bucket = (current_bucket + 1) % SCHEDULER_COMMAND_COUNT;
buckets[next_bucket].offset = 0;
buckets[next_bucket].invalid = true;
}
void StreamBuffer::Invalidate() {
buffer_offset = 0;
flush_start = 0;
}
bool StreamBuffer::UnlockFreeRegions(u32 target_size) {
available_size = 0;
// Free regions that don't need waiting
auto it = regions.lower_bound(buffer_offset);
while (it != regions.end()) {
const auto& [offset, region] = *it;
if (region.fence_counter <= scheduler.GetFenceCounter()) {
available_size += region.size;
it = regions.erase(it);
}
else {
break;
}
}
// If that wasn't enough, try waiting for some fences
while (it != regions.end() && available_size < target_size) {
const auto& [offset, region] = *it;
if (region.fence_counter > scheduler.GetFenceCounter()) {
scheduler.WaitFence(region.fence_counter);
}
available_size += region.size;
it = regions.erase(it);
}
return available_size >= target_size;
u32 StreamBuffer::GetBufferOffset() const {
const u32 current_bucket = scheduler.GetCurrentSlotIndex();
return current_bucket * bucket_size + buckets[current_bucket].offset;
}
} // namespace Vulkan

View File

@ -34,7 +34,7 @@ struct StagingBuffer {
class StreamBuffer {
public:
StreamBuffer(const Instance& instance, TaskScheduler& scheduler,
u32 size, vk::BufferUsageFlagBits usage, std::span<const vk::Format> views);
u32 size, vk::BufferUsageFlagBits usage, std::span<const vk::Format> views);
~StreamBuffer();
std::tuple<u8*, u32, bool> Map(u32 size, u32 alignment = 0);
@ -45,15 +45,14 @@ public:
/// Flushes staging memory to the GPU buffer
void Flush();
/// Returns the current buffer offset
u32 GetBufferOffset() const;
/// Returns the Vulkan buffer handle
vk::Buffer GetHandle() const {
return buffer;
}
u32 GetBufferOffset() const {
return buffer_offset;
}
/// Returns an immutable reference to the requested buffer view
const vk::BufferView& GetView(u32 index = 0) const {
ASSERT(index < view_count);
@ -68,6 +67,12 @@ private:
bool UnlockFreeRegions(u32 target_size);
private:
struct Bucket {
bool invalid;
u32 fence_counter;
u32 offset;
};
const Instance& instance;
TaskScheduler& scheduler;
StagingBuffer staging;
@ -79,10 +84,8 @@ private:
std::array<vk::BufferView, MAX_BUFFER_VIEWS> views{};
std::size_t view_count = 0;
u32 buffer_offset = 0;
u32 flush_start = 0;
s32 available_size = 0;
std::map<u32, LockedRegion> regions;
u32 bucket_size = 0;
std::array<Bucket, SCHEDULER_COMMAND_COUNT> buckets{};
};
} // namespace Vulkan

View File

@ -13,8 +13,6 @@
namespace Vulkan {
constexpr u32 SCHEDULER_COMMAND_COUNT = 4;
class Buffer;
class Instance;

View File

@ -215,13 +215,11 @@ void TextureRuntime::FormatConvert(VideoCore::PixelFormat format, bool upload,
return Pica::Texture::ConvertABGRToRGBA(source, dest);
} else if (format == VideoCore::PixelFormat::RGB8 && upload) {
return Pica::Texture::ConvertBGRToRGBA(source, dest);
} else if (format == VideoCore::PixelFormat::D24S8 && !upload) {
return; // HACK: Skip depth download
} else if (instance.IsFormatSupported(ToVkFormat(format), feature)) {
std::memcpy(dest.data(), source.data(), source.size());
} else {
LOG_CRITICAL(Render_Vulkan, "Unimplemented converion for format {}!", format);
UNREACHABLE();
std::memcpy(dest.data(), source.data(), source.size());
}
}

View File

@ -53,6 +53,7 @@ struct UniformData {
int proctex_diff_lut_offset;
float proctex_bias;
int shadow_texture_bias;
bool enable_clip1;
alignas(16) Common::Vec4i lighting_lut_offset[LightingRegs::NumLightingSampler / 4];
alignas(16) Common::Vec3f fog_color;
alignas(8) Common::Vec2f proctex_noise_f;