renderer_vulkan: Complete hardware shader support

* With these changes all commercial games I tested work fine and get a massive performance boost
This commit is contained in:
GPUCode
2022-10-09 10:07:57 +03:00
parent 51685ee2db
commit 8c5b417486
8 changed files with 103 additions and 62 deletions

View File

@ -252,6 +252,7 @@ bool Instance::CreateDevice() {
AddExtension(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
AddExtension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME);
AddExtension(VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME);
timeline_semaphores = AddExtension(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME);
extended_dynamic_state = AddExtension(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME);
push_descriptors = AddExtension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME);
@ -322,6 +323,7 @@ bool Instance::CreateDevice() {
.shaderStorageImageMultisample = available.shaderStorageImageMultisample,
.shaderClipDistance = available.shaderClipDistance}},
vk::PhysicalDeviceDepthClipControlFeaturesEXT{.depthClipControl = true},
vk::PhysicalDeviceIndexTypeUint8FeaturesEXT{.indexTypeUint8 = true},
feature_chain.get<vk::PhysicalDeviceExtendedDynamicStateFeaturesEXT>(),
feature_chain.get<vk::PhysicalDeviceTimelineSemaphoreFeaturesKHR>()};

View File

@ -78,33 +78,20 @@ u32 AttribBytes(VertexAttribute attrib) {
}
vk::Format ToVkAttributeFormat(VertexAttribute attrib) {
switch (attrib.type) {
case AttribType::Float:
switch (attrib.size) {
case 1:
return vk::Format::eR32Sfloat;
case 2:
return vk::Format::eR32G32Sfloat;
case 3:
return vk::Format::eR32G32B32Sfloat;
case 4:
return vk::Format::eR32G32B32A32Sfloat;
}
case AttribType::Ubyte:
switch (attrib.size) {
case 4:
return vk::Format::eR8G8B8A8Uint;
default:
fmt::print("{}\n", attrib.size.Value());
UNREACHABLE();
}
constexpr std::array attribute_formats = {
std::array{vk::Format::eR32Sfloat, vk::Format::eR32G32Sfloat, vk::Format::eR32G32B32Sfloat,
vk::Format::eR32G32B32A32Sfloat},
std::array{vk::Format::eR32Sint, vk::Format::eR32G32Sint, vk::Format::eR32G32B32Sint,
vk::Format::eR32G32B32A32Sint},
std::array{vk::Format::eR16Sint, vk::Format::eR16G16Sint, vk::Format::eR16G16B16Sint,
vk::Format::eR16G16B16A16Sint},
std::array{vk::Format::eR8Sint, vk::Format::eR8G8Sint, vk::Format::eR8G8B8Sint,
vk::Format::eR8G8B8A8Sint},
std::array{vk::Format::eR8Uint, vk::Format::eR8G8Uint, vk::Format::eR8G8B8Uint,
vk::Format::eR8G8B8A8Uint}};
default:
LOG_CRITICAL(Render_Vulkan, "Unimplemented vertex attribute type {}", attrib.type.Value());
UNREACHABLE();
}
return vk::Format::eR32Sfloat;
ASSERT(attrib.size <= 4);
return attribute_formats[static_cast<u32>(attrib.type.Value())][attrib.size.Value() - 1];
}
vk::ShaderStageFlagBits ToVkShaderStage(std::size_t index) {
@ -197,8 +184,14 @@ void PipelineCache::BindPipeline(const PipelineInfo& info) {
}
bool PipelineCache::UseProgrammableVertexShader(const Pica::Regs& regs,
Pica::Shader::ShaderSetup& setup) {
const PicaVSConfig config{regs.vs, setup};
Pica::Shader::ShaderSetup& setup,
const VertexLayout& layout) {
PicaVSConfig config{regs.vs, setup};
for (u32 i = 0; i < layout.attribute_count; i++) {
const auto& attrib = layout.attributes[i];
config.state.attrib_types[attrib.location.Value()] = attrib.type.Value();
}
auto [handle, result] =
programmable_vertex_shaders.Get(config, setup, vk::ShaderStageFlagBits::eVertex,
instance.GetDevice(), ShaderOptimization::Debug);

View File

@ -22,8 +22,6 @@ constexpr u32 MAX_VERTEX_BINDINGS = 16;
constexpr u32 MAX_DESCRIPTORS = 8;
constexpr u32 MAX_DESCRIPTOR_SETS = 6;
enum class AttribType : u32 { Float = 0, Int = 1, Short = 2, Byte = 3, Ubyte = 4 };
/**
* The pipeline state is tightly packed with bitfields to reduce
* the overhead of hashing as much as possible
@ -154,7 +152,8 @@ public:
void BindPipeline(const PipelineInfo& info);
/// Binds a PICA decompiled vertex shader
bool UseProgrammableVertexShader(const Pica::Regs& regs, Pica::Shader::ShaderSetup& setup);
bool UseProgrammableVertexShader(const Pica::Regs& regs, Pica::Shader::ShaderSetup& setup,
const VertexLayout& layout);
/// Binds a passthrough vertex shader
void UseTrivialVertexShader();

View File

@ -328,7 +328,7 @@ RasterizerVulkan::VertexArrayInfo RasterizerVulkan::AnalyzeVertexArray(bool is_i
void RasterizerVulkan::SetupVertexArray(u32 vs_input_size, u32 vs_input_index_min,
u32 vs_input_index_max) {
auto [array_ptr, array_offset, _] = vertex_buffer.Map(vs_input_size, 4);
auto [array_ptr, array_offset, invalidate] = vertex_buffer.Map(vs_input_size, 4);
// The Nintendo 3DS has 12 attribute loaders which are used to tell the GPU
// how to interpret vertex data. The program firsts sets GPUREG_ATTR_BUF_BASE to the base
@ -340,9 +340,8 @@ void RasterizerVulkan::SetupVertexArray(u32 vs_input_size, u32 vs_input_index_mi
const auto& vertex_attributes = regs.pipeline.vertex_attributes;
PAddr base_address = vertex_attributes.GetPhysicalBaseAddress(); // GPUREG_ATTR_BUF_BASE
VertexLayout layout{};
std::array<bool, 16> enable_attributes{};
std::array<u64, 16> binding_offsets{};
VertexLayout layout{};
u32 buffer_offset = array_offset;
for (const auto& loader : vertex_attributes.attribute_loaders) {
@ -387,26 +386,32 @@ void RasterizerVulkan::SetupVertexArray(u32 vs_input_size, u32 vs_input_index_mi
const PAddr data_addr =
base_address + loader.data_offset + (vs_input_index_min * loader.byte_count);
const u32 vertex_num = vs_input_index_max - vs_input_index_min + 1;
const u32 data_size = loader.byte_count * vertex_num;
u32 data_size = loader.byte_count * vertex_num;
res_cache.FlushRegion(data_addr, data_size, nullptr);
std::memcpy(array_ptr, VideoCore::g_memory->GetPhysicalPointer(data_addr), data_size);
// Create the binding associated with this loader
VertexBinding& binding = layout.bindings.at(layout.binding_count);
VertexBinding& binding = layout.bindings[layout.binding_count];
binding.binding.Assign(layout.binding_count);
binding.fixed.Assign(0);
binding.stride.Assign(loader.byte_count);
// Keep track of the binding offsets so we can bind the vertex buffer later
binding_offsets[layout.binding_count++] = buffer_offset;
data_size = Common::AlignUp(data_size, 16);
array_ptr += data_size;
buffer_offset += data_size;
}
// Reserve the last binding for fixed attributes
u32 offset = 0;
bool has_fixed_binding = false;
// Reserve the last binding for fixed and default attributes
// Place the default attrib at offset zero for easy access
constexpr Common::Vec4f default_attrib = Common::MakeVec(0.f, 0.f, 0.f, 1.f);
u32 offset = sizeof(Common::Vec4f);
std::memcpy(array_ptr, default_attrib.AsArray(), sizeof(Common::Vec4f));
array_ptr += sizeof(Common::Vec4f);
// Find all fixed attributes and assign them to the last binding
for (std::size_t i = 0; i < 16; i++) {
if (vertex_attributes.IsDefaultAttribute(i)) {
const u32 reg = regs.vs.GetRegisterForAttribute(i);
@ -415,11 +420,10 @@ void RasterizerVulkan::SetupVertexArray(u32 vs_input_size, u32 vs_input_index_mi
const std::array data = {attr.x.ToFloat32(), attr.y.ToFloat32(), attr.z.ToFloat32(),
attr.w.ToFloat32()};
// Copy the data to the end of the buffer
const u32 data_size = sizeof(float) * static_cast<u32>(data.size());
std::memcpy(array_ptr, data.data(), data_size);
VertexAttribute& attribute = layout.attributes.at(layout.attribute_count++);
VertexAttribute& attribute = layout.attributes[layout.attribute_count++];
attribute.binding.Assign(layout.binding_count);
attribute.location.Assign(reg);
attribute.offset.Assign(offset);
@ -428,21 +432,36 @@ void RasterizerVulkan::SetupVertexArray(u32 vs_input_size, u32 vs_input_index_mi
offset += data_size;
array_ptr += data_size;
has_fixed_binding = true;
enable_attributes[reg] = true;
}
}
}
if (has_fixed_binding) {
VertexBinding& binding = layout.bindings.at(layout.binding_count);
binding.binding.Assign(layout.binding_count);
binding.fixed.Assign(1);
binding.stride.Assign(offset);
binding_offsets[layout.binding_count++] = buffer_offset;
buffer_offset += offset;
// Loop one more time to find unused attributes and assign them to the default one
// This needs to happen because i = 2 might be assigned to location = 3 so the loop
// above would skip setting it
for (std::size_t i = 0; i < 16; i++) {
// If the attribute is just disabled, shove the default attribute to avoid
// errors if the shader ever decides to use it. The pipeline cache can discard
// this if needed since it has access to the usage mask from the code generator
if (!enable_attributes[i]) {
VertexAttribute& attribute = layout.attributes[layout.attribute_count++];
attribute.binding.Assign(layout.binding_count);
attribute.location.Assign(i);
attribute.offset.Assign(0);
attribute.type.Assign(AttribType::Float);
attribute.size.Assign(4);
}
}
// Define the fixed+default binding
VertexBinding& binding = layout.bindings[layout.binding_count];
binding.binding.Assign(layout.binding_count);
binding.fixed.Assign(1);
binding.stride.Assign(offset);
binding_offsets[layout.binding_count++] = buffer_offset;
buffer_offset += offset;
pipeline_info.vertex_layout = layout;
vertex_buffer.Commit(buffer_offset - array_offset);
@ -457,7 +476,8 @@ void RasterizerVulkan::SetupVertexArray(u32 vs_input_size, u32 vs_input_index_mi
bool RasterizerVulkan::SetupVertexShader() {
MICROPROFILE_SCOPE(OpenGL_VS);
return pipeline_cache.UseProgrammableVertexShader(Pica::g_state.regs, Pica::g_state.vs);
return pipeline_cache.UseProgrammableVertexShader(Pica::g_state.regs, Pica::g_state.vs,
pipeline_info.vertex_layout);
}
bool RasterizerVulkan::SetupGeometryShader() {
@ -484,14 +504,6 @@ bool RasterizerVulkan::AccelerateDrawBatch(bool is_indexed) {
}
}
if (!SetupVertexShader()) {
return false;
}
if (!SetupGeometryShader()) {
return false;
}
return Draw(true, is_indexed);
}
@ -506,6 +518,15 @@ bool RasterizerVulkan::AccelerateDrawBatchInternal(bool is_indexed) {
}
SetupVertexArray(vs_input_size, vs_input_index_min, vs_input_index_max);
if (!SetupVertexShader()) {
return false;
}
if (!SetupGeometryShader()) {
return false;
}
pipeline_info.rasterization.topology.Assign(regs.pipeline.triangle_topology);
pipeline_cache.BindPipeline(pipeline_info);
@ -848,6 +869,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) {
succeeded = AccelerateDrawBatchInternal(is_indexed);
} else {
pipeline_info.rasterization.topology.Assign(Pica::PipelineRegs::TriangleTopology::List);
pipeline_info.vertex_layout = HardwareVertex::GetVertexLayout();
pipeline_cache.UseTrivialVertexShader();
pipeline_cache.UseTrivialGeometryShader();
pipeline_cache.BindPipeline(pipeline_info);

View File

@ -273,6 +273,7 @@ private:
};
std::vector<HardwareVertex> vertex_batch;
std::array<u64, 16> binding_offsets{};
ImageAlloc default_texture;
vk::Sampler default_sampler;
@ -289,8 +290,6 @@ private:
bool dirty = true;
} uniform_block_data = {};
std::array<bool, 16> hw_enabled_attributes{};
std::array<SamplerInfo, 3> texture_samplers;
SamplerInfo texture_cube_sampler;
std::unordered_map<SamplerInfo, vk::Sampler> samplers;

View File

@ -1625,7 +1625,24 @@ layout (set = 0, binding = 0, std140) uniform vs_config {
// input attributes declaration
for (std::size_t i = 0; i < used_regs.size(); ++i) {
if (used_regs[i]) {
out += fmt::format("layout(location = {0}) in {1}vec4 vs_in_reg{0};\n", i, i == 3 ? "" : "");
std::string_view prefix;
switch (config.state.attrib_types[i]) {
case AttribType::Float:
prefix = "";
break;
case AttribType::Byte:
case AttribType::Short:
prefix = "i";
break;
case AttribType::Ubyte:
prefix = "u";
break;
default:
LOG_CRITICAL(Render_Vulkan, "Unknown attrib type {}", config.state.attrib_types[i]);
UNREACHABLE();
}
out += fmt::format("layout(location = {0}) in {1}vec4 vs_in_reg{0};\n", i, prefix);
}
}
out += '\n';

View File

@ -12,6 +12,8 @@
namespace Vulkan {
enum class AttribType : u32 { Float = 0, Int = 1, Short = 2, Byte = 3, Ubyte = 4 };
enum Attributes {
ATTRIBUTE_POSITION,
ATTRIBUTE_COLOR,
@ -147,6 +149,7 @@ struct PicaShaderConfigCommon {
u64 swizzle_hash;
u32 main_offset;
bool sanitize_mul;
std::array<AttribType, 16> attrib_types;
u32 num_outputs;

View File

@ -3,6 +3,7 @@
// Refer to the license.txt file included.
#include <algorithm>
#include "common/alignment.h"
#include "common/assert.h"
#include "common/logging/log.h"
#include "video_core/renderer_vulkan/vk_instance.h"
@ -115,6 +116,11 @@ std::tuple<u8*, u32, bool> StreamBuffer::Map(u32 size, u32 alignment) {
const u32 current_bucket = scheduler.GetCurrentSlotIndex();
auto& bucket = buckets[current_bucket];
if (alignment > 0) {
bucket.offset = Common::AlignUp(bucket.offset, alignment);
}
if (bucket.offset + size > bucket_size) {
UNREACHABLE();
}