shader: Implement LDS, STS, LDL, and STS and use SPIR-V 1.4 when available
This commit is contained in:
		| @@ -14,6 +14,7 @@ add_library(shader_recompiler STATIC | ||||
|     backend/spirv/emit_spirv_logical.cpp | ||||
|     backend/spirv/emit_spirv_memory.cpp | ||||
|     backend/spirv/emit_spirv_select.cpp | ||||
|     backend/spirv/emit_spirv_shared_memory.cpp | ||||
|     backend/spirv/emit_spirv_special.cpp | ||||
|     backend/spirv/emit_spirv_undefined.cpp | ||||
|     backend/spirv/emit_spirv_warp.cpp | ||||
| @@ -111,6 +112,7 @@ add_library(shader_recompiler STATIC | ||||
|     frontend/maxwell/translate/impl/load_constant.cpp | ||||
|     frontend/maxwell/translate/impl/load_effective_address.cpp | ||||
|     frontend/maxwell/translate/impl/load_store_attribute.cpp | ||||
|     frontend/maxwell/translate/impl/load_store_local_shared.cpp | ||||
|     frontend/maxwell/translate/impl/load_store_memory.cpp | ||||
|     frontend/maxwell/translate/impl/logic_operation.cpp | ||||
|     frontend/maxwell/translate/impl/logic_operation_three_input.cpp | ||||
|   | ||||
| @@ -9,6 +9,7 @@ | ||||
| #include <fmt/format.h> | ||||
|  | ||||
| #include "common/common_types.h" | ||||
| #include "common/div_ceil.h" | ||||
| #include "shader_recompiler/backend/spirv/emit_context.h" | ||||
|  | ||||
| namespace Shader::Backend::SPIRV { | ||||
| @@ -96,11 +97,13 @@ void VectorTypes::Define(Sirit::Module& sirit_ctx, Id base_type, std::string_vie | ||||
| } | ||||
|  | ||||
| EmitContext::EmitContext(const Profile& profile_, IR::Program& program, u32& binding) | ||||
|     : Sirit::Module(0x00010000), profile{profile_}, stage{program.stage} { | ||||
|     : Sirit::Module(profile_.supported_spirv), profile{profile_}, stage{program.stage} { | ||||
|     AddCapability(spv::Capability::Shader); | ||||
|     DefineCommonTypes(program.info); | ||||
|     DefineCommonConstants(); | ||||
|     DefineInterfaces(program.info); | ||||
|     DefineLocalMemory(program); | ||||
|     DefineSharedMemory(program); | ||||
|     DefineConstantBuffers(program.info, binding); | ||||
|     DefineStorageBuffers(program.info, binding); | ||||
|     DefineTextures(program.info, binding); | ||||
| @@ -143,6 +146,8 @@ void EmitContext::DefineCommonTypes(const Info& info) { | ||||
|     F32.Define(*this, TypeFloat(32), "f32"); | ||||
|     U32.Define(*this, TypeInt(32, false), "u32"); | ||||
|  | ||||
|     private_u32 = Name(TypePointer(spv::StorageClass::Private, U32[1]), "private_u32"); | ||||
|  | ||||
|     input_f32 = Name(TypePointer(spv::StorageClass::Input, F32[1]), "input_f32"); | ||||
|     input_u32 = Name(TypePointer(spv::StorageClass::Input, U32[1]), "input_u32"); | ||||
|     input_s32 = Name(TypePointer(spv::StorageClass::Input, TypeInt(32, true)), "input_s32"); | ||||
| @@ -184,6 +189,105 @@ void EmitContext::DefineInterfaces(const Info& info) { | ||||
|     DefineOutputs(info); | ||||
| } | ||||
|  | ||||
| void EmitContext::DefineLocalMemory(const IR::Program& program) { | ||||
|     if (program.local_memory_size == 0) { | ||||
|         return; | ||||
|     } | ||||
|     const u32 num_elements{Common::DivCeil(program.local_memory_size, 4U)}; | ||||
|     const Id type{TypeArray(U32[1], Constant(U32[1], num_elements))}; | ||||
|     const Id pointer{TypePointer(spv::StorageClass::Private, type)}; | ||||
|     local_memory = AddGlobalVariable(pointer, spv::StorageClass::Private); | ||||
|     if (profile.supported_spirv >= 0x00010400) { | ||||
|         interfaces.push_back(local_memory); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void EmitContext::DefineSharedMemory(const IR::Program& program) { | ||||
|     if (program.shared_memory_size == 0) { | ||||
|         return; | ||||
|     } | ||||
|     const auto make{[&](Id element_type, u32 element_size) { | ||||
|         const u32 num_elements{Common::DivCeil(program.shared_memory_size, element_size)}; | ||||
|         const Id array_type{TypeArray(element_type, Constant(U32[1], num_elements))}; | ||||
|         Decorate(array_type, spv::Decoration::ArrayStride, element_size); | ||||
|  | ||||
|         const Id struct_type{TypeStruct(array_type)}; | ||||
|         MemberDecorate(struct_type, 0U, spv::Decoration::Offset, 0U); | ||||
|         Decorate(struct_type, spv::Decoration::Block); | ||||
|  | ||||
|         const Id pointer{TypePointer(spv::StorageClass::Workgroup, struct_type)}; | ||||
|         const Id element_pointer{TypePointer(spv::StorageClass::Workgroup, element_type)}; | ||||
|         const Id variable{AddGlobalVariable(pointer, spv::StorageClass::Workgroup)}; | ||||
|         Decorate(variable, spv::Decoration::Aliased); | ||||
|         interfaces.push_back(variable); | ||||
|  | ||||
|         return std::make_pair(variable, element_pointer); | ||||
|     }}; | ||||
|     if (profile.support_explicit_workgroup_layout) { | ||||
|         AddExtension("SPV_KHR_workgroup_memory_explicit_layout"); | ||||
|         AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR); | ||||
|         if (program.info.uses_int8) { | ||||
|             AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout8BitAccessKHR); | ||||
|             std::tie(shared_memory_u8, shared_u8) = make(U8, 1); | ||||
|         } | ||||
|         if (program.info.uses_int16) { | ||||
|             AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR); | ||||
|             std::tie(shared_memory_u16, shared_u16) = make(U16, 2); | ||||
|         } | ||||
|         std::tie(shared_memory_u32, shared_u32) = make(U32[1], 4); | ||||
|         std::tie(shared_memory_u32x2, shared_u32x2) = make(U32[2], 8); | ||||
|         std::tie(shared_memory_u32x4, shared_u32x4) = make(U32[4], 16); | ||||
|     } | ||||
|     const u32 num_elements{Common::DivCeil(program.shared_memory_size, 4U)}; | ||||
|     const Id type{TypeArray(U32[1], Constant(U32[1], num_elements))}; | ||||
|     const Id pointer_type{TypePointer(spv::StorageClass::Workgroup, type)}; | ||||
|     shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); | ||||
|     shared_memory_u32 = AddGlobalVariable(pointer_type, spv::StorageClass::Workgroup); | ||||
|     interfaces.push_back(shared_memory_u32); | ||||
|  | ||||
|     const Id func_type{TypeFunction(void_id, U32[1], U32[1])}; | ||||
|     const auto make_function{[&](u32 mask, u32 size) { | ||||
|         const Id loop_header{OpLabel()}; | ||||
|         const Id continue_block{OpLabel()}; | ||||
|         const Id merge_block{OpLabel()}; | ||||
|  | ||||
|         const Id func{OpFunction(void_id, spv::FunctionControlMask::MaskNone, func_type)}; | ||||
|         const Id offset{OpFunctionParameter(U32[1])}; | ||||
|         const Id insert_value{OpFunctionParameter(U32[1])}; | ||||
|         AddLabel(); | ||||
|         OpBranch(loop_header); | ||||
|  | ||||
|         AddLabel(loop_header); | ||||
|         const Id word_offset{OpShiftRightArithmetic(U32[1], offset, Constant(U32[1], 2U))}; | ||||
|         const Id shift_offset{OpShiftLeftLogical(U32[1], offset, Constant(U32[1], 3U))}; | ||||
|         const Id bit_offset{OpBitwiseAnd(U32[1], shift_offset, Constant(U32[1], mask))}; | ||||
|         const Id count{Constant(U32[1], size)}; | ||||
|         OpLoopMerge(merge_block, continue_block, spv::LoopControlMask::MaskNone); | ||||
|         OpBranch(continue_block); | ||||
|  | ||||
|         AddLabel(continue_block); | ||||
|         const Id word_pointer{OpAccessChain(shared_u32, shared_memory_u32, word_offset)}; | ||||
|         const Id old_value{OpLoad(U32[1], word_pointer)}; | ||||
|         const Id new_value{OpBitFieldInsert(U32[1], old_value, insert_value, bit_offset, count)}; | ||||
|         const Id atomic_res{OpAtomicCompareExchange(U32[1], word_pointer, Constant(U32[1], 1U), | ||||
|                                                     u32_zero_value, u32_zero_value, new_value, | ||||
|                                                     old_value)}; | ||||
|         const Id success{OpIEqual(U1, atomic_res, old_value)}; | ||||
|         OpBranchConditional(success, merge_block, loop_header); | ||||
|  | ||||
|         AddLabel(merge_block); | ||||
|         OpReturn(); | ||||
|         OpFunctionEnd(); | ||||
|         return func; | ||||
|     }}; | ||||
|     if (program.info.uses_int8) { | ||||
|         shared_store_u8_func = make_function(24, 8); | ||||
|     } | ||||
|     if (program.info.uses_int16) { | ||||
|         shared_store_u16_func = make_function(16, 16); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void EmitContext::DefineConstantBuffers(const Info& info, u32& binding) { | ||||
|     if (info.constant_buffer_descriptors.empty()) { | ||||
|         return; | ||||
| @@ -234,6 +338,9 @@ void EmitContext::DefineStorageBuffers(const Info& info, u32& binding) { | ||||
|         Decorate(id, spv::Decoration::Binding, binding); | ||||
|         Decorate(id, spv::Decoration::DescriptorSet, 0U); | ||||
|         Name(id, fmt::format("ssbo{}", index)); | ||||
|         if (profile.supported_spirv >= 0x00010400) { | ||||
|             interfaces.push_back(id); | ||||
|         } | ||||
|         std::fill_n(ssbos.data() + index, desc.count, id); | ||||
|         index += desc.count; | ||||
|         binding += desc.count; | ||||
| @@ -261,6 +368,9 @@ void EmitContext::DefineTextures(const Info& info, u32& binding) { | ||||
|                 .image_type{image_type}, | ||||
|             }); | ||||
|         } | ||||
|         if (profile.supported_spirv >= 0x00010400) { | ||||
|             interfaces.push_back(id); | ||||
|         } | ||||
|         binding += desc.count; | ||||
|     } | ||||
| } | ||||
| @@ -363,6 +473,9 @@ void EmitContext::DefineConstantBuffers(const Info& info, Id UniformDefinitions: | ||||
|         for (size_t i = 0; i < desc.count; ++i) { | ||||
|             cbufs[desc.index + i].*member_type = id; | ||||
|         } | ||||
|         if (profile.supported_spirv >= 0x00010400) { | ||||
|             interfaces.push_back(id); | ||||
|         } | ||||
|         binding += desc.count; | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -73,6 +73,14 @@ public: | ||||
|  | ||||
|     UniformDefinitions uniform_types; | ||||
|  | ||||
|     Id private_u32{}; | ||||
|  | ||||
|     Id shared_u8{}; | ||||
|     Id shared_u16{}; | ||||
|     Id shared_u32{}; | ||||
|     Id shared_u32x2{}; | ||||
|     Id shared_u32x4{}; | ||||
|  | ||||
|     Id input_f32{}; | ||||
|     Id input_u32{}; | ||||
|     Id input_s32{}; | ||||
| @@ -96,6 +104,17 @@ public: | ||||
|     Id base_vertex{}; | ||||
|     Id front_face{}; | ||||
|  | ||||
|     Id local_memory{}; | ||||
|  | ||||
|     Id shared_memory_u8{}; | ||||
|     Id shared_memory_u16{}; | ||||
|     Id shared_memory_u32{}; | ||||
|     Id shared_memory_u32x2{}; | ||||
|     Id shared_memory_u32x4{}; | ||||
|  | ||||
|     Id shared_store_u8_func{}; | ||||
|     Id shared_store_u16_func{}; | ||||
|  | ||||
|     Id input_position{}; | ||||
|     std::array<Id, 32> input_generics{}; | ||||
|  | ||||
| @@ -111,6 +130,8 @@ private: | ||||
|     void DefineCommonTypes(const Info& info); | ||||
|     void DefineCommonConstants(); | ||||
|     void DefineInterfaces(const Info& info); | ||||
|     void DefineLocalMemory(const IR::Program& program); | ||||
|     void DefineSharedMemory(const IR::Program& program); | ||||
|     void DefineConstantBuffers(const Info& info, u32& binding); | ||||
|     void DefineStorageBuffers(const Info& info, u32& binding); | ||||
|     void DefineTextures(const Info& info, u32& binding); | ||||
|   | ||||
| @@ -58,6 +58,8 @@ void EmitSetCFlag(EmitContext& ctx); | ||||
| void EmitSetOFlag(EmitContext& ctx); | ||||
| Id EmitWorkgroupId(EmitContext& ctx); | ||||
| Id EmitLocalInvocationId(EmitContext& ctx); | ||||
| Id EmitLoadLocal(EmitContext& ctx, Id word_offset); | ||||
| void EmitWriteLocal(EmitContext& ctx, Id word_offset, Id value); | ||||
| Id EmitUndefU1(EmitContext& ctx); | ||||
| Id EmitUndefU8(EmitContext& ctx); | ||||
| Id EmitUndefU16(EmitContext& ctx); | ||||
| @@ -94,6 +96,18 @@ void EmitWriteStorage64(EmitContext& ctx, const IR::Value& binding, const IR::Va | ||||
|                         Id value); | ||||
| void EmitWriteStorage128(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset, | ||||
|                          Id value); | ||||
| Id EmitLoadSharedU8(EmitContext& ctx, Id offset); | ||||
| Id EmitLoadSharedS8(EmitContext& ctx, Id offset); | ||||
| Id EmitLoadSharedU16(EmitContext& ctx, Id offset); | ||||
| Id EmitLoadSharedS16(EmitContext& ctx, Id offset); | ||||
| Id EmitLoadSharedU32(EmitContext& ctx, Id offset); | ||||
| Id EmitLoadSharedU64(EmitContext& ctx, Id offset); | ||||
| Id EmitLoadSharedU128(EmitContext& ctx, Id offset); | ||||
| void EmitWriteSharedU8(EmitContext& ctx, Id offset, Id value); | ||||
| void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value); | ||||
| void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value); | ||||
| void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); | ||||
| void EmitWriteSharedU128(EmitContext& ctx, Id offset, Id value); | ||||
| Id EmitCompositeConstructU32x2(EmitContext& ctx, Id e1, Id e2); | ||||
| Id EmitCompositeConstructU32x3(EmitContext& ctx, Id e1, Id e2, Id e3); | ||||
| Id EmitCompositeConstructU32x4(EmitContext& ctx, Id e1, Id e2, Id e3, Id e4); | ||||
|   | ||||
| @@ -238,4 +238,14 @@ Id EmitLocalInvocationId(EmitContext& ctx) { | ||||
|     return ctx.OpLoad(ctx.U32[3], ctx.local_invocation_id); | ||||
| } | ||||
|  | ||||
| Id EmitLoadLocal(EmitContext& ctx, Id word_offset) { | ||||
|     const Id pointer{ctx.OpAccessChain(ctx.private_u32, ctx.local_memory, word_offset)}; | ||||
|     return ctx.OpLoad(ctx.U32[1], pointer); | ||||
| } | ||||
|  | ||||
| void EmitWriteLocal(EmitContext& ctx, Id word_offset, Id value) { | ||||
|     const Id pointer{ctx.OpAccessChain(ctx.private_u32, ctx.local_memory, word_offset)}; | ||||
|     ctx.OpStore(pointer, value); | ||||
| } | ||||
|  | ||||
| } // namespace Shader::Backend::SPIRV | ||||
|   | ||||
							
								
								
									
										175
									
								
								src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										175
									
								
								src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,175 @@ | ||||
| // Copyright 2021 yuzu Emulator Project | ||||
| // Licensed under GPLv2 or any later version | ||||
| // Refer to the license.txt file included. | ||||
|  | ||||
| #include "shader_recompiler/backend/spirv/emit_spirv.h" | ||||
|  | ||||
| namespace Shader::Backend::SPIRV { | ||||
| namespace { | ||||
| Id Pointer(EmitContext& ctx, Id pointer_type, Id array, Id offset, u32 shift) { | ||||
|     const Id shift_id{ctx.Constant(ctx.U32[1], shift)}; | ||||
|     const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; | ||||
|     return ctx.OpAccessChain(pointer_type, array, ctx.u32_zero_value, index); | ||||
| } | ||||
|  | ||||
| Id Word(EmitContext& ctx, Id offset) { | ||||
|     const Id shift_id{ctx.Constant(ctx.U32[1], 2U)}; | ||||
|     const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; | ||||
|     const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; | ||||
|     return ctx.OpLoad(ctx.U32[1], pointer); | ||||
| } | ||||
|  | ||||
| std::pair<Id, Id> ExtractArgs(EmitContext& ctx, Id offset, u32 mask, u32 count) { | ||||
|     const Id shift{ctx.OpShiftLeftLogical(ctx.U32[1], offset, ctx.Constant(ctx.U32[1], 3U))}; | ||||
|     const Id bit{ctx.OpBitwiseAnd(ctx.U32[1], shift, ctx.Constant(ctx.U32[1], mask))}; | ||||
|     const Id count_id{ctx.Constant(ctx.U32[1], count)}; | ||||
|     return {bit, count_id}; | ||||
| } | ||||
| } // Anonymous namespace | ||||
|  | ||||
| Id EmitLoadSharedU8(EmitContext& ctx, Id offset) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{ | ||||
|             ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; | ||||
|         return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, pointer)); | ||||
|     } else { | ||||
|         const auto [bit, count]{ExtractArgs(ctx, offset, 24, 8)}; | ||||
|         return ctx.OpBitFieldUExtract(ctx.U32[1], Word(ctx, offset), bit, count); | ||||
|     } | ||||
| } | ||||
|  | ||||
| Id EmitLoadSharedS8(EmitContext& ctx, Id offset) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{ | ||||
|             ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; | ||||
|         return ctx.OpSConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, pointer)); | ||||
|     } else { | ||||
|         const auto [bit, count]{ExtractArgs(ctx, offset, 24, 8)}; | ||||
|         return ctx.OpBitFieldSExtract(ctx.U32[1], Word(ctx, offset), bit, count); | ||||
|     } | ||||
| } | ||||
|  | ||||
| Id EmitLoadSharedU16(EmitContext& ctx, Id offset) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; | ||||
|         return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, pointer)); | ||||
|     } else { | ||||
|         const auto [bit, count]{ExtractArgs(ctx, offset, 16, 16)}; | ||||
|         return ctx.OpBitFieldUExtract(ctx.U32[1], Word(ctx, offset), bit, count); | ||||
|     } | ||||
| } | ||||
|  | ||||
| Id EmitLoadSharedS16(EmitContext& ctx, Id offset) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; | ||||
|         return ctx.OpSConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, pointer)); | ||||
|     } else { | ||||
|         const auto [bit, count]{ExtractArgs(ctx, offset, 16, 16)}; | ||||
|         return ctx.OpBitFieldSExtract(ctx.U32[1], Word(ctx, offset), bit, count); | ||||
|     } | ||||
| } | ||||
|  | ||||
| Id EmitLoadSharedU32(EmitContext& ctx, Id offset) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{Pointer(ctx, ctx.shared_u32, ctx.shared_memory_u32, offset, 2)}; | ||||
|         return ctx.OpLoad(ctx.U32[1], pointer); | ||||
|     } else { | ||||
|         return Word(ctx, offset); | ||||
|     } | ||||
| } | ||||
|  | ||||
| Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{Pointer(ctx, ctx.shared_u32x2, ctx.shared_memory_u32x2, offset, 3)}; | ||||
|         return ctx.OpLoad(ctx.U32[2], pointer); | ||||
|     } else { | ||||
|         const Id shift_id{ctx.Constant(ctx.U32[1], 2U)}; | ||||
|         const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; | ||||
|         const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.Constant(ctx.U32[1], 1U))}; | ||||
|         const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)}; | ||||
|         const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; | ||||
|         return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), | ||||
|                                         ctx.OpLoad(ctx.U32[1], rhs_pointer)); | ||||
|     } | ||||
| } | ||||
|  | ||||
| Id EmitLoadSharedU128(EmitContext& ctx, Id offset) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{Pointer(ctx, ctx.shared_u32x4, ctx.shared_memory_u32x4, offset, 4)}; | ||||
|         return ctx.OpLoad(ctx.U32[4], pointer); | ||||
|     } | ||||
|     const Id shift_id{ctx.Constant(ctx.U32[1], 2U)}; | ||||
|     const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; | ||||
|     std::array<Id, 4> values{}; | ||||
|     for (u32 i = 0; i < 4; ++i) { | ||||
|         const Id index{i == 0 ? base_index | ||||
|                               : ctx.OpIAdd(ctx.U32[1], base_index, ctx.Constant(ctx.U32[1], i))}; | ||||
|         const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; | ||||
|         values[i] = ctx.OpLoad(ctx.U32[1], pointer); | ||||
|     } | ||||
|     return ctx.OpCompositeConstruct(ctx.U32[4], values); | ||||
| } | ||||
|  | ||||
| void EmitWriteSharedU8(EmitContext& ctx, Id offset, Id value) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{ | ||||
|             ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; | ||||
|         ctx.OpStore(pointer, ctx.OpUConvert(ctx.U8, value)); | ||||
|     } else { | ||||
|         ctx.OpFunctionCall(ctx.void_id, ctx.shared_store_u8_func, offset, value); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; | ||||
|         ctx.OpStore(pointer, ctx.OpUConvert(ctx.U16, value)); | ||||
|     } else { | ||||
|         ctx.OpFunctionCall(ctx.void_id, ctx.shared_store_u16_func, offset, value); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) { | ||||
|     Id pointer{}; | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         pointer = Pointer(ctx, ctx.shared_u32, ctx.shared_memory_u32, offset, 2); | ||||
|     } else { | ||||
|         const Id shift{ctx.Constant(ctx.U32[1], 2U)}; | ||||
|         const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; | ||||
|         pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); | ||||
|     } | ||||
|     ctx.OpStore(pointer, value); | ||||
| } | ||||
|  | ||||
| void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{Pointer(ctx, ctx.shared_u32x2, ctx.shared_memory_u32x2, offset, 3)}; | ||||
|         ctx.OpStore(pointer, value); | ||||
|         return; | ||||
|     } | ||||
|     const Id shift{ctx.Constant(ctx.U32[1], 2U)}; | ||||
|     const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; | ||||
|     const Id next_offset{ctx.OpIAdd(ctx.U32[1], word_offset, ctx.Constant(ctx.U32[1], 1U))}; | ||||
|     const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset)}; | ||||
|     const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_offset)}; | ||||
|     ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U)); | ||||
|     ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U)); | ||||
| } | ||||
|  | ||||
| void EmitWriteSharedU128(EmitContext& ctx, Id offset, Id value) { | ||||
|     if (ctx.profile.support_explicit_workgroup_layout) { | ||||
|         const Id pointer{Pointer(ctx, ctx.shared_u32x4, ctx.shared_memory_u32x4, offset, 4)}; | ||||
|         ctx.OpStore(pointer, value); | ||||
|         return; | ||||
|     } | ||||
|     const Id shift{ctx.Constant(ctx.U32[1], 2U)}; | ||||
|     const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; | ||||
|     for (u32 i = 0; i < 4; ++i) { | ||||
|         const Id index{i == 0 ? base_index | ||||
|                               : ctx.OpIAdd(ctx.U32[1], base_index, ctx.Constant(ctx.U32[1], i))}; | ||||
|         const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; | ||||
|         ctx.OpStore(pointer, ctx.OpCompositeExtract(ctx.U32[1], value, i)); | ||||
|     } | ||||
| } | ||||
|  | ||||
| } // namespace Shader::Backend::SPIRV | ||||
| @@ -19,6 +19,10 @@ public: | ||||
|  | ||||
|     [[nodiscard]] virtual u32 TextureBoundBuffer() const = 0; | ||||
|  | ||||
|     [[nodiscard]] virtual u32 LocalMemorySize() const = 0; | ||||
|  | ||||
|     [[nodiscard]] virtual u32 SharedMemorySize() const = 0; | ||||
|  | ||||
|     [[nodiscard]] virtual std::array<u32, 3> WorkgroupSize() const = 0; | ||||
|  | ||||
|     [[nodiscard]] const ProgramHeader& SPH() const noexcept { | ||||
|   | ||||
| @@ -355,6 +355,52 @@ void IREmitter::WriteGlobal128(const U64& address, const IR::Value& vector) { | ||||
|     Inst(Opcode::WriteGlobal128, address, vector); | ||||
| } | ||||
|  | ||||
| U32 IREmitter::LoadLocal(const IR::U32& word_offset) { | ||||
|     return Inst<U32>(Opcode::LoadLocal, word_offset); | ||||
| } | ||||
|  | ||||
| void IREmitter::WriteLocal(const IR::U32& word_offset, const IR::U32& value) { | ||||
|     Inst(Opcode::WriteLocal, word_offset, value); | ||||
| } | ||||
|  | ||||
| Value IREmitter::LoadShared(int bit_size, bool is_signed, const IR::U32& offset) { | ||||
|     switch (bit_size) { | ||||
|     case 8: | ||||
|         return Inst(is_signed ? Opcode::LoadSharedS8 : Opcode::LoadSharedU8, offset); | ||||
|     case 16: | ||||
|         return Inst(is_signed ? Opcode::LoadSharedS16 : Opcode::LoadSharedU16, offset); | ||||
|     case 32: | ||||
|         return Inst(Opcode::LoadSharedU32, offset); | ||||
|     case 64: | ||||
|         return Inst(Opcode::LoadSharedU64, offset); | ||||
|     case 128: | ||||
|         return Inst(Opcode::LoadSharedU128, offset); | ||||
|     } | ||||
|     throw InvalidArgument("Invalid bit size {}", bit_size); | ||||
| } | ||||
|  | ||||
| void IREmitter::WriteShared(int bit_size, const IR::U32& offset, const IR::Value& value) { | ||||
|     switch (bit_size) { | ||||
|     case 8: | ||||
|         Inst(Opcode::WriteSharedU8, offset, value); | ||||
|         break; | ||||
|     case 16: | ||||
|         Inst(Opcode::WriteSharedU16, offset, value); | ||||
|         break; | ||||
|     case 32: | ||||
|         Inst(Opcode::WriteSharedU32, offset, value); | ||||
|         break; | ||||
|     case 64: | ||||
|         Inst(Opcode::WriteSharedU64, offset, value); | ||||
|         break; | ||||
|     case 128: | ||||
|         Inst(Opcode::WriteSharedU128, offset, value); | ||||
|         break; | ||||
|     default: | ||||
|         throw InvalidArgument("Invalid bit size {}", bit_size); | ||||
|     } | ||||
| } | ||||
|  | ||||
| U1 IREmitter::GetZeroFromOp(const Value& op) { | ||||
|     return Inst<U1>(Opcode::GetZeroFromOp, op); | ||||
| } | ||||
|   | ||||
| @@ -99,6 +99,12 @@ public: | ||||
|     void WriteGlobal64(const U64& address, const IR::Value& vector); | ||||
|     void WriteGlobal128(const U64& address, const IR::Value& vector); | ||||
|  | ||||
|     [[nodiscard]] U32 LoadLocal(const U32& word_offset); | ||||
|     void WriteLocal(const U32& word_offset, const U32& value); | ||||
|  | ||||
|     [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); | ||||
|     void WriteShared(int bit_size, const U32& offset, const Value& value); | ||||
|  | ||||
|     [[nodiscard]] U1 GetZeroFromOp(const Value& op); | ||||
|     [[nodiscard]] U1 GetSignFromOp(const Value& op); | ||||
|     [[nodiscard]] U1 GetCarryFromOp(const Value& op); | ||||
|   | ||||
| @@ -76,6 +76,12 @@ bool Inst::MayHaveSideEffects() const noexcept { | ||||
|     case Opcode::WriteStorage32: | ||||
|     case Opcode::WriteStorage64: | ||||
|     case Opcode::WriteStorage128: | ||||
|     case Opcode::WriteLocal: | ||||
|     case Opcode::WriteSharedU8: | ||||
|     case Opcode::WriteSharedU16: | ||||
|     case Opcode::WriteSharedU32: | ||||
|     case Opcode::WriteSharedU64: | ||||
|     case Opcode::WriteSharedU128: | ||||
|         return true; | ||||
|     default: | ||||
|         return false; | ||||
|   | ||||
| @@ -89,6 +89,24 @@ OPCODE(WriteStorage32,                                      Void,           U32, | ||||
| OPCODE(WriteStorage64,                                      Void,           U32,            U32,            U32x2,                                          ) | ||||
| OPCODE(WriteStorage128,                                     Void,           U32,            U32,            U32x4,                                          ) | ||||
|  | ||||
| // Local memory operations | ||||
| OPCODE(LoadLocal,                                           U32,            U32,                                                                            ) | ||||
| OPCODE(WriteLocal,                                          Void,           U32,            U32,                                                            ) | ||||
|  | ||||
| // Shared memory operations | ||||
| OPCODE(LoadSharedU8,                                        U32,            U32,                                                                            ) | ||||
| OPCODE(LoadSharedS8,                                        U32,            U32,                                                                            ) | ||||
| OPCODE(LoadSharedU16,                                       U32,            U32,                                                                            ) | ||||
| OPCODE(LoadSharedS16,                                       U32,            U32,                                                                            ) | ||||
| OPCODE(LoadSharedU32,                                       U32,            U32,                                                                            ) | ||||
| OPCODE(LoadSharedU64,                                       U32x2,          U32,                                                                            ) | ||||
| OPCODE(LoadSharedU128,                                      U32x4,          U32,                                                                            ) | ||||
| OPCODE(WriteSharedU8,                                       Void,           U32,            U32,                                                            ) | ||||
| OPCODE(WriteSharedU16,                                      Void,           U32,            U32,                                                            ) | ||||
| OPCODE(WriteSharedU32,                                      Void,           U32,            U32,                                                            ) | ||||
| OPCODE(WriteSharedU64,                                      Void,           U32,            U32x2,                                                          ) | ||||
| OPCODE(WriteSharedU128,                                     Void,           U32,            U32x4,                                                          ) | ||||
|  | ||||
| // Vector utility | ||||
| OPCODE(CompositeConstructU32x2,                             U32x2,          U32,            U32,                                                            ) | ||||
| OPCODE(CompositeConstructU32x3,                             U32x3,          U32,            U32,            U32,                                            ) | ||||
|   | ||||
| @@ -21,6 +21,8 @@ struct Program { | ||||
|     Info info; | ||||
|     Stage stage{}; | ||||
|     std::array<u32, 3> workgroup_size{}; | ||||
|     u32 local_memory_size{}; | ||||
|     u32 shared_memory_size{}; | ||||
| }; | ||||
|  | ||||
| [[nodiscard]] std::string DumpProgram(const Program& program); | ||||
|   | ||||
| @@ -67,8 +67,10 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo | ||||
|     program.blocks = VisitAST(inst_pool, block_pool, env, cfg); | ||||
|     program.post_order_blocks = PostOrder(program.blocks); | ||||
|     program.stage = env.ShaderStage(); | ||||
|     program.local_memory_size = env.LocalMemorySize(); | ||||
|     if (program.stage == Stage::Compute) { | ||||
|         program.workgroup_size = env.WorkgroupSize(); | ||||
|         program.shared_memory_size = env.SharedMemorySize(); | ||||
|     } | ||||
|     RemoveUnreachableBlocks(program); | ||||
|  | ||||
|   | ||||
| @@ -0,0 +1,197 @@ | ||||
| // Copyright 2021 yuzu Emulator Project | ||||
| // Licensed under GPLv2 or any later version | ||||
| // Refer to the license.txt file included. | ||||
|  | ||||
| #include "common/bit_field.h" | ||||
| #include "common/common_types.h" | ||||
| #include "shader_recompiler/frontend/maxwell/translate/impl/impl.h" | ||||
|  | ||||
| namespace Shader::Maxwell { | ||||
| namespace { | ||||
| enum class Size : u64 { | ||||
|     U8, | ||||
|     S8, | ||||
|     U16, | ||||
|     S16, | ||||
|     B32, | ||||
|     B64, | ||||
|     B128, | ||||
| }; | ||||
|  | ||||
| IR::U32 Offset(TranslatorVisitor& v, u64 insn) { | ||||
|     union { | ||||
|         u64 raw; | ||||
|         BitField<8, 8, IR::Reg> offset_reg; | ||||
|         BitField<20, 24, u64> absolute_offset; | ||||
|         BitField<20, 24, s64> relative_offset; | ||||
|     } const encoding{insn}; | ||||
|  | ||||
|     if (encoding.offset_reg == IR::Reg::RZ) { | ||||
|         return v.ir.Imm32(static_cast<u32>(encoding.absolute_offset)); | ||||
|     } else { | ||||
|         const s32 relative{static_cast<s32>(encoding.relative_offset.Value())}; | ||||
|         return v.ir.IAdd(v.X(encoding.offset_reg), v.ir.Imm32(relative)); | ||||
|     } | ||||
| } | ||||
|  | ||||
| std::pair<int, bool> GetSize(u64 insn) { | ||||
|     union { | ||||
|         u64 raw; | ||||
|         BitField<48, 3, Size> size; | ||||
|     } const encoding{insn}; | ||||
|  | ||||
|     const Size nnn = encoding.size; | ||||
|     switch (encoding.size) { | ||||
|     case Size::U8: | ||||
|         return {8, false}; | ||||
|     case Size::S8: | ||||
|         return {8, true}; | ||||
|     case Size::U16: | ||||
|         return {16, false}; | ||||
|     case Size::S16: | ||||
|         return {16, true}; | ||||
|     case Size::B32: | ||||
|         return {32, false}; | ||||
|     case Size::B64: | ||||
|         return {64, false}; | ||||
|     case Size::B128: | ||||
|         return {128, false}; | ||||
|     default: | ||||
|         throw NotImplementedException("Invalid size {}", encoding.size.Value()); | ||||
|     } | ||||
| } | ||||
|  | ||||
| IR::Reg Reg(u64 insn) { | ||||
|     union { | ||||
|         u64 raw; | ||||
|         BitField<0, 8, IR::Reg> reg; | ||||
|     } const encoding{insn}; | ||||
|  | ||||
|     return encoding.reg; | ||||
| } | ||||
|  | ||||
| IR::U32 ByteOffset(IR::IREmitter& ir, const IR::U32& offset) { | ||||
|     return ir.BitwiseAnd(ir.ShiftLeftLogical(offset, ir.Imm32(3)), ir.Imm32(24)); | ||||
| } | ||||
|  | ||||
| IR::U32 ShortOffset(IR::IREmitter& ir, const IR::U32& offset) { | ||||
|     return ir.BitwiseAnd(ir.ShiftLeftLogical(offset, ir.Imm32(3)), ir.Imm32(16)); | ||||
| } | ||||
| } // Anonymous namespace | ||||
|  | ||||
| void TranslatorVisitor::LDL(u64 insn) { | ||||
|     const IR::U32 offset{Offset(*this, insn)}; | ||||
|     const IR::U32 word_offset{ir.ShiftRightArithmetic(offset, ir.Imm32(2))}; | ||||
|  | ||||
|     const IR::Reg dest{Reg(insn)}; | ||||
|     const auto [bit_size, is_signed]{GetSize(insn)}; | ||||
|     switch (bit_size) { | ||||
|     case 8: { | ||||
|         const IR::U32 bit{ByteOffset(ir, offset)}; | ||||
|         X(dest, ir.BitFieldExtract(ir.LoadLocal(word_offset), bit, ir.Imm32(8), is_signed)); | ||||
|         break; | ||||
|     } | ||||
|     case 16: { | ||||
|         const IR::U32 bit{ShortOffset(ir, offset)}; | ||||
|         X(dest, ir.BitFieldExtract(ir.LoadLocal(word_offset), bit, ir.Imm32(16), is_signed)); | ||||
|         break; | ||||
|     } | ||||
|     case 32: | ||||
|     case 64: | ||||
|     case 128: | ||||
|         if (!IR::IsAligned(dest, bit_size / 32)) { | ||||
|             throw NotImplementedException("Unaligned destination register {}", dest); | ||||
|         } | ||||
|         X(dest, ir.LoadLocal(word_offset)); | ||||
|         for (int i = 1; i < bit_size / 32; ++i) { | ||||
|             X(dest + i, ir.LoadLocal(ir.IAdd(word_offset, ir.Imm32(i)))); | ||||
|         } | ||||
|         break; | ||||
|     } | ||||
| } | ||||
|  | ||||
| void TranslatorVisitor::LDS(u64 insn) { | ||||
|     const IR::U32 offset{Offset(*this, insn)}; | ||||
|     const IR::Reg dest{Reg(insn)}; | ||||
|     const auto [bit_size, is_signed]{GetSize(insn)}; | ||||
|     const IR::Value value{ir.LoadShared(bit_size, is_signed, offset)}; | ||||
|     switch (bit_size) { | ||||
|     case 8: | ||||
|     case 16: | ||||
|     case 32: | ||||
|         X(dest, IR::U32{value}); | ||||
|         break; | ||||
|     case 64: | ||||
|     case 128: | ||||
|         if (!IR::IsAligned(dest, bit_size / 32)) { | ||||
|             throw NotImplementedException("Unaligned destination register {}", dest); | ||||
|         } | ||||
|         for (int element = 0; element < bit_size / 32; ++element) { | ||||
|             X(dest + element, IR::U32{ir.CompositeExtract(value, element)}); | ||||
|         } | ||||
|         break; | ||||
|     } | ||||
| } | ||||
|  | ||||
| void TranslatorVisitor::STL(u64 insn) { | ||||
|     const IR::U32 offset{Offset(*this, insn)}; | ||||
|     const IR::U32 word_offset{ir.ShiftRightArithmetic(offset, ir.Imm32(2))}; | ||||
|  | ||||
|     const IR::Reg reg{Reg(insn)}; | ||||
|     const IR::U32 src{X(reg)}; | ||||
|     const int bit_size{GetSize(insn).first}; | ||||
|     switch (bit_size) { | ||||
|     case 8: { | ||||
|         const IR::U32 bit{ByteOffset(ir, offset)}; | ||||
|         const IR::U32 value{ir.BitFieldInsert(ir.LoadLocal(word_offset), src, bit, ir.Imm32(8))}; | ||||
|         ir.WriteLocal(word_offset, value); | ||||
|         break; | ||||
|     } | ||||
|     case 16: { | ||||
|         const IR::U32 bit{ShortOffset(ir, offset)}; | ||||
|         const IR::U32 value{ir.BitFieldInsert(ir.LoadLocal(word_offset), src, bit, ir.Imm32(16))}; | ||||
|         ir.WriteLocal(word_offset, value); | ||||
|         break; | ||||
|     } | ||||
|     case 32: | ||||
|     case 64: | ||||
|     case 128: | ||||
|         if (!IR::IsAligned(reg, bit_size / 32)) { | ||||
|             throw NotImplementedException("Unaligned source register"); | ||||
|         } | ||||
|         ir.WriteLocal(word_offset, src); | ||||
|         for (int i = 1; i < bit_size / 32; ++i) { | ||||
|             ir.WriteLocal(ir.IAdd(word_offset, ir.Imm32(i)), X(reg + i)); | ||||
|         } | ||||
|         break; | ||||
|     } | ||||
| } | ||||
|  | ||||
| void TranslatorVisitor::STS(u64 insn) { | ||||
|     const IR::U32 offset{Offset(*this, insn)}; | ||||
|     const IR::Reg reg{Reg(insn)}; | ||||
|     const int bit_size{GetSize(insn).first}; | ||||
|     switch (bit_size) { | ||||
|     case 8: | ||||
|     case 16: | ||||
|     case 32: | ||||
|         ir.WriteShared(bit_size, offset, X(reg)); | ||||
|         break; | ||||
|     case 64: | ||||
|         if (!IR::IsAligned(reg, 2)) { | ||||
|             throw NotImplementedException("Unaligned source register {}", reg); | ||||
|         } | ||||
|         ir.WriteShared(64, offset, ir.CompositeConstruct(X(reg), X(reg + 1))); | ||||
|         break; | ||||
|     case 128: { | ||||
|         if (!IR::IsAligned(reg, 2)) { | ||||
|             throw NotImplementedException("Unaligned source register {}", reg); | ||||
|         } | ||||
|         const IR::Value vector{ir.CompositeConstruct(X(reg), X(reg + 1), X(reg + 2), X(reg + 3))}; | ||||
|         ir.WriteShared(128, offset, vector); | ||||
|         break; | ||||
|     } | ||||
|     } | ||||
| } | ||||
|  | ||||
| } // namespace Shader::Maxwell | ||||
| @@ -193,14 +193,6 @@ void TranslatorVisitor::LD(u64) { | ||||
|     ThrowNotImplemented(Opcode::LD); | ||||
| } | ||||
|  | ||||
| void TranslatorVisitor::LDL(u64) { | ||||
|     ThrowNotImplemented(Opcode::LDL); | ||||
| } | ||||
|  | ||||
| void TranslatorVisitor::LDS(u64) { | ||||
|     ThrowNotImplemented(Opcode::LDS); | ||||
| } | ||||
|  | ||||
| void TranslatorVisitor::LEPC(u64) { | ||||
|     ThrowNotImplemented(Opcode::LEPC); | ||||
| } | ||||
| @@ -309,18 +301,10 @@ void TranslatorVisitor::ST(u64) { | ||||
|     ThrowNotImplemented(Opcode::ST); | ||||
| } | ||||
|  | ||||
| void TranslatorVisitor::STL(u64) { | ||||
|     ThrowNotImplemented(Opcode::STL); | ||||
| } | ||||
|  | ||||
| void TranslatorVisitor::STP(u64) { | ||||
|     ThrowNotImplemented(Opcode::STP); | ||||
| } | ||||
|  | ||||
| void TranslatorVisitor::STS(u64) { | ||||
|     ThrowNotImplemented(Opcode::STS); | ||||
| } | ||||
|  | ||||
| void TranslatorVisitor::SUATOM_cas(u64) { | ||||
|     ThrowNotImplemented(Opcode::SUATOM_cas); | ||||
| } | ||||
|   | ||||
| @@ -200,6 +200,9 @@ void VisitUsages(Info& info, IR::Inst& inst) { | ||||
|     case IR::Opcode::LoadStorageS8: | ||||
|     case IR::Opcode::WriteStorageU8: | ||||
|     case IR::Opcode::WriteStorageS8: | ||||
|     case IR::Opcode::LoadSharedU8: | ||||
|     case IR::Opcode::LoadSharedS8: | ||||
|     case IR::Opcode::WriteSharedU8: | ||||
|     case IR::Opcode::SelectU8: | ||||
|     case IR::Opcode::ConvertF16S8: | ||||
|     case IR::Opcode::ConvertF16U8: | ||||
| @@ -224,6 +227,9 @@ void VisitUsages(Info& info, IR::Inst& inst) { | ||||
|     case IR::Opcode::LoadStorageS16: | ||||
|     case IR::Opcode::WriteStorageU16: | ||||
|     case IR::Opcode::WriteStorageS16: | ||||
|     case IR::Opcode::LoadSharedU16: | ||||
|     case IR::Opcode::LoadSharedS16: | ||||
|     case IR::Opcode::WriteSharedU16: | ||||
|     case IR::Opcode::SelectU16: | ||||
|     case IR::Opcode::BitCastU16F16: | ||||
|     case IR::Opcode::BitCastF16U16: | ||||
|   | ||||
| @@ -18,6 +18,8 @@ enum class AttributeType : u8 { | ||||
| }; | ||||
|  | ||||
| struct Profile { | ||||
|     u32 supported_spirv{0x00010000}; | ||||
|  | ||||
|     bool unified_descriptor_binding{}; | ||||
|     bool support_vertex_instance_id{}; | ||||
|     bool support_float_controls{}; | ||||
| @@ -30,6 +32,7 @@ struct Profile { | ||||
|     bool support_fp16_signed_zero_nan_preserve{}; | ||||
|     bool support_fp32_signed_zero_nan_preserve{}; | ||||
|     bool support_fp64_signed_zero_nan_preserve{}; | ||||
|     bool support_explicit_workgroup_layout{}; | ||||
|     bool support_vote{}; | ||||
|     bool warp_size_potentially_larger_than_guest{}; | ||||
|  | ||||
|   | ||||
| @@ -114,10 +114,12 @@ public: | ||||
|         gpu_memory->ReadBlock(program_base + read_lowest, data.get(), code_size); | ||||
|  | ||||
|         const u64 num_texture_types{static_cast<u64>(texture_types.size())}; | ||||
|         const u32 local_memory_size{LocalMemorySize()}; | ||||
|         const u32 texture_bound{TextureBoundBuffer()}; | ||||
|  | ||||
|         file.write(reinterpret_cast<const char*>(&code_size), sizeof(code_size)) | ||||
|             .write(reinterpret_cast<const char*>(&num_texture_types), sizeof(num_texture_types)) | ||||
|             .write(reinterpret_cast<const char*>(&local_memory_size), sizeof(local_memory_size)) | ||||
|             .write(reinterpret_cast<const char*>(&texture_bound), sizeof(texture_bound)) | ||||
|             .write(reinterpret_cast<const char*>(&start_address), sizeof(start_address)) | ||||
|             .write(reinterpret_cast<const char*>(&read_lowest), sizeof(read_lowest)) | ||||
| @@ -132,7 +134,10 @@ public: | ||||
|         file.flush(); | ||||
|         if (stage == Shader::Stage::Compute) { | ||||
|             const std::array<u32, 3> workgroup_size{WorkgroupSize()}; | ||||
|             file.write(reinterpret_cast<const char*>(&workgroup_size), sizeof(workgroup_size)); | ||||
|             const u32 shared_memory_size{SharedMemorySize()}; | ||||
|             file.write(reinterpret_cast<const char*>(&workgroup_size), sizeof(workgroup_size)) | ||||
|                 .write(reinterpret_cast<const char*>(&shared_memory_size), | ||||
|                        sizeof(shared_memory_size)); | ||||
|         } else { | ||||
|             file.write(reinterpret_cast<const char*>(&sph), sizeof(sph)); | ||||
|         } | ||||
| @@ -278,6 +283,16 @@ public: | ||||
|         return maxwell3d->regs.tex_cb_index; | ||||
|     } | ||||
|  | ||||
|     u32 LocalMemorySize() const override { | ||||
|         const u64 size{sph.LocalMemorySize()}; | ||||
|         ASSERT(size <= std::numeric_limits<u32>::max()); | ||||
|         return static_cast<u32>(size); | ||||
|     } | ||||
|  | ||||
|     u32 SharedMemorySize() const override { | ||||
|         throw Shader::LogicError("Requesting shared memory size in graphics stage"); | ||||
|     } | ||||
|  | ||||
|     std::array<u32, 3> WorkgroupSize() const override { | ||||
|         throw Shader::LogicError("Requesting workgroup size in a graphics stage"); | ||||
|     } | ||||
| @@ -313,6 +328,16 @@ public: | ||||
|         return kepler_compute->regs.tex_cb_index; | ||||
|     } | ||||
|  | ||||
|     u32 LocalMemorySize() const override { | ||||
|         const auto& qmd{kepler_compute->launch_description}; | ||||
|         return qmd.local_pos_alloc; | ||||
|     } | ||||
|  | ||||
|     u32 SharedMemorySize() const override { | ||||
|         const auto& qmd{kepler_compute->launch_description}; | ||||
|         return qmd.shared_alloc; | ||||
|     } | ||||
|  | ||||
|     std::array<u32, 3> WorkgroupSize() const override { | ||||
|         const auto& qmd{kepler_compute->launch_description}; | ||||
|         return {qmd.block_dim_x, qmd.block_dim_y, qmd.block_dim_z}; | ||||
| @@ -366,6 +391,7 @@ public: | ||||
|         u64 num_texture_types{}; | ||||
|         file.read(reinterpret_cast<char*>(&code_size), sizeof(code_size)) | ||||
|             .read(reinterpret_cast<char*>(&num_texture_types), sizeof(num_texture_types)) | ||||
|             .read(reinterpret_cast<char*>(&local_memory_size), sizeof(local_memory_size)) | ||||
|             .read(reinterpret_cast<char*>(&texture_bound), sizeof(texture_bound)) | ||||
|             .read(reinterpret_cast<char*>(&start_address), sizeof(start_address)) | ||||
|             .read(reinterpret_cast<char*>(&read_lowest), sizeof(read_lowest)) | ||||
| @@ -381,7 +407,8 @@ public: | ||||
|             texture_types.emplace(key, type); | ||||
|         } | ||||
|         if (stage == Shader::Stage::Compute) { | ||||
|             file.read(reinterpret_cast<char*>(&workgroup_size), sizeof(workgroup_size)); | ||||
|             file.read(reinterpret_cast<char*>(&workgroup_size), sizeof(workgroup_size)) | ||||
|                 .read(reinterpret_cast<char*>(&shared_memory_size), sizeof(shared_memory_size)); | ||||
|         } else { | ||||
|             file.read(reinterpret_cast<char*>(&sph), sizeof(sph)); | ||||
|         } | ||||
| @@ -402,6 +429,14 @@ public: | ||||
|         return it->second; | ||||
|     } | ||||
|  | ||||
|     u32 LocalMemorySize() const override { | ||||
|         return local_memory_size; | ||||
|     } | ||||
|  | ||||
|     u32 SharedMemorySize() const override { | ||||
|         return shared_memory_size; | ||||
|     } | ||||
|  | ||||
|     u32 TextureBoundBuffer() const override { | ||||
|         return texture_bound; | ||||
|     } | ||||
| @@ -414,6 +449,8 @@ private: | ||||
|     std::unique_ptr<u64[]> code; | ||||
|     std::unordered_map<u64, Shader::TextureType> texture_types; | ||||
|     std::array<u32, 3> workgroup_size{}; | ||||
|     u32 local_memory_size{}; | ||||
|     u32 shared_memory_size{}; | ||||
|     u32 texture_bound{}; | ||||
|     u32 read_lowest{}; | ||||
|     u32 read_highest{}; | ||||
| @@ -541,6 +578,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, Tegra::GPU& gpu_, | ||||
|     const auto& float_control{device.FloatControlProperties()}; | ||||
|     const VkDriverIdKHR driver_id{device.GetDriverID()}; | ||||
|     base_profile = Shader::Profile{ | ||||
|         .supported_spirv = device.IsKhrSpirv1_4Supported() ? 0x00010400U : 0x00010000U, | ||||
|         .unified_descriptor_binding = true, | ||||
|         .support_vertex_instance_id = false, | ||||
|         .support_float_controls = true, | ||||
| @@ -558,6 +596,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, Tegra::GPU& gpu_, | ||||
|             float_control.shaderSignedZeroInfNanPreserveFloat32 != VK_FALSE, | ||||
|         .support_fp64_signed_zero_nan_preserve = | ||||
|             float_control.shaderSignedZeroInfNanPreserveFloat64 != VK_FALSE, | ||||
|         .support_explicit_workgroup_layout = device.IsKhrWorkgroupMemoryExplicitLayoutSupported(), | ||||
|         .support_vote = true, | ||||
|         .warp_size_potentially_larger_than_guest = device.IsWarpSizePotentiallyBiggerThanGuest(), | ||||
|         .has_broken_spirv_clamp = driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS_KHR, | ||||
| @@ -600,8 +639,8 @@ ComputePipeline* PipelineCache::CurrentComputePipeline() { | ||||
|         shader = MakeShaderInfo(env, *cpu_shader_addr); | ||||
|     } | ||||
|     const ComputePipelineCacheKey key{ | ||||
|         .unique_hash = shader->unique_hash, | ||||
|         .shared_memory_size = qmd.shared_alloc, | ||||
|         .unique_hash{shader->unique_hash}, | ||||
|         .shared_memory_size{qmd.shared_alloc}, | ||||
|         .workgroup_size{qmd.block_dim_x, qmd.block_dim_y, qmd.block_dim_z}, | ||||
|     }; | ||||
|     const auto [pair, is_new]{compute_cache.try_emplace(key)}; | ||||
|   | ||||
| @@ -399,6 +399,20 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR | ||||
|         LOG_INFO(Render_Vulkan, "Device doesn't support extended dynamic state"); | ||||
|     } | ||||
|  | ||||
|     VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR workgroup_layout; | ||||
|     if (khr_workgroup_memory_explicit_layout) { | ||||
|         workgroup_layout = { | ||||
|             .sType = | ||||
|                 VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_FEATURES_KHR, | ||||
|             .pNext = nullptr, | ||||
|             .workgroupMemoryExplicitLayout = VK_TRUE, | ||||
|             .workgroupMemoryExplicitLayoutScalarBlockLayout = VK_TRUE, | ||||
|             .workgroupMemoryExplicitLayout8BitAccess = VK_TRUE, | ||||
|             .workgroupMemoryExplicitLayout16BitAccess = VK_TRUE, | ||||
|         }; | ||||
|         SetNext(next, workgroup_layout); | ||||
|     } | ||||
|  | ||||
|     if (!ext_depth_range_unrestricted) { | ||||
|         LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted"); | ||||
|     } | ||||
| @@ -662,6 +676,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) { | ||||
|     } | ||||
|  | ||||
|     bool has_khr_shader_float16_int8{}; | ||||
|     bool has_khr_workgroup_memory_explicit_layout{}; | ||||
|     bool has_ext_subgroup_size_control{}; | ||||
|     bool has_ext_transform_feedback{}; | ||||
|     bool has_ext_custom_border_color{}; | ||||
| @@ -682,6 +697,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) { | ||||
|         test(nv_viewport_swizzle, VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME, true); | ||||
|         test(khr_uniform_buffer_standard_layout, | ||||
|              VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true); | ||||
|         test(khr_spirv_1_4, VK_KHR_SPIRV_1_4_EXTENSION_NAME, true); | ||||
|         test(has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false); | ||||
|         test(ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); | ||||
|         test(ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); | ||||
| @@ -694,6 +710,8 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) { | ||||
|         test(has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME, false); | ||||
|         test(has_ext_extended_dynamic_state, VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false); | ||||
|         test(has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, false); | ||||
|         test(has_khr_workgroup_memory_explicit_layout, | ||||
|              VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME, false); | ||||
|         if (Settings::values.renderer_debug) { | ||||
|             test(nv_device_diagnostics_config, VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME, | ||||
|                  true); | ||||
| @@ -787,6 +805,22 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) { | ||||
|             ext_extended_dynamic_state = true; | ||||
|         } | ||||
|     } | ||||
|     if (has_khr_workgroup_memory_explicit_layout) { | ||||
|         VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR layout; | ||||
|         layout.sType = | ||||
|             VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_FEATURES_KHR; | ||||
|         layout.pNext = nullptr; | ||||
|         features.pNext = &layout; | ||||
|         physical.GetFeatures2KHR(features); | ||||
|  | ||||
|         if (layout.workgroupMemoryExplicitLayout && | ||||
|             layout.workgroupMemoryExplicitLayout8BitAccess && | ||||
|             layout.workgroupMemoryExplicitLayout16BitAccess && | ||||
|             layout.workgroupMemoryExplicitLayoutScalarBlockLayout) { | ||||
|             extensions.push_back(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); | ||||
|             khr_workgroup_memory_explicit_layout = true; | ||||
|         } | ||||
|     } | ||||
|     return extensions; | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -168,11 +168,21 @@ public: | ||||
|         return nv_viewport_swizzle; | ||||
|     } | ||||
|  | ||||
|     /// Returns true if the device supports VK_EXT_scalar_block_layout. | ||||
|     /// Returns true if the device supports VK_KHR_uniform_buffer_standard_layout. | ||||
|     bool IsKhrUniformBufferStandardLayoutSupported() const { | ||||
|         return khr_uniform_buffer_standard_layout; | ||||
|     } | ||||
|  | ||||
|     /// Returns true if the device supports VK_KHR_spirv_1_4. | ||||
|     bool IsKhrSpirv1_4Supported() const { | ||||
|         return khr_spirv_1_4; | ||||
|     } | ||||
|  | ||||
|     /// Returns true if the device supports VK_KHR_workgroup_memory_explicit_layout. | ||||
|     bool IsKhrWorkgroupMemoryExplicitLayoutSupported() const { | ||||
|         return khr_workgroup_memory_explicit_layout; | ||||
|     } | ||||
|  | ||||
|     /// Returns true if the device supports VK_EXT_index_type_uint8. | ||||
|     bool IsExtIndexTypeUint8Supported() const { | ||||
|         return ext_index_type_uint8; | ||||
| @@ -296,7 +306,9 @@ private: | ||||
|     bool is_shader_storage_image_multisample{}; ///< Support for image operations on MSAA images. | ||||
|     bool is_blit_depth_stencil_supported{};     ///< Support for blitting from and to depth stencil. | ||||
|     bool nv_viewport_swizzle{};                 ///< Support for VK_NV_viewport_swizzle. | ||||
|     bool khr_uniform_buffer_standard_layout{};  ///< Support for std430 on UBOs. | ||||
|     bool khr_uniform_buffer_standard_layout{};  ///< Support for scalar uniform buffer layouts. | ||||
|     bool khr_spirv_1_4{};                       ///< Support for VK_KHR_spirv_1_4. | ||||
|     bool khr_workgroup_memory_explicit_layout{}; ///< Support for explicit workgroup layouts. | ||||
|     bool ext_index_type_uint8{};                 ///< Support for VK_EXT_index_type_uint8. | ||||
|     bool ext_sampler_filter_minmax{};            ///< Support for VK_EXT_sampler_filter_minmax. | ||||
|     bool ext_depth_range_unrestricted{};         ///< Support for VK_EXT_depth_range_unrestricted. | ||||
|   | ||||
		Reference in New Issue
	
	Block a user