shader_recompiler: translate f64 to f32 when unsupported on host
This commit is contained in:
		| @@ -223,6 +223,7 @@ add_library(shader_recompiler STATIC | |||||||
|     ir_opt/identity_removal_pass.cpp |     ir_opt/identity_removal_pass.cpp | ||||||
|     ir_opt/layer_pass.cpp |     ir_opt/layer_pass.cpp | ||||||
|     ir_opt/lower_fp16_to_fp32.cpp |     ir_opt/lower_fp16_to_fp32.cpp | ||||||
|  |     ir_opt/lower_fp64_to_fp32.cpp | ||||||
|     ir_opt/lower_int64_to_int32.cpp |     ir_opt/lower_int64_to_int32.cpp | ||||||
|     ir_opt/passes.h |     ir_opt/passes.h | ||||||
|     ir_opt/position_pass.cpp |     ir_opt/position_pass.cpp | ||||||
|   | |||||||
| @@ -280,6 +280,9 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo | |||||||
|     RemoveUnreachableBlocks(program); |     RemoveUnreachableBlocks(program); | ||||||
|  |  | ||||||
|     // Replace instructions before the SSA rewrite |     // Replace instructions before the SSA rewrite | ||||||
|  |     if (!host_info.support_float64) { | ||||||
|  |         Optimization::LowerFp64ToFp32(program); | ||||||
|  |     } | ||||||
|     if (!host_info.support_float16) { |     if (!host_info.support_float16) { | ||||||
|         Optimization::LowerFp16ToFp32(program); |         Optimization::LowerFp16ToFp32(program); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -10,6 +10,7 @@ namespace Shader { | |||||||
|  |  | ||||||
| /// Misc information about the host | /// Misc information about the host | ||||||
| struct HostTranslateInfo { | struct HostTranslateInfo { | ||||||
|  |     bool support_float64{};      ///< True when the device supports 64-bit floats | ||||||
|     bool support_float16{};      ///< True when the device supports 16-bit floats |     bool support_float16{};      ///< True when the device supports 16-bit floats | ||||||
|     bool support_int64{};        ///< True when the device supports 64-bit integers |     bool support_int64{};        ///< True when the device supports 64-bit integers | ||||||
|     bool needs_demote_reorder{}; ///< True when the device needs DemoteToHelperInvocation reordered |     bool needs_demote_reorder{}; ///< True when the device needs DemoteToHelperInvocation reordered | ||||||
|   | |||||||
							
								
								
									
										185
									
								
								src/shader_recompiler/ir_opt/lower_fp64_to_fp32.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										185
									
								
								src/shader_recompiler/ir_opt/lower_fp64_to_fp32.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,185 @@ | |||||||
|  | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||||||
|  | // SPDX-License-Identifier: GPL-2.0-or-later | ||||||
|  |  | ||||||
|  | #include "shader_recompiler/frontend/ir/ir_emitter.h" | ||||||
|  | #include "shader_recompiler/frontend/ir/opcodes.h" | ||||||
|  | #include "shader_recompiler/frontend/ir/value.h" | ||||||
|  | #include "shader_recompiler/ir_opt/passes.h" | ||||||
|  |  | ||||||
|  | namespace Shader::Optimization { | ||||||
|  | namespace { | ||||||
|  |  | ||||||
|  | constexpr s32 F64ToF32Exp = +1023 - 127; | ||||||
|  | constexpr s32 F32ToF64Exp = +127 - 1023; | ||||||
|  |  | ||||||
|  | IR::F32 PackedF64ToF32(IR::IREmitter& ir, const IR::Value& packed) { | ||||||
|  |     const IR::U32 lo{ir.CompositeExtract(packed, 0)}; | ||||||
|  |     const IR::U32 hi{ir.CompositeExtract(packed, 1)}; | ||||||
|  |     const IR::U32 sign{ir.BitFieldExtract(hi, ir.Imm32(31), ir.Imm32(1))}; | ||||||
|  |     const IR::U32 exp{ir.BitFieldExtract(hi, ir.Imm32(20), ir.Imm32(11))}; | ||||||
|  |     const IR::U32 mantissa_hi{ir.BitFieldExtract(hi, ir.Imm32(0), ir.Imm32(20))}; | ||||||
|  |     const IR::U32 mantissa_lo{ir.BitFieldExtract(lo, ir.Imm32(29), ir.Imm32(3))}; | ||||||
|  |     const IR::U32 mantissa{ | ||||||
|  |         ir.BitwiseOr(ir.ShiftLeftLogical(mantissa_hi, ir.Imm32(3)), mantissa_lo)}; | ||||||
|  |     const IR::U32 exp_if_subnorm{ | ||||||
|  |         ir.Select(ir.IEqual(exp, ir.Imm32(0)), ir.Imm32(0), ir.IAdd(exp, ir.Imm32(F64ToF32Exp)))}; | ||||||
|  |     const IR::U32 exp_if_infnan{ | ||||||
|  |         ir.Select(ir.IEqual(exp, ir.Imm32(0x7ff)), ir.Imm32(0xff), exp_if_subnorm)}; | ||||||
|  |     const IR::U32 result{ | ||||||
|  |         ir.BitwiseOr(ir.ShiftLeftLogical(sign, ir.Imm32(31)), | ||||||
|  |                      ir.BitwiseOr(ir.ShiftLeftLogical(exp_if_infnan, ir.Imm32(23)), mantissa))}; | ||||||
|  |     return ir.BitCast<IR::F32>(result); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | IR::Value F32ToPackedF64(IR::IREmitter& ir, const IR::Value& raw) { | ||||||
|  |     const IR::U32 value{ir.BitCast<IR::U32>(IR::F32(raw))}; | ||||||
|  |     const IR::U32 sign{ir.BitFieldExtract(value, ir.Imm32(31), ir.Imm32(1))}; | ||||||
|  |     const IR::U32 exp{ir.BitFieldExtract(value, ir.Imm32(23), ir.Imm32(8))}; | ||||||
|  |     const IR::U32 mantissa{ir.BitFieldExtract(value, ir.Imm32(0), ir.Imm32(23))}; | ||||||
|  |     const IR::U32 mantissa_hi{ir.BitFieldExtract(mantissa, ir.Imm32(3), ir.Imm32(20))}; | ||||||
|  |     const IR::U32 mantissa_lo{ir.BitFieldExtract(mantissa, ir.Imm32(0), ir.Imm32(3))}; | ||||||
|  |     const IR::U32 exp_if_subnorm{ | ||||||
|  |         ir.Select(ir.IEqual(exp, ir.Imm32(0)), ir.Imm32(0), ir.IAdd(exp, ir.Imm32(F32ToF64Exp)))}; | ||||||
|  |     const IR::U32 exp_if_infnan{ | ||||||
|  |         ir.Select(ir.IEqual(exp, ir.Imm32(0xff)), ir.Imm32(0x7ff), exp_if_subnorm)}; | ||||||
|  |     const IR::U32 lo{ir.ShiftLeftLogical(mantissa_lo, ir.Imm32(29))}; | ||||||
|  |     const IR::U32 hi{ | ||||||
|  |         ir.BitwiseOr(ir.ShiftLeftLogical(sign, ir.Imm32(31)), | ||||||
|  |                      ir.BitwiseOr(ir.ShiftLeftLogical(exp_if_infnan, ir.Imm32(20)), mantissa_hi))}; | ||||||
|  |     return ir.CompositeConstruct(lo, hi); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | IR::Opcode Replace(IR::Opcode op) { | ||||||
|  |     switch (op) { | ||||||
|  |     case IR::Opcode::FPAbs64: | ||||||
|  |         return IR::Opcode::FPAbs32; | ||||||
|  |     case IR::Opcode::FPAdd64: | ||||||
|  |         return IR::Opcode::FPAdd32; | ||||||
|  |     case IR::Opcode::FPCeil64: | ||||||
|  |         return IR::Opcode::FPCeil32; | ||||||
|  |     case IR::Opcode::FPFloor64: | ||||||
|  |         return IR::Opcode::FPFloor32; | ||||||
|  |     case IR::Opcode::FPFma64: | ||||||
|  |         return IR::Opcode::FPFma32; | ||||||
|  |     case IR::Opcode::FPMul64: | ||||||
|  |         return IR::Opcode::FPMul32; | ||||||
|  |     case IR::Opcode::FPNeg64: | ||||||
|  |         return IR::Opcode::FPNeg32; | ||||||
|  |     case IR::Opcode::FPRoundEven64: | ||||||
|  |         return IR::Opcode::FPRoundEven32; | ||||||
|  |     case IR::Opcode::FPSaturate64: | ||||||
|  |         return IR::Opcode::FPSaturate32; | ||||||
|  |     case IR::Opcode::FPClamp64: | ||||||
|  |         return IR::Opcode::FPClamp32; | ||||||
|  |     case IR::Opcode::FPTrunc64: | ||||||
|  |         return IR::Opcode::FPTrunc32; | ||||||
|  |     case IR::Opcode::CompositeConstructF64x2: | ||||||
|  |         return IR::Opcode::CompositeConstructF32x2; | ||||||
|  |     case IR::Opcode::CompositeConstructF64x3: | ||||||
|  |         return IR::Opcode::CompositeConstructF32x3; | ||||||
|  |     case IR::Opcode::CompositeConstructF64x4: | ||||||
|  |         return IR::Opcode::CompositeConstructF32x4; | ||||||
|  |     case IR::Opcode::CompositeExtractF64x2: | ||||||
|  |         return IR::Opcode::CompositeExtractF32x2; | ||||||
|  |     case IR::Opcode::CompositeExtractF64x3: | ||||||
|  |         return IR::Opcode::CompositeExtractF32x3; | ||||||
|  |     case IR::Opcode::CompositeExtractF64x4: | ||||||
|  |         return IR::Opcode::CompositeExtractF32x4; | ||||||
|  |     case IR::Opcode::CompositeInsertF64x2: | ||||||
|  |         return IR::Opcode::CompositeInsertF32x2; | ||||||
|  |     case IR::Opcode::CompositeInsertF64x3: | ||||||
|  |         return IR::Opcode::CompositeInsertF32x3; | ||||||
|  |     case IR::Opcode::CompositeInsertF64x4: | ||||||
|  |         return IR::Opcode::CompositeInsertF32x4; | ||||||
|  |     case IR::Opcode::FPOrdEqual64: | ||||||
|  |         return IR::Opcode::FPOrdEqual32; | ||||||
|  |     case IR::Opcode::FPUnordEqual64: | ||||||
|  |         return IR::Opcode::FPUnordEqual32; | ||||||
|  |     case IR::Opcode::FPOrdNotEqual64: | ||||||
|  |         return IR::Opcode::FPOrdNotEqual32; | ||||||
|  |     case IR::Opcode::FPUnordNotEqual64: | ||||||
|  |         return IR::Opcode::FPUnordNotEqual32; | ||||||
|  |     case IR::Opcode::FPOrdLessThan64: | ||||||
|  |         return IR::Opcode::FPOrdLessThan32; | ||||||
|  |     case IR::Opcode::FPUnordLessThan64: | ||||||
|  |         return IR::Opcode::FPUnordLessThan32; | ||||||
|  |     case IR::Opcode::FPOrdGreaterThan64: | ||||||
|  |         return IR::Opcode::FPOrdGreaterThan32; | ||||||
|  |     case IR::Opcode::FPUnordGreaterThan64: | ||||||
|  |         return IR::Opcode::FPUnordGreaterThan32; | ||||||
|  |     case IR::Opcode::FPOrdLessThanEqual64: | ||||||
|  |         return IR::Opcode::FPOrdLessThanEqual32; | ||||||
|  |     case IR::Opcode::FPUnordLessThanEqual64: | ||||||
|  |         return IR::Opcode::FPUnordLessThanEqual32; | ||||||
|  |     case IR::Opcode::FPOrdGreaterThanEqual64: | ||||||
|  |         return IR::Opcode::FPOrdGreaterThanEqual32; | ||||||
|  |     case IR::Opcode::FPUnordGreaterThanEqual64: | ||||||
|  |         return IR::Opcode::FPUnordGreaterThanEqual32; | ||||||
|  |     case IR::Opcode::FPIsNan64: | ||||||
|  |         return IR::Opcode::FPIsNan32; | ||||||
|  |     case IR::Opcode::ConvertS16F64: | ||||||
|  |         return IR::Opcode::ConvertS16F32; | ||||||
|  |     case IR::Opcode::ConvertS32F64: | ||||||
|  |         return IR::Opcode::ConvertS32F32; | ||||||
|  |     case IR::Opcode::ConvertS64F64: | ||||||
|  |         return IR::Opcode::ConvertS64F32; | ||||||
|  |     case IR::Opcode::ConvertU16F64: | ||||||
|  |         return IR::Opcode::ConvertU16F32; | ||||||
|  |     case IR::Opcode::ConvertU32F64: | ||||||
|  |         return IR::Opcode::ConvertU32F32; | ||||||
|  |     case IR::Opcode::ConvertU64F64: | ||||||
|  |         return IR::Opcode::ConvertU64F32; | ||||||
|  |     case IR::Opcode::ConvertF32F64: | ||||||
|  |         return IR::Opcode::Identity; | ||||||
|  |     case IR::Opcode::ConvertF64F32: | ||||||
|  |         return IR::Opcode::Identity; | ||||||
|  |     case IR::Opcode::ConvertF64S8: | ||||||
|  |         return IR::Opcode::ConvertF32S8; | ||||||
|  |     case IR::Opcode::ConvertF64S16: | ||||||
|  |         return IR::Opcode::ConvertF32S16; | ||||||
|  |     case IR::Opcode::ConvertF64S32: | ||||||
|  |         return IR::Opcode::ConvertF32S32; | ||||||
|  |     case IR::Opcode::ConvertF64S64: | ||||||
|  |         return IR::Opcode::ConvertF32S64; | ||||||
|  |     case IR::Opcode::ConvertF64U8: | ||||||
|  |         return IR::Opcode::ConvertF32U8; | ||||||
|  |     case IR::Opcode::ConvertF64U16: | ||||||
|  |         return IR::Opcode::ConvertF32U16; | ||||||
|  |     case IR::Opcode::ConvertF64U32: | ||||||
|  |         return IR::Opcode::ConvertF32U32; | ||||||
|  |     case IR::Opcode::ConvertF64U64: | ||||||
|  |         return IR::Opcode::ConvertF32U64; | ||||||
|  |     default: | ||||||
|  |         return op; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void Lower(IR::Block& block, IR::Inst& inst) { | ||||||
|  |     switch (inst.GetOpcode()) { | ||||||
|  |     case IR::Opcode::PackDouble2x32: { | ||||||
|  |         IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); | ||||||
|  |         inst.ReplaceUsesWith(PackedF64ToF32(ir, inst.Arg(0))); | ||||||
|  |         break; | ||||||
|  |     } | ||||||
|  |     case IR::Opcode::UnpackDouble2x32: { | ||||||
|  |         IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); | ||||||
|  |         inst.ReplaceUsesWith(F32ToPackedF64(ir, inst.Arg(0))); | ||||||
|  |         break; | ||||||
|  |     } | ||||||
|  |     default: | ||||||
|  |         inst.ReplaceOpcode(Replace(inst.GetOpcode())); | ||||||
|  |         break; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | } // Anonymous namespace | ||||||
|  |  | ||||||
|  | void LowerFp64ToFp32(IR::Program& program) { | ||||||
|  |     for (IR::Block* const block : program.blocks) { | ||||||
|  |         for (IR::Inst& inst : block->Instructions()) { | ||||||
|  |             Lower(*block, inst); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | } // namespace Shader::Optimization | ||||||
| @@ -17,6 +17,7 @@ void ConstantPropagationPass(Environment& env, IR::Program& program); | |||||||
| void DeadCodeEliminationPass(IR::Program& program); | void DeadCodeEliminationPass(IR::Program& program); | ||||||
| void GlobalMemoryToStorageBufferPass(IR::Program& program); | void GlobalMemoryToStorageBufferPass(IR::Program& program); | ||||||
| void IdentityRemovalPass(IR::Program& program); | void IdentityRemovalPass(IR::Program& program); | ||||||
|  | void LowerFp64ToFp32(IR::Program& program); | ||||||
| void LowerFp16ToFp32(IR::Program& program); | void LowerFp16ToFp32(IR::Program& program); | ||||||
| void LowerInt64ToInt32(IR::Program& program); | void LowerInt64ToInt32(IR::Program& program); | ||||||
| void RescalingPass(IR::Program& program); | void RescalingPass(IR::Program& program); | ||||||
|   | |||||||
| @@ -232,6 +232,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo | |||||||
|           .gl_max_compute_smem_size = device.GetMaxComputeSharedMemorySize(), |           .gl_max_compute_smem_size = device.GetMaxComputeSharedMemorySize(), | ||||||
|       }, |       }, | ||||||
|       host_info{ |       host_info{ | ||||||
|  |           .support_float64 = true, | ||||||
|           .support_float16 = false, |           .support_float16 = false, | ||||||
|           .support_int64 = device.HasShaderInt64(), |           .support_int64 = device.HasShaderInt64(), | ||||||
|           .needs_demote_reorder = device.IsAmd(), |           .needs_demote_reorder = device.IsAmd(), | ||||||
|   | |||||||
| @@ -350,6 +350,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device | |||||||
|         .has_broken_spirv_subgroup_mask_vector_extract_dynamic = |         .has_broken_spirv_subgroup_mask_vector_extract_dynamic = | ||||||
|             driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY}; |             driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY}; | ||||||
|     host_info = Shader::HostTranslateInfo{ |     host_info = Shader::HostTranslateInfo{ | ||||||
|  |         .support_float64 = device.IsFloat64Supported(), | ||||||
|         .support_float16 = device.IsFloat16Supported(), |         .support_float16 = device.IsFloat16Supported(), | ||||||
|         .support_int64 = device.IsShaderInt64Supported(), |         .support_int64 = device.IsShaderInt64Supported(), | ||||||
|         .needs_demote_reorder = |         .needs_demote_reorder = | ||||||
|   | |||||||
| @@ -300,6 +300,11 @@ public: | |||||||
|         return GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY; |         return GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     /// Returns true if the device suppors float64 natively. | ||||||
|  |     bool IsFloat64Supported() const { | ||||||
|  |         return features.features.shaderFloat64; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     /// Returns true if the device supports float16 natively. |     /// Returns true if the device supports float16 natively. | ||||||
|     bool IsFloat16Supported() const { |     bool IsFloat16Supported() const { | ||||||
|         return features.shader_float16_int8.shaderFloat16; |         return features.shader_float16_int8.shaderFloat16; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user