Merge pull request #10916 from ameerj/lolmem
OpenGL: Add Local Memory warmup shader for Nvidia
This commit is contained in:
		| @@ -461,7 +461,7 @@ std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info, I | |||||||
|         header += fmt::format("R{},", index); |         header += fmt::format("R{},", index); | ||||||
|     } |     } | ||||||
|     if (program.local_memory_size > 0) { |     if (program.local_memory_size > 0) { | ||||||
|         header += fmt::format("lmem[{}],", program.local_memory_size); |         header += fmt::format("lmem[{}],", Common::DivCeil(program.local_memory_size, 4U)); | ||||||
|     } |     } | ||||||
|     if (program.info.uses_fswzadd) { |     if (program.info.uses_fswzadd) { | ||||||
|         header += "FSWZA[4],FSWZB[4],"; |         header += "FSWZA[4],FSWZB[4],"; | ||||||
|   | |||||||
| @@ -424,6 +424,10 @@ void VisitUsages(Info& info, IR::Inst& inst) { | |||||||
|         info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2; |         info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2; | ||||||
|         info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4; |         info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4; | ||||||
|         break; |         break; | ||||||
|  |     case IR::Opcode::LoadLocal: | ||||||
|  |     case IR::Opcode::WriteLocal: | ||||||
|  |         info.uses_local_memory = true; | ||||||
|  |         break; | ||||||
|     default: |     default: | ||||||
|         break; |         break; | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -172,6 +172,7 @@ struct Info { | |||||||
|     bool stores_indexed_attributes{}; |     bool stores_indexed_attributes{}; | ||||||
|  |  | ||||||
|     bool stores_global_memory{}; |     bool stores_global_memory{}; | ||||||
|  |     bool uses_local_memory{}; | ||||||
|  |  | ||||||
|     bool uses_fp16{}; |     bool uses_fp16{}; | ||||||
|     bool uses_fp64{}; |     bool uses_fp64{}; | ||||||
|   | |||||||
| @@ -33,6 +33,7 @@ set(SHADER_FILES | |||||||
|     opengl_fidelityfx_fsr.frag |     opengl_fidelityfx_fsr.frag | ||||||
|     opengl_fidelityfx_fsr_easu.frag |     opengl_fidelityfx_fsr_easu.frag | ||||||
|     opengl_fidelityfx_fsr_rcas.frag |     opengl_fidelityfx_fsr_rcas.frag | ||||||
|  |     opengl_lmem_warmup.comp | ||||||
|     opengl_present.frag |     opengl_present.frag | ||||||
|     opengl_present.vert |     opengl_present.vert | ||||||
|     opengl_present_scaleforce.frag |     opengl_present_scaleforce.frag | ||||||
|   | |||||||
							
								
								
									
										47
									
								
								src/video_core/host_shaders/opengl_lmem_warmup.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								src/video_core/host_shaders/opengl_lmem_warmup.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,47 @@ | |||||||
|  | // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project | ||||||
|  | // SPDX-License-Identifier: GPL-2.0-or-later | ||||||
|  |  | ||||||
|  | // This shader is a workaround for a quirk in NVIDIA OpenGL drivers | ||||||
|  | // Shaders using local memory see a great performance benefit if a shader that was dispatched | ||||||
|  | // before it had more local memory allocated. | ||||||
|  | // This shader allocates the maximum local memory allowed on NVIDIA drivers to ensure that | ||||||
|  | // subsequent shaders see the performance boost. | ||||||
|  |  | ||||||
|  | // NOTE: This shader does no actual meaningful work and returns immediately, | ||||||
|  | // it is simply a means to have the driver expect a shader using lots of local memory. | ||||||
|  |  | ||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; | ||||||
|  |  | ||||||
|  | layout(location = 0) uniform uint uniform_data; | ||||||
|  |  | ||||||
|  | layout(binding = 0, rgba8) uniform writeonly restrict image2DArray dest_image; | ||||||
|  |  | ||||||
|  | #define MAX_LMEM_SIZE 4080 // Size chosen to avoid errors in Nvidia's GLSL compiler | ||||||
|  | #define NUM_LMEM_CONSTANTS 1 | ||||||
|  | #define ARRAY_SIZE MAX_LMEM_SIZE - NUM_LMEM_CONSTANTS | ||||||
|  |  | ||||||
|  | uint lmem_0[ARRAY_SIZE]; | ||||||
|  | const uvec4 constant_values[NUM_LMEM_CONSTANTS] = uvec4[](uvec4(0)); | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const uint global_id = gl_GlobalInvocationID.x; | ||||||
|  |     if (global_id <= 128) { | ||||||
|  |         // Since the shader is called with a dispatch of 1x1x1 | ||||||
|  |         // This should always be the case, and this shader will not actually execute | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     for (uint t = 0; t < uniform_data; t++) { | ||||||
|  |         const uint offset = (t * uniform_data); | ||||||
|  |         lmem_0[offset] = t; | ||||||
|  |     } | ||||||
|  |     const uint offset = (gl_GlobalInvocationID.y * uniform_data + gl_GlobalInvocationID.x); | ||||||
|  |     const uint value = lmem_0[offset]; | ||||||
|  |     const uint const_value = constant_values[offset / 4][offset % 4]; | ||||||
|  |     const uvec4 color = uvec4(value + const_value); | ||||||
|  |  | ||||||
|  |     // A "side-effect" is needed so the variables don't get optimized out, | ||||||
|  |     // but this should never execute so there should be no clobbering of previously bound state. | ||||||
|  |     imageStore(dest_image, ivec3(gl_GlobalInvocationID), color); | ||||||
|  | } | ||||||
| @@ -63,6 +63,7 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac | |||||||
|     writes_global_memory = !use_storage_buffers && |     writes_global_memory = !use_storage_buffers && | ||||||
|                            std::ranges::any_of(info.storage_buffers_descriptors, |                            std::ranges::any_of(info.storage_buffers_descriptors, | ||||||
|                                                [](const auto& desc) { return desc.is_written; }); |                                                [](const auto& desc) { return desc.is_written; }); | ||||||
|  |     uses_local_memory = info.uses_local_memory; | ||||||
|     if (force_context_flush) { |     if (force_context_flush) { | ||||||
|         std::scoped_lock lock{built_mutex}; |         std::scoped_lock lock{built_mutex}; | ||||||
|         built_fence.Create(); |         built_fence.Create(); | ||||||
|   | |||||||
| @@ -59,6 +59,10 @@ public: | |||||||
|         return writes_global_memory; |         return writes_global_memory; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     [[nodiscard]] bool UsesLocalMemory() const noexcept { | ||||||
|  |         return uses_local_memory; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_, |     void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_, | ||||||
|                    Tegra::MemoryManager* gpu_memory_) { |                    Tegra::MemoryManager* gpu_memory_) { | ||||||
|         kepler_compute = kepler_compute_; |         kepler_compute = kepler_compute_; | ||||||
| @@ -84,6 +88,7 @@ private: | |||||||
|  |  | ||||||
|     bool use_storage_buffers{}; |     bool use_storage_buffers{}; | ||||||
|     bool writes_global_memory{}; |     bool writes_global_memory{}; | ||||||
|  |     bool uses_local_memory{}; | ||||||
|  |  | ||||||
|     std::mutex built_mutex; |     std::mutex built_mutex; | ||||||
|     std::condition_variable built_condvar; |     std::condition_variable built_condvar; | ||||||
|   | |||||||
| @@ -194,6 +194,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { | |||||||
|             has_bool_ref_bug = true; |             has_bool_ref_bug = true; | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |     has_lmem_perf_bug = is_nvidia; | ||||||
|  |  | ||||||
|     strict_context_required = emu_window.StrictContextRequired(); |     strict_context_required = emu_window.StrictContextRequired(); | ||||||
|     // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation. |     // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation. | ||||||
|   | |||||||
| @@ -192,6 +192,10 @@ public: | |||||||
|         return supports_conditional_barriers; |         return supports_conditional_barriers; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     bool HasLmemPerfBug() const { | ||||||
|  |         return has_lmem_perf_bug; | ||||||
|  |     } | ||||||
|  |  | ||||||
| private: | private: | ||||||
|     static bool TestVariableAoffi(); |     static bool TestVariableAoffi(); | ||||||
|     static bool TestPreciseBug(); |     static bool TestPreciseBug(); | ||||||
| @@ -238,6 +242,7 @@ private: | |||||||
|     bool can_report_memory{}; |     bool can_report_memory{}; | ||||||
|     bool strict_context_required{}; |     bool strict_context_required{}; | ||||||
|     bool supports_conditional_barriers{}; |     bool supports_conditional_barriers{}; | ||||||
|  |     bool has_lmem_perf_bug{}; | ||||||
|  |  | ||||||
|     std::string vendor_name; |     std::string vendor_name; | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -215,6 +215,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c | |||||||
|  |  | ||||||
|         writes_global_memory |= std::ranges::any_of( |         writes_global_memory |= std::ranges::any_of( | ||||||
|             info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); |             info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); | ||||||
|  |         uses_local_memory |= info.uses_local_memory; | ||||||
|     } |     } | ||||||
|     ASSERT(num_textures <= MAX_TEXTURES); |     ASSERT(num_textures <= MAX_TEXTURES); | ||||||
|     ASSERT(num_images <= MAX_IMAGES); |     ASSERT(num_images <= MAX_IMAGES); | ||||||
|   | |||||||
| @@ -98,6 +98,10 @@ public: | |||||||
|         return writes_global_memory; |         return writes_global_memory; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     [[nodiscard]] bool UsesLocalMemory() const noexcept { | ||||||
|  |         return uses_local_memory; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     [[nodiscard]] bool IsBuilt() noexcept; |     [[nodiscard]] bool IsBuilt() noexcept; | ||||||
|  |  | ||||||
|     template <typename Spec> |     template <typename Spec> | ||||||
| @@ -146,6 +150,7 @@ private: | |||||||
|  |  | ||||||
|     bool use_storage_buffers{}; |     bool use_storage_buffers{}; | ||||||
|     bool writes_global_memory{}; |     bool writes_global_memory{}; | ||||||
|  |     bool uses_local_memory{}; | ||||||
|  |  | ||||||
|     static constexpr std::size_t XFB_ENTRY_STRIDE = 3; |     static constexpr std::size_t XFB_ENTRY_STRIDE = 3; | ||||||
|     GLsizei num_xfb_attribs{}; |     GLsizei num_xfb_attribs{}; | ||||||
|   | |||||||
| @@ -222,6 +222,9 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) { | |||||||
|     gpu.TickWork(); |     gpu.TickWork(); | ||||||
|  |  | ||||||
|     std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; |     std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; | ||||||
|  |     if (pipeline->UsesLocalMemory()) { | ||||||
|  |         program_manager.LocalMemoryWarmup(); | ||||||
|  |     } | ||||||
|     pipeline->SetEngine(maxwell3d, gpu_memory); |     pipeline->SetEngine(maxwell3d, gpu_memory); | ||||||
|     pipeline->Configure(is_indexed); |     pipeline->Configure(is_indexed); | ||||||
|  |  | ||||||
| @@ -371,6 +374,9 @@ void RasterizerOpenGL::DispatchCompute() { | |||||||
|     if (!pipeline) { |     if (!pipeline) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |     if (pipeline->UsesLocalMemory()) { | ||||||
|  |         program_manager.LocalMemoryWarmup(); | ||||||
|  |     } | ||||||
|     pipeline->SetEngine(kepler_compute, gpu_memory); |     pipeline->SetEngine(kepler_compute, gpu_memory); | ||||||
|     pipeline->Configure(); |     pipeline->Configure(); | ||||||
|     const auto& qmd{kepler_compute->launch_description}; |     const auto& qmd{kepler_compute->launch_description}; | ||||||
|   | |||||||
| @@ -3,7 +3,9 @@ | |||||||
|  |  | ||||||
| #include <glad/glad.h> | #include <glad/glad.h> | ||||||
|  |  | ||||||
|  | #include "video_core/host_shaders/opengl_lmem_warmup_comp.h" | ||||||
| #include "video_core/renderer_opengl/gl_shader_manager.h" | #include "video_core/renderer_opengl/gl_shader_manager.h" | ||||||
|  | #include "video_core/renderer_opengl/gl_shader_util.h" | ||||||
|  |  | ||||||
| namespace OpenGL { | namespace OpenGL { | ||||||
|  |  | ||||||
| @@ -17,6 +19,10 @@ ProgramManager::ProgramManager(const Device& device) { | |||||||
|     if (device.UseAssemblyShaders()) { |     if (device.UseAssemblyShaders()) { | ||||||
|         glEnable(GL_COMPUTE_PROGRAM_NV); |         glEnable(GL_COMPUTE_PROGRAM_NV); | ||||||
|     } |     } | ||||||
|  |     if (device.HasLmemPerfBug()) { | ||||||
|  |         lmem_warmup_program = | ||||||
|  |             CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER); | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| void ProgramManager::BindComputeProgram(GLuint program) { | void ProgramManager::BindComputeProgram(GLuint program) { | ||||||
| @@ -98,6 +104,13 @@ void ProgramManager::BindAssemblyPrograms(std::span<const OGLAssemblyProgram, NU | |||||||
|  |  | ||||||
| void ProgramManager::RestoreGuestCompute() {} | void ProgramManager::RestoreGuestCompute() {} | ||||||
|  |  | ||||||
|  | void ProgramManager::LocalMemoryWarmup() { | ||||||
|  |     if (lmem_warmup_program.handle != 0) { | ||||||
|  |         BindComputeProgram(lmem_warmup_program.handle); | ||||||
|  |         glDispatchCompute(1, 1, 1); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| void ProgramManager::BindPipeline() { | void ProgramManager::BindPipeline() { | ||||||
|     if (!is_pipeline_bound) { |     if (!is_pipeline_bound) { | ||||||
|         is_pipeline_bound = true; |         is_pipeline_bound = true; | ||||||
|   | |||||||
| @@ -30,6 +30,8 @@ public: | |||||||
|  |  | ||||||
|     void RestoreGuestCompute(); |     void RestoreGuestCompute(); | ||||||
|  |  | ||||||
|  |     void LocalMemoryWarmup(); | ||||||
|  |  | ||||||
| private: | private: | ||||||
|     void BindPipeline(); |     void BindPipeline(); | ||||||
|  |  | ||||||
| @@ -44,6 +46,7 @@ private: | |||||||
|     u32 current_stage_mask = 0; |     u32 current_stage_mask = 0; | ||||||
|     std::array<GLuint, NUM_STAGES> current_programs{}; |     std::array<GLuint, NUM_STAGES> current_programs{}; | ||||||
|     GLuint current_assembly_compute_program = 0; |     GLuint current_assembly_compute_program = 0; | ||||||
|  |     OGLProgram lmem_warmup_program; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| } // namespace OpenGL | } // namespace OpenGL | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user