diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 7f6f29f35..532baab08 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -82,6 +82,8 @@ add_library(video_core STATIC renderer_vulkan/vk_rasterizer.cpp renderer_vulkan/vk_rasterizer.h renderer_vulkan/vk_shader_state.h + renderer_vulkan/vk_shader_gen.cpp + renderer_vulkan/vk_shader_gen.h renderer_vulkan/vk_state.cpp renderer_vulkan/vk_state.h renderer_vulkan/vk_surface_params.cpp diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 03bf154d8..b2909b11e 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -29,24 +29,12 @@ #include "core/tracer/recorder.h" #include "video_core/debug_utils/debug_utils.h" #include "video_core/rasterizer_interface.h" -#include "video_core/renderer_opengl/gl_state.h" -#include "video_core/renderer_opengl/gl_vars.h" -#include "video_core/renderer_opengl/post_processing_opengl.h" -#include "video_core/renderer_opengl/renderer_opengl.h" +#include "video_core/renderer_vulkan/vk_state.h" +#include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/video_core.h" namespace Vulkan { -// If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have -// to wait on available presentation frames. There doesn't seem to be much of a downside to a larger -// number but 9 swap textures at 60FPS presentation allows for 800% speed so thats probably fine -#ifdef ANDROID -// Reduce the size of swap_chain, since the UI only allows upto 200% speed. -constexpr std::size_t SWAP_CHAIN_SIZE = 6; -#else -constexpr std::size_t SWAP_CHAIN_SIZE = 9; -#endif - class OGLTextureMailboxException : public std::runtime_error { public: using std::runtime_error::runtime_error; @@ -371,13 +359,13 @@ RendererVulkan::RendererVulkan(Frontend::EmuWindow& window) frame_dumper.mailbox = std::make_unique(); } -RendererOpenGL::~RendererOpenGL() = default; +RendererVulkan::~RendererVulkan() = default; MICROPROFILE_DEFINE(OpenGL_RenderFrame, "OpenGL", "Render Frame", MP_RGB(128, 128, 64)); MICROPROFILE_DEFINE(OpenGL_WaitPresent, "OpenGL", "Wait For Present", MP_RGB(128, 128, 128)); /// Swap buffers (render frame) -void RendererOpenGL::SwapBuffers() { +void RendererVulkan::SwapBuffers() { // Maintain the rasterizer's state as a priority OpenGLState prev_state = OpenGLState::GetCurState(); state.Apply(); @@ -415,7 +403,7 @@ void RendererOpenGL::SwapBuffers() { } } -void RendererOpenGL::RenderScreenshot() { +void RendererVulkan::RenderScreenshot() { if (VideoCore::g_renderer_screenshot_requested) { // Draw this frame to the screenshot framebuffer screenshot_framebuffer.Create(); @@ -449,7 +437,7 @@ void RendererOpenGL::RenderScreenshot() { } } -void RendererOpenGL::PrepareRendertarget() { +void RendererVulkan::PrepareRendertarget() { for (int i : {0, 1, 2}) { int fb_id = i == 2 ? 1 : 0; const auto& framebuffer = GPU::g_regs.framebuffer_config[fb_id]; @@ -486,7 +474,7 @@ void RendererOpenGL::PrepareRendertarget() { } } -void RendererOpenGL::RenderToMailbox(const Layout::FramebufferLayout& layout, +void RendererVulkan::RenderToMailbox(const Layout::FramebufferLayout& layout, std::unique_ptr& mailbox, bool flipped) { @@ -540,7 +528,7 @@ void RendererOpenGL::RenderToMailbox(const Layout::FramebufferLayout& layout, /** * Loads framebuffer from emulated memory into the active OpenGL texture. */ -void RendererOpenGL::LoadFBToScreenInfo(const GPU::Regs::FramebufferConfig& framebuffer, +void RendererVulkan::LoadFBToScreenInfo(const GPU::Regs::FramebufferConfig& framebuffer, ScreenInfo& screen_info, bool right_eye) { if (framebuffer.address_right1 == 0 || framebuffer.address_right2 == 0) @@ -601,7 +589,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const GPU::Regs::FramebufferConfig& fram * Fills active OpenGL texture with the given RGB color. Since the color is solid, the texture can * be 1x1 but will stretch across whatever it's rendered on. */ -void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, +void RendererVulkan::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, const TextureInfo& texture) { state.texture_units[0].texture_2d = texture.resource.handle; state.Apply(); @@ -619,7 +607,7 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color /** * Initializes the OpenGL state and creates persistent objects. */ -void RendererOpenGL::InitOpenGLObjects() { +void RendererVulkan::InitOpenGLObjects() { glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, 0.0f); @@ -672,7 +660,7 @@ void RendererOpenGL::InitOpenGLObjects() { state.Apply(); } -void RendererOpenGL::ReloadSampler() { +void RendererVulkan::ReloadSampler() { glSamplerParameteri(filter_sampler.handle, GL_TEXTURE_MIN_FILTER, Settings::values.filter_mode ? GL_LINEAR : GL_NEAREST); glSamplerParameteri(filter_sampler.handle, GL_TEXTURE_MAG_FILTER, @@ -681,7 +669,7 @@ void RendererOpenGL::ReloadSampler() { glSamplerParameteri(filter_sampler.handle, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); } -void RendererOpenGL::ReloadShader() { +void RendererVulkan::ReloadShader() { // Link shaders and get variable locations std::string shader_data; if (GLES) { @@ -754,7 +742,7 @@ void RendererOpenGL::ReloadShader() { attrib_tex_coord = glGetAttribLocation(shader.handle, "vert_tex_coord"); } -void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, +void RendererVulkan::ConfigureFramebufferTexture(TextureInfo& texture, const GPU::Regs::FramebufferConfig& framebuffer) { GPU::Regs::PixelFormat format = framebuffer.color_format; GLint internal_format; @@ -819,7 +807,7 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, * Draws a single texture to the emulator window, rotating the texture to correct for the 3DS's LCD * rotation. */ -void RendererOpenGL::DrawSingleScreenRotated(const ScreenInfo& screen_info, float x, float y, +void RendererVulkan::DrawSingleScreenRotated(const ScreenInfo& screen_info, float x, float y, float w, float h) { const auto& texcoords = screen_info.display_texcoords; @@ -851,7 +839,7 @@ void RendererOpenGL::DrawSingleScreenRotated(const ScreenInfo& screen_info, floa state.Apply(); } -void RendererOpenGL::DrawSingleScreen(const ScreenInfo& screen_info, float x, float y, float w, +void RendererVulkan::DrawSingleScreen(const ScreenInfo& screen_info, float x, float y, float w, float h) { const auto& texcoords = screen_info.display_texcoords; @@ -884,7 +872,7 @@ void RendererOpenGL::DrawSingleScreen(const ScreenInfo& screen_info, float x, fl * Draws a single texture to the emulator window, rotating the texture to correct for the 3DS's LCD * rotation. */ -void RendererOpenGL::DrawSingleScreenStereoRotated(const ScreenInfo& screen_info_l, +void RendererVulkan::DrawSingleScreenStereoRotated(const ScreenInfo& screen_info_l, const ScreenInfo& screen_info_r, float x, float y, float w, float h) { const auto& texcoords = screen_info_l.display_texcoords; @@ -919,7 +907,7 @@ void RendererOpenGL::DrawSingleScreenStereoRotated(const ScreenInfo& screen_info state.Apply(); } -void RendererOpenGL::DrawSingleScreenStereo(const ScreenInfo& screen_info_l, +void RendererVulkan::DrawSingleScreenStereo(const ScreenInfo& screen_info_l, const ScreenInfo& screen_info_r, float x, float y, float w, float h) { const auto& texcoords = screen_info_l.display_texcoords; @@ -957,7 +945,7 @@ void RendererOpenGL::DrawSingleScreenStereo(const ScreenInfo& screen_info_l, /** * Draws the emulated screens to the emulator window. */ -void RendererOpenGL::DrawScreens(const Layout::FramebufferLayout& layout, bool flipped) { +void RendererVulkan::DrawScreens(const Layout::FramebufferLayout& layout, bool flipped) { if (VideoCore::g_renderer_bg_color_update_requested.exchange(false)) { // Update background color before drawing glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, @@ -1123,7 +1111,7 @@ void RendererOpenGL::DrawScreens(const Layout::FramebufferLayout& layout, bool f } } -void RendererOpenGL::TryPresent(int timeout_ms) { +void RendererVulkan::TryPresent(int timeout_ms) { const auto& layout = render_window.GetFramebufferLayout(); auto frame = render_window.mailbox->TryGetPresentFrame(timeout_ms); if (!frame) { @@ -1164,9 +1152,9 @@ void RendererOpenGL::TryPresent(int timeout_ms) { } /// Updates the framerate -void RendererOpenGL::UpdateFramerate() {} +void RendererVulkan::UpdateFramerate() {} -void RendererOpenGL::PrepareVideoDumping() { +void RendererVulkan::PrepareVideoDumping() { auto* mailbox = static_cast(frame_dumper.mailbox.get()); { std::unique_lock lock(mailbox->swap_chain_lock); @@ -1175,7 +1163,7 @@ void RendererOpenGL::PrepareVideoDumping() { frame_dumper.StartDumping(); } -void RendererOpenGL::CleanupVideoDumping() { +void RendererVulkan::CleanupVideoDumping() { frame_dumper.StopDumping(); auto* mailbox = static_cast(frame_dumper.mailbox.get()); { @@ -1240,7 +1228,7 @@ static void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum } /// Initialize the renderer -VideoCore::ResultStatus RendererOpenGL::Init() { +VideoCore::ResultStatus RendererVulkan::Init() { #ifndef ANDROID if (!gladLoadGL()) { return VideoCore::ResultStatus::ErrorBelowGL33; @@ -1284,6 +1272,6 @@ VideoCore::ResultStatus RendererOpenGL::Init() { } /// Shutdown the renderer -void RendererOpenGL::ShutDown() {} +void RendererVulkan::ShutDown() {} } // namespace OpenGL diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index ec2ad37fc..04123b736 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -62,8 +62,7 @@ private: void RenderScreenshot(); void RenderToMailbox(const Layout::FramebufferLayout& layout, std::unique_ptr& mailbox, bool flipped); - void ConfigureFramebufferTexture(TextureInfo& texture, - const GPU::Regs::FramebufferConfig& framebuffer); + void ConfigureFramebufferTexture(ScreenInfo& screen, const GPU::Regs::FramebufferConfig& framebuffer); void DrawScreens(const Layout::FramebufferLayout& layout, bool flipped); void DrawSingleScreenRotated(const ScreenInfo& screen_info, float x, float y, float w, float h); void DrawSingleScreen(const ScreenInfo& screen_info, float x, float y, float w, float h); @@ -78,7 +77,7 @@ private: void LoadFBToScreenInfo(const GPU::Regs::FramebufferConfig& framebuffer, ScreenInfo& screen_info, bool right_eye); // Fills active OpenGL texture with the given RGB color. - void LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, const TextureInfo& texture); + void LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, const ScreenInfo& screen); VulkanState state; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 39b94c546..a02aa6091 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -113,55 +113,67 @@ bool VKInstance::CreateDevice(vk::SurfaceKHR surface, bool validation_enabled) { bool VKInstance::FindFeatures() { - auto available_features = physical_device.getFeatures(); + auto available = physical_device.getFeatures(); // Not having geometry shaders or wide lines will cause issues with rendering. - if (!available_features.geometryShader && !available_features.wideLines) { + if (!available.geometryShader && !available.wideLines) { LOG_WARNING(Render_Vulkan, "Geometry shaders not availabe! Rendering will be limited"); } // Enable some common features other emulators like Dolphin use - device_features.dualSrcBlend = available_features.dualSrcBlend; - device_features.geometryShader = available_features.geometryShader; - device_features.samplerAnisotropy = available_features.samplerAnisotropy; - device_features.logicOp = available_features.logicOp; - device_features.fragmentStoresAndAtomics = available_features.fragmentStoresAndAtomics; - device_features.sampleRateShading = available_features.sampleRateShading; - device_features.largePoints = available_features.largePoints; - device_features.shaderStorageImageMultisample = available_features.shaderStorageImageMultisample; - device_features.occlusionQueryPrecise = available_features.occlusionQueryPrecise; - device_features.shaderClipDistance = available_features.shaderClipDistance; - device_features.depthClamp = available_features.depthClamp; - device_features.textureCompressionBC = available_features.textureCompressionBC; + vk_features.dualSrcBlend = available.dualSrcBlend; + vk_features.geometryShader = available.geometryShader; + vk_features.samplerAnisotropy = available.samplerAnisotropy; + vk_features.logicOp = available.logicOp; + vk_features.fragmentStoresAndAtomics = available.fragmentStoresAndAtomics; + vk_features.sampleRateShading = available.sampleRateShading; + vk_features.largePoints = available.largePoints; + vk_features.shaderStorageImageMultisample = available.shaderStorageImageMultisample; + vk_features.occlusionQueryPrecise = available.occlusionQueryPrecise; + vk_features.shaderClipDistance = available.shaderClipDistance; + vk_features.depthClamp = available.depthClamp; + vk_features.textureCompressionBC = available.textureCompressionBC; - // Enable timeline semaphore support - new_features.timelineSemaphore = true; + // Enable newer Vulkan features + vk12_features.timelineSemaphore = true; + vk13_features.dynamicRendering = true; + dynamic_state.extendedDynamicState = true; + dynamic_state2.extendedDynamicState2 = true; + dynamic_state2.extendedDynamicState2LogicOp = true; + dynamic_state2.extendedDynamicState2PatchControlPoints = true; + + // Include features in device creation + features.setFeatures(vk_features); + features.setPNext(&vk12_features); + vk12_features.setPNext(&vk13_features); + vk13_features.setPNext(&dynamic_state); + dynamic_state.setPNext(&dynamic_state2); return true; } bool VKInstance::FindExtensions() { - auto extensions = physical_device.enumerateDeviceExtensionProperties(); - if (extensions.empty()) { + auto available = physical_device.enumerateDeviceExtensionProperties(); + if (available.empty()) { LOG_CRITICAL(Render_Vulkan, "No extensions supported by device."); return false; } // List available device extensions - for (const auto& prop : extensions) { + for (const auto& prop : available) { LOG_INFO(Render_Vulkan, "Vulkan extension: {}", prop.extensionName); } // Helper lambda for adding extensions auto AddExtension = [&](const char* name, bool required) { - auto result = std::find_if(extensions.begin(), extensions.end(), [&](const auto& prop) { + auto result = std::find_if(available.begin(), available.end(), [&](const auto& prop) { return !std::strcmp(name, prop.extensionName); }); - if (result != extensions.end()) { + if (result != available.end()) { LOG_INFO(Render_Vulkan, "Enabling extension: {}", name); - device_extensions.push_back(name); + extensions.push_back(name); return true; } @@ -172,13 +184,14 @@ bool VKInstance::FindExtensions() return false; }; - // The swapchain extension is required - if (!AddExtension(VK_KHR_SWAPCHAIN_EXTENSION_NAME, true)) { + // Add required extensions + if (!AddExtension(VK_KHR_SWAPCHAIN_EXTENSION_NAME, true) || + !AddExtension(VK_KHR_DYNAMIC_RENDERING_EXTENSION_NAME, true) || + !AddExtension(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, true) || + !AddExtension(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME, true)) { return false; } - // Add more extensions in the future... - return true; } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 3576c0999..9e7812099 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -52,10 +52,16 @@ public: vk::UniqueDevice device; // Extensions and features - std::vector device_extensions; - vk::PhysicalDeviceFeatures device_features{}; - vk::PhysicalDeviceVulkan12Features new_features{}; + std::vector extensions; + vk::PhysicalDeviceFeatures2 features{}; vk::PhysicalDeviceLimits device_limits; + + // Features per vulkan version + vk::PhysicalDeviceFeatures vk_features{}; + vk::PhysicalDeviceVulkan12Features vk12_features{}; + vk::PhysicalDeviceVulkan13Features vk13_features{}; + vk::PhysicalDeviceExtendedDynamicStateFeaturesEXT dynamic_state{}; + vk::PhysicalDeviceExtendedDynamicState2FeaturesEXT dynamic_state2{}; }; extern std::unique_ptr g_vk_instace; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 30b3b6a7b..f09f4ce11 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -314,13 +314,6 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { uniform_block_data.dirty = true; } - bool need_duplicate_texture = false; - auto CheckBarrier = [&need_duplicate_texture, &color_surface](VKTexture* handle) { - if (color_surface && &color_surface->texture == handle) { - need_duplicate_texture = true; - } - }; - // Sync and bind the texture surfaces const auto pica_textures = regs.texturing.GetTextures(); for (unsigned texture_index = 0; texture_index < pica_textures.size(); ++texture_index) { @@ -367,11 +360,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { vk::Extent2D(draw_rect.GetHeight(), draw_rect.GetHeight())); state.SetScissor(scissor); - // Draw the vertex batch - bool succeeded = true; - shader_program_manager->UseTrivialVertexShader(); - shader_program_manager->UseTrivialGeometryShader(); - shader_program_manager->ApplyTo(state); + // Apply pending state state.Apply(); std::size_t max_vertices = 3 * (VERTEX_BUFFER_SIZE / (3 * sizeof(HardwareVertex))); @@ -431,7 +420,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { depth_surface); } - return succeeded; + return true; } void RasterizerVulkan::NotifyPicaRegisterChanged(u32 id) { @@ -1141,11 +1130,11 @@ bool RasterizerVulkan::AccelerateDisplay(const GPU::Regs::FramebufferConfig& con void RasterizerVulkan::SetShader() { - shader_program_manager->UseFragmentShader(Pica::g_state.regs); + state.SetFragmentShader(Pica::g_state.regs); } void RasterizerVulkan::SyncClipEnabled() { - state.clip_distance[1] = Pica::g_state.regs.rasterizer.clip_enable != 0; + //state.clip_distance[1] = Pica::g_state.regs.rasterizer.clip_enable != 0; } void RasterizerVulkan::SyncClipCoef() { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 4483799ad..30f51a582 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -31,7 +31,6 @@ class EmuWindow; } namespace Vulkan { -class ShaderProgramManager; enum class UniformBindings : u32 { Common, VS, GS }; @@ -112,6 +111,65 @@ static_assert(sizeof(VSUniformData) < 16384, struct ScreenInfo; +struct VertexBase { + VertexBase() = default; + VertexBase(const Pica::Shader::OutputVertex& v, bool flip_quaternion) { + position[0] = v.pos.x.ToFloat32(); + position[1] = v.pos.y.ToFloat32(); + position[2] = v.pos.z.ToFloat32(); + position[3] = v.pos.w.ToFloat32(); + color[0] = v.color.x.ToFloat32(); + color[1] = v.color.y.ToFloat32(); + color[2] = v.color.z.ToFloat32(); + color[3] = v.color.w.ToFloat32(); + tex_coord0[0] = v.tc0.x.ToFloat32(); + tex_coord0[1] = v.tc0.y.ToFloat32(); + tex_coord1[0] = v.tc1.x.ToFloat32(); + tex_coord1[1] = v.tc1.y.ToFloat32(); + tex_coord2[0] = v.tc2.x.ToFloat32(); + tex_coord2[1] = v.tc2.y.ToFloat32(); + tex_coord0_w = v.tc0_w.ToFloat32(); + normquat[0] = v.quat.x.ToFloat32(); + normquat[1] = v.quat.y.ToFloat32(); + normquat[2] = v.quat.z.ToFloat32(); + normquat[3] = v.quat.w.ToFloat32(); + view[0] = v.view.x.ToFloat32(); + view[1] = v.view.y.ToFloat32(); + view[2] = v.view.z.ToFloat32(); + + if (flip_quaternion) { + normquat = -normquat; + } + } + + glm::vec4 position; + glm::vec4 color; + glm::vec2 tex_coord0; + glm::vec2 tex_coord1; + glm::vec2 tex_coord2; + float tex_coord0_w; + glm::vec4 normquat; + glm::vec3 view; +}; + +/// Structure that the hardware rendered vertices are composed of +struct HardwareVertex : public VertexBase { + HardwareVertex() = default; + HardwareVertex(const Pica::Shader::OutputVertex& v, bool flip_quaternion) : VertexBase(v, flip_quaternion) {}; + static constexpr auto binding_desc = vk::VertexInputBindingDescription(0, sizeof(VertexBase)); + static constexpr std::array attribute_desc = + { + vk::VertexInputAttributeDescription(0, 0, vk::Format::eR32G32B32A32Sfloat, offsetof(VertexBase, position)), + vk::VertexInputAttributeDescription(1, 0, vk::Format::eR32G32B32A32Sfloat, offsetof(VertexBase, color)), + vk::VertexInputAttributeDescription(2, 0, vk::Format::eR32G32Sfloat, offsetof(VertexBase, tex_coord0)), + vk::VertexInputAttributeDescription(3, 0, vk::Format::eR32G32Sfloat, offsetof(VertexBase, tex_coord1)), + vk::VertexInputAttributeDescription(4, 0, vk::Format::eR32G32Sfloat, offsetof(VertexBase, tex_coord2)), + vk::VertexInputAttributeDescription(5, 0, vk::Format::eR32Sfloat, offsetof(VertexBase, tex_coord0_w)), + vk::VertexInputAttributeDescription(6, 0, vk::Format::eR32G32B32A32Sfloat, offsetof(VertexBase, normquat)), + vk::VertexInputAttributeDescription(7, 0, vk::Format::eR32G32B32Sfloat, offsetof(VertexBase, view)), + }; +}; + class RasterizerVulkan : public VideoCore::RasterizerInterface { public: explicit RasterizerVulkan(Frontend::EmuWindow& emu_window); @@ -140,68 +198,6 @@ public: void SyncEntireState() override; private: - struct VertexBase - { - VertexBase() = default; - VertexBase(const Pica::Shader::OutputVertex& v, bool flip_quaternion) { - position[0] = v.pos.x.ToFloat32(); - position[1] = v.pos.y.ToFloat32(); - position[2] = v.pos.z.ToFloat32(); - position[3] = v.pos.w.ToFloat32(); - color[0] = v.color.x.ToFloat32(); - color[1] = v.color.y.ToFloat32(); - color[2] = v.color.z.ToFloat32(); - color[3] = v.color.w.ToFloat32(); - tex_coord0[0] = v.tc0.x.ToFloat32(); - tex_coord0[1] = v.tc0.y.ToFloat32(); - tex_coord1[0] = v.tc1.x.ToFloat32(); - tex_coord1[1] = v.tc1.y.ToFloat32(); - tex_coord2[0] = v.tc2.x.ToFloat32(); - tex_coord2[1] = v.tc2.y.ToFloat32(); - tex_coord0_w = v.tc0_w.ToFloat32(); - normquat[0] = v.quat.x.ToFloat32(); - normquat[1] = v.quat.y.ToFloat32(); - normquat[2] = v.quat.z.ToFloat32(); - normquat[3] = v.quat.w.ToFloat32(); - view[0] = v.view.x.ToFloat32(); - view[1] = v.view.y.ToFloat32(); - view[2] = v.view.z.ToFloat32(); - - if (flip_quaternion) { - normquat = -normquat; - } - } - - glm::vec4 position; - glm::vec4 color; - glm::vec2 tex_coord0; - glm::vec2 tex_coord1; - glm::vec2 tex_coord2; - float tex_coord0_w; - glm::vec4 normquat; - glm::vec3 view; - }; - - /// Structure that the hardware rendered vertices are composed of - struct HardwareVertex : public VertexBase - { - HardwareVertex() = default; - HardwareVertex(const Pica::Shader::OutputVertex& v, bool flip_quaternion) : VertexBase(v, flip_quaternion) {}; - static constexpr auto binding_desc = vk::VertexInputBindingDescription(0, sizeof(VertexBase)); - static constexpr std::array attribute_desc = - { - vk::VertexInputAttributeDescription(0, 0, vk::Format::eR32G32B32A32Sfloat, offsetof(VertexBase, position)), - vk::VertexInputAttributeDescription(1, 0, vk::Format::eR32G32B32A32Sfloat, offsetof(VertexBase, color)), - vk::VertexInputAttributeDescription(2, 0, vk::Format::eR32G32Sfloat, offsetof(VertexBase, tex_coord0)), - vk::VertexInputAttributeDescription(3, 0, vk::Format::eR32G32Sfloat, offsetof(VertexBase, tex_coord1)), - vk::VertexInputAttributeDescription(4, 0, vk::Format::eR32G32Sfloat, offsetof(VertexBase, tex_coord2)), - vk::VertexInputAttributeDescription(5, 0, vk::Format::eR32Sfloat, offsetof(VertexBase, tex_coord0_w)), - vk::VertexInputAttributeDescription(6, 0, vk::Format::eR32G32B32A32Sfloat, offsetof(VertexBase, normquat)), - vk::VertexInputAttributeDescription(7, 0, vk::Format::eR32G32B32Sfloat, offsetof(VertexBase, view)), - }; - }; - - /// Syncs the clip enabled status to match the PICA register void SyncClipEnabled(); @@ -337,8 +333,6 @@ private: bool dirty; } uniform_block_data = {}; - std::unique_ptr shader_program_manager; - // They shall be big enough for about one frame. static constexpr std::size_t VERTEX_BUFFER_SIZE = 16 * 1024 * 1024; static constexpr std::size_t INDEX_BUFFER_SIZE = 1 * 1024 * 1024; diff --git a/src/video_core/renderer_vulkan/vk_shader_gen.cpp b/src/video_core/renderer_vulkan/vk_shader_gen.cpp new file mode 100644 index 000000000..bf929b70b --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_gen.cpp @@ -0,0 +1,1616 @@ +// Copyright 2022 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include +#include +#include "common/assert.h" +#include "common/bit_field.h" +#include "common/bit_set.h" +#include "common/logging/log.h" +#include "core/core.h" +#include "video_core/regs_framebuffer.h" +#include "video_core/regs_lighting.h" +#include "video_core/regs_rasterizer.h" +#include "video_core/regs_texturing.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_opengl/gl_shader_decompiler.h" +#include "video_core/renderer_vulkan/vk_shader_gen.h" +#include "video_core/renderer_opengl/gl_shader_util.h" +#include "video_core/video_core.h" + +using Pica::FramebufferRegs; +using Pica::LightingRegs; +using Pica::RasterizerRegs; +using Pica::TexturingRegs; +using TevStageConfig = TexturingRegs::TevStageConfig; +using VSOutputAttributes = RasterizerRegs::VSOutputAttributes; + +namespace Vulkan { + +constexpr std::string_view UniformBlockDef = R"( +#define NUM_TEV_STAGES 6 +#define NUM_LIGHTS 8 +#define NUM_LIGHTING_SAMPLERS 24 + +struct LightSrc { + vec3 specular_0; + vec3 specular_1; + vec3 diffuse; + vec3 ambient; + vec3 position; + vec3 spot_direction; + float dist_atten_bias; + float dist_atten_scale; +}; + +layout (std140) uniform shader_data { + int framebuffer_scale; + int alphatest_ref; + float depth_scale; + float depth_offset; + float shadow_bias_constant; + float shadow_bias_linear; + int scissor_x1; + int scissor_y1; + int scissor_x2; + int scissor_y2; + int fog_lut_offset; + int proctex_noise_lut_offset; + int proctex_color_map_offset; + int proctex_alpha_map_offset; + int proctex_lut_offset; + int proctex_diff_lut_offset; + float proctex_bias; + int shadow_texture_bias; + ivec4 lighting_lut_offset[NUM_LIGHTING_SAMPLERS / 4]; + vec3 fog_color; + vec2 proctex_noise_f; + vec2 proctex_noise_a; + vec2 proctex_noise_p; + vec3 lighting_global_ambient; + LightSrc light_src[NUM_LIGHTS]; + vec4 const_color[NUM_TEV_STAGES]; + vec4 tev_combiner_buffer_color; + vec4 clip_coef; +}; +)"; + +static std::string GetVertexInterfaceDeclaration(bool is_output, bool separable_shader) { + std::string out; + + const auto append_variable = [&](std::string_view var, int location) { + if (separable_shader) { + out += fmt::format("layout (location={}) ", location); + } + out += fmt::format("{}{};\n", is_output ? "out " : "in ", var); + }; + + append_variable("vec4 primary_color", ATTRIBUTE_COLOR); + append_variable("vec2 texcoord0", ATTRIBUTE_TEXCOORD0); + append_variable("vec2 texcoord1", ATTRIBUTE_TEXCOORD1); + append_variable("vec2 texcoord2", ATTRIBUTE_TEXCOORD2); + append_variable("float texcoord0_w", ATTRIBUTE_TEXCOORD0_W); + append_variable("vec4 normquat", ATTRIBUTE_NORMQUAT); + append_variable("vec3 view", ATTRIBUTE_VIEW); + + if (is_output && separable_shader) { + // gl_PerVertex redeclaration is required for separate shader object + out += R"( + out gl_PerVertex { + vec4 gl_Position; + float gl_ClipDistance[2]; + }; + )"; + } + + return out; +} + +PicaFSConfig PicaFSConfig::BuildFromRegs(const Pica::Regs& regs) { + PicaFSConfig res{}; + + auto& state = res.state; + + state.scissor_test_mode = regs.rasterizer.scissor_test.mode; + + state.depthmap_enable = regs.rasterizer.depthmap_enable; + + state.alpha_test_func = regs.framebuffer.output_merger.alpha_test.enable + ? regs.framebuffer.output_merger.alpha_test.func.Value() + : FramebufferRegs::CompareFunc::Always; + + state.texture0_type = regs.texturing.texture0.type; + + state.texture2_use_coord1 = regs.texturing.main_config.texture2_use_coord1 != 0; + + // We don't need these otherwise, reset them to avoid unnecessary shader generation + state.alphablend_enable = {}; + state.logic_op = {}; + + // Copy relevant tev stages fields. + // We don't sync const_color here because of the high variance, it is a + // shader uniform instead. + const auto& tev_stages = regs.texturing.GetTevStages(); + DEBUG_ASSERT(state.tev_stages.size() == tev_stages.size()); + for (std::size_t i = 0; i < tev_stages.size(); i++) { + const auto& tev_stage = tev_stages[i]; + state.tev_stages[i].sources_raw = tev_stage.sources_raw; + state.tev_stages[i].modifiers_raw = tev_stage.modifiers_raw; + state.tev_stages[i].ops_raw = tev_stage.ops_raw; + state.tev_stages[i].scales_raw = tev_stage.scales_raw; + } + + state.fog_mode = regs.texturing.fog_mode; + state.fog_flip = regs.texturing.fog_flip != 0; + + state.combiner_buffer_input = regs.texturing.tev_combiner_buffer_input.update_mask_rgb.Value() | + regs.texturing.tev_combiner_buffer_input.update_mask_a.Value() + << 4; + + // Fragment lighting + + state.lighting.enable = !regs.lighting.disable; + state.lighting.src_num = regs.lighting.max_light_index + 1; + + for (unsigned light_index = 0; light_index < state.lighting.src_num; ++light_index) { + unsigned num = regs.lighting.light_enable.GetNum(light_index); + const auto& light = regs.lighting.light[num]; + state.lighting.light[light_index].num = num; + state.lighting.light[light_index].directional = light.config.directional != 0; + state.lighting.light[light_index].two_sided_diffuse = light.config.two_sided_diffuse != 0; + state.lighting.light[light_index].geometric_factor_0 = light.config.geometric_factor_0 != 0; + state.lighting.light[light_index].geometric_factor_1 = light.config.geometric_factor_1 != 0; + state.lighting.light[light_index].dist_atten_enable = + !regs.lighting.IsDistAttenDisabled(num); + state.lighting.light[light_index].spot_atten_enable = + !regs.lighting.IsSpotAttenDisabled(num); + state.lighting.light[light_index].shadow_enable = !regs.lighting.IsShadowDisabled(num); + } + + state.lighting.lut_d0.enable = regs.lighting.config1.disable_lut_d0 == 0; + state.lighting.lut_d0.abs_input = regs.lighting.abs_lut_input.disable_d0 == 0; + state.lighting.lut_d0.type = regs.lighting.lut_input.d0.Value(); + state.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0); + + state.lighting.lut_d1.enable = regs.lighting.config1.disable_lut_d1 == 0; + state.lighting.lut_d1.abs_input = regs.lighting.abs_lut_input.disable_d1 == 0; + state.lighting.lut_d1.type = regs.lighting.lut_input.d1.Value(); + state.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1); + + // this is a dummy field due to lack of the corresponding register + state.lighting.lut_sp.enable = true; + state.lighting.lut_sp.abs_input = regs.lighting.abs_lut_input.disable_sp == 0; + state.lighting.lut_sp.type = regs.lighting.lut_input.sp.Value(); + state.lighting.lut_sp.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.sp); + + state.lighting.lut_fr.enable = regs.lighting.config1.disable_lut_fr == 0; + state.lighting.lut_fr.abs_input = regs.lighting.abs_lut_input.disable_fr == 0; + state.lighting.lut_fr.type = regs.lighting.lut_input.fr.Value(); + state.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr); + + state.lighting.lut_rr.enable = regs.lighting.config1.disable_lut_rr == 0; + state.lighting.lut_rr.abs_input = regs.lighting.abs_lut_input.disable_rr == 0; + state.lighting.lut_rr.type = regs.lighting.lut_input.rr.Value(); + state.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr); + + state.lighting.lut_rg.enable = regs.lighting.config1.disable_lut_rg == 0; + state.lighting.lut_rg.abs_input = regs.lighting.abs_lut_input.disable_rg == 0; + state.lighting.lut_rg.type = regs.lighting.lut_input.rg.Value(); + state.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg); + + state.lighting.lut_rb.enable = regs.lighting.config1.disable_lut_rb == 0; + state.lighting.lut_rb.abs_input = regs.lighting.abs_lut_input.disable_rb == 0; + state.lighting.lut_rb.type = regs.lighting.lut_input.rb.Value(); + state.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb); + + state.lighting.config = regs.lighting.config0.config; + state.lighting.enable_primary_alpha = regs.lighting.config0.enable_primary_alpha; + state.lighting.enable_secondary_alpha = regs.lighting.config0.enable_secondary_alpha; + state.lighting.bump_mode = regs.lighting.config0.bump_mode; + state.lighting.bump_selector = regs.lighting.config0.bump_selector; + state.lighting.bump_renorm = regs.lighting.config0.disable_bump_renorm == 0; + state.lighting.clamp_highlights = regs.lighting.config0.clamp_highlights != 0; + + state.lighting.enable_shadow = regs.lighting.config0.enable_shadow != 0; + state.lighting.shadow_primary = regs.lighting.config0.shadow_primary != 0; + state.lighting.shadow_secondary = regs.lighting.config0.shadow_secondary != 0; + state.lighting.shadow_invert = regs.lighting.config0.shadow_invert != 0; + state.lighting.shadow_alpha = regs.lighting.config0.shadow_alpha != 0; + state.lighting.shadow_selector = regs.lighting.config0.shadow_selector; + + state.proctex.enable = regs.texturing.main_config.texture3_enable; + if (state.proctex.enable) { + state.proctex.coord = regs.texturing.main_config.texture3_coordinates; + state.proctex.u_clamp = regs.texturing.proctex.u_clamp; + state.proctex.v_clamp = regs.texturing.proctex.v_clamp; + state.proctex.color_combiner = regs.texturing.proctex.color_combiner; + state.proctex.alpha_combiner = regs.texturing.proctex.alpha_combiner; + state.proctex.separate_alpha = regs.texturing.proctex.separate_alpha; + state.proctex.noise_enable = regs.texturing.proctex.noise_enable; + state.proctex.u_shift = regs.texturing.proctex.u_shift; + state.proctex.v_shift = regs.texturing.proctex.v_shift; + state.proctex.lut_width = regs.texturing.proctex_lut.width; + state.proctex.lut_offset0 = regs.texturing.proctex_lut_offset.level0; + state.proctex.lut_offset1 = regs.texturing.proctex_lut_offset.level1; + state.proctex.lut_offset2 = regs.texturing.proctex_lut_offset.level2; + state.proctex.lut_offset3 = regs.texturing.proctex_lut_offset.level3; + state.proctex.lod_min = regs.texturing.proctex_lut.lod_min; + state.proctex.lod_max = regs.texturing.proctex_lut.lod_max; + state.proctex.lut_filter = regs.texturing.proctex_lut.filter; + } + + state.shadow_rendering = regs.framebuffer.output_merger.fragment_operation_mode == + FramebufferRegs::FragmentOperationMode::Shadow; + + state.shadow_texture_orthographic = regs.texturing.shadow.orthographic != 0; + + return res; +} + +void PicaShaderConfigCommon::Init(const Pica::ShaderRegs& regs, Pica::Shader::ShaderSetup& setup) { + program_hash = setup.GetProgramCodeHash(); + swizzle_hash = setup.GetSwizzleDataHash(); + main_offset = regs.main_offset; + sanitize_mul = VideoCore::g_hw_shader_accurate_mul; + + num_outputs = 0; + output_map.fill(16); + + for (int reg : Common::BitSet(regs.output_mask)) { + output_map[reg] = num_outputs++; + } +} + +void PicaGSConfigCommonRaw::Init(const Pica::Regs& regs) { + vs_output_attributes = Common::BitSet(regs.vs.output_mask).Count(); + gs_output_attributes = vs_output_attributes; + + semantic_maps.fill({16, 0}); + for (u32 attrib = 0; attrib < regs.rasterizer.vs_output_total; ++attrib) { + const std::array semantics{ + regs.rasterizer.vs_output_attributes[attrib].map_x.Value(), + regs.rasterizer.vs_output_attributes[attrib].map_y.Value(), + regs.rasterizer.vs_output_attributes[attrib].map_z.Value(), + regs.rasterizer.vs_output_attributes[attrib].map_w.Value(), + }; + for (u32 comp = 0; comp < 4; ++comp) { + const auto semantic = semantics[comp]; + if (static_cast(semantic) < 24) { + semantic_maps[static_cast(semantic)] = {attrib, comp}; + } else if (semantic != VSOutputAttributes::INVALID) { + LOG_ERROR(Render_OpenGL, "Invalid/unknown semantic id: {}", semantic); + } + } + } +} + +/// Detects if a TEV stage is configured to be skipped (to avoid generating unnecessary code) +static bool IsPassThroughTevStage(const TevStageConfig& stage) { + return (stage.color_op == TevStageConfig::Operation::Replace && + stage.alpha_op == TevStageConfig::Operation::Replace && + stage.color_source1 == TevStageConfig::Source::Previous && + stage.alpha_source1 == TevStageConfig::Source::Previous && + stage.color_modifier1 == TevStageConfig::ColorModifier::SourceColor && + stage.alpha_modifier1 == TevStageConfig::AlphaModifier::SourceAlpha && + stage.GetColorMultiplier() == 1 && stage.GetAlphaMultiplier() == 1); +} + +static std::string SampleTexture(const PicaFSConfig& config, unsigned texture_unit) { + const auto& state = config.state; + switch (texture_unit) { + case 0: + // Only unit 0 respects the texturing type + switch (state.texture0_type) { + case TexturingRegs::TextureConfig::Texture2D: + return "textureLod(tex0, texcoord0, getLod(texcoord0 * vec2(textureSize(tex0, 0))))"; + case TexturingRegs::TextureConfig::Projection2D: + // TODO (wwylele): find the exact LOD formula for projection texture + return "textureProj(tex0, vec3(texcoord0, texcoord0_w))"; + case TexturingRegs::TextureConfig::TextureCube: + return "texture(tex_cube, vec3(texcoord0, texcoord0_w))"; + case TexturingRegs::TextureConfig::Shadow2D: + return "shadowTexture(texcoord0, texcoord0_w)"; + case TexturingRegs::TextureConfig::ShadowCube: + return "shadowTextureCube(texcoord0, texcoord0_w)"; + case TexturingRegs::TextureConfig::Disabled: + return "vec4(0.0)"; + default: + LOG_CRITICAL(HW_GPU, "Unhandled texture type {:x}", state.texture0_type); + UNIMPLEMENTED(); + return "texture(tex0, texcoord0)"; + } + case 1: + return "textureLod(tex1, texcoord1, getLod(texcoord1 * vec2(textureSize(tex1, 0))))"; + case 2: + if (state.texture2_use_coord1) + return "textureLod(tex2, texcoord1, getLod(texcoord1 * vec2(textureSize(tex2, 0))))"; + else + return "textureLod(tex2, texcoord2, getLod(texcoord2 * vec2(textureSize(tex2, 0))))"; + case 3: + if (state.proctex.enable) { + return "ProcTex()"; + } else { + LOG_DEBUG(Render_OpenGL, "Using Texture3 without enabling it"); + return "vec4(0.0)"; + } + default: + UNREACHABLE(); + return ""; + } +} + +/// Writes the specified TEV stage source component(s) +static void AppendSource(std::string& out, const PicaFSConfig& config, + TevStageConfig::Source source, std::string_view index_name) { + using Source = TevStageConfig::Source; + switch (source) { + case Source::PrimaryColor: + out += "rounded_primary_color"; + break; + case Source::PrimaryFragmentColor: + out += "primary_fragment_color"; + break; + case Source::SecondaryFragmentColor: + out += "secondary_fragment_color"; + break; + case Source::Texture0: + out += SampleTexture(config, 0); + break; + case Source::Texture1: + out += SampleTexture(config, 1); + break; + case Source::Texture2: + out += SampleTexture(config, 2); + break; + case Source::Texture3: + out += SampleTexture(config, 3); + break; + case Source::PreviousBuffer: + out += "combiner_buffer"; + break; + case Source::Constant: + out += "const_color["; + out += index_name; + out += ']'; + break; + case Source::Previous: + out += "last_tex_env_out"; + break; + default: + out += "vec4(0.0)"; + LOG_CRITICAL(Render_OpenGL, "Unknown source op {}", source); + break; + } +} + +/// Writes the color components to use for the specified TEV stage color modifier +static void AppendColorModifier(std::string& out, const PicaFSConfig& config, + TevStageConfig::ColorModifier modifier, + TevStageConfig::Source source, std::string_view index_name) { + using ColorModifier = TevStageConfig::ColorModifier; + switch (modifier) { + case ColorModifier::SourceColor: + AppendSource(out, config, source, index_name); + out += ".rgb"; + break; + case ColorModifier::OneMinusSourceColor: + out += "vec3(1.0) - "; + AppendSource(out, config, source, index_name); + out += ".rgb"; + break; + case ColorModifier::SourceAlpha: + AppendSource(out, config, source, index_name); + out += ".aaa"; + break; + case ColorModifier::OneMinusSourceAlpha: + out += "vec3(1.0) - "; + AppendSource(out, config, source, index_name); + out += ".aaa"; + break; + case ColorModifier::SourceRed: + AppendSource(out, config, source, index_name); + out += ".rrr"; + break; + case ColorModifier::OneMinusSourceRed: + out += "vec3(1.0) - "; + AppendSource(out, config, source, index_name); + out += ".rrr"; + break; + case ColorModifier::SourceGreen: + AppendSource(out, config, source, index_name); + out += ".ggg"; + break; + case ColorModifier::OneMinusSourceGreen: + out += "vec3(1.0) - "; + AppendSource(out, config, source, index_name); + out += ".ggg"; + break; + case ColorModifier::SourceBlue: + AppendSource(out, config, source, index_name); + out += ".bbb"; + break; + case ColorModifier::OneMinusSourceBlue: + out += "vec3(1.0) - "; + AppendSource(out, config, source, index_name); + out += ".bbb"; + break; + default: + out += "vec3(0.0)"; + LOG_CRITICAL(Render_OpenGL, "Unknown color modifier op {}", modifier); + break; + } +} + +/// Writes the alpha component to use for the specified TEV stage alpha modifier +static void AppendAlphaModifier(std::string& out, const PicaFSConfig& config, + TevStageConfig::AlphaModifier modifier, + TevStageConfig::Source source, const std::string& index_name) { + using AlphaModifier = TevStageConfig::AlphaModifier; + switch (modifier) { + case AlphaModifier::SourceAlpha: + AppendSource(out, config, source, index_name); + out += ".a"; + break; + case AlphaModifier::OneMinusSourceAlpha: + out += "1.0 - "; + AppendSource(out, config, source, index_name); + out += ".a"; + break; + case AlphaModifier::SourceRed: + AppendSource(out, config, source, index_name); + out += ".r"; + break; + case AlphaModifier::OneMinusSourceRed: + out += "1.0 - "; + AppendSource(out, config, source, index_name); + out += ".r"; + break; + case AlphaModifier::SourceGreen: + AppendSource(out, config, source, index_name); + out += ".g"; + break; + case AlphaModifier::OneMinusSourceGreen: + out += "1.0 - "; + AppendSource(out, config, source, index_name); + out += ".g"; + break; + case AlphaModifier::SourceBlue: + AppendSource(out, config, source, index_name); + out += ".b"; + break; + case AlphaModifier::OneMinusSourceBlue: + out += "1.0 - "; + AppendSource(out, config, source, index_name); + out += ".b"; + break; + default: + out += "0.0"; + LOG_CRITICAL(Render_OpenGL, "Unknown alpha modifier op {}", modifier); + break; + } +} + +/// Writes the combiner function for the color components for the specified TEV stage operation +static void AppendColorCombiner(std::string& out, TevStageConfig::Operation operation, + std::string_view variable_name) { + out += "clamp("; + using Operation = TevStageConfig::Operation; + switch (operation) { + case Operation::Replace: + out += fmt::format("{}[0]", variable_name); + break; + case Operation::Modulate: + out += fmt::format("{0}[0] * {0}[1]", variable_name); + break; + case Operation::Add: + out += fmt::format("{0}[0] + {0}[1]", variable_name); + break; + case Operation::AddSigned: + out += fmt::format("{0}[0] + {0}[1] - vec3(0.5)", variable_name); + break; + case Operation::Lerp: + out += fmt::format("{0}[0] * {0}[2] + {0}[1] * (vec3(1.0) - {0}[2])", variable_name); + break; + case Operation::Subtract: + out += fmt::format("{0}[0] - {0}[1]", variable_name); + break; + case Operation::MultiplyThenAdd: + out += fmt::format("{0}[0] * {0}[1] + {0}[2]", variable_name); + break; + case Operation::AddThenMultiply: + out += fmt::format("min({0}[0] + {0}[1], vec3(1.0)) * {0}[2]", variable_name); + break; + case Operation::Dot3_RGB: + case Operation::Dot3_RGBA: + out += + fmt::format("vec3(dot({0}[0] - vec3(0.5), {0}[1] - vec3(0.5)) * 4.0)", variable_name); + break; + default: + out += "vec3(0.0)"; + LOG_CRITICAL(Render_OpenGL, "Unknown color combiner operation: {}", operation); + break; + } + out += ", vec3(0.0), vec3(1.0))"; // Clamp result to 0.0, 1.0 +} + +/// Writes the combiner function for the alpha component for the specified TEV stage operation +static void AppendAlphaCombiner(std::string& out, TevStageConfig::Operation operation, + std::string_view variable_name) { + out += "clamp("; + using Operation = TevStageConfig::Operation; + switch (operation) { + case Operation::Replace: + out += fmt::format("{}[0]", variable_name); + break; + case Operation::Modulate: + out += fmt::format("{0}[0] * {0}[1]", variable_name); + break; + case Operation::Add: + out += fmt::format("{0}[0] + {0}[1]", variable_name); + break; + case Operation::AddSigned: + out += fmt::format("{0}[0] + {0}[1] - 0.5", variable_name); + break; + case Operation::Lerp: + out += fmt::format("{0}[0] * {0}[2] + {0}[1] * (1.0 - {0}[2])", variable_name); + break; + case Operation::Subtract: + out += fmt::format("{0}[0] - {0}[1]", variable_name); + break; + case Operation::MultiplyThenAdd: + out += fmt::format("{0}[0] * {0}[1] + {0}[2]", variable_name); + break; + case Operation::AddThenMultiply: + out += fmt::format("min({0}[0] + {0}[1], 1.0) * {0}[2]", variable_name); + break; + default: + out += "0.0"; + LOG_CRITICAL(Render_OpenGL, "Unknown alpha combiner operation: {}", operation); + break; + } + out += ", 0.0, 1.0)"; +} + +/// Writes the if-statement condition used to evaluate alpha testing +static void AppendAlphaTestCondition(std::string& out, FramebufferRegs::CompareFunc func) { + using CompareFunc = FramebufferRegs::CompareFunc; + switch (func) { + case CompareFunc::Never: + out += "true"; + break; + case CompareFunc::Always: + out += "false"; + break; + case CompareFunc::Equal: + case CompareFunc::NotEqual: + case CompareFunc::LessThan: + case CompareFunc::LessThanOrEqual: + case CompareFunc::GreaterThan: + case CompareFunc::GreaterThanOrEqual: { + static constexpr std::array op{"!=", "==", ">=", ">", "<=", "<"}; + const auto index = static_cast(func) - static_cast(CompareFunc::Equal); + out += fmt::format("int(last_tex_env_out.a * 255.0) {} alphatest_ref", op[index]); + break; + } + + default: + out += "false"; + LOG_CRITICAL(Render_OpenGL, "Unknown alpha test condition {}", func); + break; + } +} + +/// Writes the code to emulate the specified TEV stage +static void WriteTevStage(std::string& out, const PicaFSConfig& config, unsigned index) { + const auto stage = + static_cast(config.state.tev_stages[index]); + if (!IsPassThroughTevStage(stage)) { + const std::string index_name = std::to_string(index); + + out += fmt::format("vec3 color_results_{}_1 = ", index_name); + AppendColorModifier(out, config, stage.color_modifier1, stage.color_source1, index_name); + out += fmt::format(";\nvec3 color_results_{}_2 = ", index_name); + AppendColorModifier(out, config, stage.color_modifier2, stage.color_source2, index_name); + out += fmt::format(";\nvec3 color_results_{}_3 = ", index_name); + AppendColorModifier(out, config, stage.color_modifier3, stage.color_source3, index_name); + out += fmt::format(";\nvec3 color_results_{}[3] = vec3[3](color_results_{}_1, " + "color_results_{}_2, color_results_{}_3);\n", + index_name, index_name, index_name, index_name); + + // Round the output of each TEV stage to maintain the PICA's 8 bits of precision + out += fmt::format("vec3 color_output_{} = byteround(", index_name); + AppendColorCombiner(out, stage.color_op, "color_results_" + index_name); + out += ");\n"; + + if (stage.color_op == TevStageConfig::Operation::Dot3_RGBA) { + // result of Dot3_RGBA operation is also placed to the alpha component + out += fmt::format("float alpha_output_{0} = color_output_{0}[0];\n", index_name); + } else { + out += fmt::format("float alpha_results_{}[3] = float[3](", index_name); + AppendAlphaModifier(out, config, stage.alpha_modifier1, stage.alpha_source1, + index_name); + out += ", "; + AppendAlphaModifier(out, config, stage.alpha_modifier2, stage.alpha_source2, + index_name); + out += ", "; + AppendAlphaModifier(out, config, stage.alpha_modifier3, stage.alpha_source3, + index_name); + out += ");\n"; + + out += fmt::format("float alpha_output_{} = byteround(", index_name); + AppendAlphaCombiner(out, stage.alpha_op, "alpha_results_" + index_name); + out += ");\n"; + } + + out += fmt::format("last_tex_env_out = vec4(" + "clamp(color_output_{} * {}.0, vec3(0.0), vec3(1.0)), " + "clamp(alpha_output_{} * {}.0, 0.0, 1.0));\n", + index_name, stage.GetColorMultiplier(), index_name, + stage.GetAlphaMultiplier()); + } + + out += "combiner_buffer = next_combiner_buffer;\n"; + + if (config.TevStageUpdatesCombinerBufferColor(index)) + out += "next_combiner_buffer.rgb = last_tex_env_out.rgb;\n"; + + if (config.TevStageUpdatesCombinerBufferAlpha(index)) + out += "next_combiner_buffer.a = last_tex_env_out.a;\n"; +} + +/// Writes the code to emulate fragment lighting +static void WriteLighting(std::string& out, const PicaFSConfig& config) { + const auto& lighting = config.state.lighting; + + // Define lighting globals + out += "vec4 diffuse_sum = vec4(0.0, 0.0, 0.0, 1.0);\n" + "vec4 specular_sum = vec4(0.0, 0.0, 0.0, 1.0);\n" + "vec3 light_vector = vec3(0.0);\n" + "vec3 refl_value = vec3(0.0);\n" + "vec3 spot_dir = vec3(0.0);\n" + "vec3 half_vector = vec3(0.0);\n" + "float dot_product = 0.0;\n" + "float clamp_highlights = 1.0;\n" + "float geo_factor = 1.0;\n"; + + // Compute fragment normals and tangents + const auto Perturbation = [&] { + return fmt::format("2.0 * ({}).rgb - 1.0", SampleTexture(config, lighting.bump_selector)); + }; + if (lighting.bump_mode == LightingRegs::LightingBumpMode::NormalMap) { + // Bump mapping is enabled using a normal map + out += fmt::format("vec3 surface_normal = {};\n", Perturbation()); + + // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher + // precision result + if (lighting.bump_renorm) { + constexpr std::string_view val = + "(1.0 - (surface_normal.x*surface_normal.x + surface_normal.y*surface_normal.y))"; + out += fmt::format("surface_normal.z = sqrt(max({}, 0.0));\n", val); + } + + // The tangent vector is not perturbed by the normal map and is just a unit vector. + out += "vec3 surface_tangent = vec3(1.0, 0.0, 0.0);\n"; + } else if (lighting.bump_mode == LightingRegs::LightingBumpMode::TangentMap) { + // Bump mapping is enabled using a tangent map + out += fmt::format("vec3 surface_tangent = {};\n", Perturbation()); + // Mathematically, recomputing Z-component of the tangent vector won't affect the relevant + // computation below, which is also confirmed on 3DS. So we don't bother recomputing here + // even if 'renorm' is enabled. + + // The normal vector is not perturbed by the tangent map and is just a unit vector. + out += "vec3 surface_normal = vec3(0.0, 0.0, 1.0);\n"; + } else { + // No bump mapping - surface local normal and tangent are just unit vectors + out += "vec3 surface_normal = vec3(0.0, 0.0, 1.0);\n" + "vec3 surface_tangent = vec3(1.0, 0.0, 0.0);\n"; + } + + // Rotate the surface-local normal by the interpolated normal quaternion to convert it to + // eyespace. + out += "vec4 normalized_normquat = normalize(normquat);\n" + "vec3 normal = quaternion_rotate(normalized_normquat, surface_normal);\n" + "vec3 tangent = quaternion_rotate(normalized_normquat, surface_tangent);\n"; + + if (lighting.enable_shadow) { + std::string shadow_texture = SampleTexture(config, lighting.shadow_selector); + if (lighting.shadow_invert) { + out += fmt::format("vec4 shadow = vec4(1.0) - {};\n", shadow_texture); + } else { + out += fmt::format("vec4 shadow = {};\n", shadow_texture); + } + } else { + out += "vec4 shadow = vec4(1.0);\n"; + } + + // Samples the specified lookup table for specular lighting + auto GetLutValue = [&lighting](LightingRegs::LightingSampler sampler, unsigned light_num, + LightingRegs::LightingLutInput input, bool abs) { + std::string index; + switch (input) { + case LightingRegs::LightingLutInput::NH: + index = "dot(normal, normalize(half_vector))"; + break; + + case LightingRegs::LightingLutInput::VH: + index = "dot(normalize(view), normalize(half_vector))"; + break; + + case LightingRegs::LightingLutInput::NV: + index = "dot(normal, normalize(view))"; + break; + + case LightingRegs::LightingLutInput::LN: + index = "dot(light_vector, normal)"; + break; + + case LightingRegs::LightingLutInput::SP: + index = "dot(light_vector, spot_dir)"; + break; + + case LightingRegs::LightingLutInput::CP: + // CP input is only available with configuration 7 + if (lighting.config == LightingRegs::LightingConfig::Config7) { + // Note: even if the normal vector is modified by normal map, which is not the + // normal of the tangent plane anymore, the half angle vector is still projected + // using the modified normal vector. + constexpr std::string_view half_angle_proj = + "normalize(half_vector) - normal * dot(normal, normalize(half_vector))"; + // Note: the half angle vector projection is confirmed not normalized before the dot + // product. The result is in fact not cos(phi) as the name suggested. + index = fmt::format("dot({}, tangent)", half_angle_proj); + } else { + index = "0.0"; + } + break; + + default: + LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input {}", (int)input); + UNIMPLEMENTED(); + index = "0.0"; + break; + } + + const auto sampler_index = static_cast(sampler); + + if (abs) { + // LUT index is in the range of (0.0, 1.0) + index = lighting.light[light_num].two_sided_diffuse + ? fmt::format("abs({})", index) + : fmt::format("max({}, 0.0)", index); + return fmt::format("LookupLightingLUTUnsigned({}, {})", sampler_index, index); + } else { + // LUT index is in the range of (-1.0, 1.0) + return fmt::format("LookupLightingLUTSigned({}, {})", sampler_index, index); + } + }; + + // Write the code to emulate each enabled light + for (unsigned light_index = 0; light_index < lighting.src_num; ++light_index) { + const auto& light_config = lighting.light[light_index]; + const std::string light_src = fmt::format("light_src[{}]", light_config.num); + + // Compute light vector (directional or positional) + if (light_config.directional) { + out += fmt::format("light_vector = normalize({}.position);\n", light_src); + } else { + out += fmt::format("light_vector = normalize({}.position + view);\n", light_src); + } + + out += fmt::format("spot_dir = {}.spot_direction;\n", light_src); + out += "half_vector = normalize(view) + light_vector;\n"; + + // Compute dot product of light_vector and normal, adjust if lighting is one-sided or + // two-sided + out += std::string("dot_product = ") + (light_config.two_sided_diffuse + ? "abs(dot(light_vector, normal));\n" + : "max(dot(light_vector, normal), 0.0);\n"); + + // If enabled, clamp specular component if lighting result is zero + if (lighting.clamp_highlights) { + out += "clamp_highlights = sign(dot_product);\n"; + } + + // If enabled, compute spot light attenuation value + std::string spot_atten = "1.0"; + if (light_config.spot_atten_enable && + LightingRegs::IsLightingSamplerSupported( + lighting.config, LightingRegs::LightingSampler::SpotlightAttenuation)) { + const std::string value = + GetLutValue(LightingRegs::SpotlightAttenuationSampler(light_config.num), + light_config.num, lighting.lut_sp.type, lighting.lut_sp.abs_input); + spot_atten = fmt::format("({:#} * {})", lighting.lut_sp.scale, value); + } + + // If enabled, compute distance attenuation value + std::string dist_atten = "1.0"; + if (light_config.dist_atten_enable) { + const std::string index = fmt::format("clamp({}.dist_atten_scale * length(-view - " + "{}.position) + {}.dist_atten_bias, 0.0, 1.0)", + light_src, light_src, light_src); + const auto sampler = LightingRegs::DistanceAttenuationSampler(light_config.num); + dist_atten = fmt::format("LookupLightingLUTUnsigned({}, {})", sampler, index); + } + + if (light_config.geometric_factor_0 || light_config.geometric_factor_1) { + out += "geo_factor = dot(half_vector, half_vector);\n" + "geo_factor = geo_factor == 0.0 ? 0.0 : min(" + "dot_product / geo_factor, 1.0);\n"; + } + + // Specular 0 component + std::string d0_lut_value = "1.0"; + if (lighting.lut_d0.enable && + LightingRegs::IsLightingSamplerSupported( + lighting.config, LightingRegs::LightingSampler::Distribution0)) { + // Lookup specular "distribution 0" LUT value + const std::string value = + GetLutValue(LightingRegs::LightingSampler::Distribution0, light_config.num, + lighting.lut_d0.type, lighting.lut_d0.abs_input); + d0_lut_value = fmt::format("({:#} * {})", lighting.lut_d0.scale, value); + } + std::string specular_0 = fmt::format("({} * {}.specular_0)", d0_lut_value, light_src); + if (light_config.geometric_factor_0) { + specular_0 = fmt::format("({} * geo_factor)", specular_0); + } + + // If enabled, lookup ReflectRed value, otherwise, 1.0 is used + if (lighting.lut_rr.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::ReflectRed)) { + std::string value = + GetLutValue(LightingRegs::LightingSampler::ReflectRed, light_config.num, + lighting.lut_rr.type, lighting.lut_rr.abs_input); + value = fmt::format("({:#} * {})", lighting.lut_rr.scale, value); + out += fmt::format("refl_value.r = {};\n", value); + } else { + out += "refl_value.r = 1.0;\n"; + } + + // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used + if (lighting.lut_rg.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::ReflectGreen)) { + std::string value = + GetLutValue(LightingRegs::LightingSampler::ReflectGreen, light_config.num, + lighting.lut_rg.type, lighting.lut_rg.abs_input); + value = fmt::format("({:#} * {})", lighting.lut_rg.scale, value); + out += fmt::format("refl_value.g = {};\n", value); + } else { + out += "refl_value.g = refl_value.r;\n"; + } + + // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used + if (lighting.lut_rb.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::ReflectBlue)) { + std::string value = + GetLutValue(LightingRegs::LightingSampler::ReflectBlue, light_config.num, + lighting.lut_rb.type, lighting.lut_rb.abs_input); + value = fmt::format("({:#} * {})", lighting.lut_rb.scale, value); + out += fmt::format("refl_value.b = {};\n", value); + } else { + out += "refl_value.b = refl_value.r;\n"; + } + + // Specular 1 component + std::string d1_lut_value = "1.0"; + if (lighting.lut_d1.enable && + LightingRegs::IsLightingSamplerSupported( + lighting.config, LightingRegs::LightingSampler::Distribution1)) { + // Lookup specular "distribution 1" LUT value + const std::string value = + GetLutValue(LightingRegs::LightingSampler::Distribution1, light_config.num, + lighting.lut_d1.type, lighting.lut_d1.abs_input); + d1_lut_value = fmt::format("({:#} * {})", lighting.lut_d1.scale, value); + } + std::string specular_1 = + fmt::format("({} * refl_value * {}.specular_1)", d1_lut_value, light_src); + if (light_config.geometric_factor_1) { + specular_1 = fmt::format("({} * geo_factor)", specular_1); + } + + // Fresnel + // Note: only the last entry in the light slots applies the Fresnel factor + if (light_index == lighting.src_num - 1 && lighting.lut_fr.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::Fresnel)) { + // Lookup fresnel LUT value + std::string value = + GetLutValue(LightingRegs::LightingSampler::Fresnel, light_config.num, + lighting.lut_fr.type, lighting.lut_fr.abs_input); + value = fmt::format("({:#} * {})", lighting.lut_fr.scale, value); + + // Enabled for diffuse lighting alpha component + if (lighting.enable_primary_alpha) { + out += fmt::format("diffuse_sum.a = {};\n", value); + } + + // Enabled for the specular lighting alpha component + if (lighting.enable_secondary_alpha) { + out += fmt::format("specular_sum.a = {};\n", value); + } + } + + bool shadow_primary_enable = lighting.shadow_primary && light_config.shadow_enable; + bool shadow_secondary_enable = lighting.shadow_secondary && light_config.shadow_enable; + std::string shadow_primary = shadow_primary_enable ? " * shadow.rgb" : ""; + std::string shadow_secondary = shadow_secondary_enable ? " * shadow.rgb" : ""; + + // Compute primary fragment color (diffuse lighting) function + out += fmt::format( + "diffuse_sum.rgb += (({}.diffuse * dot_product) + {}.ambient) * {} * {}{};\n", + light_src, light_src, dist_atten, spot_atten, shadow_primary); + + // Compute secondary fragment color (specular lighting) function + out += fmt::format("specular_sum.rgb += ({} + {}) * clamp_highlights * {} * {}{};\n", + specular_0, specular_1, dist_atten, spot_atten, shadow_secondary); + } + + // Apply shadow attenuation to alpha components if enabled + if (lighting.shadow_alpha) { + if (lighting.enable_primary_alpha) { + out += "diffuse_sum.a *= shadow.a;\n"; + } + if (lighting.enable_secondary_alpha) { + out += "specular_sum.a *= shadow.a;\n"; + } + } + + // Sum final lighting result + out += "diffuse_sum.rgb += lighting_global_ambient;\n" + "primary_fragment_color = clamp(diffuse_sum, vec4(0.0), vec4(1.0));\n" + "secondary_fragment_color = clamp(specular_sum, vec4(0.0), vec4(1.0));\n"; +} + +using ProcTexClamp = TexturingRegs::ProcTexClamp; +using ProcTexShift = TexturingRegs::ProcTexShift; +using ProcTexCombiner = TexturingRegs::ProcTexCombiner; +using ProcTexFilter = TexturingRegs::ProcTexFilter; + +static void AppendProcTexShiftOffset(std::string& out, std::string_view v, ProcTexShift mode, + ProcTexClamp clamp_mode) { + const std::string_view offset = (clamp_mode == ProcTexClamp::MirroredRepeat) ? "1.0" : "0.5"; + switch (mode) { + case ProcTexShift::None: + out += "0.0"; + break; + case ProcTexShift::Odd: + out += fmt::format("{} * float((int({}) / 2) % 2)", offset, v); + break; + case ProcTexShift::Even: + out += fmt::format("{} * float(((int({}) + 1) / 2) % 2)", offset, v); + break; + default: + LOG_CRITICAL(HW_GPU, "Unknown shift mode {}", mode); + out += "0.0"; + break; + } +} + +static void AppendProcTexClamp(std::string& out, std::string_view var, ProcTexClamp mode) { + switch (mode) { + case ProcTexClamp::ToZero: + out += fmt::format("{0} = {0} > 1.0 ? 0 : {0};\n", var); + break; + case ProcTexClamp::ToEdge: + out += fmt::format("{0} = min({0}, 1.0);\n", var); + break; + case ProcTexClamp::SymmetricalRepeat: + out += fmt::format("{0} = fract({0});\n", var); + break; + case ProcTexClamp::MirroredRepeat: { + out += fmt::format("{0} = int({0}) % 2 == 0 ? fract({0}) : 1.0 - fract({0});\n", var); + break; + } + case ProcTexClamp::Pulse: + out += fmt::format("{0} = {0} > 0.5 ? 1.0 : 0.0;\n", var); + break; + default: + LOG_CRITICAL(HW_GPU, "Unknown clamp mode {}", mode); + out += fmt::format("{0} = min({0}, 1.0);\n", var); + break; + } +} + +static void AppendProcTexCombineAndMap(std::string& out, ProcTexCombiner combiner, + std::string_view offset) { + const auto combined = [combiner]() -> std::string_view { + switch (combiner) { + case ProcTexCombiner::U: + return "u"; + case ProcTexCombiner::U2: + return "(u * u)"; + case TexturingRegs::ProcTexCombiner::V: + return "v"; + case TexturingRegs::ProcTexCombiner::V2: + return "(v * v)"; + case TexturingRegs::ProcTexCombiner::Add: + return "((u + v) * 0.5)"; + case TexturingRegs::ProcTexCombiner::Add2: + return "((u * u + v * v) * 0.5)"; + case TexturingRegs::ProcTexCombiner::SqrtAdd2: + return "min(sqrt(u * u + v * v), 1.0)"; + case TexturingRegs::ProcTexCombiner::Min: + return "min(u, v)"; + case TexturingRegs::ProcTexCombiner::Max: + return "max(u, v)"; + case TexturingRegs::ProcTexCombiner::RMax: + return "min(((u + v) * 0.5 + sqrt(u * u + v * v)) * 0.5, 1.0)"; + default: + LOG_CRITICAL(HW_GPU, "Unknown combiner {}", combiner); + return "0.0"; + } + }(); + + out += fmt::format("ProcTexLookupLUT({}, {})", offset, combined); +} + +static void AppendProcTexSampler(std::string& out, const PicaFSConfig& config) { + // LUT sampling uitlity + // For NoiseLUT/ColorMap/AlphaMap, coord=0.0 is lut[0], coord=127.0/128.0 is lut[127] and + // coord=1.0 is lut[127]+lut_diff[127]. For other indices, the result is interpolated using + // value entries and difference entries. + out += R"( +float ProcTexLookupLUT(int offset, float coord) { + coord *= 128.0; + float index_i = clamp(floor(coord), 0.0, 127.0); + float index_f = coord - index_i; // fract() cannot be used here because 128.0 needs to be + // extracted as index_i = 127.0 and index_f = 1.0 + vec2 entry = texelFetch(texture_buffer_lut_rg, int(index_i) + offset).rg; + return clamp(entry.r + entry.g * index_f, 0.0, 1.0); +} + )"; + + // Noise utility + if (config.state.proctex.noise_enable) { + // See swrasterizer/proctex.cpp for more information about these functions + out += R"( +int ProcTexNoiseRand1D(int v) { + const int table[] = int[](0,4,10,8,4,9,7,12,5,15,13,14,11,15,2,11); + return ((v % 9 + 2) * 3 & 0xF) ^ table[(v / 9) & 0xF]; +} + +float ProcTexNoiseRand2D(vec2 point) { + const int table[] = int[](10,2,15,8,0,7,4,5,5,13,2,6,13,9,3,14); + int u2 = ProcTexNoiseRand1D(int(point.x)); + int v2 = ProcTexNoiseRand1D(int(point.y)); + v2 += ((u2 & 3) == 1) ? 4 : 0; + v2 ^= (u2 & 1) * 6; + v2 += 10 + u2; + v2 &= 0xF; + v2 ^= table[u2]; + return -1.0 + float(v2) * 2.0/ 15.0; +} + +float ProcTexNoiseCoef(vec2 x) { + vec2 grid = 9.0 * proctex_noise_f * abs(x + proctex_noise_p); + vec2 point = floor(grid); + vec2 frac = grid - point; + + float g0 = ProcTexNoiseRand2D(point) * (frac.x + frac.y); + float g1 = ProcTexNoiseRand2D(point + vec2(1.0, 0.0)) * (frac.x + frac.y - 1.0); + float g2 = ProcTexNoiseRand2D(point + vec2(0.0, 1.0)) * (frac.x + frac.y - 1.0); + float g3 = ProcTexNoiseRand2D(point + vec2(1.0, 1.0)) * (frac.x + frac.y - 2.0); + + float x_noise = ProcTexLookupLUT(proctex_noise_lut_offset, frac.x); + float y_noise = ProcTexLookupLUT(proctex_noise_lut_offset, frac.y); + float x0 = mix(g0, g1, x_noise); + float x1 = mix(g2, g3, x_noise); + return mix(x0, x1, y_noise); +} + )"; + } + + out += "vec4 SampleProcTexColor(float lut_coord, int level) {\n"; + out += fmt::format("int lut_width = {} >> level;\n", config.state.proctex.lut_width); + // Offsets for level 4-7 seem to be hardcoded + out += fmt::format("int lut_offsets[8] = int[]({}, {}, {}, {}, 0xF0, 0xF8, 0xFC, 0xFE);\n", + config.state.proctex.lut_offset0, config.state.proctex.lut_offset1, + config.state.proctex.lut_offset2, config.state.proctex.lut_offset3); + out += "int lut_offset = lut_offsets[level];\n"; + // For the color lut, coord=0.0 is lut[offset] and coord=1.0 is lut[offset+width-1] + out += "lut_coord *= float(lut_width - 1);\n"; + + switch (config.state.proctex.lut_filter) { + case ProcTexFilter::Linear: + case ProcTexFilter::LinearMipmapLinear: + case ProcTexFilter::LinearMipmapNearest: + out += "int lut_index_i = int(lut_coord) + lut_offset;\n"; + out += "float lut_index_f = fract(lut_coord);\n"; + out += "return texelFetch(texture_buffer_lut_rgba, lut_index_i + " + "proctex_lut_offset) + " + "lut_index_f * " + "texelFetch(texture_buffer_lut_rgba, lut_index_i + proctex_diff_lut_offset);\n"; + break; + case ProcTexFilter::Nearest: + case ProcTexFilter::NearestMipmapLinear: + case ProcTexFilter::NearestMipmapNearest: + out += "lut_coord += float(lut_offset);\n"; + out += "return texelFetch(texture_buffer_lut_rgba, int(round(lut_coord)) + " + "proctex_lut_offset);\n"; + break; + } + + out += "}\n"; + + out += "vec4 ProcTex() {\n"; + if (config.state.proctex.coord < 3) { + out += fmt::format("vec2 uv = abs(texcoord{});\n", config.state.proctex.coord); + } else { + LOG_CRITICAL(Render_OpenGL, "Unexpected proctex.coord >= 3"); + out += "vec2 uv = abs(texcoord0);\n"; + } + + // This LOD formula is the same as the LOD upper limit defined in OpenGL. + // f(x, y) <= m_u + m_v + m_w + // (See OpenGL 4.6 spec, 8.14.1 - Scale Factor and Level-of-Detail) + // Note: this is different from the one normal 2D textures use. + out += "vec2 duv = max(abs(dFdx(uv)), abs(dFdy(uv)));\n"; + // unlike normal texture, the bias is inside the log2 + out += fmt::format("float lod = log2(abs(float({}) * proctex_bias) * (duv.x + duv.y));\n", + config.state.proctex.lut_width); + out += "if (proctex_bias == 0.0) lod = 0.0;\n"; + out += fmt::format("lod = clamp(lod, {:#}, {:#});\n", + std::max(0.0f, static_cast(config.state.proctex.lod_min)), + std::min(7.0f, static_cast(config.state.proctex.lod_max))); + // Get shift offset before noise generation + out += "float u_shift = "; + AppendProcTexShiftOffset(out, "uv.y", config.state.proctex.u_shift, + config.state.proctex.u_clamp); + out += ";\n"; + out += "float v_shift = "; + AppendProcTexShiftOffset(out, "uv.x", config.state.proctex.v_shift, + config.state.proctex.v_clamp); + out += ";\n"; + + // Generate noise + if (config.state.proctex.noise_enable) { + out += "uv += proctex_noise_a * ProcTexNoiseCoef(uv);\n" + "uv = abs(uv);\n"; + } + + // Shift + out += "float u = uv.x + u_shift;\n" + "float v = uv.y + v_shift;\n"; + + // Clamp + AppendProcTexClamp(out, "u", config.state.proctex.u_clamp); + AppendProcTexClamp(out, "v", config.state.proctex.v_clamp); + + // Combine and map + out += "float lut_coord = "; + AppendProcTexCombineAndMap(out, config.state.proctex.color_combiner, + "proctex_color_map_offset"); + out += ";\n"; + + switch (config.state.proctex.lut_filter) { + case ProcTexFilter::Linear: + case ProcTexFilter::Nearest: + out += "vec4 final_color = SampleProcTexColor(lut_coord, 0);\n"; + break; + case ProcTexFilter::NearestMipmapNearest: + case ProcTexFilter::LinearMipmapNearest: + out += "vec4 final_color = SampleProcTexColor(lut_coord, int(round(lod)));\n"; + break; + case ProcTexFilter::NearestMipmapLinear: + case ProcTexFilter::LinearMipmapLinear: + out += "int lod_i = int(lod);\n" + "float lod_f = fract(lod);\n" + "vec4 final_color = mix(SampleProcTexColor(lut_coord, lod_i), " + "SampleProcTexColor(lut_coord, lod_i + 1), lod_f);\n"; + break; + } + + if (config.state.proctex.separate_alpha) { + // Note: in separate alpha mode, the alpha channel skips the color LUT look up stage. It + // uses the output of CombineAndMap directly instead. + out += "float final_alpha = "; + AppendProcTexCombineAndMap(out, config.state.proctex.alpha_combiner, + "proctex_alpha_map_offset"); + out += ";\n"; + out += "return vec4(final_color.xyz, final_alpha);\n}\n"; + } else { + out += "return final_color;\n}\n"; + } +} + +ShaderDecompiler::ProgramResult GenerateFragmentShader(const PicaFSConfig& config) { + const auto& state = config.state; + std::string out; + + out += R"( + #extension GL_ARB_shader_image_load_store : enable + #extension GL_ARB_shader_image_size : enable + #define ALLOW_SHADOW 0 + )"; + out += "#extension GL_ARB_separate_shader_objects : enable\n"; + + out += GetVertexInterfaceDeclaration(false, true); + + out += R"( + #ifndef CITRA_GLES + in vec4 gl_FragCoord; + #endif // CITRA_GLES + + out vec4 color; + + layout(set = 1, binding = 0) uniform sampler2D tex0; + layout(set = 1, binding = 1) uniform sampler2D tex1; + layout(set = 1, binding = 2) uniform sampler2D tex2; + layout(set = 1, binding = 3) uniform samplerCube tex_cube; + layout(set = 2, binding = 0) uniform samplerBuffer texture_buffer_lut_lf; + layout(set = 2, binding = 1) uniform samplerBuffer texture_buffer_lut_rg; + layout(set = 2, binding = 2) uniform samplerBuffer texture_buffer_lut_rgba; + + #if ALLOW_SHADOW + layout(r32ui) uniform readonly uimage2D shadow_texture_px; + layout(r32ui) uniform readonly uimage2D shadow_texture_nx; + layout(r32ui) uniform readonly uimage2D shadow_texture_py; + layout(r32ui) uniform readonly uimage2D shadow_texture_ny; + layout(r32ui) uniform readonly uimage2D shadow_texture_pz; + layout(r32ui) uniform readonly uimage2D shadow_texture_nz; + layout(r32ui) uniform uimage2D shadow_buffer; + #endif + )"; + + out += UniformBlockDef; + + out += R"( + // Rotate the vector v by the quaternion q + vec3 quaternion_rotate(vec4 q, vec3 v) { + return v + 2.0 * cross(q.xyz, cross(q.xyz, v) + q.w * v); + } + + float LookupLightingLUT(int lut_index, int index, float delta) { + vec2 entry = texelFetch(texture_buffer_lut_lf, lighting_lut_offset[lut_index >> 2][lut_index & 3] + index).rg; + return entry.r + entry.g * delta; + } + + float LookupLightingLUTUnsigned(int lut_index, float pos) { + int index = clamp(int(pos * 256.0), 0, 255); + float delta = pos * 256.0 - float(index); + return LookupLightingLUT(lut_index, index, delta); + } + + float LookupLightingLUTSigned(int lut_index, float pos) { + int index = clamp(int(pos * 128.0), -128, 127); + float delta = pos * 128.0 - float(index); + if (index < 0) index += 256; + return LookupLightingLUT(lut_index, index, delta); + } + + float byteround(float x) { + return round(x * 255.0) * (1.0 / 255.0); + } + + vec2 byteround(vec2 x) { + return round(x * 255.0) * (1.0 / 255.0); + } + + vec3 byteround(vec3 x) { + return round(x * 255.0) * (1.0 / 255.0); + } + + vec4 byteround(vec4 x) { + return round(x * 255.0) * (1.0 / 255.0); + } + + // PICA's LOD formula for 2D textures. + // This LOD formula is the same as the LOD lower limit defined in OpenGL. + // f(x, y) >= max{m_u, m_v, m_w} + // (See OpenGL 4.6 spec, 8.14.1 - Scale Factor and Level-of-Detail) + float getLod(vec2 coord) { + vec2 d = max(abs(dFdx(coord)), abs(dFdy(coord))); + return log2(max(d.x, d.y)); + } + + #if ALLOW_SHADOW + + uvec2 DecodeShadow(uint pixel) { + return uvec2(pixel >> 8, pixel & 0xFFu); + } + + uint EncodeShadow(uvec2 pixel) { + return (pixel.x << 8) | pixel.y; + } + + float CompareShadow(uint pixel, uint z) { + uvec2 p = DecodeShadow(pixel); + return mix(float(p.y) * (1.0 / 255.0), 0.0, p.x <= z); + } + + float SampleShadow2D(ivec2 uv, uint z) { + if (any(bvec4( lessThan(uv, ivec2(0)), greaterThanEqual(uv, imageSize(shadow_texture_px)) ))) + return 1.0; + return CompareShadow(imageLoad(shadow_texture_px, uv).x, z); + } + + float mix2(vec4 s, vec2 a) { + vec2 t = mix(s.xy, s.zw, a.yy); + return mix(t.x, t.y, a.x); + } + + vec4 shadowTexture(vec2 uv, float w) { + )"; + + if (!config.state.shadow_texture_orthographic) { + out += "uv /= w;"; + } + + out += "uint z = uint(max(0, int(min(abs(w), 1.0) * float(0xFFFFFF)) - shadow_texture_bias));"; + out += R"( + vec2 coord = vec2(imageSize(shadow_texture_px)) * uv - vec2(0.5); + vec2 coord_floor = floor(coord); + vec2 f = coord - coord_floor; + ivec2 i = ivec2(coord_floor); + vec4 s = vec4( + SampleShadow2D(i , z), + SampleShadow2D(i + ivec2(1, 0), z), + SampleShadow2D(i + ivec2(0, 1), z), + SampleShadow2D(i + ivec2(1, 1), z)); + return vec4(mix2(s, f)); + } + + vec4 shadowTextureCube(vec2 uv, float w) { + ivec2 size = imageSize(shadow_texture_px); + vec3 c = vec3(uv, w); + vec3 a = abs(c); + if (a.x > a.y && a.x > a.z) { + w = a.x; + uv = -c.zy; + if (c.x < 0.0) uv.x = -uv.x; + } else if (a.y > a.z) { + w = a.y; + uv = c.xz; + if (c.y < 0.0) uv.y = -uv.y; + } else { + w = a.z; + uv = -c.xy; + if (c.z > 0.0) uv.x = -uv.x; + } + )"; + out += "uint z = uint(max(0, int(min(w, 1.0) * float(0xFFFFFF)) - shadow_texture_bias));"; + out += R"( + vec2 coord = vec2(size) * (uv / w * vec2(0.5) + vec2(0.5)) - vec2(0.5); + vec2 coord_floor = floor(coord); + vec2 f = coord - coord_floor; + ivec2 i00 = ivec2(coord_floor); + ivec2 i10 = i00 + ivec2(1, 0); + ivec2 i01 = i00 + ivec2(0, 1); + ivec2 i11 = i00 + ivec2(1, 1); + ivec2 cmin = ivec2(0), cmax = size - ivec2(1, 1); + i00 = clamp(i00, cmin, cmax); + i10 = clamp(i10, cmin, cmax); + i01 = clamp(i01, cmin, cmax); + i11 = clamp(i11, cmin, cmax); + uvec4 pixels; + // This part should have been refactored into functions, + // but many drivers don't like passing uimage2D as parameters + if (a.x > a.y && a.x > a.z) { + if (c.x > 0.0) + pixels = uvec4( + imageLoad(shadow_texture_px, i00).r, + imageLoad(shadow_texture_px, i10).r, + imageLoad(shadow_texture_px, i01).r, + imageLoad(shadow_texture_px, i11).r); + else + pixels = uvec4( + imageLoad(shadow_texture_nx, i00).r, + imageLoad(shadow_texture_nx, i10).r, + imageLoad(shadow_texture_nx, i01).r, + imageLoad(shadow_texture_nx, i11).r); + } else if (a.y > a.z) { + if (c.y > 0.0) + pixels = uvec4( + imageLoad(shadow_texture_py, i00).r, + imageLoad(shadow_texture_py, i10).r, + imageLoad(shadow_texture_py, i01).r, + imageLoad(shadow_texture_py, i11).r); + else + pixels = uvec4( + imageLoad(shadow_texture_ny, i00).r, + imageLoad(shadow_texture_ny, i10).r, + imageLoad(shadow_texture_ny, i01).r, + imageLoad(shadow_texture_ny, i11).r); + } else { + if (c.z > 0.0) + pixels = uvec4( + imageLoad(shadow_texture_pz, i00).r, + imageLoad(shadow_texture_pz, i10).r, + imageLoad(shadow_texture_pz, i01).r, + imageLoad(shadow_texture_pz, i11).r); + else + pixels = uvec4( + imageLoad(shadow_texture_nz, i00).r, + imageLoad(shadow_texture_nz, i10).r, + imageLoad(shadow_texture_nz, i01).r, + imageLoad(shadow_texture_nz, i11).r); + } + vec4 s = vec4( + CompareShadow(pixels.x, z), + CompareShadow(pixels.y, z), + CompareShadow(pixels.z, z), + CompareShadow(pixels.w, z)); + return vec4(mix2(s, f)); +} + +#else + +vec4 shadowTexture(vec2 uv, float w) { + return vec4(1.0); +} + +vec4 shadowTextureCube(vec2 uv, float w) { + return vec4(1.0); +} + +#endif +)"; + + if (config.state.proctex.enable) + AppendProcTexSampler(out, config); + + // We round the interpolated primary color to the nearest 1/255th + // This maintains the PICA's 8 bits of precision + out += R"( +void main() { +vec4 rounded_primary_color = byteround(primary_color); +vec4 primary_fragment_color = vec4(0.0); +vec4 secondary_fragment_color = vec4(0.0); +)"; + + // Do not do any sort of processing if it's obvious we're not going to pass the alpha test + if (state.alpha_test_func == FramebufferRegs::CompareFunc::Never) { + out += "discard; }"; + return {std::move(out)}; + } + + // Append the scissor test + if (state.scissor_test_mode != RasterizerRegs::ScissorMode::Disabled) { + out += "if ("; + // Negate the condition if we have to keep only the pixels outside the scissor box + if (state.scissor_test_mode == RasterizerRegs::ScissorMode::Include) { + out += '!'; + } + out += "(gl_FragCoord.x >= float(scissor_x1) && " + "gl_FragCoord.y >= float(scissor_y1) && " + "gl_FragCoord.x < float(scissor_x2) && " + "gl_FragCoord.y < float(scissor_y2))) discard;\n"; + } + + // After perspective divide, OpenGL transform z_over_w from [-1, 1] to [near, far]. Here we use + // default near = 0 and far = 1, and undo the transformation to get the original z_over_w, then + // do our own transformation according to PICA specification. + out += "float z_over_w = 2.0 * gl_FragCoord.z - 1.0;\n" + "float depth = z_over_w * depth_scale + depth_offset;\n"; + if (state.depthmap_enable == RasterizerRegs::DepthBuffering::WBuffering) { + out += "depth /= gl_FragCoord.w;\n"; + } + + if (state.lighting.enable) + WriteLighting(out, config); + + out += "vec4 combiner_buffer = vec4(0.0);\n" + "vec4 next_combiner_buffer = tev_combiner_buffer_color;\n" + "vec4 last_tex_env_out = vec4(0.0);\n"; + + for (std::size_t index = 0; index < state.tev_stages.size(); ++index) { + WriteTevStage(out, config, static_cast(index)); + } + + if (state.alpha_test_func != FramebufferRegs::CompareFunc::Always) { + out += "if ("; + AppendAlphaTestCondition(out, state.alpha_test_func); + out += ") discard;\n"; + } + + // Append fog combiner + if (state.fog_mode == TexturingRegs::FogMode::Fog) { + // Get index into fog LUT + if (state.fog_flip) { + out += "float fog_index = (1.0 - float(depth)) * 128.0;\n"; + } else { + out += "float fog_index = depth * 128.0;\n"; + } + + // Generate clamped fog factor from LUT for given fog index + out += "float fog_i = clamp(floor(fog_index), 0.0, 127.0);\n" + "float fog_f = fog_index - fog_i;\n" + "vec2 fog_lut_entry = texelFetch(texture_buffer_lut_lf, int(fog_i) + " + "fog_lut_offset).rg;\n" + "float fog_factor = fog_lut_entry.r + fog_lut_entry.g * fog_f;\n" + "fog_factor = clamp(fog_factor, 0.0, 1.0);\n"; + + // Blend the fog + out += "last_tex_env_out.rgb = mix(fog_color.rgb, last_tex_env_out.rgb, fog_factor);\n"; + } else if (state.fog_mode == TexturingRegs::FogMode::Gas) { + Core::System::GetInstance().TelemetrySession().AddField( + Common::Telemetry::FieldType::Session, "VideoCore_Pica_UseGasMode", true); + LOG_CRITICAL(Render_OpenGL, "Unimplemented gas mode"); + out += "discard; }"; + return {std::move(out)}; + } + + if (state.shadow_rendering) { + out += R"( +#if ALLOW_SHADOW +uint d = uint(clamp(depth, 0.0, 1.0) * float(0xFFFFFF)); +uint s = uint(last_tex_env_out.g * float(0xFF)); +ivec2 image_coord = ivec2(gl_FragCoord.xy); + +uint old = imageLoad(shadow_buffer, image_coord).x; +uint new; +uint old2; +do { + old2 = old; + + uvec2 ref = DecodeShadow(old); + if (d < ref.x) { + if (s == 0u) { + ref.x = d; + } else { + s = uint(float(s) / (shadow_bias_constant + shadow_bias_linear * float(d) / float(ref.x))); + ref.y = min(s, ref.y); + } + } + new = EncodeShadow(ref); + +} while ((old = imageAtomicCompSwap(shadow_buffer, image_coord, old, new)) != old2); +#endif // ALLOW_SHADOW +)"; + } else { + out += "gl_FragDepth = depth;\n"; + // Round the final fragment color to maintain the PICA's 8 bits of precision + out += "color = byteround(last_tex_env_out);\n"; + } + + out += '}'; + + return {std::move(out)}; +} + +ShaderDecompiler::ProgramResult GenerateTrivialVertexShader(bool separable_shader) { + std::string out; + out += "#extension GL_ARB_separate_shader_objects : enable\n"; + out += + fmt::format("layout(location = {}) in vec4 vert_position;\n" + "layout(location = {}) in vec4 vert_color;\n" + "layout(location = {}) in vec2 vert_texcoord0;\n" + "layout(location = {}) in vec2 vert_texcoord1;\n" + "layout(location = {}) in vec2 vert_texcoord2;\n" + "layout(location = {}) in float vert_texcoord0_w;\n" + "layout(location = {}) in vec4 vert_normquat;\n" + "layout(location = {}) in vec3 vert_view;\n", + ATTRIBUTE_POSITION, ATTRIBUTE_COLOR, ATTRIBUTE_TEXCOORD0, ATTRIBUTE_TEXCOORD1, + ATTRIBUTE_TEXCOORD2, ATTRIBUTE_TEXCOORD0_W, ATTRIBUTE_NORMQUAT, ATTRIBUTE_VIEW); + + out += GetVertexInterfaceDeclaration(true, separable_shader); + + out += UniformBlockDef; + + out += R"( + +void main() { + primary_color = vert_color; + texcoord0 = vert_texcoord0; + texcoord1 = vert_texcoord1; + texcoord2 = vert_texcoord2; + texcoord0_w = vert_texcoord0_w; + normquat = vert_normquat; + view = vert_view; + gl_Position = vert_position; +#if !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance) + gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0 + gl_ClipDistance[1] = dot(clip_coef, vert_position); +#endif // !defined(CITRA_GLES) || defined(GL_EXT_clip_cull_distance) +} +)"; + + return {std::move(out)}; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_gen.h b/src/video_core/renderer_vulkan/vk_shader_gen.h new file mode 100644 index 000000000..e37c11b17 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_gen.h @@ -0,0 +1,43 @@ +// Copyright 2022 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "common/hash.h" +#include "video_core/regs.h" +#include "video_core/shader/shader.h" +#include "video_core/renderer_vulkan/vk_shader_state.h" + +namespace Vulkan { + +namespace ShaderDecompiler { +struct ProgramResult { + std::string code; +}; +} + +/** + * Generates the GLSL vertex shader program source code that accepts vertices from software shader + * and directly passes them to the fragment shader. + * @param separable_shader generates shader that can be used for separate shader object + * @returns String of the shader source code + */ +ShaderDecompiler::ProgramResult GenerateTrivialVertexShader(bool separable_shader); + +/** + * Generates the GLSL fragment shader program source code for the current Pica state + * @param config ShaderCacheKey object generated for the current Pica state, used for the shader + * configuration (NOTE: Use state in this struct only, not the Pica registers!) + * @param separable_shader generates shader that can be used for separate shader object + * @returns String of the shader source code + */ +ShaderDecompiler::ProgramResult GenerateFragmentShader(const PicaFSConfig& config); + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_state.h b/src/video_core/renderer_vulkan/vk_shader_state.h index 955edbf31..62572b1aa 100644 --- a/src/video_core/renderer_vulkan/vk_shader_state.h +++ b/src/video_core/renderer_vulkan/vk_shader_state.h @@ -10,6 +10,7 @@ #include #include #include +#include #include "common/hash.h" #include "video_core/regs.h" #include "video_core/shader/shader.h" @@ -198,17 +199,14 @@ struct PicaFixedGSConfig : Common::HashableStruct { } }; -/** - * This struct combines the vertex and fragment states for a complete pipeline cache key - */ -struct VKPipelineCacheKey { - VKPipelineCacheKey(const Pica::Regs& regs, Pica::Shader::ShaderSetup& setup) : - vertex_config(regs.vs, setup), geometry_config(regs), - fragment_config(PicaFSConfig::BuildFromRegs(regs)) {} - - PicaVSConfig vertex_config; - PicaFixedGSConfig geometry_config; +struct PipelineCacheKey { + vk::PipelineColorBlendAttachmentState blend_config; PicaFSConfig fragment_config; + + u64 Hash() const { + const u64 hash = Common::CityHash64(reinterpret_cast(this), sizeof(PipelineCacheKey)); + return static_cast(hash); + } }; } // namespace Vulkan @@ -234,4 +232,11 @@ struct hash { return k.Hash(); } }; + +template <> +struct hash { + size_t operator()(const Vulkan::PipelineCacheKey& k) const noexcept { + return k.Hash(); + } +}; } // namespace std diff --git a/src/video_core/renderer_vulkan/vk_state.cpp b/src/video_core/renderer_vulkan/vk_state.cpp index 6729af9b3..39b696bb2 100644 --- a/src/video_core/renderer_vulkan/vk_state.cpp +++ b/src/video_core/renderer_vulkan/vk_state.cpp @@ -7,8 +7,8 @@ #include "video_core/renderer_vulkan/vk_state.h" #include "video_core/renderer_vulkan/vk_task_scheduler.h" #include "video_core/renderer_vulkan/vk_resource_cache.h" -#include "video_core/renderer_opengl/gl_shader_gen.h" -#include "video_core/renderer_opengl/gl_shader_decompiler.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_vulkan/vk_shader_gen.h" namespace Vulkan { @@ -67,20 +67,57 @@ void VulkanState::Create() { // Create texture sampler auto props = g_vk_instace->GetPhysicalDevice().getProperties(); - vk::SamplerCreateInfo sampler_info - ( + vk::SamplerCreateInfo sampler_info{ {}, vk::Filter::eNearest, vk::Filter::eNearest, vk::SamplerMipmapMode::eNearest, vk::SamplerAddressMode::eClampToEdge, vk::SamplerAddressMode::eClampToEdge, vk::SamplerAddressMode::eClampToEdge, {}, true, props.limits.maxSamplerAnisotropy, false, vk::CompareOp::eAlways, {}, {}, vk::BorderColor::eIntOpaqueBlack, false - ); - + }; sampler = g_vk_instace->GetDevice().createSamplerUnique(sampler_info); - dirty_flags |= DirtyFlags::All; - CompileTrivialShader(); + + // Define the descriptor sets we will be using + std::array ubo_set = {{ + { 0, vk::DescriptorType::eUniformBuffer, 1, vk::ShaderStageFlagBits::eVertex | + vk::ShaderStageFlagBits::eGeometry | vk::ShaderStageFlagBits::eFragment }, // shader_data + { 1, vk::DescriptorType::eUniformBuffer, 1, vk::ShaderStageFlagBits::eVertex } // pica_uniforms + }}; + + std::array texture_set = {{ + { 0, vk::DescriptorType::eCombinedImageSampler, 1, vk::ShaderStageFlagBits::eFragment }, // tex0 + { 1, vk::DescriptorType::eCombinedImageSampler, 1, vk::ShaderStageFlagBits::eFragment }, // tex1 + { 2, vk::DescriptorType::eCombinedImageSampler, 1, vk::ShaderStageFlagBits::eFragment }, // tex2 + { 3, vk::DescriptorType::eCombinedImageSampler, 1, vk::ShaderStageFlagBits::eFragment }, // tex_cube + }}; + + std::array lut_set = {{ + { 0, vk::DescriptorType::eStorageTexelBuffer, 1, vk::ShaderStageFlagBits::eFragment }, // texture_buffer_lut_lf + { 1, vk::DescriptorType::eStorageTexelBuffer, 1, vk::ShaderStageFlagBits::eFragment }, // texture_buffer_lut_rg + { 2, vk::DescriptorType::eStorageTexelBuffer, 1, vk::ShaderStageFlagBits::eFragment } // texture_buffer_lut_rgba + }}; + + // Create and store descriptor set layouts + std::array create_infos = {{ + { vk::DescriptorSetLayoutCreateFlags(), ubo_set }, + { vk::DescriptorSetLayoutCreateFlags(), texture_set }, + { vk::DescriptorSetLayoutCreateFlags(), lut_set } + }}; + + for (int i = 0; i < DESCRIPTOR_SET_LAYOUT_COUNT; i++) { + descriptor_layouts[i] = g_vk_instace->GetDevice().createDescriptorSetLayout(create_infos[i]); + } + + // Create the standard descriptor set layout + vk::PipelineLayoutCreateInfo layout_info({}, descriptor_layouts); + pipeline_layout = g_vk_instace->GetDevice().createPipelineLayoutUnique(layout_info); + + // Compile trivial vertex shader + auto source = GenerateTrivialVertexShader(true); + MakeShader(source.code, vk::ShaderStageFlagBits::eVertex); + + dirty_flags |= DirtyFlags::All; } void VulkanState::SetVertexBuffer(VKBuffer* buffer, vk::DeviceSize offset) { @@ -231,11 +268,11 @@ void VulkanState::SetLogicOp(vk::LogicOp new_logic_op) { void VulkanState::SetColorMask(bool red, bool green, bool blue, bool alpha) { auto mask = static_cast(red | (green << 1) | (blue << 2) | (alpha << 3)); - static_state.blend.setColorWriteMask(mask); + pipeline_key.blend_config.setColorWriteMask(mask); } void VulkanState::SetBlendEnable(bool enable) { - static_state.blend.setBlendEnable(enable); + pipeline_key.blend_config.setBlendEnable(enable); } void VulkanState::SetBlendCostants(float red, float green, float blue, float alpha) { @@ -286,7 +323,7 @@ void VulkanState::SetDepthTest(bool enable, vk::CompareOp compare) { void VulkanState::SetBlendOp(vk::BlendOp rgb_op, vk::BlendOp alpha_op, vk::BlendFactor src_color, vk::BlendFactor dst_color, vk::BlendFactor src_alpha, vk::BlendFactor dst_alpha) { - auto& blend = static_state.blend; + auto& blend = pipeline_key.blend_config; blend.setColorBlendOp(rgb_op); blend.setAlphaBlendOp(alpha_op); blend.setSrcColorBlendFactor(src_color); @@ -295,6 +332,136 @@ void VulkanState::SetBlendOp(vk::BlendOp rgb_op, vk::BlendOp alpha_op, vk::Blend blend.setDstAlphaBlendFactor(dst_alpha); } +void VulkanState::SetFragmentShader(const Pica::Regs& regs) { + vk::Pipeline pipeline; + pipeline_key.fragment_config = PicaFSConfig::BuildFromRegs(regs); + auto it1 = pipelines.find(pipeline_key); + + do { + // Try to use an already complete pipeline + if (it1 != pipelines.end()) { + pipeline = it1->second.get(); + break; + } + + // Maybe the shader has been compiled but the pipeline state changed? + auto shader = fragment_shaders.find(pipeline_key.fragment_config); + if (shader != fragment_shaders.end()) { + pipeline = MakePipeline(shader->second.get()); + break; + } + + // Re-compile shader module and create new pipeline + auto result = GenerateFragmentShader(pipeline_key.fragment_config); + auto module = MakeShader(result.code, vk::ShaderStageFlagBits::eFragment); + pipeline = MakePipeline(module); + } while (false); + + // Bind the pipeline + auto command_buffer = g_vk_task_scheduler->GetCommandBuffer(); + command_buffer.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline); +} + +vk::ShaderModule VulkanState::MakeShader(const std::string& source, vk::ShaderStageFlagBits stage) { + shaderc::Compiler compiler; + shaderc::CompileOptions options; + options.SetOptimizationLevel(shaderc_optimization_level_performance); + options.SetTargetEnvironment(shaderc_target_env_vulkan, shaderc_env_version_vulkan_1_2); + + shaderc_shader_kind kind{}; + std::string name{}; + switch (stage) { + case vk::ShaderStageFlagBits::eVertex: + kind = shaderc_glsl_vertex_shader; + name = "vertex shader"; + break; + case vk::ShaderStageFlagBits::eFragment: + kind = shaderc_glsl_fragment_shader; + name = "fragment shader"; + break; + default: + LOG_CRITICAL(Render_Vulkan, "Unknown shader stage"); + UNREACHABLE(); + } + + auto shader_module = compiler.CompileGlslToSpv(source.data(), kind, name.c_str(), options); + if (shader_module.GetCompilationStatus() != shaderc_compilation_status_success) { + LOG_CRITICAL(Render_Vulkan, shader_module.GetErrorMessage().c_str()); + } + + auto shader_code = std::vector{shader_module.cbegin(), shader_module.cend()}; + vk::ShaderModuleCreateInfo shader_info{{}, shader_code}; + + auto& device = g_vk_instace->GetDevice(); + auto shader = device.createShaderModuleUnique(shader_info); + + if (stage == vk::ShaderStageFlagBits::eFragment) { + auto handle = shader.get(); + fragment_shaders[pipeline_key.fragment_config] = std::move(shader); + return handle; + } + else if (stage == vk::ShaderStageFlagBits::eVertex) { + trivial_vertex_shader = std::move(shader); + return trivial_vertex_shader.get(); + } + + UNREACHABLE(); +} + +vk::Pipeline VulkanState::MakePipeline(vk::ShaderModule fragment) { + std::array shader_stages {{ + { {}, vk::ShaderStageFlagBits::eVertex, trivial_vertex_shader.get(), "main" }, + { {}, vk::ShaderStageFlagBits::eFragment, fragment, "main" } + }}; + + vk::PipelineVertexInputStateCreateInfo vertex_input_info{ + {}, HardwareVertex::binding_desc, HardwareVertex::attribute_desc + }; + + vk::PipelineInputAssemblyStateCreateInfo input_assembly{{}, vk::PrimitiveTopology::eTriangleList, false}; + vk::PipelineRasterizationStateCreateInfo rasterizer{ + {}, false, false, vk::PolygonMode::eFill, vk::CullModeFlagBits::eNone, + vk::FrontFace::eClockwise, false + }; + + vk::PipelineMultisampleStateCreateInfo multisampling{{}, vk::SampleCountFlagBits::e1}; + vk::PipelineColorBlendStateCreateInfo color_blending{{}, false, vk::LogicOp::eCopy, pipeline_key.blend_config}; + + // Enable every required dynamic state + std::array dynamic_states{ + vk::DynamicState::eDepthCompareOp, vk::DynamicState::eLineWidth, + vk::DynamicState::eDepthTestEnable, vk::DynamicState::eColorWriteEnableEXT, + vk::DynamicState::eStencilTestEnable, vk::DynamicState::eStencilOp, + vk::DynamicState::eStencilCompareMask, vk::DynamicState::eStencilWriteMask, + vk::DynamicState::eCullMode, vk::DynamicState::eBlendConstants, + vk::DynamicState::eViewport, vk::DynamicState::eScissor, + vk::DynamicState::eLogicOpEXT, vk::DynamicState::eFrontFace + }; + + vk::PipelineDynamicStateCreateInfo dynamic_info{{}, dynamic_states}; + + vk::PipelineDepthStencilStateCreateInfo depth_info{ + {}, true, true, vk::CompareOp::eGreaterOrEqual, false, true + }; + + vk::GraphicsPipelineCreateInfo pipeline_info{ + {}, shader_stages, &vertex_input_info, &input_assembly, nullptr, nullptr, + &rasterizer, &multisampling, &depth_info, &color_blending, &dynamic_info, + pipeline_layout.get(), nullptr + }; + + auto& device = g_vk_instace->GetDevice(); + auto result = device.createGraphicsPipelineUnique(nullptr, pipeline_info); + + if (result.result == vk::Result::eSuccess) { + auto handle = result.value.get(); + pipelines[pipeline_key] = std::move(result.value); + return handle; + } + + return VK_NULL_HANDLE; +} + void VulkanState::Apply() { // Update resources in descriptor sets if changed UpdateDescriptorSet(); @@ -372,24 +539,4 @@ void VulkanState::UpdateDescriptorSet() { } } -void VulkanState::CompileTrivialShader() { - auto source = OpenGL::GenerateTrivialVertexShader(true); - - shaderc::Compiler compiler; - shaderc::CompileOptions options; - options.SetOptimizationLevel(shaderc_optimization_level_performance); - options.SetTargetEnvironment(shaderc_target_env_vulkan, shaderc_env_version_vulkan_1_2); - - auto shader_module = compiler.CompileGlslToSpv(source.code, shaderc_glsl_vertex_shader, "vertex shader", options); - if (shader_module.GetCompilationStatus() != shaderc_compilation_status_success) { - LOG_CRITICAL(Render_Vulkan, shader_module.GetErrorMessage().c_str()); - } - - auto shader_code = std::vector{ shader_module.cbegin(), shader_module.cend() }; - vk::ShaderModuleCreateInfo shader_info { {}, shader_code }; - - auto& device = g_vk_instace->GetDevice(); - trivial_vertex_shader = device.createShaderModuleUnique(shader_info); -} - } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_state.h b/src/video_core/renderer_vulkan/vk_state.h index 870ee6630..9a46f35ee 100644 --- a/src/video_core/renderer_vulkan/vk_state.h +++ b/src/video_core/renderer_vulkan/vk_state.h @@ -6,6 +6,9 @@ #include #include +#include +#include "video_core/regs.h" +#include "video_core/renderer_vulkan/vk_shader_state.h" #include "video_core/renderer_vulkan/vk_texture.h" namespace Vulkan { @@ -87,6 +90,7 @@ public: void PushRenderTargets(VKTexture* color, VKTexture* depth_stencil); void PopRenderTargets(); void SetRenderArea(vk::Rect2D render_area); + void SetFragmentShader(const Pica::Regs& config); void BeginRendering(); void EndRendering(); @@ -102,8 +106,8 @@ public: private: void UpdateDescriptorSet(); - void GetPipeline(); - void CompileTrivialShader(); + vk::Pipeline MakePipeline(vk::ShaderModule fragment); + vk::ShaderModule MakeShader(const std::string& source, vk::ShaderStageFlagBits stage); private: struct Binding { @@ -145,13 +149,14 @@ private: vk::StencilOp fail_op, pass_op, depth_fail_op; vk::CompareOp compare_op; - struct { - vk::PipelineColorBlendAttachmentState blend; - vk::PipelineDepthStencilStateCreateInfo depth_stencil; - } static_state; - // Pipeline cache vk::UniqueShaderModule trivial_vertex_shader; + vk::UniquePipelineLayout pipeline_layout; + std::vector descriptor_layouts; + PipelineCacheKey pipeline_key; + + std::unordered_map fragment_shaders; + std::unordered_map pipelines; }; } // namespace Vulkan