Compare commits
	
		
			3 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | cfdb10a7ba | ||
|  | 8012b28b92 | ||
|  | 531d280461 | 
| @@ -95,6 +95,8 @@ add_library(video_core STATIC | |||||||
|     renderer_software/sw_proctex.h |     renderer_software/sw_proctex.h | ||||||
|     renderer_software/sw_rasterizer.cpp |     renderer_software/sw_rasterizer.cpp | ||||||
|     renderer_software/sw_rasterizer.h |     renderer_software/sw_rasterizer.h | ||||||
|  |     renderer_software/sw_tev_jit.cpp | ||||||
|  |     renderer_software/sw_tev_jit.h | ||||||
|     renderer_software/sw_texturing.cpp |     renderer_software/sw_texturing.cpp | ||||||
|     renderer_software/sw_texturing.h |     renderer_software/sw_texturing.h | ||||||
|     renderer_vulkan/pica_to_vk.h |     renderer_vulkan/pica_to_vk.h | ||||||
|   | |||||||
| @@ -8,6 +8,7 @@ | |||||||
| #include "core/hw/hw.h" | #include "core/hw/hw.h" | ||||||
| #include "core/hw/lcd.h" | #include "core/hw/lcd.h" | ||||||
| #include "video_core/renderer_software/renderer_software.h" | #include "video_core/renderer_software/renderer_software.h" | ||||||
|  | #include "video_core/renderer_software/sw_rasterizer.h" | ||||||
|  |  | ||||||
| namespace SwRenderer { | namespace SwRenderer { | ||||||
|  |  | ||||||
| @@ -17,6 +18,10 @@ RendererSoftware::RendererSoftware(Core::System& system, Frontend::EmuWindow& wi | |||||||
|  |  | ||||||
| RendererSoftware::~RendererSoftware() = default; | RendererSoftware::~RendererSoftware() = default; | ||||||
|  |  | ||||||
|  | VideoCore::RasterizerInterface* RendererSoftware::Rasterizer() const { | ||||||
|  |     return rasterizer.get(); | ||||||
|  | } | ||||||
|  |  | ||||||
| void RendererSoftware::SwapBuffers() { | void RendererSoftware::SwapBuffers() { | ||||||
|     PrepareRenderTarget(); |     PrepareRenderTarget(); | ||||||
|     EndFrame(); |     EndFrame(); | ||||||
|   | |||||||
| @@ -5,7 +5,6 @@ | |||||||
| #pragma once | #pragma once | ||||||
|  |  | ||||||
| #include "video_core/renderer_base.h" | #include "video_core/renderer_base.h" | ||||||
| #include "video_core/renderer_software/sw_rasterizer.h" |  | ||||||
|  |  | ||||||
| namespace Core { | namespace Core { | ||||||
| class System; | class System; | ||||||
| @@ -19,19 +18,18 @@ struct ScreenInfo { | |||||||
|     std::vector<u8> pixels; |     std::vector<u8> pixels; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | class RasterizerSoftware; | ||||||
|  |  | ||||||
| class RendererSoftware : public VideoCore::RendererBase { | class RendererSoftware : public VideoCore::RendererBase { | ||||||
| public: | public: | ||||||
|     explicit RendererSoftware(Core::System& system, Frontend::EmuWindow& window); |     explicit RendererSoftware(Core::System& system, Frontend::EmuWindow& window); | ||||||
|     ~RendererSoftware() override; |     ~RendererSoftware() override; | ||||||
|  |  | ||||||
|     [[nodiscard]] VideoCore::RasterizerInterface* Rasterizer() const override { |  | ||||||
|         return rasterizer.get(); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     [[nodiscard]] const ScreenInfo& Screen(VideoCore::ScreenId id) const noexcept { |     [[nodiscard]] const ScreenInfo& Screen(VideoCore::ScreenId id) const noexcept { | ||||||
|         return screen_infos[static_cast<u32>(id)]; |         return screen_infos[static_cast<u32>(id)]; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     VideoCore::RasterizerInterface* Rasterizer() const override; | ||||||
|     void SwapBuffers() override; |     void SwapBuffers() override; | ||||||
|     void TryPresent(int timeout_ms, bool is_secondary) override {} |     void TryPresent(int timeout_ms, bool is_secondary) override {} | ||||||
|     void Sync() override {} |     void Sync() override {} | ||||||
|   | |||||||
| @@ -41,10 +41,22 @@ Framebuffer::Framebuffer(Memory::MemorySystem& memory_, const Pica::FramebufferR | |||||||
|  |  | ||||||
| Framebuffer::~Framebuffer() = default; | Framebuffer::~Framebuffer() = default; | ||||||
|  |  | ||||||
| void Framebuffer::DrawPixel(int x, int y, const Common::Vec4<u8>& color) const { | void Framebuffer::Bind() { | ||||||
|     const auto& framebuffer = regs.framebuffer; |     PAddr addr = regs.framebuffer.GetColorBufferPhysicalAddress(); | ||||||
|     const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); |     if (color_addr != addr) [[unlikely]] { | ||||||
|  |         color_addr = addr; | ||||||
|  |         color_buffer = memory.GetPhysicalPointer(color_addr); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     addr = regs.framebuffer.GetDepthBufferPhysicalAddress(); | ||||||
|  |     if (depth_addr != addr) [[unlikely]] { | ||||||
|  |         depth_addr = addr; | ||||||
|  |         depth_buffer = memory.GetPhysicalPointer(depth_addr); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void Framebuffer::DrawPixel(u32 x, u32 y, const Common::Vec4<u8>& color) const { | ||||||
|  |     const auto& framebuffer = regs.framebuffer; | ||||||
|     // Similarly to textures, the render framebuffer is laid out from bottom to top, too. |     // Similarly to textures, the render framebuffer is laid out from bottom to top, too. | ||||||
|     // NOTE: The framebuffer height register contains the actual FB height minus one. |     // NOTE: The framebuffer height register contains the actual FB height minus one. | ||||||
|     y = framebuffer.height - y; |     y = framebuffer.height - y; | ||||||
| @@ -54,8 +66,7 @@ void Framebuffer::DrawPixel(int x, int y, const Common::Vec4<u8>& color) const { | |||||||
|         GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); |         GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); | ||||||
|     const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + |     const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + | ||||||
|                            coarse_y * framebuffer.width * bytes_per_pixel; |                            coarse_y * framebuffer.width * bytes_per_pixel; | ||||||
|     u8* depth_buffer = memory.GetPhysicalPointer(addr); |     u8* dst_pixel = color_buffer + dst_offset; | ||||||
|     u8* dst_pixel = depth_buffer + dst_offset; |  | ||||||
|  |  | ||||||
|     switch (framebuffer.color_format) { |     switch (framebuffer.color_format) { | ||||||
|     case FramebufferRegs::ColorFormat::RGBA8: |     case FramebufferRegs::ColorFormat::RGBA8: | ||||||
| @@ -80,10 +91,8 @@ void Framebuffer::DrawPixel(int x, int y, const Common::Vec4<u8>& color) const { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| const Common::Vec4<u8> Framebuffer::GetPixel(int x, int y) const { | const Common::Vec4<u8> Framebuffer::GetPixel(u32 x, u32 y) const { | ||||||
|     const auto& framebuffer = regs.framebuffer; |     const auto& framebuffer = regs.framebuffer; | ||||||
|     const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); |  | ||||||
|  |  | ||||||
|     y = framebuffer.height - y; |     y = framebuffer.height - y; | ||||||
|  |  | ||||||
|     const u32 coarse_y = y & ~7; |     const u32 coarse_y = y & ~7; | ||||||
| @@ -91,7 +100,6 @@ const Common::Vec4<u8> Framebuffer::GetPixel(int x, int y) const { | |||||||
|         GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); |         GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); | ||||||
|     const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + |     const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + | ||||||
|                            coarse_y * framebuffer.width * bytes_per_pixel; |                            coarse_y * framebuffer.width * bytes_per_pixel; | ||||||
|     const u8* color_buffer = memory.GetPhysicalPointer(addr); |  | ||||||
|     const u8* src_pixel = color_buffer + src_offset; |     const u8* src_pixel = color_buffer + src_offset; | ||||||
|  |  | ||||||
|     switch (framebuffer.color_format) { |     switch (framebuffer.color_format) { | ||||||
| @@ -114,10 +122,8 @@ const Common::Vec4<u8> Framebuffer::GetPixel(int x, int y) const { | |||||||
|     return {0, 0, 0, 0}; |     return {0, 0, 0, 0}; | ||||||
| } | } | ||||||
|  |  | ||||||
| u32 Framebuffer::GetDepth(int x, int y) const { | u32 Framebuffer::GetDepth(u32 x, u32 y) const { | ||||||
|     const auto& framebuffer = regs.framebuffer; |     const auto& framebuffer = regs.framebuffer; | ||||||
|     const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); |  | ||||||
|  |  | ||||||
|     y = framebuffer.height - y; |     y = framebuffer.height - y; | ||||||
|  |  | ||||||
|     const u32 coarse_y = y & ~7; |     const u32 coarse_y = y & ~7; | ||||||
| @@ -125,7 +131,6 @@ u32 Framebuffer::GetDepth(int x, int y) const { | |||||||
|     const u32 stride = framebuffer.width * bytes_per_pixel; |     const u32 stride = framebuffer.width * bytes_per_pixel; | ||||||
|  |  | ||||||
|     const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; |     const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||||
|     const u8* depth_buffer = memory.GetPhysicalPointer(addr); |  | ||||||
|     const u8* src_pixel = depth_buffer + src_offset; |     const u8* src_pixel = depth_buffer + src_offset; | ||||||
|  |  | ||||||
|     switch (framebuffer.depth_format) { |     switch (framebuffer.depth_format) { | ||||||
| @@ -143,10 +148,8 @@ u32 Framebuffer::GetDepth(int x, int y) const { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| u8 Framebuffer::GetStencil(int x, int y) const { | u8 Framebuffer::GetStencil(u32 x, u32 y) const { | ||||||
|     const auto& framebuffer = regs.framebuffer; |     const auto& framebuffer = regs.framebuffer; | ||||||
|     const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); |  | ||||||
|  |  | ||||||
|     y = framebuffer.height - y; |     y = framebuffer.height - y; | ||||||
|  |  | ||||||
|     const u32 coarse_y = y & ~7; |     const u32 coarse_y = y & ~7; | ||||||
| @@ -154,7 +157,6 @@ u8 Framebuffer::GetStencil(int x, int y) const { | |||||||
|     const u32 stride = framebuffer.width * bytes_per_pixel; |     const u32 stride = framebuffer.width * bytes_per_pixel; | ||||||
|  |  | ||||||
|     const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; |     const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||||
|     const u8* depth_buffer = memory.GetPhysicalPointer(addr); |  | ||||||
|     const u8* src_pixel = depth_buffer + src_offset; |     const u8* src_pixel = depth_buffer + src_offset; | ||||||
|  |  | ||||||
|     switch (framebuffer.depth_format) { |     switch (framebuffer.depth_format) { | ||||||
| @@ -169,10 +171,8 @@ u8 Framebuffer::GetStencil(int x, int y) const { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| void Framebuffer::SetDepth(int x, int y, u32 value) const { | void Framebuffer::SetDepth(u32 x, u32 y, u32 value) const { | ||||||
|     const auto& framebuffer = regs.framebuffer; |     const auto& framebuffer = regs.framebuffer; | ||||||
|     const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); |  | ||||||
|  |  | ||||||
|     y = framebuffer.height - y; |     y = framebuffer.height - y; | ||||||
|  |  | ||||||
|     const u32 coarse_y = y & ~7; |     const u32 coarse_y = y & ~7; | ||||||
| @@ -180,7 +180,6 @@ void Framebuffer::SetDepth(int x, int y, u32 value) const { | |||||||
|     const u32 stride = framebuffer.width * bytes_per_pixel; |     const u32 stride = framebuffer.width * bytes_per_pixel; | ||||||
|  |  | ||||||
|     const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; |     const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||||
|     u8* depth_buffer = memory.GetPhysicalPointer(addr); |  | ||||||
|     u8* dst_pixel = depth_buffer + dst_offset; |     u8* dst_pixel = depth_buffer + dst_offset; | ||||||
|  |  | ||||||
|     switch (framebuffer.depth_format) { |     switch (framebuffer.depth_format) { | ||||||
| @@ -201,10 +200,8 @@ void Framebuffer::SetDepth(int x, int y, u32 value) const { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| void Framebuffer::SetStencil(int x, int y, u8 value) const { | void Framebuffer::SetStencil(u32 x, u32 y, u8 value) const { | ||||||
|     const auto& framebuffer = regs.framebuffer; |     const auto& framebuffer = regs.framebuffer; | ||||||
|     const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); |  | ||||||
|  |  | ||||||
|     y = framebuffer.height - y; |     y = framebuffer.height - y; | ||||||
|  |  | ||||||
|     const u32 coarse_y = y & ~7; |     const u32 coarse_y = y & ~7; | ||||||
| @@ -212,7 +209,6 @@ void Framebuffer::SetStencil(int x, int y, u8 value) const { | |||||||
|     const u32 stride = framebuffer.width * bytes_per_pixel; |     const u32 stride = framebuffer.width * bytes_per_pixel; | ||||||
|  |  | ||||||
|     const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; |     const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||||
|     u8* depth_buffer = memory.GetPhysicalPointer(addr); |  | ||||||
|     u8* dst_pixel = depth_buffer + dst_offset; |     u8* dst_pixel = depth_buffer + dst_offset; | ||||||
|  |  | ||||||
|     switch (framebuffer.depth_format) { |     switch (framebuffer.depth_format) { | ||||||
| @@ -231,7 +227,7 @@ void Framebuffer::SetStencil(int x, int y, u8 value) const { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| void Framebuffer::DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const { | void Framebuffer::DrawShadowMapPixel(u32 x, u32 y, u32 depth, u8 stencil) const { | ||||||
|     const auto& framebuffer = regs.framebuffer; |     const auto& framebuffer = regs.framebuffer; | ||||||
|     const auto& shadow = regs.shadow; |     const auto& shadow = regs.shadow; | ||||||
|     const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); |     const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); | ||||||
|   | |||||||
| @@ -23,30 +23,37 @@ public: | |||||||
|     explicit Framebuffer(Memory::MemorySystem& memory, const Pica::FramebufferRegs& framebuffer); |     explicit Framebuffer(Memory::MemorySystem& memory, const Pica::FramebufferRegs& framebuffer); | ||||||
|     ~Framebuffer(); |     ~Framebuffer(); | ||||||
|  |  | ||||||
|  |     /// Updates the framebuffer addresses from the PICA registers. | ||||||
|  |     void Bind(); | ||||||
|  |  | ||||||
|     /// Draws a pixel at the specified coordinates. |     /// Draws a pixel at the specified coordinates. | ||||||
|     void DrawPixel(int x, int y, const Common::Vec4<u8>& color) const; |     void DrawPixel(u32 x, u32 y, const Common::Vec4<u8>& color) const; | ||||||
|  |  | ||||||
|     /// Returns the current color at the specified coordinates. |     /// Returns the current color at the specified coordinates. | ||||||
|     [[nodiscard]] const Common::Vec4<u8> GetPixel(int x, int y) const; |     [[nodiscard]] const Common::Vec4<u8> GetPixel(u32 x, u32 y) const; | ||||||
|  |  | ||||||
|     /// Returns the depth value at the specified coordinates. |     /// Returns the depth value at the specified coordinates. | ||||||
|     [[nodiscard]] u32 GetDepth(int x, int y) const; |     [[nodiscard]] u32 GetDepth(u32 x, u32 y) const; | ||||||
|  |  | ||||||
|     /// Returns the stencil value at the specified coordinates. |     /// Returns the stencil value at the specified coordinates. | ||||||
|     [[nodiscard]] u8 GetStencil(int x, int y) const; |     [[nodiscard]] u8 GetStencil(u32 x, u32 y) const; | ||||||
|  |  | ||||||
|     /// Stores the provided depth value at the specified coordinates. |     /// Stores the provided depth value at the specified coordinates. | ||||||
|     void SetDepth(int x, int y, u32 value) const; |     void SetDepth(u32 x, u32 y, u32 value) const; | ||||||
|  |  | ||||||
|     /// Stores the provided stencil value at the specified coordinates. |     /// Stores the provided stencil value at the specified coordinates. | ||||||
|     void SetStencil(int x, int y, u8 value) const; |     void SetStencil(u32 x, u32 y, u8 value) const; | ||||||
|  |  | ||||||
|     /// Draws a pixel to the shadow buffer. |     /// Draws a pixel to the shadow buffer. | ||||||
|     void DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const; |     void DrawShadowMapPixel(u32 x, u32 y, u32 depth, u8 stencil) const; | ||||||
|  |  | ||||||
| private: | private: | ||||||
|     Memory::MemorySystem& memory; |     Memory::MemorySystem& memory; | ||||||
|     const Pica::FramebufferRegs& regs; |     const Pica::FramebufferRegs& regs; | ||||||
|  |     PAddr color_addr; | ||||||
|  |     u8* color_buffer{}; | ||||||
|  |     PAddr depth_addr; | ||||||
|  |     u8* depth_buffer{}; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| u8 PerformStencilAction(Pica::FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref); | u8 PerformStencilAction(Pica::FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref); | ||||||
|   | |||||||
| @@ -95,8 +95,14 @@ private: | |||||||
|  |  | ||||||
| } // Anonymous namespace | } // Anonymous namespace | ||||||
|  |  | ||||||
|  | // Kirby Blowout Blast relies on the combiner output of a previous draw | ||||||
|  | // in order to render the sky correctly. | ||||||
|  | static thread_local Common::Vec4<u8> combiner_output{}; | ||||||
|  |  | ||||||
| RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_) | RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_) | ||||||
|     : memory{memory_}, state{Pica::g_state}, regs{state.regs}, fb{memory, regs.framebuffer} {} |     : memory{memory_}, state{Pica::g_state}, regs{state.regs}, | ||||||
|  |       num_sw_threads{std::max(std::thread::hardware_concurrency(), 2U)}, | ||||||
|  |       sw_workers{num_sw_threads, "SwRenderer workers"}, fb{memory, regs.framebuffer} {} | ||||||
|  |  | ||||||
| void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0, | void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0, | ||||||
|                                      const Pica::Shader::OutputVertex& v1, |                                      const Pica::Shader::OutputVertex& v1, | ||||||
| @@ -289,167 +295,194 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con | |||||||
|  |  | ||||||
|     const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); |     const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); | ||||||
|  |  | ||||||
|     auto textures = regs.texturing.GetTextures(); |     const auto textures = regs.texturing.GetTextures(); | ||||||
|     const auto tev_stages = regs.texturing.GetTevStages(); |     const auto tev_stages = regs.texturing.GetTevStages(); | ||||||
|  |     for (u32 i = 0; i < texture_data.size(); i++) { | ||||||
|  |         const PAddr addr = textures[i].config.GetPhysicalAddress(); | ||||||
|  |         if (addr) { | ||||||
|  |             texture_data[i] = memory.GetPhysicalPointer(addr); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fb.Bind(); | ||||||
|  |  | ||||||
|  |     if (use_jit) { | ||||||
|  |         const TevConfigKey key{regs.texturing}; | ||||||
|  |         auto [it, new_fun] = tev_cache.try_emplace(key.Hash()); | ||||||
|  |         if (new_fun) { | ||||||
|  |             it->second = std::make_unique<TevConfig>(regs, key); | ||||||
|  |         } | ||||||
|  |         tev_config = it->second.get(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     // Enter rasterization loop, starting at the center of the topleft bounding box corner. |     // Enter rasterization loop, starting at the center of the topleft bounding box corner. | ||||||
|     // TODO: Not sure if looping through x first might be faster |     // TODO: Not sure if looping through x first might be faster | ||||||
|     for (u16 y = min_y + 8; y < max_y; y += 0x10) { |     for (u16 y = min_y + 8; y < max_y; y += 0x10) { | ||||||
|         for (u16 x = min_x + 8; x < max_x; x += 0x10) { |         const auto process_scanline = [&, y] { | ||||||
|             // Do not process the pixel if it's inside the scissor box and the scissor mode is set |             for (u16 x = min_x + 8; x < max_x; x += 0x10) { | ||||||
|             // to Exclude. |                 // Do not process the pixel if it's inside the scissor box and the scissor mode is | ||||||
|             if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { |                 // set to Exclude. | ||||||
|                 if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) { |                 if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { | ||||||
|  |                     if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) { | ||||||
|  |                         continue; | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 // Calculate the barycentric coordinates w0, w1 and w2 | ||||||
|  |                 const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); | ||||||
|  |                 const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); | ||||||
|  |                 const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); | ||||||
|  |                 const s32 wsum = w0 + w1 + w2; | ||||||
|  |  | ||||||
|  |                 // If current pixel is not covered by the current primitive | ||||||
|  |                 if (w0 < 0 || w1 < 0 || w2 < 0) { | ||||||
|                     continue; |                     continue; | ||||||
|                 } |                 } | ||||||
|             } |  | ||||||
|  |  | ||||||
|             // Calculate the barycentric coordinates w0, w1 and w2 |                 const auto baricentric_coordinates = Common::MakeVec( | ||||||
|             const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); |                     f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)), | ||||||
|             const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); |                     f24::FromFloat32(static_cast<f32>(w2))); | ||||||
|             const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); |                 const f24 interpolated_w_inverse = | ||||||
|             const s32 wsum = w0 + w1 + w2; |                     f24::One() / Common::Dot(w_inverse, baricentric_coordinates); | ||||||
|  |  | ||||||
|             // If current pixel is not covered by the current primitive |                 // interpolated_z = z / w | ||||||
|             if (w0 < 0 || w1 < 0 || w2 < 0) { |                 const float interpolated_z_over_w = | ||||||
|                 continue; |                     (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + | ||||||
|             } |                      v2.screenpos[2].ToFloat32() * w2) / | ||||||
|  |                     wsum; | ||||||
|  |  | ||||||
|             const auto baricentric_coordinates = Common::MakeVec( |                 // Not fully accurate. About 3 bits in precision are missing. | ||||||
|                 f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)), |                 // Z-Buffer (z / w * scale + offset) | ||||||
|                 f24::FromFloat32(static_cast<f32>(w2))); |                 const float depth_scale = | ||||||
|             const f24 interpolated_w_inverse = |                     f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); | ||||||
|                 f24::One() / Common::Dot(w_inverse, baricentric_coordinates); |                 const float depth_offset = | ||||||
|  |                     f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); | ||||||
|  |                 float depth = interpolated_z_over_w * depth_scale + depth_offset; | ||||||
|  |  | ||||||
|             // interpolated_z = z / w |                 // Potentially switch to W-Buffer | ||||||
|             const float interpolated_z_over_w = |                 if (regs.rasterizer.depthmap_enable == | ||||||
|                 (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + |                     Pica::RasterizerRegs::DepthBuffering::WBuffering) { | ||||||
|                  v2.screenpos[2].ToFloat32() * w2) / |                     // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w) | ||||||
|                 wsum; |                     depth *= interpolated_w_inverse.ToFloat32() * wsum; | ||||||
|  |                 } | ||||||
|  |  | ||||||
|             // Not fully accurate. About 3 bits in precision are missing. |                 // Clamp the result | ||||||
|             // Z-Buffer (z / w * scale + offset) |                 depth = std::clamp(depth, 0.0f, 1.0f); | ||||||
|             const float depth_scale = |  | ||||||
|                 f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); |  | ||||||
|             const float depth_offset = |  | ||||||
|                 f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); |  | ||||||
|             float depth = interpolated_z_over_w * depth_scale + depth_offset; |  | ||||||
|  |  | ||||||
|             // Potentially switch to W-Buffer |                 /** | ||||||
|             if (regs.rasterizer.depthmap_enable == |                  * Perspective correct attribute interpolation: | ||||||
|                 Pica::RasterizerRegs::DepthBuffering::WBuffering) { |                  * Attribute values cannot be calculated by simple linear interpolation since | ||||||
|                 // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w) |                  * they are not linear in screen space. For example, when interpolating a | ||||||
|                 depth *= interpolated_w_inverse.ToFloat32() * wsum; |                  * texture coordinate across two vertices, something simple like | ||||||
|             } |                  *     u = (u0*w0 + u1*w1)/(w0+w1) | ||||||
|  |                  * will not work. However, the attribute value divided by the | ||||||
|             // Clamp the result |                  * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear | ||||||
|             depth = std::clamp(depth, 0.0f, 1.0f); |                  * in screenspace. Hence, we can linearly interpolate these two independently and | ||||||
|  |                  * calculate the interpolated attribute by dividing the results. | ||||||
|             /** |                  * I.e. | ||||||
|              * Perspective correct attribute interpolation: |                  *     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1) | ||||||
|              * Attribute values cannot be calculated by simple linear interpolation since |                  *     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) | ||||||
|              * they are not linear in screen space. For example, when interpolating a |                  *     u = u_over_w / one_over_w | ||||||
|              * texture coordinate across two vertices, something simple like |                  * | ||||||
|              *     u = (u0*w0 + u1*w1)/(w0+w1) |                  * The generalization to three vertices is straightforward in baricentric | ||||||
|              * will not work. However, the attribute value divided by the |                  *coordinates. | ||||||
|              * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear |                  **/ | ||||||
|              * in screenspace. Hence, we can linearly interpolate these two independently and |                 const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) { | ||||||
|              * calculate the interpolated attribute by dividing the results. |                     auto attr_over_w = Common::MakeVec(attr0, attr1, attr2); | ||||||
|              * I.e. |                     f24 interpolated_attr_over_w = | ||||||
|              *     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1) |                         Common::Dot(attr_over_w, baricentric_coordinates); | ||||||
|              *     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) |                     return interpolated_attr_over_w * interpolated_w_inverse; | ||||||
|              *     u = u_over_w / one_over_w |  | ||||||
|              * |  | ||||||
|              * The generalization to three vertices is straightforward in baricentric coordinates. |  | ||||||
|              **/ |  | ||||||
|             const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) { |  | ||||||
|                 auto attr_over_w = Common::MakeVec(attr0, attr1, attr2); |  | ||||||
|                 f24 interpolated_attr_over_w = Common::Dot(attr_over_w, baricentric_coordinates); |  | ||||||
|                 return interpolated_attr_over_w * interpolated_w_inverse; |  | ||||||
|             }; |  | ||||||
|  |  | ||||||
|             const Common::Vec4<u8> primary_color{ |  | ||||||
|                 static_cast<u8>( |  | ||||||
|                     round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r()) |  | ||||||
|                               .ToFloat32() * |  | ||||||
|                           255)), |  | ||||||
|                 static_cast<u8>( |  | ||||||
|                     round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g()) |  | ||||||
|                               .ToFloat32() * |  | ||||||
|                           255)), |  | ||||||
|                 static_cast<u8>( |  | ||||||
|                     round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b()) |  | ||||||
|                               .ToFloat32() * |  | ||||||
|                           255)), |  | ||||||
|                 static_cast<u8>( |  | ||||||
|                     round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a()) |  | ||||||
|                               .ToFloat32() * |  | ||||||
|                           255)), |  | ||||||
|             }; |  | ||||||
|  |  | ||||||
|             std::array<Common::Vec2<f24>, 3> uv; |  | ||||||
|             uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u()); |  | ||||||
|             uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v()); |  | ||||||
|             uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u()); |  | ||||||
|             uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v()); |  | ||||||
|             uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u()); |  | ||||||
|             uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v()); |  | ||||||
|  |  | ||||||
|             // Sample bound texture units. |  | ||||||
|             const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); |  | ||||||
|             const auto texture_color = TextureColor(uv, textures, tc0_w); |  | ||||||
|  |  | ||||||
|             Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0}; |  | ||||||
|             Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0}; |  | ||||||
|  |  | ||||||
|             if (!regs.lighting.disable) { |  | ||||||
|                 const auto normquat = |  | ||||||
|                     Common::Quaternion<f32>{ |  | ||||||
|                         {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(), |  | ||||||
|                          get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(), |  | ||||||
|                          get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()}, |  | ||||||
|                         get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(), |  | ||||||
|                     } |  | ||||||
|                         .Normalized(); |  | ||||||
|  |  | ||||||
|                 const Common::Vec3f view{ |  | ||||||
|                     get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(), |  | ||||||
|                     get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(), |  | ||||||
|                     get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(), |  | ||||||
|                 }; |                 }; | ||||||
|                 std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors( |  | ||||||
|                     regs.lighting, state.lighting, normquat, view, texture_color); |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             // Write the TEV stages. |                 const Common::Vec4<u8> primary_color{ | ||||||
|             WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color, |                     static_cast<u8>( | ||||||
|                            secondary_fragment_color); |                         round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r()) | ||||||
|  |                                   .ToFloat32() * | ||||||
|  |                               255)), | ||||||
|  |                     static_cast<u8>( | ||||||
|  |                         round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g()) | ||||||
|  |                                   .ToFloat32() * | ||||||
|  |                               255)), | ||||||
|  |                     static_cast<u8>( | ||||||
|  |                         round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b()) | ||||||
|  |                                   .ToFloat32() * | ||||||
|  |                               255)), | ||||||
|  |                     static_cast<u8>( | ||||||
|  |                         round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a()) | ||||||
|  |                                   .ToFloat32() * | ||||||
|  |                               255)), | ||||||
|  |                 }; | ||||||
|  |  | ||||||
|             const auto& output_merger = regs.framebuffer.output_merger; |                 std::array<Common::Vec2<f24>, 3> uv; | ||||||
|             if (output_merger.fragment_operation_mode == |                 uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u()); | ||||||
|                 FramebufferRegs::FragmentOperationMode::Shadow) { |                 uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v()); | ||||||
|                 u32 depth_int = static_cast<u32>(depth * 0xFFFFFF); |                 uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u()); | ||||||
|                 // Use green color as the shadow intensity |                 uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v()); | ||||||
|                 u8 stencil = combiner_output.y; |                 uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u()); | ||||||
|                 fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil); |                 uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v()); | ||||||
|                 // Skip the normal output merger pipeline if it is in shadow mode |  | ||||||
|                 continue; |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             // Does alpha testing happen before or after stencil? |                 // Sample bound texture units. | ||||||
|             if (!DoAlphaTest(combiner_output.a())) { |                 const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); | ||||||
|                 continue; |                 auto texture_color = TextureColor(uv, textures, tc0_w); | ||||||
|  |  | ||||||
|  |                 Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0}; | ||||||
|  |                 Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0}; | ||||||
|  |  | ||||||
|  |                 if (!regs.lighting.disable) { | ||||||
|  |                     const auto normquat = | ||||||
|  |                         Common::Quaternion<f32>{ | ||||||
|  |                             {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x) | ||||||
|  |                                  .ToFloat32(), | ||||||
|  |                              get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y) | ||||||
|  |                                  .ToFloat32(), | ||||||
|  |                              get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z) | ||||||
|  |                                  .ToFloat32()}, | ||||||
|  |                             get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(), | ||||||
|  |                         } | ||||||
|  |                             .Normalized(); | ||||||
|  |  | ||||||
|  |                     const Common::Vec3f view{ | ||||||
|  |                         get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(), | ||||||
|  |                         get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(), | ||||||
|  |                         get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(), | ||||||
|  |                     }; | ||||||
|  |                     std::tie(primary_fragment_color, secondary_fragment_color) = | ||||||
|  |                         ComputeFragmentsColors(regs.lighting, state.lighting, normquat, view, | ||||||
|  |                                                texture_color); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 // Write the TEV stages. | ||||||
|  |                 WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color, | ||||||
|  |                                secondary_fragment_color); | ||||||
|  |  | ||||||
|  |                 const auto& output_merger = regs.framebuffer.output_merger; | ||||||
|  |                 if (output_merger.fragment_operation_mode == | ||||||
|  |                     FramebufferRegs::FragmentOperationMode::Shadow) { | ||||||
|  |                     u32 depth_int = static_cast<u32>(depth * 0xFFFFFF); | ||||||
|  |                     // Use green color as the shadow intensity | ||||||
|  |                     u8 stencil = combiner_output.y; | ||||||
|  |                     fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil); | ||||||
|  |                     // Skip the normal output merger pipeline if it is in shadow mode | ||||||
|  |                     continue; | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 // Does alpha testing happen before or after stencil? | ||||||
|  |                 if (!DoAlphaTest(combiner_output.a())) { | ||||||
|  |                     continue; | ||||||
|  |                 } | ||||||
|  |                 WriteFog(depth); | ||||||
|  |                 if (!DoDepthStencilTest(x, y, depth)) { | ||||||
|  |                     continue; | ||||||
|  |                 } | ||||||
|  |                 const auto result = PixelColor(x, y); | ||||||
|  |                 if (regs.framebuffer.framebuffer.allow_color_write != 0) { | ||||||
|  |                     fb.DrawPixel(x >> 4, y >> 4, result); | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|             WriteFog(combiner_output, depth); |         }; | ||||||
|             if (!DoDepthStencilTest(x, y, depth)) { |         sw_workers.QueueWork(std::move(process_scanline)); | ||||||
|                 continue; |  | ||||||
|             } |  | ||||||
|             const auto result = PixelColor(x, y, combiner_output); |  | ||||||
|             if (regs.framebuffer.framebuffer.allow_color_write != 0) { |  | ||||||
|                 fb.DrawPixel(x >> 4, y >> 4, result); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|  |     sw_workers.WaitForRequests(); | ||||||
| } | } | ||||||
|  |  | ||||||
| std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor( | std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor( | ||||||
| @@ -538,11 +571,10 @@ std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor( | |||||||
|             t = texture.config.height - 1 - |             t = texture.config.height - 1 - | ||||||
|                 GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); |                 GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); | ||||||
|  |  | ||||||
|             const u8* texture_data = memory.GetPhysicalPointer(texture_address); |  | ||||||
|             const auto info = TextureInfo::FromPicaRegister(texture.config, texture.format); |             const auto info = TextureInfo::FromPicaRegister(texture.config, texture.format); | ||||||
|  |  | ||||||
|             // TODO: Apply the min and mag filters to the texture |             // TODO: Apply the min and mag filters to the texture | ||||||
|             texture_color[i] = LookupTexture(texture_data, s, t, info); |             texture_color[i] = LookupTexture(texture_data[i], s, t, info); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         if (i == 0 && (texture.config.type == TexturingRegs::TextureConfig::Shadow2D || |         if (i == 0 && (texture.config.type == TexturingRegs::TextureConfig::Shadow2D || | ||||||
| @@ -572,8 +604,7 @@ std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor( | |||||||
|     return texture_color; |     return texture_color; | ||||||
| } | } | ||||||
|  |  | ||||||
| Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y, | Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y) const { | ||||||
|                                                 Common::Vec4<u8>& combiner_output) const { |  | ||||||
|     const auto dest = fb.GetPixel(x >> 4, y >> 4); |     const auto dest = fb.GetPixel(x >> 4, y >> 4); | ||||||
|     Common::Vec4<u8> blend_output = combiner_output; |     Common::Vec4<u8> blend_output = combiner_output; | ||||||
|  |  | ||||||
| @@ -664,10 +695,20 @@ Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y, | |||||||
| } | } | ||||||
|  |  | ||||||
| void RasterizerSoftware::WriteTevConfig( | void RasterizerSoftware::WriteTevConfig( | ||||||
|     std::span<const Common::Vec4<u8>, 4> texture_color, |     std::span<Common::Vec4<u8>, 4> texture_color, | ||||||
|     std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages, |     std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages, | ||||||
|     Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color, |     Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color, | ||||||
|     Common::Vec4<u8> secondary_fragment_color) { |     Common::Vec4<u8> secondary_fragment_color) { | ||||||
|  |  | ||||||
|  | #if CITRA_ARCH(x86_64) | ||||||
|  |     if (use_jit) { | ||||||
|  |         const u32 tev_combiner_buffer_color = regs.texturing.tev_combiner_buffer_color.raw; | ||||||
|  |         combiner_output = tev_config->Run(texture_color, primary_color, primary_fragment_color, | ||||||
|  |                                           secondary_fragment_color, tev_combiner_buffer_color); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|     /** |     /** | ||||||
|      * Texture environment - consists of 6 stages of color and alpha combining. |      * Texture environment - consists of 6 stages of color and alpha combining. | ||||||
|      * Color combiners take three input color values from some source (e.g. interpolated |      * Color combiners take three input color values from some source (e.g. interpolated | ||||||
| @@ -731,6 +772,7 @@ void RasterizerSoftware::WriteTevConfig( | |||||||
|             GetColorModifier(tev_stage.color_modifier2, get_source(tev_stage.color_source2)), |             GetColorModifier(tev_stage.color_modifier2, get_source(tev_stage.color_source2)), | ||||||
|             GetColorModifier(tev_stage.color_modifier3, get_source(tev_stage.color_source3)), |             GetColorModifier(tev_stage.color_modifier3, get_source(tev_stage.color_source3)), | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         const Common::Vec3<u8> color_output = ColorCombine(tev_stage.color_op, color_result); |         const Common::Vec3<u8> color_output = ColorCombine(tev_stage.color_op, color_result); | ||||||
|  |  | ||||||
|         u8 alpha_output; |         u8 alpha_output; | ||||||
| @@ -768,7 +810,7 @@ void RasterizerSoftware::WriteTevConfig( | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| void RasterizerSoftware::WriteFog(Common::Vec4<u8>& combiner_output, float depth) const { | void RasterizerSoftware::WriteFog(float depth) const { | ||||||
|     /** |     /** | ||||||
|      * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to |      * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to | ||||||
|      * store the depth etc. Using float for now until we know more about Pica datatypes. |      * store the depth etc. Using float for now until we know more about Pica datatypes. | ||||||
|   | |||||||
| @@ -4,13 +4,20 @@ | |||||||
|  |  | ||||||
| #pragma once | #pragma once | ||||||
|  |  | ||||||
|  | #include <memory> | ||||||
| #include <span> | #include <span> | ||||||
|  | #include <unordered_map> | ||||||
|  |  | ||||||
|  | #include "common/arch.h" | ||||||
|  | #include "common/thread_worker.h" | ||||||
| #include "video_core/rasterizer_interface.h" | #include "video_core/rasterizer_interface.h" | ||||||
| #include "video_core/regs_texturing.h" |  | ||||||
| #include "video_core/renderer_software/sw_clipper.h" | #include "video_core/renderer_software/sw_clipper.h" | ||||||
| #include "video_core/renderer_software/sw_framebuffer.h" | #include "video_core/renderer_software/sw_framebuffer.h" | ||||||
|  |  | ||||||
|  | #if CITRA_ARCH(x86_64) | ||||||
|  | #include "video_core/renderer_software/sw_tev_jit.h" | ||||||
|  | #endif | ||||||
|  |  | ||||||
| namespace Pica::Shader { | namespace Pica::Shader { | ||||||
| struct OutputVertex; | struct OutputVertex; | ||||||
| } | } | ||||||
| @@ -52,16 +59,16 @@ private: | |||||||
|         std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const; |         std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const; | ||||||
|  |  | ||||||
|     /// Returns the final pixel color with blending or logic ops applied. |     /// Returns the final pixel color with blending or logic ops applied. | ||||||
|     Common::Vec4<u8> PixelColor(u16 x, u16 y, Common::Vec4<u8>& combiner_output) const; |     Common::Vec4<u8> PixelColor(u16 x, u16 y) const; | ||||||
|  |  | ||||||
|     /// Emulates the TEV configuration and returns the combiner output. |     /// Emulates the TEV configuration and returns the combiner output. | ||||||
|     void WriteTevConfig(std::span<const Common::Vec4<u8>, 4> texture_color, |     void WriteTevConfig(std::span<Common::Vec4<u8>, 4> texture_color, | ||||||
|                         std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages, |                         std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages, | ||||||
|                         Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color, |                         Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color, | ||||||
|                         Common::Vec4<u8> secondary_fragment_color); |                         Common::Vec4<u8> secondary_fragment_color); | ||||||
|  |  | ||||||
|     /// Blends fog to the combiner output if enabled. |     /// Blends fog to the combiner output if enabled. | ||||||
|     void WriteFog(Common::Vec4<u8>& combiner_output, float depth) const; |     void WriteFog(float depth) const; | ||||||
|  |  | ||||||
|     /// Performs the alpha test. Returns false if the test failed. |     /// Performs the alpha test. Returns false if the test failed. | ||||||
|     bool DoAlphaTest(u8 alpha) const; |     bool DoAlphaTest(u8 alpha) const; | ||||||
| @@ -73,10 +80,13 @@ private: | |||||||
|     Memory::MemorySystem& memory; |     Memory::MemorySystem& memory; | ||||||
|     Pica::State& state; |     Pica::State& state; | ||||||
|     const Pica::Regs& regs; |     const Pica::Regs& regs; | ||||||
|  |     bool use_jit{true}; | ||||||
|  |     size_t num_sw_threads; | ||||||
|  |     Common::ThreadWorker sw_workers; | ||||||
|     Framebuffer fb; |     Framebuffer fb; | ||||||
|     // Kirby Blowout Blast relies on the combiner output of a previous draw |     TevCache tev_cache; | ||||||
|     // in order to render the sky correctly. |     TevConfig* tev_config{}; | ||||||
|     Common::Vec4<u8> combiner_output{}; |     std::array<const u8*, 3> texture_data{}; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| } // namespace SwRenderer | } // namespace SwRenderer | ||||||
|   | |||||||
							
								
								
									
										473
									
								
								src/video_core/renderer_software/sw_tev_jit.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										473
									
								
								src/video_core/renderer_software/sw_tev_jit.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,473 @@ | |||||||
|  | // Copyright 2023 Citra Emulator Project | ||||||
|  | // Licensed under GPLv2 or any later version | ||||||
|  | // Refer to the license.txt file included. | ||||||
|  |  | ||||||
|  | #include <bit> | ||||||
|  | #include <emmintrin.h> | ||||||
|  | #include "common/x64/xbyak_abi.h" | ||||||
|  | #include "video_core/regs.h" | ||||||
|  | #include "video_core/renderer_software/sw_tev_jit.h" | ||||||
|  |  | ||||||
|  | namespace SwRenderer { | ||||||
|  |  | ||||||
|  | namespace { | ||||||
|  |  | ||||||
|  | using namespace Common::X64; | ||||||
|  | using namespace Xbyak::util; | ||||||
|  | using Pica::TexturingRegs; | ||||||
|  | using Xbyak::Reg32; | ||||||
|  | using Xbyak::Reg64; | ||||||
|  | using Xbyak::Xmm; | ||||||
|  | using TevStageConfig = Pica::TexturingRegs::TevStageConfig; | ||||||
|  |  | ||||||
|  | constexpr Reg32 A0 = r11d; | ||||||
|  | constexpr Reg32 A1 = r12d; | ||||||
|  | constexpr Reg32 A2 = r13d; | ||||||
|  | constexpr Reg32 ALPHA_OUTPUT = r14d; | ||||||
|  | constexpr Xmm COMBINER_OUTPUT = xmm0; | ||||||
|  | constexpr Xmm COMBINER_BUFFER = xmm1; | ||||||
|  | constexpr Xmm NEXT_COMBINER_BUFFER = xmm2; | ||||||
|  | constexpr Xmm VEC0 = xmm3; | ||||||
|  | constexpr Xmm VEC1 = xmm4; | ||||||
|  | constexpr Xmm VEC2 = xmm5; | ||||||
|  | constexpr Xmm COLOR_OUTPUT = xmm6; | ||||||
|  | constexpr Xmm ZERO = xmm13; | ||||||
|  | constexpr Xmm MID_COLOR = xmm14; | ||||||
|  | constexpr Xmm MAX_COLOR = xmm15; | ||||||
|  |  | ||||||
|  | bool IsPassThroughTevStage(const TevStageConfig& stage) { | ||||||
|  |     return (stage.color_op == TevStageConfig::Operation::Replace && | ||||||
|  |             stage.alpha_op == TevStageConfig::Operation::Replace && | ||||||
|  |             stage.color_source1 == TevStageConfig::Source::Previous && | ||||||
|  |             stage.alpha_source1 == TevStageConfig::Source::Previous && | ||||||
|  |             stage.color_modifier1 == TevStageConfig::ColorModifier::SourceColor && | ||||||
|  |             stage.alpha_modifier1 == TevStageConfig::AlphaModifier::SourceAlpha && | ||||||
|  |             stage.GetColorMultiplier() == 1 && stage.GetAlphaMultiplier() == 1); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | } // Anonymous namespace | ||||||
|  |  | ||||||
|  | TevConfigKey::TevConfigKey(const Pica::TexturingRegs& regs) { | ||||||
|  |     const auto& tev_stages = regs.GetTevStages(); | ||||||
|  |     for (size_t i = 0; i < tev_stages.size(); i++) { | ||||||
|  |         const auto& tev_stage = tev_stages[i]; | ||||||
|  |         stages[i].sources_raw = tev_stage.sources_raw; | ||||||
|  |         stages[i].modifiers_raw = tev_stage.modifiers_raw; | ||||||
|  |         stages[i].ops_raw = tev_stage.ops_raw; | ||||||
|  |         stages[i].const_color = tev_stage.const_color; | ||||||
|  |         stages[i].scales_raw = tev_stage.scales_raw; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | TevConfig::TevConfig(const Pica::Regs& regs_, const TevConfigKey& key) : regs{regs_} { | ||||||
|  |     WriteTevConfig(key); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | TevConfig::~TevConfig() = default; | ||||||
|  |  | ||||||
|  | Common::Vec4<u8> TevConfig::Run(std::span<Common::Vec4<u8>, 4> texture_color_, | ||||||
|  |                                 Common::Vec4<u8> primary_color_, | ||||||
|  |                                 Common::Vec4<u8> primary_fragment_color_, | ||||||
|  |                                 Common::Vec4<u8> secondary_fragment_color_, | ||||||
|  |                                 u64 tev_combiner_buffer_color) { | ||||||
|  |     u32* texture_color = reinterpret_cast<u32*>(texture_color_.data()); | ||||||
|  |     const u32 primary_color = std::bit_cast<u32>(primary_color_); | ||||||
|  |     const u32 primary_fragment_color = std::bit_cast<u32>(primary_fragment_color_); | ||||||
|  |     const u32 secondary_fragment_color = std::bit_cast<u32>(secondary_fragment_color_); | ||||||
|  |     const u64 secondary_fragment_color_and_tev_combiner_buffer_color = | ||||||
|  |         secondary_fragment_color | (tev_combiner_buffer_color << 32); | ||||||
|  |     const u32 result = program(texture_color, primary_color, primary_fragment_color, | ||||||
|  |                                secondary_fragment_color_and_tev_combiner_buffer_color); | ||||||
|  |     return std::bit_cast<Common::Vec4<u8>>(result); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void TevConfig::WriteTevConfig(const TevConfigKey& key) { | ||||||
|  |     program = (CompiledTevFun*)getCurr(); | ||||||
|  |  | ||||||
|  |     constexpr Xbyak::Reg TEXTURE_COLOR = ABI_PARAM1; | ||||||
|  |     constexpr Xbyak::Reg PRIMARY_COLOR = ABI_PARAM2; | ||||||
|  |     constexpr Xbyak::Reg PRIMARY_FRAGMENT_COLOR = ABI_PARAM3; | ||||||
|  |     constexpr Xbyak::Reg SECONDARY_FRAGMENT_COLOR = ABI_PARAM4; | ||||||
|  |  | ||||||
|  |     // Save calle state | ||||||
|  |     ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16); | ||||||
|  |  | ||||||
|  |     // Clear the combiner registers and zero constant | ||||||
|  |     pxor(COMBINER_OUTPUT, COMBINER_OUTPUT); | ||||||
|  |     pxor(COMBINER_BUFFER, COMBINER_BUFFER); | ||||||
|  |     pxor(ZERO, ZERO); | ||||||
|  |  | ||||||
|  |     // Used to set an xmm register to the max color | ||||||
|  |     static const __m128i max = _mm_set1_epi32(255); | ||||||
|  |     mov(rax, reinterpret_cast<size_t>(&max)); | ||||||
|  |     movdqu(MAX_COLOR, xword[rax]); | ||||||
|  |  | ||||||
|  |     // Used to set an xmm register to the mid color | ||||||
|  |     static const __m128i mid = _mm_set1_epi32(128); | ||||||
|  |     mov(rax, reinterpret_cast<size_t>(&mid)); | ||||||
|  |     movdqu(MID_COLOR, xword[rax]); | ||||||
|  |  | ||||||
|  |     // Load next_combiner_buffer | ||||||
|  |     mov(rax, ABI_PARAM4); | ||||||
|  |     shr(rax, 32); | ||||||
|  |     vmovd(NEXT_COMBINER_BUFFER, eax); | ||||||
|  |     pmovzxbd(NEXT_COMBINER_BUFFER, NEXT_COMBINER_BUFFER); | ||||||
|  |  | ||||||
|  |     for (u32 tev_stage_index = 0; tev_stage_index < key.stages.size(); ++tev_stage_index) { | ||||||
|  |         const auto& tev_stage = key.stages[tev_stage_index]; | ||||||
|  |         if (!IsPassThroughTevStage(tev_stage)) { | ||||||
|  |             using Source = TexturingRegs::TevStageConfig::Source; | ||||||
|  |  | ||||||
|  |             const auto get_source = [&](const Xbyak::Xmm& dest, Source source) { | ||||||
|  |                 switch (source) { | ||||||
|  |                 case Source::PrimaryColor: | ||||||
|  |                     vmovd(dest, PRIMARY_COLOR.cvt32()); | ||||||
|  |                     pmovzxbd(dest, dest); | ||||||
|  |                     break; | ||||||
|  |                 case Source::PrimaryFragmentColor: | ||||||
|  |                     vmovd(dest, PRIMARY_FRAGMENT_COLOR.cvt32()); | ||||||
|  |                     pmovzxbd(dest, dest); | ||||||
|  |                     break; | ||||||
|  |                 case Source::SecondaryFragmentColor: | ||||||
|  |                     vmovd(dest, SECONDARY_FRAGMENT_COLOR.cvt32()); | ||||||
|  |                     pmovzxbd(dest, dest); | ||||||
|  |                     break; | ||||||
|  |                 case Source::Texture0: | ||||||
|  |                 case Source::Texture1: | ||||||
|  |                 case Source::Texture2: | ||||||
|  |                 case Source::Texture3: { | ||||||
|  |                     const u32 index = static_cast<u32>(source) - static_cast<u32>(Source::Texture0); | ||||||
|  |                     vmovd(dest, dword[TEXTURE_COLOR + index * sizeof(u32)]); | ||||||
|  |                     pmovzxbd(dest, dest); | ||||||
|  |                     break; | ||||||
|  |                 } | ||||||
|  |                 case Source::PreviousBuffer: | ||||||
|  |                     vmovdqa(dest, COMBINER_BUFFER); | ||||||
|  |                     break; | ||||||
|  |                 case Source::Constant: | ||||||
|  |                     mov(eax, tev_stage.const_color); | ||||||
|  |                     vmovd(dest, eax); | ||||||
|  |                     pmovzxbd(dest, dest); | ||||||
|  |                     break; | ||||||
|  |                 case Source::Previous: | ||||||
|  |                     vmovdqa(dest, COMBINER_OUTPUT); | ||||||
|  |                     break; | ||||||
|  |                 default: | ||||||
|  |                     LOG_ERROR(HW_GPU, "Unknown color combiner source {}", source); | ||||||
|  |                     UNIMPLEMENTED(); | ||||||
|  |                     vmovdqa(dest, ZERO); | ||||||
|  |                 } | ||||||
|  |                 return dest; | ||||||
|  |             }; | ||||||
|  |  | ||||||
|  |             // Load the color modifiers to VEC0/1/2. | ||||||
|  |             GetColorModifier(get_source(VEC0, tev_stage.color_source1), tev_stage.color_modifier1); | ||||||
|  |             GetColorModifier(get_source(VEC1, tev_stage.color_source2), tev_stage.color_modifier2); | ||||||
|  |             GetColorModifier(get_source(VEC2, tev_stage.color_source3), tev_stage.color_modifier3); | ||||||
|  |  | ||||||
|  |             // Combine the texture colors to COLOR_OUTPUT. | ||||||
|  |             ColorCombine(COLOR_OUTPUT, tev_stage.color_op); | ||||||
|  |  | ||||||
|  |             if (tev_stage.color_op == TexturingRegs::TevStageConfig::Operation::Dot3_RGBA) { | ||||||
|  |                 // Result of Dot3_RGBA operation is also placed to the alpha component | ||||||
|  |                 vmovd(ALPHA_OUTPUT.cvt32(), COLOR_OUTPUT); | ||||||
|  |             } else { | ||||||
|  |                 // Load the alpha modifers to VEC0/1/2. | ||||||
|  |                 GetAlphaModifier(get_source(VEC0, tev_stage.alpha_source1), A0, | ||||||
|  |                                  tev_stage.alpha_modifier1); | ||||||
|  |                 GetAlphaModifier(get_source(VEC1, tev_stage.alpha_source2), A1, | ||||||
|  |                                  tev_stage.alpha_modifier2); | ||||||
|  |                 GetAlphaModifier(get_source(VEC2, tev_stage.alpha_source3), A2, | ||||||
|  |                                  tev_stage.alpha_modifier3); | ||||||
|  |  | ||||||
|  |                 // Combine the alpha values to ALPHA_OUTPUT. | ||||||
|  |                 AlphaCombine(ALPHA_OUTPUT, tev_stage.alpha_op); | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             // Load the color multipler to an SSE vector. | ||||||
|  |             mov(eax, tev_stage.GetColorMultiplier()); | ||||||
|  |             movd(VEC0, eax); | ||||||
|  |             pshufd(VEC0, VEC0, 0); | ||||||
|  |  | ||||||
|  |             // Multiply color output with the multiplier and take the minimum. | ||||||
|  |             pmulld(COLOR_OUTPUT, VEC0); | ||||||
|  |             pminsd(COLOR_OUTPUT, MAX_COLOR); | ||||||
|  |  | ||||||
|  |             // Load the alpha multiplier, multiply it with the alpha output. | ||||||
|  |             mov(eax, tev_stage.GetAlphaMultiplier()); | ||||||
|  |             imul(ALPHA_OUTPUT, eax); | ||||||
|  |  | ||||||
|  |             // Load result to a vector and take the minimum | ||||||
|  |             movd(VEC0, ALPHA_OUTPUT); | ||||||
|  |             pshufd(VEC0, VEC0, 0); | ||||||
|  |             pminsd(VEC0, MAX_COLOR); | ||||||
|  |  | ||||||
|  |             // Blend vectors to get the combiner output | ||||||
|  |             vpblendd(COMBINER_OUTPUT, COLOR_OUTPUT, VEC0, 0b1000); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // Set combiner buffer to the next buffer | ||||||
|  |         movq(COMBINER_BUFFER, NEXT_COMBINER_BUFFER); | ||||||
|  |  | ||||||
|  |         if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor( | ||||||
|  |                 tev_stage_index)) { | ||||||
|  |             vpblendd(NEXT_COMBINER_BUFFER, COMBINER_OUTPUT, NEXT_COMBINER_BUFFER, 0b1000); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha( | ||||||
|  |                 tev_stage_index)) { | ||||||
|  |             vpblendd(NEXT_COMBINER_BUFFER, COMBINER_OUTPUT, NEXT_COMBINER_BUFFER, 0b0111); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Pack combiner output to a u32 to be returned. | ||||||
|  |     vpextrd(edx, COMBINER_OUTPUT, 3); | ||||||
|  |     vpextrd(eax, COMBINER_OUTPUT, 2); | ||||||
|  |     sal(edx, 8); | ||||||
|  |     or_(eax, edx); | ||||||
|  |     vpextrd(edx, COMBINER_OUTPUT, 1); | ||||||
|  |     sal(eax, 8); | ||||||
|  |     or_(edx, eax); | ||||||
|  |     vmovd(eax, COMBINER_OUTPUT); | ||||||
|  |     sal(edx, 8); | ||||||
|  |     or_(eax, edx); | ||||||
|  |  | ||||||
|  |     ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16); | ||||||
|  |     ret(); | ||||||
|  |     ready(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void TevConfig::GetColorModifier(const Xbyak::Xmm& dest, TevStageConfig::ColorModifier factor) { | ||||||
|  |     using ColorModifier = TevStageConfig::ColorModifier; | ||||||
|  |  | ||||||
|  |     const auto broadcast = [&](u32 comp) { | ||||||
|  |         const u8 mask = comp | (comp << 2) | (comp << 4); | ||||||
|  |         vpshufd(dest, dest, mask); | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     switch (factor) { | ||||||
|  |     case ColorModifier::SourceColor: | ||||||
|  |         vpblendd(dest, dest, ZERO, 0b1000); | ||||||
|  |         break; | ||||||
|  |     case ColorModifier::OneMinusSourceColor: | ||||||
|  |         vpsubd(dest, MAX_COLOR, dest); | ||||||
|  |         break; | ||||||
|  |     case ColorModifier::SourceAlpha: | ||||||
|  |         broadcast(3); | ||||||
|  |         break; | ||||||
|  |     case ColorModifier::OneMinusSourceAlpha: | ||||||
|  |         broadcast(3); | ||||||
|  |         vpsubd(dest, MAX_COLOR, dest); | ||||||
|  |         break; | ||||||
|  |     case ColorModifier::SourceRed: | ||||||
|  |         broadcast(0); | ||||||
|  |         break; | ||||||
|  |     case ColorModifier::OneMinusSourceRed: | ||||||
|  |         broadcast(0); | ||||||
|  |         vpsubd(dest, MAX_COLOR, dest); | ||||||
|  |         break; | ||||||
|  |     case ColorModifier::SourceGreen: | ||||||
|  |         broadcast(1); | ||||||
|  |         break; | ||||||
|  |     case ColorModifier::OneMinusSourceGreen: | ||||||
|  |         broadcast(1); | ||||||
|  |         vpsubd(dest, MAX_COLOR, dest); | ||||||
|  |         break; | ||||||
|  |     case ColorModifier::SourceBlue: | ||||||
|  |         broadcast(2); | ||||||
|  |         break; | ||||||
|  |     case ColorModifier::OneMinusSourceBlue: | ||||||
|  |         broadcast(2); | ||||||
|  |         vpsubd(dest, MAX_COLOR, dest); | ||||||
|  |         break; | ||||||
|  |     default: | ||||||
|  |         UNREACHABLE(); | ||||||
|  |     } | ||||||
|  |     pand(dest, MAX_COLOR); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | void TevConfig::ColorCombine(const Xbyak::Xmm& dest, TevStageConfig::Operation op) { | ||||||
|  |     using Operation = TevStageConfig::Operation; | ||||||
|  |  | ||||||
|  |     switch (op) { | ||||||
|  |     case Operation::Replace: | ||||||
|  |         vmovdqa(dest, VEC0); | ||||||
|  |         break; | ||||||
|  |     case Operation::Modulate: | ||||||
|  |         pmulld(VEC0, VEC1); | ||||||
|  |         vpsrlq(dest, VEC0, 8); // TODO: This is a very crude approximation of division by 255 | ||||||
|  |         break; | ||||||
|  |     case Operation::Add: | ||||||
|  |         vpaddd(VEC0, VEC0, VEC1); | ||||||
|  |         vpminsd(dest, MAX_COLOR, VEC0); | ||||||
|  |         break; | ||||||
|  |     case Operation::AddSigned: | ||||||
|  |         vpaddd(VEC0, VEC0, VEC1); | ||||||
|  |         vpsubd(VEC0, VEC0, MID_COLOR); | ||||||
|  |         vpminsd(VEC0, VEC0, MAX_COLOR); | ||||||
|  |         vpmaxsd(dest, VEC0, ZERO); | ||||||
|  |         break; | ||||||
|  |     case Operation::Lerp: | ||||||
|  |         pmulld(VEC0, VEC2); | ||||||
|  |         psubd(VEC2, MAX_COLOR); | ||||||
|  |         pmulld(VEC1, VEC2); | ||||||
|  |         vpaddd(dest, VEC0, VEC1); | ||||||
|  |         vpsrlq(dest, VEC0, 8); // TODO: This is a very crude approximation of division by 255 | ||||||
|  |         break; | ||||||
|  |     case Operation::Subtract: | ||||||
|  |         psubd(VEC0, VEC1); | ||||||
|  |         vpmaxsd(dest, VEC0, ZERO); | ||||||
|  |         break; | ||||||
|  |     case Operation::MultiplyThenAdd: | ||||||
|  |         pmulld(VEC0, VEC1); | ||||||
|  |         pmulld(VEC2, MAX_COLOR); | ||||||
|  |         paddd(VEC0, VEC2); | ||||||
|  |         pminsd(VEC0, MAX_COLOR); | ||||||
|  |         vpsrlq(dest, VEC0, 8); // TODO: This is a very crude approximation of division by 255 | ||||||
|  |         break; | ||||||
|  |     case Operation::AddThenMultiply: | ||||||
|  |         paddd(VEC0, VEC1); | ||||||
|  |         pminsd(VEC0, MAX_COLOR); | ||||||
|  |         pmulld(VEC0, VEC2); | ||||||
|  |         vpsrlq(dest, VEC0, 8); // TODO: This is a very crude approximation of division by 255 | ||||||
|  |         break; | ||||||
|  |     case Operation::Dot3_RGB: | ||||||
|  |     case Operation::Dot3_RGBA: | ||||||
|  |         pslld(VEC0, 1); | ||||||
|  |         psubd(VEC0, MAX_COLOR); | ||||||
|  |         pslld(VEC1, 1); | ||||||
|  |         psubd(VEC1, MAX_COLOR); | ||||||
|  |         pmulld(VEC0, VEC1); | ||||||
|  |         paddd(VEC0, MID_COLOR); | ||||||
|  |         psrld(VEC0, 8); | ||||||
|  |         vpblendd(VEC0, VEC0, ZERO, 0b1000); | ||||||
|  |         phaddd(VEC0, VEC0); | ||||||
|  |         phaddd(VEC0, VEC0); | ||||||
|  |         pminsd(VEC0, MAX_COLOR); | ||||||
|  |         pmaxsd(VEC0, ZERO); | ||||||
|  |         pshufd(dest, VEC0, 0); | ||||||
|  |         break; | ||||||
|  |     default: | ||||||
|  |         LOG_ERROR(HW_GPU, "Unknown color combiner operation {}", (int)op); | ||||||
|  |         UNIMPLEMENTED(); | ||||||
|  |     } | ||||||
|  |     pand(dest, MAX_COLOR); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | void TevConfig::GetAlphaModifier(const Xbyak::Xmm& src, const Xbyak::Reg32& dest, | ||||||
|  |                                  TevStageConfig::AlphaModifier factor) { | ||||||
|  |     using AlphaModifier = TevStageConfig::AlphaModifier; | ||||||
|  |  | ||||||
|  |     const auto get_comp = [&](u32 comp, bool minus = false) { | ||||||
|  |         const auto& reg = minus ? eax : dest; | ||||||
|  |         vpextrd(reg, src, comp); | ||||||
|  |         if (minus) { | ||||||
|  |             mov(dest, 255); | ||||||
|  |             sub(dest, reg); | ||||||
|  |         } | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     switch (factor) { | ||||||
|  |     case AlphaModifier::SourceAlpha: | ||||||
|  |         get_comp(3); | ||||||
|  |         break; | ||||||
|  |     case AlphaModifier::OneMinusSourceAlpha: | ||||||
|  |         get_comp(3, true); | ||||||
|  |         break; | ||||||
|  |     case AlphaModifier::SourceRed: | ||||||
|  |         get_comp(0); | ||||||
|  |         break; | ||||||
|  |     case AlphaModifier::OneMinusSourceRed: | ||||||
|  |         get_comp(0, true); | ||||||
|  |         break; | ||||||
|  |     case AlphaModifier::SourceGreen: | ||||||
|  |         get_comp(1); | ||||||
|  |         break; | ||||||
|  |     case AlphaModifier::OneMinusSourceGreen: | ||||||
|  |         get_comp(1, true); | ||||||
|  |         break; | ||||||
|  |     case AlphaModifier::SourceBlue: | ||||||
|  |         get_comp(2); | ||||||
|  |         break; | ||||||
|  |     case AlphaModifier::OneMinusSourceBlue: | ||||||
|  |         get_comp(2, true); | ||||||
|  |         break; | ||||||
|  |     default: | ||||||
|  |         UNREACHABLE(); | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | void TevConfig::AlphaCombine(const Xbyak::Reg32& dest, TevStageConfig::Operation op) { | ||||||
|  |     using Operation = TevStageConfig::Operation; | ||||||
|  |  | ||||||
|  |     const auto div_255 = [&](const Reg32& dst, const Reg32& src) { | ||||||
|  |         mov(dst, 0x80808081); | ||||||
|  |         imul(dst.cvt64(), src.cvt64()); | ||||||
|  |         shr(dst.cvt64(), 39); | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     switch (op) { | ||||||
|  |     case Operation::Replace: | ||||||
|  |         mov(dest, A0); | ||||||
|  |         break; | ||||||
|  |     case Operation::Modulate: | ||||||
|  |         imul(A0, A1); | ||||||
|  |         div_255(dest, A0); | ||||||
|  |         break; | ||||||
|  |     case Operation::Add: | ||||||
|  |         add(A0, A1); | ||||||
|  |         cmp(A0, 255); | ||||||
|  |         mov(eax, 255); | ||||||
|  |         cmovb(A0, eax); | ||||||
|  |         break; | ||||||
|  |     case Operation::AddSigned: | ||||||
|  |         xor_(eax, eax); | ||||||
|  |         add(A0, A1); | ||||||
|  |         sub(A0, 128); | ||||||
|  |         test(A0, A0); | ||||||
|  |         cmovg(eax, A0); | ||||||
|  |         cmp(eax, 255); | ||||||
|  |         mov(A0, 255); | ||||||
|  |         cmovb(A0, eax); | ||||||
|  |         break; | ||||||
|  |     case Operation::Lerp: | ||||||
|  |         imul(A0, A2); | ||||||
|  |         mov(eax, 255); | ||||||
|  |         sub(eax, A2); | ||||||
|  |         imul(A1, eax); | ||||||
|  |         add(A0, A1); | ||||||
|  |         div_255(dest, A0); | ||||||
|  |         break; | ||||||
|  |     case Operation::Subtract: | ||||||
|  |         sub(A0, A1); | ||||||
|  |         xor_(eax, eax); | ||||||
|  |         test(A0, A0); | ||||||
|  |         cmovl(A0, eax); | ||||||
|  |         mov(dest, A0); | ||||||
|  |         break; | ||||||
|  |     case Operation::MultiplyThenAdd: | ||||||
|  |         imul(A0, A1); | ||||||
|  |         mov(dest, A2); | ||||||
|  |         shl(dest, 8); | ||||||
|  |         sub(dest, A2); | ||||||
|  |         add(dest, A0); | ||||||
|  |         div_255(eax, dest); | ||||||
|  |         cmp(eax, 255); | ||||||
|  |         mov(dest, 255); | ||||||
|  |         cmovb(dest, eax); | ||||||
|  |         break; | ||||||
|  |     case Operation::AddThenMultiply: | ||||||
|  |         add(A0, A1); | ||||||
|  |         cmp(A0, 255); | ||||||
|  |         mov(eax, 255); | ||||||
|  |         cmovg(A0, eax); | ||||||
|  |         imul(A0, A2); | ||||||
|  |         div_255(dest, A0); | ||||||
|  |         break; | ||||||
|  |     default: | ||||||
|  |         LOG_ERROR(HW_GPU, "Unknown alpha combiner operation {}", (int)op); | ||||||
|  |         UNIMPLEMENTED(); | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | } // namespace SwRenderer | ||||||
							
								
								
									
										64
									
								
								src/video_core/renderer_software/sw_tev_jit.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										64
									
								
								src/video_core/renderer_software/sw_tev_jit.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,64 @@ | |||||||
|  | // Copyright 2023 Citra Emulator Project | ||||||
|  | // Licensed under GPLv2 or any later version | ||||||
|  | // Refer to the license.txt file included. | ||||||
|  |  | ||||||
|  | #pragma once | ||||||
|  |  | ||||||
|  | #include <span> | ||||||
|  | #include <xbyak/xbyak.h> | ||||||
|  |  | ||||||
|  | #include "common/hash.h" | ||||||
|  | #include "common/vector_math.h" | ||||||
|  | #include "video_core/regs_texturing.h" | ||||||
|  |  | ||||||
|  | namespace Pica { | ||||||
|  | struct State; | ||||||
|  | struct Regs; | ||||||
|  | } // namespace Pica | ||||||
|  |  | ||||||
|  | namespace SwRenderer { | ||||||
|  |  | ||||||
|  | struct TevConfigKey { | ||||||
|  |     explicit TevConfigKey(const Pica::TexturingRegs& regs); | ||||||
|  |  | ||||||
|  |     u64 Hash() const noexcept { | ||||||
|  |         return Common::ComputeHash64(this, sizeof(TevConfigKey)); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     std::array<Pica::TexturingRegs::TevStageConfig, 6> stages; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | class TevConfig : public Xbyak::CodeGenerator { | ||||||
|  | public: | ||||||
|  |     explicit TevConfig(const Pica::Regs& regs, const TevConfigKey& key); | ||||||
|  |     ~TevConfig(); | ||||||
|  |  | ||||||
|  |     Common::Vec4<u8> Run(std::span<Common::Vec4<u8>, 4> texture_color_, | ||||||
|  |                          Common::Vec4<u8> primary_color_, Common::Vec4<u8> primary_fragment_color_, | ||||||
|  |                          Common::Vec4<u8> secondary_fragment_color_, u64 tev_combiner_buffer_color); | ||||||
|  |  | ||||||
|  | private: | ||||||
|  |     void WriteTevConfig(const TevConfigKey& key); | ||||||
|  |  | ||||||
|  |     void GetColorModifier(const Xbyak::Xmm& dest, | ||||||
|  |                           Pica::TexturingRegs::TevStageConfig::ColorModifier factor); | ||||||
|  |  | ||||||
|  |     void GetAlphaModifier(const Xbyak::Xmm& src, const Xbyak::Reg32& dest, | ||||||
|  |                           Pica::TexturingRegs::TevStageConfig::AlphaModifier factor); | ||||||
|  |  | ||||||
|  |     void ColorCombine(const Xbyak::Xmm& dest, Pica::TexturingRegs::TevStageConfig::Operation op); | ||||||
|  |  | ||||||
|  |     void AlphaCombine(const Xbyak::Reg32& dest, Pica::TexturingRegs::TevStageConfig::Operation op); | ||||||
|  |  | ||||||
|  | private: | ||||||
|  |     const Pica::Regs& regs; | ||||||
|  |  | ||||||
|  |     using CompiledTevFun = u32(u32* texture_color, u32 primary_color, u32 primary_fragment_color, | ||||||
|  |                                u64 secondary_fragment_color_and_tev_combiner_buffer_color); | ||||||
|  |  | ||||||
|  |     CompiledTevFun* program = nullptr; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | using TevCache = std::unordered_map<u64, std::unique_ptr<TevConfig>, Common::IdentityHash<u64>>; | ||||||
|  |  | ||||||
|  | } // namespace SwRenderer | ||||||
		Reference in New Issue
	
	Block a user