Compare commits
	
		
			3 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					cfdb10a7ba | ||
| 
						 | 
					8012b28b92 | ||
| 
						 | 
					531d280461 | 
@@ -95,6 +95,8 @@ add_library(video_core STATIC
 | 
			
		||||
    renderer_software/sw_proctex.h
 | 
			
		||||
    renderer_software/sw_rasterizer.cpp
 | 
			
		||||
    renderer_software/sw_rasterizer.h
 | 
			
		||||
    renderer_software/sw_tev_jit.cpp
 | 
			
		||||
    renderer_software/sw_tev_jit.h
 | 
			
		||||
    renderer_software/sw_texturing.cpp
 | 
			
		||||
    renderer_software/sw_texturing.h
 | 
			
		||||
    renderer_vulkan/pica_to_vk.h
 | 
			
		||||
 
 | 
			
		||||
@@ -8,6 +8,7 @@
 | 
			
		||||
#include "core/hw/hw.h"
 | 
			
		||||
#include "core/hw/lcd.h"
 | 
			
		||||
#include "video_core/renderer_software/renderer_software.h"
 | 
			
		||||
#include "video_core/renderer_software/sw_rasterizer.h"
 | 
			
		||||
 | 
			
		||||
namespace SwRenderer {
 | 
			
		||||
 | 
			
		||||
@@ -17,6 +18,10 @@ RendererSoftware::RendererSoftware(Core::System& system, Frontend::EmuWindow& wi
 | 
			
		||||
 | 
			
		||||
RendererSoftware::~RendererSoftware() = default;
 | 
			
		||||
 | 
			
		||||
VideoCore::RasterizerInterface* RendererSoftware::Rasterizer() const {
 | 
			
		||||
    return rasterizer.get();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void RendererSoftware::SwapBuffers() {
 | 
			
		||||
    PrepareRenderTarget();
 | 
			
		||||
    EndFrame();
 | 
			
		||||
 
 | 
			
		||||
@@ -5,7 +5,6 @@
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#include "video_core/renderer_base.h"
 | 
			
		||||
#include "video_core/renderer_software/sw_rasterizer.h"
 | 
			
		||||
 | 
			
		||||
namespace Core {
 | 
			
		||||
class System;
 | 
			
		||||
@@ -19,19 +18,18 @@ struct ScreenInfo {
 | 
			
		||||
    std::vector<u8> pixels;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class RasterizerSoftware;
 | 
			
		||||
 | 
			
		||||
class RendererSoftware : public VideoCore::RendererBase {
 | 
			
		||||
public:
 | 
			
		||||
    explicit RendererSoftware(Core::System& system, Frontend::EmuWindow& window);
 | 
			
		||||
    ~RendererSoftware() override;
 | 
			
		||||
 | 
			
		||||
    [[nodiscard]] VideoCore::RasterizerInterface* Rasterizer() const override {
 | 
			
		||||
        return rasterizer.get();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    [[nodiscard]] const ScreenInfo& Screen(VideoCore::ScreenId id) const noexcept {
 | 
			
		||||
        return screen_infos[static_cast<u32>(id)];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    VideoCore::RasterizerInterface* Rasterizer() const override;
 | 
			
		||||
    void SwapBuffers() override;
 | 
			
		||||
    void TryPresent(int timeout_ms, bool is_secondary) override {}
 | 
			
		||||
    void Sync() override {}
 | 
			
		||||
 
 | 
			
		||||
@@ -41,10 +41,22 @@ Framebuffer::Framebuffer(Memory::MemorySystem& memory_, const Pica::FramebufferR
 | 
			
		||||
 | 
			
		||||
Framebuffer::~Framebuffer() = default;
 | 
			
		||||
 | 
			
		||||
void Framebuffer::DrawPixel(int x, int y, const Common::Vec4<u8>& color) const {
 | 
			
		||||
    const auto& framebuffer = regs.framebuffer;
 | 
			
		||||
    const PAddr addr = framebuffer.GetColorBufferPhysicalAddress();
 | 
			
		||||
void Framebuffer::Bind() {
 | 
			
		||||
    PAddr addr = regs.framebuffer.GetColorBufferPhysicalAddress();
 | 
			
		||||
    if (color_addr != addr) [[unlikely]] {
 | 
			
		||||
        color_addr = addr;
 | 
			
		||||
        color_buffer = memory.GetPhysicalPointer(color_addr);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    addr = regs.framebuffer.GetDepthBufferPhysicalAddress();
 | 
			
		||||
    if (depth_addr != addr) [[unlikely]] {
 | 
			
		||||
        depth_addr = addr;
 | 
			
		||||
        depth_buffer = memory.GetPhysicalPointer(depth_addr);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void Framebuffer::DrawPixel(u32 x, u32 y, const Common::Vec4<u8>& color) const {
 | 
			
		||||
    const auto& framebuffer = regs.framebuffer;
 | 
			
		||||
    // Similarly to textures, the render framebuffer is laid out from bottom to top, too.
 | 
			
		||||
    // NOTE: The framebuffer height register contains the actual FB height minus one.
 | 
			
		||||
    y = framebuffer.height - y;
 | 
			
		||||
@@ -54,8 +66,7 @@ void Framebuffer::DrawPixel(int x, int y, const Common::Vec4<u8>& color) const {
 | 
			
		||||
        GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value()));
 | 
			
		||||
    const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
 | 
			
		||||
                           coarse_y * framebuffer.width * bytes_per_pixel;
 | 
			
		||||
    u8* depth_buffer = memory.GetPhysicalPointer(addr);
 | 
			
		||||
    u8* dst_pixel = depth_buffer + dst_offset;
 | 
			
		||||
    u8* dst_pixel = color_buffer + dst_offset;
 | 
			
		||||
 | 
			
		||||
    switch (framebuffer.color_format) {
 | 
			
		||||
    case FramebufferRegs::ColorFormat::RGBA8:
 | 
			
		||||
@@ -80,10 +91,8 @@ void Framebuffer::DrawPixel(int x, int y, const Common::Vec4<u8>& color) const {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
const Common::Vec4<u8> Framebuffer::GetPixel(int x, int y) const {
 | 
			
		||||
const Common::Vec4<u8> Framebuffer::GetPixel(u32 x, u32 y) const {
 | 
			
		||||
    const auto& framebuffer = regs.framebuffer;
 | 
			
		||||
    const PAddr addr = framebuffer.GetColorBufferPhysicalAddress();
 | 
			
		||||
 | 
			
		||||
    y = framebuffer.height - y;
 | 
			
		||||
 | 
			
		||||
    const u32 coarse_y = y & ~7;
 | 
			
		||||
@@ -91,7 +100,6 @@ const Common::Vec4<u8> Framebuffer::GetPixel(int x, int y) const {
 | 
			
		||||
        GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value()));
 | 
			
		||||
    const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
 | 
			
		||||
                           coarse_y * framebuffer.width * bytes_per_pixel;
 | 
			
		||||
    const u8* color_buffer = memory.GetPhysicalPointer(addr);
 | 
			
		||||
    const u8* src_pixel = color_buffer + src_offset;
 | 
			
		||||
 | 
			
		||||
    switch (framebuffer.color_format) {
 | 
			
		||||
@@ -114,10 +122,8 @@ const Common::Vec4<u8> Framebuffer::GetPixel(int x, int y) const {
 | 
			
		||||
    return {0, 0, 0, 0};
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
u32 Framebuffer::GetDepth(int x, int y) const {
 | 
			
		||||
u32 Framebuffer::GetDepth(u32 x, u32 y) const {
 | 
			
		||||
    const auto& framebuffer = regs.framebuffer;
 | 
			
		||||
    const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
 | 
			
		||||
 | 
			
		||||
    y = framebuffer.height - y;
 | 
			
		||||
 | 
			
		||||
    const u32 coarse_y = y & ~7;
 | 
			
		||||
@@ -125,7 +131,6 @@ u32 Framebuffer::GetDepth(int x, int y) const {
 | 
			
		||||
    const u32 stride = framebuffer.width * bytes_per_pixel;
 | 
			
		||||
 | 
			
		||||
    const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride;
 | 
			
		||||
    const u8* depth_buffer = memory.GetPhysicalPointer(addr);
 | 
			
		||||
    const u8* src_pixel = depth_buffer + src_offset;
 | 
			
		||||
 | 
			
		||||
    switch (framebuffer.depth_format) {
 | 
			
		||||
@@ -143,10 +148,8 @@ u32 Framebuffer::GetDepth(int x, int y) const {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
u8 Framebuffer::GetStencil(int x, int y) const {
 | 
			
		||||
u8 Framebuffer::GetStencil(u32 x, u32 y) const {
 | 
			
		||||
    const auto& framebuffer = regs.framebuffer;
 | 
			
		||||
    const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
 | 
			
		||||
 | 
			
		||||
    y = framebuffer.height - y;
 | 
			
		||||
 | 
			
		||||
    const u32 coarse_y = y & ~7;
 | 
			
		||||
@@ -154,7 +157,6 @@ u8 Framebuffer::GetStencil(int x, int y) const {
 | 
			
		||||
    const u32 stride = framebuffer.width * bytes_per_pixel;
 | 
			
		||||
 | 
			
		||||
    const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride;
 | 
			
		||||
    const u8* depth_buffer = memory.GetPhysicalPointer(addr);
 | 
			
		||||
    const u8* src_pixel = depth_buffer + src_offset;
 | 
			
		||||
 | 
			
		||||
    switch (framebuffer.depth_format) {
 | 
			
		||||
@@ -169,10 +171,8 @@ u8 Framebuffer::GetStencil(int x, int y) const {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void Framebuffer::SetDepth(int x, int y, u32 value) const {
 | 
			
		||||
void Framebuffer::SetDepth(u32 x, u32 y, u32 value) const {
 | 
			
		||||
    const auto& framebuffer = regs.framebuffer;
 | 
			
		||||
    const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
 | 
			
		||||
 | 
			
		||||
    y = framebuffer.height - y;
 | 
			
		||||
 | 
			
		||||
    const u32 coarse_y = y & ~7;
 | 
			
		||||
@@ -180,7 +180,6 @@ void Framebuffer::SetDepth(int x, int y, u32 value) const {
 | 
			
		||||
    const u32 stride = framebuffer.width * bytes_per_pixel;
 | 
			
		||||
 | 
			
		||||
    const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride;
 | 
			
		||||
    u8* depth_buffer = memory.GetPhysicalPointer(addr);
 | 
			
		||||
    u8* dst_pixel = depth_buffer + dst_offset;
 | 
			
		||||
 | 
			
		||||
    switch (framebuffer.depth_format) {
 | 
			
		||||
@@ -201,10 +200,8 @@ void Framebuffer::SetDepth(int x, int y, u32 value) const {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void Framebuffer::SetStencil(int x, int y, u8 value) const {
 | 
			
		||||
void Framebuffer::SetStencil(u32 x, u32 y, u8 value) const {
 | 
			
		||||
    const auto& framebuffer = regs.framebuffer;
 | 
			
		||||
    const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
 | 
			
		||||
 | 
			
		||||
    y = framebuffer.height - y;
 | 
			
		||||
 | 
			
		||||
    const u32 coarse_y = y & ~7;
 | 
			
		||||
@@ -212,7 +209,6 @@ void Framebuffer::SetStencil(int x, int y, u8 value) const {
 | 
			
		||||
    const u32 stride = framebuffer.width * bytes_per_pixel;
 | 
			
		||||
 | 
			
		||||
    const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride;
 | 
			
		||||
    u8* depth_buffer = memory.GetPhysicalPointer(addr);
 | 
			
		||||
    u8* dst_pixel = depth_buffer + dst_offset;
 | 
			
		||||
 | 
			
		||||
    switch (framebuffer.depth_format) {
 | 
			
		||||
@@ -231,7 +227,7 @@ void Framebuffer::SetStencil(int x, int y, u8 value) const {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void Framebuffer::DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const {
 | 
			
		||||
void Framebuffer::DrawShadowMapPixel(u32 x, u32 y, u32 depth, u8 stencil) const {
 | 
			
		||||
    const auto& framebuffer = regs.framebuffer;
 | 
			
		||||
    const auto& shadow = regs.shadow;
 | 
			
		||||
    const PAddr addr = framebuffer.GetColorBufferPhysicalAddress();
 | 
			
		||||
 
 | 
			
		||||
@@ -23,30 +23,37 @@ public:
 | 
			
		||||
    explicit Framebuffer(Memory::MemorySystem& memory, const Pica::FramebufferRegs& framebuffer);
 | 
			
		||||
    ~Framebuffer();
 | 
			
		||||
 | 
			
		||||
    /// Updates the framebuffer addresses from the PICA registers.
 | 
			
		||||
    void Bind();
 | 
			
		||||
 | 
			
		||||
    /// Draws a pixel at the specified coordinates.
 | 
			
		||||
    void DrawPixel(int x, int y, const Common::Vec4<u8>& color) const;
 | 
			
		||||
    void DrawPixel(u32 x, u32 y, const Common::Vec4<u8>& color) const;
 | 
			
		||||
 | 
			
		||||
    /// Returns the current color at the specified coordinates.
 | 
			
		||||
    [[nodiscard]] const Common::Vec4<u8> GetPixel(int x, int y) const;
 | 
			
		||||
    [[nodiscard]] const Common::Vec4<u8> GetPixel(u32 x, u32 y) const;
 | 
			
		||||
 | 
			
		||||
    /// Returns the depth value at the specified coordinates.
 | 
			
		||||
    [[nodiscard]] u32 GetDepth(int x, int y) const;
 | 
			
		||||
    [[nodiscard]] u32 GetDepth(u32 x, u32 y) const;
 | 
			
		||||
 | 
			
		||||
    /// Returns the stencil value at the specified coordinates.
 | 
			
		||||
    [[nodiscard]] u8 GetStencil(int x, int y) const;
 | 
			
		||||
    [[nodiscard]] u8 GetStencil(u32 x, u32 y) const;
 | 
			
		||||
 | 
			
		||||
    /// Stores the provided depth value at the specified coordinates.
 | 
			
		||||
    void SetDepth(int x, int y, u32 value) const;
 | 
			
		||||
    void SetDepth(u32 x, u32 y, u32 value) const;
 | 
			
		||||
 | 
			
		||||
    /// Stores the provided stencil value at the specified coordinates.
 | 
			
		||||
    void SetStencil(int x, int y, u8 value) const;
 | 
			
		||||
    void SetStencil(u32 x, u32 y, u8 value) const;
 | 
			
		||||
 | 
			
		||||
    /// Draws a pixel to the shadow buffer.
 | 
			
		||||
    void DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const;
 | 
			
		||||
    void DrawShadowMapPixel(u32 x, u32 y, u32 depth, u8 stencil) const;
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    Memory::MemorySystem& memory;
 | 
			
		||||
    const Pica::FramebufferRegs& regs;
 | 
			
		||||
    PAddr color_addr;
 | 
			
		||||
    u8* color_buffer{};
 | 
			
		||||
    PAddr depth_addr;
 | 
			
		||||
    u8* depth_buffer{};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
u8 PerformStencilAction(Pica::FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref);
 | 
			
		||||
 
 | 
			
		||||
@@ -95,8 +95,14 @@ private:
 | 
			
		||||
 | 
			
		||||
} // Anonymous namespace
 | 
			
		||||
 | 
			
		||||
// Kirby Blowout Blast relies on the combiner output of a previous draw
 | 
			
		||||
// in order to render the sky correctly.
 | 
			
		||||
static thread_local Common::Vec4<u8> combiner_output{};
 | 
			
		||||
 | 
			
		||||
RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_)
 | 
			
		||||
    : memory{memory_}, state{Pica::g_state}, regs{state.regs}, fb{memory, regs.framebuffer} {}
 | 
			
		||||
    : memory{memory_}, state{Pica::g_state}, regs{state.regs},
 | 
			
		||||
      num_sw_threads{std::max(std::thread::hardware_concurrency(), 2U)},
 | 
			
		||||
      sw_workers{num_sw_threads, "SwRenderer workers"}, fb{memory, regs.framebuffer} {}
 | 
			
		||||
 | 
			
		||||
void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0,
 | 
			
		||||
                                     const Pica::Shader::OutputVertex& v1,
 | 
			
		||||
@@ -289,167 +295,194 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con
 | 
			
		||||
 | 
			
		||||
    const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w);
 | 
			
		||||
 | 
			
		||||
    auto textures = regs.texturing.GetTextures();
 | 
			
		||||
    const auto textures = regs.texturing.GetTextures();
 | 
			
		||||
    const auto tev_stages = regs.texturing.GetTevStages();
 | 
			
		||||
    for (u32 i = 0; i < texture_data.size(); i++) {
 | 
			
		||||
        const PAddr addr = textures[i].config.GetPhysicalAddress();
 | 
			
		||||
        if (addr) {
 | 
			
		||||
            texture_data[i] = memory.GetPhysicalPointer(addr);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fb.Bind();
 | 
			
		||||
 | 
			
		||||
    if (use_jit) {
 | 
			
		||||
        const TevConfigKey key{regs.texturing};
 | 
			
		||||
        auto [it, new_fun] = tev_cache.try_emplace(key.Hash());
 | 
			
		||||
        if (new_fun) {
 | 
			
		||||
            it->second = std::make_unique<TevConfig>(regs, key);
 | 
			
		||||
        }
 | 
			
		||||
        tev_config = it->second.get();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Enter rasterization loop, starting at the center of the topleft bounding box corner.
 | 
			
		||||
    // TODO: Not sure if looping through x first might be faster
 | 
			
		||||
    for (u16 y = min_y + 8; y < max_y; y += 0x10) {
 | 
			
		||||
        for (u16 x = min_x + 8; x < max_x; x += 0x10) {
 | 
			
		||||
            // Do not process the pixel if it's inside the scissor box and the scissor mode is set
 | 
			
		||||
            // to Exclude.
 | 
			
		||||
            if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) {
 | 
			
		||||
                if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) {
 | 
			
		||||
        const auto process_scanline = [&, y] {
 | 
			
		||||
            for (u16 x = min_x + 8; x < max_x; x += 0x10) {
 | 
			
		||||
                // Do not process the pixel if it's inside the scissor box and the scissor mode is
 | 
			
		||||
                // set to Exclude.
 | 
			
		||||
                if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) {
 | 
			
		||||
                    if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) {
 | 
			
		||||
                        continue;
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                // Calculate the barycentric coordinates w0, w1 and w2
 | 
			
		||||
                const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
 | 
			
		||||
                const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
 | 
			
		||||
                const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
 | 
			
		||||
                const s32 wsum = w0 + w1 + w2;
 | 
			
		||||
 | 
			
		||||
                // If current pixel is not covered by the current primitive
 | 
			
		||||
                if (w0 < 0 || w1 < 0 || w2 < 0) {
 | 
			
		||||
                    continue;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Calculate the barycentric coordinates w0, w1 and w2
 | 
			
		||||
            const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
 | 
			
		||||
            const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
 | 
			
		||||
            const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
 | 
			
		||||
            const s32 wsum = w0 + w1 + w2;
 | 
			
		||||
                const auto baricentric_coordinates = Common::MakeVec(
 | 
			
		||||
                    f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)),
 | 
			
		||||
                    f24::FromFloat32(static_cast<f32>(w2)));
 | 
			
		||||
                const f24 interpolated_w_inverse =
 | 
			
		||||
                    f24::One() / Common::Dot(w_inverse, baricentric_coordinates);
 | 
			
		||||
 | 
			
		||||
            // If current pixel is not covered by the current primitive
 | 
			
		||||
            if (w0 < 0 || w1 < 0 || w2 < 0) {
 | 
			
		||||
                continue;
 | 
			
		||||
            }
 | 
			
		||||
                // interpolated_z = z / w
 | 
			
		||||
                const float interpolated_z_over_w =
 | 
			
		||||
                    (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 +
 | 
			
		||||
                     v2.screenpos[2].ToFloat32() * w2) /
 | 
			
		||||
                    wsum;
 | 
			
		||||
 | 
			
		||||
            const auto baricentric_coordinates = Common::MakeVec(
 | 
			
		||||
                f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)),
 | 
			
		||||
                f24::FromFloat32(static_cast<f32>(w2)));
 | 
			
		||||
            const f24 interpolated_w_inverse =
 | 
			
		||||
                f24::One() / Common::Dot(w_inverse, baricentric_coordinates);
 | 
			
		||||
                // Not fully accurate. About 3 bits in precision are missing.
 | 
			
		||||
                // Z-Buffer (z / w * scale + offset)
 | 
			
		||||
                const float depth_scale =
 | 
			
		||||
                    f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32();
 | 
			
		||||
                const float depth_offset =
 | 
			
		||||
                    f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32();
 | 
			
		||||
                float depth = interpolated_z_over_w * depth_scale + depth_offset;
 | 
			
		||||
 | 
			
		||||
            // interpolated_z = z / w
 | 
			
		||||
            const float interpolated_z_over_w =
 | 
			
		||||
                (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 +
 | 
			
		||||
                 v2.screenpos[2].ToFloat32() * w2) /
 | 
			
		||||
                wsum;
 | 
			
		||||
                // Potentially switch to W-Buffer
 | 
			
		||||
                if (regs.rasterizer.depthmap_enable ==
 | 
			
		||||
                    Pica::RasterizerRegs::DepthBuffering::WBuffering) {
 | 
			
		||||
                    // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
 | 
			
		||||
                    depth *= interpolated_w_inverse.ToFloat32() * wsum;
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
            // Not fully accurate. About 3 bits in precision are missing.
 | 
			
		||||
            // Z-Buffer (z / w * scale + offset)
 | 
			
		||||
            const float depth_scale =
 | 
			
		||||
                f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32();
 | 
			
		||||
            const float depth_offset =
 | 
			
		||||
                f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32();
 | 
			
		||||
            float depth = interpolated_z_over_w * depth_scale + depth_offset;
 | 
			
		||||
                // Clamp the result
 | 
			
		||||
                depth = std::clamp(depth, 0.0f, 1.0f);
 | 
			
		||||
 | 
			
		||||
            // Potentially switch to W-Buffer
 | 
			
		||||
            if (regs.rasterizer.depthmap_enable ==
 | 
			
		||||
                Pica::RasterizerRegs::DepthBuffering::WBuffering) {
 | 
			
		||||
                // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
 | 
			
		||||
                depth *= interpolated_w_inverse.ToFloat32() * wsum;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Clamp the result
 | 
			
		||||
            depth = std::clamp(depth, 0.0f, 1.0f);
 | 
			
		||||
 | 
			
		||||
            /**
 | 
			
		||||
             * Perspective correct attribute interpolation:
 | 
			
		||||
             * Attribute values cannot be calculated by simple linear interpolation since
 | 
			
		||||
             * they are not linear in screen space. For example, when interpolating a
 | 
			
		||||
             * texture coordinate across two vertices, something simple like
 | 
			
		||||
             *     u = (u0*w0 + u1*w1)/(w0+w1)
 | 
			
		||||
             * will not work. However, the attribute value divided by the
 | 
			
		||||
             * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear
 | 
			
		||||
             * in screenspace. Hence, we can linearly interpolate these two independently and
 | 
			
		||||
             * calculate the interpolated attribute by dividing the results.
 | 
			
		||||
             * I.e.
 | 
			
		||||
             *     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1)
 | 
			
		||||
             *     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
 | 
			
		||||
             *     u = u_over_w / one_over_w
 | 
			
		||||
             *
 | 
			
		||||
             * The generalization to three vertices is straightforward in baricentric coordinates.
 | 
			
		||||
             **/
 | 
			
		||||
            const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) {
 | 
			
		||||
                auto attr_over_w = Common::MakeVec(attr0, attr1, attr2);
 | 
			
		||||
                f24 interpolated_attr_over_w = Common::Dot(attr_over_w, baricentric_coordinates);
 | 
			
		||||
                return interpolated_attr_over_w * interpolated_w_inverse;
 | 
			
		||||
            };
 | 
			
		||||
 | 
			
		||||
            const Common::Vec4<u8> primary_color{
 | 
			
		||||
                static_cast<u8>(
 | 
			
		||||
                    round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r())
 | 
			
		||||
                              .ToFloat32() *
 | 
			
		||||
                          255)),
 | 
			
		||||
                static_cast<u8>(
 | 
			
		||||
                    round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g())
 | 
			
		||||
                              .ToFloat32() *
 | 
			
		||||
                          255)),
 | 
			
		||||
                static_cast<u8>(
 | 
			
		||||
                    round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b())
 | 
			
		||||
                              .ToFloat32() *
 | 
			
		||||
                          255)),
 | 
			
		||||
                static_cast<u8>(
 | 
			
		||||
                    round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a())
 | 
			
		||||
                              .ToFloat32() *
 | 
			
		||||
                          255)),
 | 
			
		||||
            };
 | 
			
		||||
 | 
			
		||||
            std::array<Common::Vec2<f24>, 3> uv;
 | 
			
		||||
            uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
 | 
			
		||||
            uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
 | 
			
		||||
            uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u());
 | 
			
		||||
            uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v());
 | 
			
		||||
            uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u());
 | 
			
		||||
            uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v());
 | 
			
		||||
 | 
			
		||||
            // Sample bound texture units.
 | 
			
		||||
            const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
 | 
			
		||||
            const auto texture_color = TextureColor(uv, textures, tc0_w);
 | 
			
		||||
 | 
			
		||||
            Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
 | 
			
		||||
            Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
 | 
			
		||||
 | 
			
		||||
            if (!regs.lighting.disable) {
 | 
			
		||||
                const auto normquat =
 | 
			
		||||
                    Common::Quaternion<f32>{
 | 
			
		||||
                        {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
 | 
			
		||||
                         get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
 | 
			
		||||
                         get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()},
 | 
			
		||||
                        get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
 | 
			
		||||
                    }
 | 
			
		||||
                        .Normalized();
 | 
			
		||||
 | 
			
		||||
                const Common::Vec3f view{
 | 
			
		||||
                    get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
 | 
			
		||||
                    get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
 | 
			
		||||
                    get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
 | 
			
		||||
                /**
 | 
			
		||||
                 * Perspective correct attribute interpolation:
 | 
			
		||||
                 * Attribute values cannot be calculated by simple linear interpolation since
 | 
			
		||||
                 * they are not linear in screen space. For example, when interpolating a
 | 
			
		||||
                 * texture coordinate across two vertices, something simple like
 | 
			
		||||
                 *     u = (u0*w0 + u1*w1)/(w0+w1)
 | 
			
		||||
                 * will not work. However, the attribute value divided by the
 | 
			
		||||
                 * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear
 | 
			
		||||
                 * in screenspace. Hence, we can linearly interpolate these two independently and
 | 
			
		||||
                 * calculate the interpolated attribute by dividing the results.
 | 
			
		||||
                 * I.e.
 | 
			
		||||
                 *     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1)
 | 
			
		||||
                 *     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
 | 
			
		||||
                 *     u = u_over_w / one_over_w
 | 
			
		||||
                 *
 | 
			
		||||
                 * The generalization to three vertices is straightforward in baricentric
 | 
			
		||||
                 *coordinates.
 | 
			
		||||
                 **/
 | 
			
		||||
                const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) {
 | 
			
		||||
                    auto attr_over_w = Common::MakeVec(attr0, attr1, attr2);
 | 
			
		||||
                    f24 interpolated_attr_over_w =
 | 
			
		||||
                        Common::Dot(attr_over_w, baricentric_coordinates);
 | 
			
		||||
                    return interpolated_attr_over_w * interpolated_w_inverse;
 | 
			
		||||
                };
 | 
			
		||||
                std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(
 | 
			
		||||
                    regs.lighting, state.lighting, normquat, view, texture_color);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Write the TEV stages.
 | 
			
		||||
            WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color,
 | 
			
		||||
                           secondary_fragment_color);
 | 
			
		||||
                const Common::Vec4<u8> primary_color{
 | 
			
		||||
                    static_cast<u8>(
 | 
			
		||||
                        round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r())
 | 
			
		||||
                                  .ToFloat32() *
 | 
			
		||||
                              255)),
 | 
			
		||||
                    static_cast<u8>(
 | 
			
		||||
                        round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g())
 | 
			
		||||
                                  .ToFloat32() *
 | 
			
		||||
                              255)),
 | 
			
		||||
                    static_cast<u8>(
 | 
			
		||||
                        round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b())
 | 
			
		||||
                                  .ToFloat32() *
 | 
			
		||||
                              255)),
 | 
			
		||||
                    static_cast<u8>(
 | 
			
		||||
                        round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a())
 | 
			
		||||
                                  .ToFloat32() *
 | 
			
		||||
                              255)),
 | 
			
		||||
                };
 | 
			
		||||
 | 
			
		||||
            const auto& output_merger = regs.framebuffer.output_merger;
 | 
			
		||||
            if (output_merger.fragment_operation_mode ==
 | 
			
		||||
                FramebufferRegs::FragmentOperationMode::Shadow) {
 | 
			
		||||
                u32 depth_int = static_cast<u32>(depth * 0xFFFFFF);
 | 
			
		||||
                // Use green color as the shadow intensity
 | 
			
		||||
                u8 stencil = combiner_output.y;
 | 
			
		||||
                fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil);
 | 
			
		||||
                // Skip the normal output merger pipeline if it is in shadow mode
 | 
			
		||||
                continue;
 | 
			
		||||
            }
 | 
			
		||||
                std::array<Common::Vec2<f24>, 3> uv;
 | 
			
		||||
                uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
 | 
			
		||||
                uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
 | 
			
		||||
                uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u());
 | 
			
		||||
                uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v());
 | 
			
		||||
                uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u());
 | 
			
		||||
                uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v());
 | 
			
		||||
 | 
			
		||||
            // Does alpha testing happen before or after stencil?
 | 
			
		||||
            if (!DoAlphaTest(combiner_output.a())) {
 | 
			
		||||
                continue;
 | 
			
		||||
                // Sample bound texture units.
 | 
			
		||||
                const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
 | 
			
		||||
                auto texture_color = TextureColor(uv, textures, tc0_w);
 | 
			
		||||
 | 
			
		||||
                Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
 | 
			
		||||
                Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
 | 
			
		||||
 | 
			
		||||
                if (!regs.lighting.disable) {
 | 
			
		||||
                    const auto normquat =
 | 
			
		||||
                        Common::Quaternion<f32>{
 | 
			
		||||
                            {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x)
 | 
			
		||||
                                 .ToFloat32(),
 | 
			
		||||
                             get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y)
 | 
			
		||||
                                 .ToFloat32(),
 | 
			
		||||
                             get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z)
 | 
			
		||||
                                 .ToFloat32()},
 | 
			
		||||
                            get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
 | 
			
		||||
                        }
 | 
			
		||||
                            .Normalized();
 | 
			
		||||
 | 
			
		||||
                    const Common::Vec3f view{
 | 
			
		||||
                        get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
 | 
			
		||||
                        get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
 | 
			
		||||
                        get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
 | 
			
		||||
                    };
 | 
			
		||||
                    std::tie(primary_fragment_color, secondary_fragment_color) =
 | 
			
		||||
                        ComputeFragmentsColors(regs.lighting, state.lighting, normquat, view,
 | 
			
		||||
                                               texture_color);
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                // Write the TEV stages.
 | 
			
		||||
                WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color,
 | 
			
		||||
                               secondary_fragment_color);
 | 
			
		||||
 | 
			
		||||
                const auto& output_merger = regs.framebuffer.output_merger;
 | 
			
		||||
                if (output_merger.fragment_operation_mode ==
 | 
			
		||||
                    FramebufferRegs::FragmentOperationMode::Shadow) {
 | 
			
		||||
                    u32 depth_int = static_cast<u32>(depth * 0xFFFFFF);
 | 
			
		||||
                    // Use green color as the shadow intensity
 | 
			
		||||
                    u8 stencil = combiner_output.y;
 | 
			
		||||
                    fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil);
 | 
			
		||||
                    // Skip the normal output merger pipeline if it is in shadow mode
 | 
			
		||||
                    continue;
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                // Does alpha testing happen before or after stencil?
 | 
			
		||||
                if (!DoAlphaTest(combiner_output.a())) {
 | 
			
		||||
                    continue;
 | 
			
		||||
                }
 | 
			
		||||
                WriteFog(depth);
 | 
			
		||||
                if (!DoDepthStencilTest(x, y, depth)) {
 | 
			
		||||
                    continue;
 | 
			
		||||
                }
 | 
			
		||||
                const auto result = PixelColor(x, y);
 | 
			
		||||
                if (regs.framebuffer.framebuffer.allow_color_write != 0) {
 | 
			
		||||
                    fb.DrawPixel(x >> 4, y >> 4, result);
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            WriteFog(combiner_output, depth);
 | 
			
		||||
            if (!DoDepthStencilTest(x, y, depth)) {
 | 
			
		||||
                continue;
 | 
			
		||||
            }
 | 
			
		||||
            const auto result = PixelColor(x, y, combiner_output);
 | 
			
		||||
            if (regs.framebuffer.framebuffer.allow_color_write != 0) {
 | 
			
		||||
                fb.DrawPixel(x >> 4, y >> 4, result);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        };
 | 
			
		||||
        sw_workers.QueueWork(std::move(process_scanline));
 | 
			
		||||
    }
 | 
			
		||||
    sw_workers.WaitForRequests();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
 | 
			
		||||
@@ -538,11 +571,10 @@ std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
 | 
			
		||||
            t = texture.config.height - 1 -
 | 
			
		||||
                GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
 | 
			
		||||
 | 
			
		||||
            const u8* texture_data = memory.GetPhysicalPointer(texture_address);
 | 
			
		||||
            const auto info = TextureInfo::FromPicaRegister(texture.config, texture.format);
 | 
			
		||||
 | 
			
		||||
            // TODO: Apply the min and mag filters to the texture
 | 
			
		||||
            texture_color[i] = LookupTexture(texture_data, s, t, info);
 | 
			
		||||
            texture_color[i] = LookupTexture(texture_data[i], s, t, info);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (i == 0 && (texture.config.type == TexturingRegs::TextureConfig::Shadow2D ||
 | 
			
		||||
@@ -572,8 +604,7 @@ std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
 | 
			
		||||
    return texture_color;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y,
 | 
			
		||||
                                                Common::Vec4<u8>& combiner_output) const {
 | 
			
		||||
Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y) const {
 | 
			
		||||
    const auto dest = fb.GetPixel(x >> 4, y >> 4);
 | 
			
		||||
    Common::Vec4<u8> blend_output = combiner_output;
 | 
			
		||||
 | 
			
		||||
@@ -664,10 +695,20 @@ Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void RasterizerSoftware::WriteTevConfig(
 | 
			
		||||
    std::span<const Common::Vec4<u8>, 4> texture_color,
 | 
			
		||||
    std::span<Common::Vec4<u8>, 4> texture_color,
 | 
			
		||||
    std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages,
 | 
			
		||||
    Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color,
 | 
			
		||||
    Common::Vec4<u8> secondary_fragment_color) {
 | 
			
		||||
 | 
			
		||||
#if CITRA_ARCH(x86_64)
 | 
			
		||||
    if (use_jit) {
 | 
			
		||||
        const u32 tev_combiner_buffer_color = regs.texturing.tev_combiner_buffer_color.raw;
 | 
			
		||||
        combiner_output = tev_config->Run(texture_color, primary_color, primary_fragment_color,
 | 
			
		||||
                                          secondary_fragment_color, tev_combiner_buffer_color);
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Texture environment - consists of 6 stages of color and alpha combining.
 | 
			
		||||
     * Color combiners take three input color values from some source (e.g. interpolated
 | 
			
		||||
@@ -731,6 +772,7 @@ void RasterizerSoftware::WriteTevConfig(
 | 
			
		||||
            GetColorModifier(tev_stage.color_modifier2, get_source(tev_stage.color_source2)),
 | 
			
		||||
            GetColorModifier(tev_stage.color_modifier3, get_source(tev_stage.color_source3)),
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        const Common::Vec3<u8> color_output = ColorCombine(tev_stage.color_op, color_result);
 | 
			
		||||
 | 
			
		||||
        u8 alpha_output;
 | 
			
		||||
@@ -768,7 +810,7 @@ void RasterizerSoftware::WriteTevConfig(
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void RasterizerSoftware::WriteFog(Common::Vec4<u8>& combiner_output, float depth) const {
 | 
			
		||||
void RasterizerSoftware::WriteFog(float depth) const {
 | 
			
		||||
    /**
 | 
			
		||||
     * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to
 | 
			
		||||
     * store the depth etc. Using float for now until we know more about Pica datatypes.
 | 
			
		||||
 
 | 
			
		||||
@@ -4,13 +4,20 @@
 | 
			
		||||
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#include <memory>
 | 
			
		||||
#include <span>
 | 
			
		||||
#include <unordered_map>
 | 
			
		||||
 | 
			
		||||
#include "common/arch.h"
 | 
			
		||||
#include "common/thread_worker.h"
 | 
			
		||||
#include "video_core/rasterizer_interface.h"
 | 
			
		||||
#include "video_core/regs_texturing.h"
 | 
			
		||||
#include "video_core/renderer_software/sw_clipper.h"
 | 
			
		||||
#include "video_core/renderer_software/sw_framebuffer.h"
 | 
			
		||||
 | 
			
		||||
#if CITRA_ARCH(x86_64)
 | 
			
		||||
#include "video_core/renderer_software/sw_tev_jit.h"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
namespace Pica::Shader {
 | 
			
		||||
struct OutputVertex;
 | 
			
		||||
}
 | 
			
		||||
@@ -52,16 +59,16 @@ private:
 | 
			
		||||
        std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const;
 | 
			
		||||
 | 
			
		||||
    /// Returns the final pixel color with blending or logic ops applied.
 | 
			
		||||
    Common::Vec4<u8> PixelColor(u16 x, u16 y, Common::Vec4<u8>& combiner_output) const;
 | 
			
		||||
    Common::Vec4<u8> PixelColor(u16 x, u16 y) const;
 | 
			
		||||
 | 
			
		||||
    /// Emulates the TEV configuration and returns the combiner output.
 | 
			
		||||
    void WriteTevConfig(std::span<const Common::Vec4<u8>, 4> texture_color,
 | 
			
		||||
    void WriteTevConfig(std::span<Common::Vec4<u8>, 4> texture_color,
 | 
			
		||||
                        std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages,
 | 
			
		||||
                        Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color,
 | 
			
		||||
                        Common::Vec4<u8> secondary_fragment_color);
 | 
			
		||||
 | 
			
		||||
    /// Blends fog to the combiner output if enabled.
 | 
			
		||||
    void WriteFog(Common::Vec4<u8>& combiner_output, float depth) const;
 | 
			
		||||
    void WriteFog(float depth) const;
 | 
			
		||||
 | 
			
		||||
    /// Performs the alpha test. Returns false if the test failed.
 | 
			
		||||
    bool DoAlphaTest(u8 alpha) const;
 | 
			
		||||
@@ -73,10 +80,13 @@ private:
 | 
			
		||||
    Memory::MemorySystem& memory;
 | 
			
		||||
    Pica::State& state;
 | 
			
		||||
    const Pica::Regs& regs;
 | 
			
		||||
    bool use_jit{true};
 | 
			
		||||
    size_t num_sw_threads;
 | 
			
		||||
    Common::ThreadWorker sw_workers;
 | 
			
		||||
    Framebuffer fb;
 | 
			
		||||
    // Kirby Blowout Blast relies on the combiner output of a previous draw
 | 
			
		||||
    // in order to render the sky correctly.
 | 
			
		||||
    Common::Vec4<u8> combiner_output{};
 | 
			
		||||
    TevCache tev_cache;
 | 
			
		||||
    TevConfig* tev_config{};
 | 
			
		||||
    std::array<const u8*, 3> texture_data{};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
} // namespace SwRenderer
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										473
									
								
								src/video_core/renderer_software/sw_tev_jit.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										473
									
								
								src/video_core/renderer_software/sw_tev_jit.cpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,473 @@
 | 
			
		||||
// Copyright 2023 Citra Emulator Project
 | 
			
		||||
// Licensed under GPLv2 or any later version
 | 
			
		||||
// Refer to the license.txt file included.
 | 
			
		||||
 | 
			
		||||
#include <bit>
 | 
			
		||||
#include <emmintrin.h>
 | 
			
		||||
#include "common/x64/xbyak_abi.h"
 | 
			
		||||
#include "video_core/regs.h"
 | 
			
		||||
#include "video_core/renderer_software/sw_tev_jit.h"
 | 
			
		||||
 | 
			
		||||
namespace SwRenderer {
 | 
			
		||||
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
using namespace Common::X64;
 | 
			
		||||
using namespace Xbyak::util;
 | 
			
		||||
using Pica::TexturingRegs;
 | 
			
		||||
using Xbyak::Reg32;
 | 
			
		||||
using Xbyak::Reg64;
 | 
			
		||||
using Xbyak::Xmm;
 | 
			
		||||
using TevStageConfig = Pica::TexturingRegs::TevStageConfig;
 | 
			
		||||
 | 
			
		||||
constexpr Reg32 A0 = r11d;
 | 
			
		||||
constexpr Reg32 A1 = r12d;
 | 
			
		||||
constexpr Reg32 A2 = r13d;
 | 
			
		||||
constexpr Reg32 ALPHA_OUTPUT = r14d;
 | 
			
		||||
constexpr Xmm COMBINER_OUTPUT = xmm0;
 | 
			
		||||
constexpr Xmm COMBINER_BUFFER = xmm1;
 | 
			
		||||
constexpr Xmm NEXT_COMBINER_BUFFER = xmm2;
 | 
			
		||||
constexpr Xmm VEC0 = xmm3;
 | 
			
		||||
constexpr Xmm VEC1 = xmm4;
 | 
			
		||||
constexpr Xmm VEC2 = xmm5;
 | 
			
		||||
constexpr Xmm COLOR_OUTPUT = xmm6;
 | 
			
		||||
constexpr Xmm ZERO = xmm13;
 | 
			
		||||
constexpr Xmm MID_COLOR = xmm14;
 | 
			
		||||
constexpr Xmm MAX_COLOR = xmm15;
 | 
			
		||||
 | 
			
		||||
bool IsPassThroughTevStage(const TevStageConfig& stage) {
 | 
			
		||||
    return (stage.color_op == TevStageConfig::Operation::Replace &&
 | 
			
		||||
            stage.alpha_op == TevStageConfig::Operation::Replace &&
 | 
			
		||||
            stage.color_source1 == TevStageConfig::Source::Previous &&
 | 
			
		||||
            stage.alpha_source1 == TevStageConfig::Source::Previous &&
 | 
			
		||||
            stage.color_modifier1 == TevStageConfig::ColorModifier::SourceColor &&
 | 
			
		||||
            stage.alpha_modifier1 == TevStageConfig::AlphaModifier::SourceAlpha &&
 | 
			
		||||
            stage.GetColorMultiplier() == 1 && stage.GetAlphaMultiplier() == 1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
} // Anonymous namespace
 | 
			
		||||
 | 
			
		||||
TevConfigKey::TevConfigKey(const Pica::TexturingRegs& regs) {
 | 
			
		||||
    const auto& tev_stages = regs.GetTevStages();
 | 
			
		||||
    for (size_t i = 0; i < tev_stages.size(); i++) {
 | 
			
		||||
        const auto& tev_stage = tev_stages[i];
 | 
			
		||||
        stages[i].sources_raw = tev_stage.sources_raw;
 | 
			
		||||
        stages[i].modifiers_raw = tev_stage.modifiers_raw;
 | 
			
		||||
        stages[i].ops_raw = tev_stage.ops_raw;
 | 
			
		||||
        stages[i].const_color = tev_stage.const_color;
 | 
			
		||||
        stages[i].scales_raw = tev_stage.scales_raw;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
TevConfig::TevConfig(const Pica::Regs& regs_, const TevConfigKey& key) : regs{regs_} {
 | 
			
		||||
    WriteTevConfig(key);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
TevConfig::~TevConfig() = default;
 | 
			
		||||
 | 
			
		||||
Common::Vec4<u8> TevConfig::Run(std::span<Common::Vec4<u8>, 4> texture_color_,
 | 
			
		||||
                                Common::Vec4<u8> primary_color_,
 | 
			
		||||
                                Common::Vec4<u8> primary_fragment_color_,
 | 
			
		||||
                                Common::Vec4<u8> secondary_fragment_color_,
 | 
			
		||||
                                u64 tev_combiner_buffer_color) {
 | 
			
		||||
    u32* texture_color = reinterpret_cast<u32*>(texture_color_.data());
 | 
			
		||||
    const u32 primary_color = std::bit_cast<u32>(primary_color_);
 | 
			
		||||
    const u32 primary_fragment_color = std::bit_cast<u32>(primary_fragment_color_);
 | 
			
		||||
    const u32 secondary_fragment_color = std::bit_cast<u32>(secondary_fragment_color_);
 | 
			
		||||
    const u64 secondary_fragment_color_and_tev_combiner_buffer_color =
 | 
			
		||||
        secondary_fragment_color | (tev_combiner_buffer_color << 32);
 | 
			
		||||
    const u32 result = program(texture_color, primary_color, primary_fragment_color,
 | 
			
		||||
                               secondary_fragment_color_and_tev_combiner_buffer_color);
 | 
			
		||||
    return std::bit_cast<Common::Vec4<u8>>(result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void TevConfig::WriteTevConfig(const TevConfigKey& key) {
 | 
			
		||||
    program = (CompiledTevFun*)getCurr();
 | 
			
		||||
 | 
			
		||||
    constexpr Xbyak::Reg TEXTURE_COLOR = ABI_PARAM1;
 | 
			
		||||
    constexpr Xbyak::Reg PRIMARY_COLOR = ABI_PARAM2;
 | 
			
		||||
    constexpr Xbyak::Reg PRIMARY_FRAGMENT_COLOR = ABI_PARAM3;
 | 
			
		||||
    constexpr Xbyak::Reg SECONDARY_FRAGMENT_COLOR = ABI_PARAM4;
 | 
			
		||||
 | 
			
		||||
    // Save calle state
 | 
			
		||||
    ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16);
 | 
			
		||||
 | 
			
		||||
    // Clear the combiner registers and zero constant
 | 
			
		||||
    pxor(COMBINER_OUTPUT, COMBINER_OUTPUT);
 | 
			
		||||
    pxor(COMBINER_BUFFER, COMBINER_BUFFER);
 | 
			
		||||
    pxor(ZERO, ZERO);
 | 
			
		||||
 | 
			
		||||
    // Used to set an xmm register to the max color
 | 
			
		||||
    static const __m128i max = _mm_set1_epi32(255);
 | 
			
		||||
    mov(rax, reinterpret_cast<size_t>(&max));
 | 
			
		||||
    movdqu(MAX_COLOR, xword[rax]);
 | 
			
		||||
 | 
			
		||||
    // Used to set an xmm register to the mid color
 | 
			
		||||
    static const __m128i mid = _mm_set1_epi32(128);
 | 
			
		||||
    mov(rax, reinterpret_cast<size_t>(&mid));
 | 
			
		||||
    movdqu(MID_COLOR, xword[rax]);
 | 
			
		||||
 | 
			
		||||
    // Load next_combiner_buffer
 | 
			
		||||
    mov(rax, ABI_PARAM4);
 | 
			
		||||
    shr(rax, 32);
 | 
			
		||||
    vmovd(NEXT_COMBINER_BUFFER, eax);
 | 
			
		||||
    pmovzxbd(NEXT_COMBINER_BUFFER, NEXT_COMBINER_BUFFER);
 | 
			
		||||
 | 
			
		||||
    for (u32 tev_stage_index = 0; tev_stage_index < key.stages.size(); ++tev_stage_index) {
 | 
			
		||||
        const auto& tev_stage = key.stages[tev_stage_index];
 | 
			
		||||
        if (!IsPassThroughTevStage(tev_stage)) {
 | 
			
		||||
            using Source = TexturingRegs::TevStageConfig::Source;
 | 
			
		||||
 | 
			
		||||
            const auto get_source = [&](const Xbyak::Xmm& dest, Source source) {
 | 
			
		||||
                switch (source) {
 | 
			
		||||
                case Source::PrimaryColor:
 | 
			
		||||
                    vmovd(dest, PRIMARY_COLOR.cvt32());
 | 
			
		||||
                    pmovzxbd(dest, dest);
 | 
			
		||||
                    break;
 | 
			
		||||
                case Source::PrimaryFragmentColor:
 | 
			
		||||
                    vmovd(dest, PRIMARY_FRAGMENT_COLOR.cvt32());
 | 
			
		||||
                    pmovzxbd(dest, dest);
 | 
			
		||||
                    break;
 | 
			
		||||
                case Source::SecondaryFragmentColor:
 | 
			
		||||
                    vmovd(dest, SECONDARY_FRAGMENT_COLOR.cvt32());
 | 
			
		||||
                    pmovzxbd(dest, dest);
 | 
			
		||||
                    break;
 | 
			
		||||
                case Source::Texture0:
 | 
			
		||||
                case Source::Texture1:
 | 
			
		||||
                case Source::Texture2:
 | 
			
		||||
                case Source::Texture3: {
 | 
			
		||||
                    const u32 index = static_cast<u32>(source) - static_cast<u32>(Source::Texture0);
 | 
			
		||||
                    vmovd(dest, dword[TEXTURE_COLOR + index * sizeof(u32)]);
 | 
			
		||||
                    pmovzxbd(dest, dest);
 | 
			
		||||
                    break;
 | 
			
		||||
                }
 | 
			
		||||
                case Source::PreviousBuffer:
 | 
			
		||||
                    vmovdqa(dest, COMBINER_BUFFER);
 | 
			
		||||
                    break;
 | 
			
		||||
                case Source::Constant:
 | 
			
		||||
                    mov(eax, tev_stage.const_color);
 | 
			
		||||
                    vmovd(dest, eax);
 | 
			
		||||
                    pmovzxbd(dest, dest);
 | 
			
		||||
                    break;
 | 
			
		||||
                case Source::Previous:
 | 
			
		||||
                    vmovdqa(dest, COMBINER_OUTPUT);
 | 
			
		||||
                    break;
 | 
			
		||||
                default:
 | 
			
		||||
                    LOG_ERROR(HW_GPU, "Unknown color combiner source {}", source);
 | 
			
		||||
                    UNIMPLEMENTED();
 | 
			
		||||
                    vmovdqa(dest, ZERO);
 | 
			
		||||
                }
 | 
			
		||||
                return dest;
 | 
			
		||||
            };
 | 
			
		||||
 | 
			
		||||
            // Load the color modifiers to VEC0/1/2.
 | 
			
		||||
            GetColorModifier(get_source(VEC0, tev_stage.color_source1), tev_stage.color_modifier1);
 | 
			
		||||
            GetColorModifier(get_source(VEC1, tev_stage.color_source2), tev_stage.color_modifier2);
 | 
			
		||||
            GetColorModifier(get_source(VEC2, tev_stage.color_source3), tev_stage.color_modifier3);
 | 
			
		||||
 | 
			
		||||
            // Combine the texture colors to COLOR_OUTPUT.
 | 
			
		||||
            ColorCombine(COLOR_OUTPUT, tev_stage.color_op);
 | 
			
		||||
 | 
			
		||||
            if (tev_stage.color_op == TexturingRegs::TevStageConfig::Operation::Dot3_RGBA) {
 | 
			
		||||
                // Result of Dot3_RGBA operation is also placed to the alpha component
 | 
			
		||||
                vmovd(ALPHA_OUTPUT.cvt32(), COLOR_OUTPUT);
 | 
			
		||||
            } else {
 | 
			
		||||
                // Load the alpha modifers to VEC0/1/2.
 | 
			
		||||
                GetAlphaModifier(get_source(VEC0, tev_stage.alpha_source1), A0,
 | 
			
		||||
                                 tev_stage.alpha_modifier1);
 | 
			
		||||
                GetAlphaModifier(get_source(VEC1, tev_stage.alpha_source2), A1,
 | 
			
		||||
                                 tev_stage.alpha_modifier2);
 | 
			
		||||
                GetAlphaModifier(get_source(VEC2, tev_stage.alpha_source3), A2,
 | 
			
		||||
                                 tev_stage.alpha_modifier3);
 | 
			
		||||
 | 
			
		||||
                // Combine the alpha values to ALPHA_OUTPUT.
 | 
			
		||||
                AlphaCombine(ALPHA_OUTPUT, tev_stage.alpha_op);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Load the color multipler to an SSE vector.
 | 
			
		||||
            mov(eax, tev_stage.GetColorMultiplier());
 | 
			
		||||
            movd(VEC0, eax);
 | 
			
		||||
            pshufd(VEC0, VEC0, 0);
 | 
			
		||||
 | 
			
		||||
            // Multiply color output with the multiplier and take the minimum.
 | 
			
		||||
            pmulld(COLOR_OUTPUT, VEC0);
 | 
			
		||||
            pminsd(COLOR_OUTPUT, MAX_COLOR);
 | 
			
		||||
 | 
			
		||||
            // Load the alpha multiplier, multiply it with the alpha output.
 | 
			
		||||
            mov(eax, tev_stage.GetAlphaMultiplier());
 | 
			
		||||
            imul(ALPHA_OUTPUT, eax);
 | 
			
		||||
 | 
			
		||||
            // Load result to a vector and take the minimum
 | 
			
		||||
            movd(VEC0, ALPHA_OUTPUT);
 | 
			
		||||
            pshufd(VEC0, VEC0, 0);
 | 
			
		||||
            pminsd(VEC0, MAX_COLOR);
 | 
			
		||||
 | 
			
		||||
            // Blend vectors to get the combiner output
 | 
			
		||||
            vpblendd(COMBINER_OUTPUT, COLOR_OUTPUT, VEC0, 0b1000);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Set combiner buffer to the next buffer
 | 
			
		||||
        movq(COMBINER_BUFFER, NEXT_COMBINER_BUFFER);
 | 
			
		||||
 | 
			
		||||
        if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor(
 | 
			
		||||
                tev_stage_index)) {
 | 
			
		||||
            vpblendd(NEXT_COMBINER_BUFFER, COMBINER_OUTPUT, NEXT_COMBINER_BUFFER, 0b1000);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha(
 | 
			
		||||
                tev_stage_index)) {
 | 
			
		||||
            vpblendd(NEXT_COMBINER_BUFFER, COMBINER_OUTPUT, NEXT_COMBINER_BUFFER, 0b0111);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Pack combiner output to a u32 to be returned.
 | 
			
		||||
    vpextrd(edx, COMBINER_OUTPUT, 3);
 | 
			
		||||
    vpextrd(eax, COMBINER_OUTPUT, 2);
 | 
			
		||||
    sal(edx, 8);
 | 
			
		||||
    or_(eax, edx);
 | 
			
		||||
    vpextrd(edx, COMBINER_OUTPUT, 1);
 | 
			
		||||
    sal(eax, 8);
 | 
			
		||||
    or_(edx, eax);
 | 
			
		||||
    vmovd(eax, COMBINER_OUTPUT);
 | 
			
		||||
    sal(edx, 8);
 | 
			
		||||
    or_(eax, edx);
 | 
			
		||||
 | 
			
		||||
    ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16);
 | 
			
		||||
    ret();
 | 
			
		||||
    ready();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void TevConfig::GetColorModifier(const Xbyak::Xmm& dest, TevStageConfig::ColorModifier factor) {
 | 
			
		||||
    using ColorModifier = TevStageConfig::ColorModifier;
 | 
			
		||||
 | 
			
		||||
    const auto broadcast = [&](u32 comp) {
 | 
			
		||||
        const u8 mask = comp | (comp << 2) | (comp << 4);
 | 
			
		||||
        vpshufd(dest, dest, mask);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    switch (factor) {
 | 
			
		||||
    case ColorModifier::SourceColor:
 | 
			
		||||
        vpblendd(dest, dest, ZERO, 0b1000);
 | 
			
		||||
        break;
 | 
			
		||||
    case ColorModifier::OneMinusSourceColor:
 | 
			
		||||
        vpsubd(dest, MAX_COLOR, dest);
 | 
			
		||||
        break;
 | 
			
		||||
    case ColorModifier::SourceAlpha:
 | 
			
		||||
        broadcast(3);
 | 
			
		||||
        break;
 | 
			
		||||
    case ColorModifier::OneMinusSourceAlpha:
 | 
			
		||||
        broadcast(3);
 | 
			
		||||
        vpsubd(dest, MAX_COLOR, dest);
 | 
			
		||||
        break;
 | 
			
		||||
    case ColorModifier::SourceRed:
 | 
			
		||||
        broadcast(0);
 | 
			
		||||
        break;
 | 
			
		||||
    case ColorModifier::OneMinusSourceRed:
 | 
			
		||||
        broadcast(0);
 | 
			
		||||
        vpsubd(dest, MAX_COLOR, dest);
 | 
			
		||||
        break;
 | 
			
		||||
    case ColorModifier::SourceGreen:
 | 
			
		||||
        broadcast(1);
 | 
			
		||||
        break;
 | 
			
		||||
    case ColorModifier::OneMinusSourceGreen:
 | 
			
		||||
        broadcast(1);
 | 
			
		||||
        vpsubd(dest, MAX_COLOR, dest);
 | 
			
		||||
        break;
 | 
			
		||||
    case ColorModifier::SourceBlue:
 | 
			
		||||
        broadcast(2);
 | 
			
		||||
        break;
 | 
			
		||||
    case ColorModifier::OneMinusSourceBlue:
 | 
			
		||||
        broadcast(2);
 | 
			
		||||
        vpsubd(dest, MAX_COLOR, dest);
 | 
			
		||||
        break;
 | 
			
		||||
    default:
 | 
			
		||||
        UNREACHABLE();
 | 
			
		||||
    }
 | 
			
		||||
    pand(dest, MAX_COLOR);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void TevConfig::ColorCombine(const Xbyak::Xmm& dest, TevStageConfig::Operation op) {
 | 
			
		||||
    using Operation = TevStageConfig::Operation;
 | 
			
		||||
 | 
			
		||||
    switch (op) {
 | 
			
		||||
    case Operation::Replace:
 | 
			
		||||
        vmovdqa(dest, VEC0);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::Modulate:
 | 
			
		||||
        pmulld(VEC0, VEC1);
 | 
			
		||||
        vpsrlq(dest, VEC0, 8); // TODO: This is a very crude approximation of division by 255
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::Add:
 | 
			
		||||
        vpaddd(VEC0, VEC0, VEC1);
 | 
			
		||||
        vpminsd(dest, MAX_COLOR, VEC0);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::AddSigned:
 | 
			
		||||
        vpaddd(VEC0, VEC0, VEC1);
 | 
			
		||||
        vpsubd(VEC0, VEC0, MID_COLOR);
 | 
			
		||||
        vpminsd(VEC0, VEC0, MAX_COLOR);
 | 
			
		||||
        vpmaxsd(dest, VEC0, ZERO);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::Lerp:
 | 
			
		||||
        pmulld(VEC0, VEC2);
 | 
			
		||||
        psubd(VEC2, MAX_COLOR);
 | 
			
		||||
        pmulld(VEC1, VEC2);
 | 
			
		||||
        vpaddd(dest, VEC0, VEC1);
 | 
			
		||||
        vpsrlq(dest, VEC0, 8); // TODO: This is a very crude approximation of division by 255
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::Subtract:
 | 
			
		||||
        psubd(VEC0, VEC1);
 | 
			
		||||
        vpmaxsd(dest, VEC0, ZERO);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::MultiplyThenAdd:
 | 
			
		||||
        pmulld(VEC0, VEC1);
 | 
			
		||||
        pmulld(VEC2, MAX_COLOR);
 | 
			
		||||
        paddd(VEC0, VEC2);
 | 
			
		||||
        pminsd(VEC0, MAX_COLOR);
 | 
			
		||||
        vpsrlq(dest, VEC0, 8); // TODO: This is a very crude approximation of division by 255
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::AddThenMultiply:
 | 
			
		||||
        paddd(VEC0, VEC1);
 | 
			
		||||
        pminsd(VEC0, MAX_COLOR);
 | 
			
		||||
        pmulld(VEC0, VEC2);
 | 
			
		||||
        vpsrlq(dest, VEC0, 8); // TODO: This is a very crude approximation of division by 255
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::Dot3_RGB:
 | 
			
		||||
    case Operation::Dot3_RGBA:
 | 
			
		||||
        pslld(VEC0, 1);
 | 
			
		||||
        psubd(VEC0, MAX_COLOR);
 | 
			
		||||
        pslld(VEC1, 1);
 | 
			
		||||
        psubd(VEC1, MAX_COLOR);
 | 
			
		||||
        pmulld(VEC0, VEC1);
 | 
			
		||||
        paddd(VEC0, MID_COLOR);
 | 
			
		||||
        psrld(VEC0, 8);
 | 
			
		||||
        vpblendd(VEC0, VEC0, ZERO, 0b1000);
 | 
			
		||||
        phaddd(VEC0, VEC0);
 | 
			
		||||
        phaddd(VEC0, VEC0);
 | 
			
		||||
        pminsd(VEC0, MAX_COLOR);
 | 
			
		||||
        pmaxsd(VEC0, ZERO);
 | 
			
		||||
        pshufd(dest, VEC0, 0);
 | 
			
		||||
        break;
 | 
			
		||||
    default:
 | 
			
		||||
        LOG_ERROR(HW_GPU, "Unknown color combiner operation {}", (int)op);
 | 
			
		||||
        UNIMPLEMENTED();
 | 
			
		||||
    }
 | 
			
		||||
    pand(dest, MAX_COLOR);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void TevConfig::GetAlphaModifier(const Xbyak::Xmm& src, const Xbyak::Reg32& dest,
 | 
			
		||||
                                 TevStageConfig::AlphaModifier factor) {
 | 
			
		||||
    using AlphaModifier = TevStageConfig::AlphaModifier;
 | 
			
		||||
 | 
			
		||||
    const auto get_comp = [&](u32 comp, bool minus = false) {
 | 
			
		||||
        const auto& reg = minus ? eax : dest;
 | 
			
		||||
        vpextrd(reg, src, comp);
 | 
			
		||||
        if (minus) {
 | 
			
		||||
            mov(dest, 255);
 | 
			
		||||
            sub(dest, reg);
 | 
			
		||||
        }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    switch (factor) {
 | 
			
		||||
    case AlphaModifier::SourceAlpha:
 | 
			
		||||
        get_comp(3);
 | 
			
		||||
        break;
 | 
			
		||||
    case AlphaModifier::OneMinusSourceAlpha:
 | 
			
		||||
        get_comp(3, true);
 | 
			
		||||
        break;
 | 
			
		||||
    case AlphaModifier::SourceRed:
 | 
			
		||||
        get_comp(0);
 | 
			
		||||
        break;
 | 
			
		||||
    case AlphaModifier::OneMinusSourceRed:
 | 
			
		||||
        get_comp(0, true);
 | 
			
		||||
        break;
 | 
			
		||||
    case AlphaModifier::SourceGreen:
 | 
			
		||||
        get_comp(1);
 | 
			
		||||
        break;
 | 
			
		||||
    case AlphaModifier::OneMinusSourceGreen:
 | 
			
		||||
        get_comp(1, true);
 | 
			
		||||
        break;
 | 
			
		||||
    case AlphaModifier::SourceBlue:
 | 
			
		||||
        get_comp(2);
 | 
			
		||||
        break;
 | 
			
		||||
    case AlphaModifier::OneMinusSourceBlue:
 | 
			
		||||
        get_comp(2, true);
 | 
			
		||||
        break;
 | 
			
		||||
    default:
 | 
			
		||||
        UNREACHABLE();
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void TevConfig::AlphaCombine(const Xbyak::Reg32& dest, TevStageConfig::Operation op) {
 | 
			
		||||
    using Operation = TevStageConfig::Operation;
 | 
			
		||||
 | 
			
		||||
    const auto div_255 = [&](const Reg32& dst, const Reg32& src) {
 | 
			
		||||
        mov(dst, 0x80808081);
 | 
			
		||||
        imul(dst.cvt64(), src.cvt64());
 | 
			
		||||
        shr(dst.cvt64(), 39);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    switch (op) {
 | 
			
		||||
    case Operation::Replace:
 | 
			
		||||
        mov(dest, A0);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::Modulate:
 | 
			
		||||
        imul(A0, A1);
 | 
			
		||||
        div_255(dest, A0);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::Add:
 | 
			
		||||
        add(A0, A1);
 | 
			
		||||
        cmp(A0, 255);
 | 
			
		||||
        mov(eax, 255);
 | 
			
		||||
        cmovb(A0, eax);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::AddSigned:
 | 
			
		||||
        xor_(eax, eax);
 | 
			
		||||
        add(A0, A1);
 | 
			
		||||
        sub(A0, 128);
 | 
			
		||||
        test(A0, A0);
 | 
			
		||||
        cmovg(eax, A0);
 | 
			
		||||
        cmp(eax, 255);
 | 
			
		||||
        mov(A0, 255);
 | 
			
		||||
        cmovb(A0, eax);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::Lerp:
 | 
			
		||||
        imul(A0, A2);
 | 
			
		||||
        mov(eax, 255);
 | 
			
		||||
        sub(eax, A2);
 | 
			
		||||
        imul(A1, eax);
 | 
			
		||||
        add(A0, A1);
 | 
			
		||||
        div_255(dest, A0);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::Subtract:
 | 
			
		||||
        sub(A0, A1);
 | 
			
		||||
        xor_(eax, eax);
 | 
			
		||||
        test(A0, A0);
 | 
			
		||||
        cmovl(A0, eax);
 | 
			
		||||
        mov(dest, A0);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::MultiplyThenAdd:
 | 
			
		||||
        imul(A0, A1);
 | 
			
		||||
        mov(dest, A2);
 | 
			
		||||
        shl(dest, 8);
 | 
			
		||||
        sub(dest, A2);
 | 
			
		||||
        add(dest, A0);
 | 
			
		||||
        div_255(eax, dest);
 | 
			
		||||
        cmp(eax, 255);
 | 
			
		||||
        mov(dest, 255);
 | 
			
		||||
        cmovb(dest, eax);
 | 
			
		||||
        break;
 | 
			
		||||
    case Operation::AddThenMultiply:
 | 
			
		||||
        add(A0, A1);
 | 
			
		||||
        cmp(A0, 255);
 | 
			
		||||
        mov(eax, 255);
 | 
			
		||||
        cmovg(A0, eax);
 | 
			
		||||
        imul(A0, A2);
 | 
			
		||||
        div_255(dest, A0);
 | 
			
		||||
        break;
 | 
			
		||||
    default:
 | 
			
		||||
        LOG_ERROR(HW_GPU, "Unknown alpha combiner operation {}", (int)op);
 | 
			
		||||
        UNIMPLEMENTED();
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
} // namespace SwRenderer
 | 
			
		||||
							
								
								
									
										64
									
								
								src/video_core/renderer_software/sw_tev_jit.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										64
									
								
								src/video_core/renderer_software/sw_tev_jit.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,64 @@
 | 
			
		||||
// Copyright 2023 Citra Emulator Project
 | 
			
		||||
// Licensed under GPLv2 or any later version
 | 
			
		||||
// Refer to the license.txt file included.
 | 
			
		||||
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#include <span>
 | 
			
		||||
#include <xbyak/xbyak.h>
 | 
			
		||||
 | 
			
		||||
#include "common/hash.h"
 | 
			
		||||
#include "common/vector_math.h"
 | 
			
		||||
#include "video_core/regs_texturing.h"
 | 
			
		||||
 | 
			
		||||
namespace Pica {
 | 
			
		||||
struct State;
 | 
			
		||||
struct Regs;
 | 
			
		||||
} // namespace Pica
 | 
			
		||||
 | 
			
		||||
namespace SwRenderer {
 | 
			
		||||
 | 
			
		||||
struct TevConfigKey {
 | 
			
		||||
    explicit TevConfigKey(const Pica::TexturingRegs& regs);
 | 
			
		||||
 | 
			
		||||
    u64 Hash() const noexcept {
 | 
			
		||||
        return Common::ComputeHash64(this, sizeof(TevConfigKey));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    std::array<Pica::TexturingRegs::TevStageConfig, 6> stages;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class TevConfig : public Xbyak::CodeGenerator {
 | 
			
		||||
public:
 | 
			
		||||
    explicit TevConfig(const Pica::Regs& regs, const TevConfigKey& key);
 | 
			
		||||
    ~TevConfig();
 | 
			
		||||
 | 
			
		||||
    Common::Vec4<u8> Run(std::span<Common::Vec4<u8>, 4> texture_color_,
 | 
			
		||||
                         Common::Vec4<u8> primary_color_, Common::Vec4<u8> primary_fragment_color_,
 | 
			
		||||
                         Common::Vec4<u8> secondary_fragment_color_, u64 tev_combiner_buffer_color);
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    void WriteTevConfig(const TevConfigKey& key);
 | 
			
		||||
 | 
			
		||||
    void GetColorModifier(const Xbyak::Xmm& dest,
 | 
			
		||||
                          Pica::TexturingRegs::TevStageConfig::ColorModifier factor);
 | 
			
		||||
 | 
			
		||||
    void GetAlphaModifier(const Xbyak::Xmm& src, const Xbyak::Reg32& dest,
 | 
			
		||||
                          Pica::TexturingRegs::TevStageConfig::AlphaModifier factor);
 | 
			
		||||
 | 
			
		||||
    void ColorCombine(const Xbyak::Xmm& dest, Pica::TexturingRegs::TevStageConfig::Operation op);
 | 
			
		||||
 | 
			
		||||
    void AlphaCombine(const Xbyak::Reg32& dest, Pica::TexturingRegs::TevStageConfig::Operation op);
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    const Pica::Regs& regs;
 | 
			
		||||
 | 
			
		||||
    using CompiledTevFun = u32(u32* texture_color, u32 primary_color, u32 primary_fragment_color,
 | 
			
		||||
                               u64 secondary_fragment_color_and_tev_combiner_buffer_color);
 | 
			
		||||
 | 
			
		||||
    CompiledTevFun* program = nullptr;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
using TevCache = std::unordered_map<u64, std::unique_ptr<TevConfig>, Common::IdentityHash<u64>>;
 | 
			
		||||
 | 
			
		||||
} // namespace SwRenderer
 | 
			
		||||
		Reference in New Issue
	
	Block a user