citra/src/video_core/renderer_software/sw_rasterizer.cpp

// Copyright 2015 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.

#include <boost/container/static_vector.hpp>
#include "common/logging/log.h"
#include "common/microprofile.h"
#include "common/quaternion.h"
#include "common/vector_math.h"
#include "core/memory.h"
#include "video_core/pica_state.h"
#include "video_core/pica_types.h"
#include "video_core/renderer_software/sw_framebuffer.h"
#include "video_core/renderer_software/sw_lighting.h"
#include "video_core/renderer_software/sw_proctex.h"
#include "video_core/renderer_software/sw_rasterizer.h"
#include "video_core/renderer_software/sw_texturing.h"
#include "video_core/shader/shader.h"
#include "video_core/texture/texture_decode.h"

namespace SwRenderer {

using Pica::f24;
using Pica::FramebufferRegs;
using Pica::RasterizerRegs;
using Pica::TexturingRegs;
using Pica::Texture::LookupTexture;
using Pica::Texture::TextureInfo;

// Certain games render 2D elements very close to clip plane 0 resulting in very tiny
// negative/positive z values when computing with f32 precision,
// causing some vertices to get erroneously clipped. To workaround this problem,
// we can use a very small epsilon value for clip plane comparison.
constexpr f32 EPSILON_Z = 0.00000001f;

struct Vertex : Pica::Shader::OutputVertex {
    Vertex(const OutputVertex& v) : OutputVertex(v) {}

    /// Attributes used to store intermediate results position after perspective divide.
    Common::Vec3<f24> screenpos;

    /**
     * Linear interpolation
     * factor: 0=this, 1=vtx
     * Note: This function cannot be called after perspective divide.
     **/
    void Lerp(f24 factor, const Vertex& vtx) {
        pos = pos * factor + vtx.pos * (f24::One() - factor);
        quat = quat * factor + vtx.quat * (f24::One() - factor);
        color = color * factor + vtx.color * (f24::One() - factor);
        tc0 = tc0 * factor + vtx.tc0 * (f24::One() - factor);
        tc1 = tc1 * factor + vtx.tc1 * (f24::One() - factor);
        tc0_w = tc0_w * factor + vtx.tc0_w * (f24::One() - factor);
        view = view * factor + vtx.view * (f24::One() - factor);
        tc2 = tc2 * factor + vtx.tc2 * (f24::One() - factor);
    }

    /**
     * Linear interpolation
     * factor: 0=v0, 1=v1
     * Note: This function cannot be called after perspective divide.
     **/
    static Vertex Lerp(f24 factor, const Vertex& v0, const Vertex& v1) {
        Vertex ret = v0;
        ret.Lerp(factor, v1);
        return ret;
    }
};

namespace {

MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240));

struct ClippingEdge {
public:
    constexpr ClippingEdge(Common::Vec4<f24> coeffs,
                           Common::Vec4<f24> bias = Common::Vec4<f24>(f24::Zero(), f24::Zero(),
                                                                      f24::Zero(), f24::Zero()))
        : pos(f24::Zero()), coeffs(coeffs), bias(bias) {}

    bool IsInside(const Vertex& vertex) const {
        return Common::Dot(vertex.pos + bias, coeffs) >= f24::FromFloat32(-EPSILON_Z);
    }

    bool IsOutSide(const Vertex& vertex) const {
        return !IsInside(vertex);
    }

    Vertex GetIntersection(const Vertex& v0, const Vertex& v1) const {
        const f24 dp = Common::Dot(v0.pos + bias, coeffs);
        const f24 dp_prev = Common::Dot(v1.pos + bias, coeffs);
        const f24 factor = dp_prev / (dp_prev - dp);
        return Vertex::Lerp(factor, v0, v1);
    }

private:
    [[maybe_unused]] f24 pos;
    Common::Vec4<f24> coeffs;
    Common::Vec4<f24> bias;
};

} // Anonymous namespace

RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_)
    : memory{memory_}, state{Pica::g_state}, regs{state.regs},
      num_sw_threads{std::max(std::thread::hardware_concurrency(), 2U)},
      sw_workers{num_sw_threads, "SwRenderer workers"}, fb{memory, regs.framebuffer} {}

void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0,
                                     const Pica::Shader::OutputVertex& v1,
                                     const Pica::Shader::OutputVertex& v2) {
    /**
     * Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at
     * the new edge (or less in degenerate cases). As such, we can say that each clipping plane
     * introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
     * fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
     **/
    static constexpr std::size_t MAX_VERTICES = 9;

    boost::container::static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2};
    boost::container::static_vector<Vertex, MAX_VERTICES> buffer_b;

    FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat);
    FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat);

    auto* output_list = &buffer_a;
    auto* input_list = &buffer_b;

    // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
    // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
    //       epsilon possible within f24 accuracy.
    static constexpr f24 EPSILON = f24::FromFloat32(0.00001f);
    static constexpr f24 f0 = f24::Zero();
    static constexpr f24 f1 = f24::One();
    static constexpr std::array<ClippingEdge, 7> clipping_edges = {{
        {Common::MakeVec(-f1, f0, f0, f1)},                                        // x = +w
        {Common::MakeVec(f1, f0, f0, f1)},                                         // x = -w
        {Common::MakeVec(f0, -f1, f0, f1)},                                        // y = +w
        {Common::MakeVec(f0, f1, f0, f1)},                                         // y = -w
        {Common::MakeVec(f0, f0, -f1, f0)},                                        // z =  0
        {Common::MakeVec(f0, f0, f1, f1)},                                         // z = -w
        {Common::MakeVec(f0, f0, f0, f1), Common::Vec4<f24>(f0, f0, f0, EPSILON)}, // w = EPSILON
    }};

    // Simple implementation of the Sutherland-Hodgman clipping algorithm.
    // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
    const auto clip = [&](const ClippingEdge& edge) {
        std::swap(input_list, output_list);
        output_list->clear();

        const Vertex* reference_vertex = &input_list->back();
        for (const auto& vertex : *input_list) {
            // NOTE: This algorithm changes vertex order in some cases!
            if (edge.IsInside(vertex)) {
                if (edge.IsOutSide(*reference_vertex)) {
                    output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
                }
                output_list->push_back(vertex);
            } else if (edge.IsInside(*reference_vertex)) {
                output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
            }
            reference_vertex = &vertex;
        }
    };

    for (const ClippingEdge& edge : clipping_edges) {
        clip(edge);
        if (output_list->size() < 3) {
            return;
        }
    }

    if (state.regs.rasterizer.clip_enable) {
        const ClippingEdge custom_edge{state.regs.rasterizer.GetClipCoef()};
        clip(custom_edge);
        if (output_list->size() < 3) {
            return;
        }
    }

    MakeScreenCoords((*output_list)[0]);
    MakeScreenCoords((*output_list)[1]);

    for (std::size_t i = 0; i < output_list->size() - 2; i++) {
        Vertex& vtx0 = (*output_list)[0];
        Vertex& vtx1 = (*output_list)[i + 1];
        Vertex& vtx2 = (*output_list)[i + 2];

        MakeScreenCoords(vtx2);

        LOG_TRACE(
            Render_Software,
            "Triangle {}/{} at position ({:.3}, {:.3}, {:.3}, {:.3f}), "
            "({:.3}, {:.3}, {:.3}, {:.3}), ({:.3}, {:.3}, {:.3}, {:.3}) and "
            "screen position ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2})",
            i + 1, output_list->size() - 2, vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(),
            vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(), vtx1.pos.x.ToFloat32(),
            vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
            vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(),
            vtx2.pos.w.ToFloat32(), vtx0.screenpos.x.ToFloat32(), vtx0.screenpos.y.ToFloat32(),
            vtx0.screenpos.z.ToFloat32(), vtx1.screenpos.x.ToFloat32(),
            vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(),
            vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(),
            vtx2.screenpos.z.ToFloat32());

        ProcessTriangle(vtx0, vtx1, vtx2);
    }
}

void RasterizerSoftware::MakeScreenCoords(Vertex& vtx) {
    Viewport viewport{};
    viewport.halfsize_x = f24::FromRaw(regs.rasterizer.viewport_size_x);
    viewport.halfsize_y = f24::FromRaw(regs.rasterizer.viewport_size_y);
    viewport.offset_x = f24::FromFloat32(static_cast<f32>(regs.rasterizer.viewport_corner.x));
    viewport.offset_y = f24::FromFloat32(static_cast<f32>(regs.rasterizer.viewport_corner.y));

    f24 inv_w = f24::One() / vtx.pos.w;
    vtx.pos.w = inv_w;
    vtx.quat *= inv_w;
    vtx.color *= inv_w;
    vtx.tc0 *= inv_w;
    vtx.tc1 *= inv_w;
    vtx.tc0_w *= inv_w;
    vtx.view *= inv_w;
    vtx.tc2 *= inv_w;

    vtx.screenpos[0] = (vtx.pos.x * inv_w + f24::One()) * viewport.halfsize_x + viewport.offset_x;
    vtx.screenpos[1] = (vtx.pos.y * inv_w + f24::One()) * viewport.halfsize_y + viewport.offset_y;
    vtx.screenpos[2] = vtx.pos.z * inv_w;
}

void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2,
                                         bool reversed) {
    MICROPROFILE_SCOPE(GPU_Rasterization);

    // Vertex positions in rasterizer coordinates
    static auto screen_to_rasterizer_coords = [](const Common::Vec3<f24>& vec) {
        return Common::Vec3{Fix12P4::FromFloat24(vec.x), Fix12P4::FromFloat24(vec.y),
                            Fix12P4::FromFloat24(vec.z)};
    };

    const std::array<Common::Vec3<Fix12P4>, 3> vtxpos = {
        screen_to_rasterizer_coords(v0.screenpos),
        screen_to_rasterizer_coords(v1.screenpos),
        screen_to_rasterizer_coords(v2.screenpos),
    };

    if (regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepAll) {
        // Make sure we always end up with a triangle wound counter-clockwise
        if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
            ProcessTriangle(v0, v2, v1, true);
            return;
        }
    } else {
        if (!reversed && regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepClockWise) {
            // Reverse vertex order and use the CCW code path.
            ProcessTriangle(v0, v2, v1, true);
            return;
        }
        // Cull away triangles which are wound clockwise.
        if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
            return;
        }
    }

    u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
    u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});
    u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
    u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});

    // Convert the scissor box coordinates to 12.4 fixed point
    const u16 scissor_x1 = static_cast<u16>(regs.rasterizer.scissor_test.x1 << 4);
    const u16 scissor_y1 = static_cast<u16>(regs.rasterizer.scissor_test.y1 << 4);
    // x2,y2 have +1 added to cover the entire sub-pixel area
    const u16 scissor_x2 = static_cast<u16>((regs.rasterizer.scissor_test.x2 + 1) << 4);
    const u16 scissor_y2 = static_cast<u16>((regs.rasterizer.scissor_test.y2 + 1) << 4);

    if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Include) {
        // Calculate the new bounds
        min_x = std::max(min_x, scissor_x1);
        min_y = std::max(min_y, scissor_y1);
        max_x = std::min(max_x, scissor_x2);
        max_y = std::min(max_y, scissor_y2);
    }

    min_x &= Fix12P4::IntMask();
    min_y &= Fix12P4::IntMask();
    max_x = ((max_x + Fix12P4::FracMask()) & Fix12P4::IntMask());
    max_y = ((max_y + Fix12P4::FracMask()) & Fix12P4::IntMask());

    const int bias0 =
        IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0;
    const int bias1 =
        IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0;
    const int bias2 =
        IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;

    const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w);

    const auto textures = regs.texturing.GetTextures();
    const auto tev_stages = regs.texturing.GetTevStages();

    fb.Bind();

    // Enter rasterization loop, starting at the center of the topleft bounding box corner.
    // TODO: Not sure if looping through x first might be faster
    for (u16 y = min_y + 8; y < max_y; y += 0x10) {
        const auto process_scanline = [&, y] {
            for (u16 x = min_x + 8; x < max_x; x += 0x10) {
                // Do not process the pixel if it's inside the scissor box and the scissor mode is
                // set to Exclude.
                if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) {
                    if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) {
                        continue;
                    }
                }

                // Calculate the barycentric coordinates w0, w1 and w2
                const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
                const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
                const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
                const s32 wsum = w0 + w1 + w2;

                // If current pixel is not covered by the current primitive
                if (w0 < 0 || w1 < 0 || w2 < 0) {
                    continue;
                }

                const auto baricentric_coordinates = Common::MakeVec(
                    f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)),
                    f24::FromFloat32(static_cast<f32>(w2)));
                const f24 interpolated_w_inverse =
                    f24::One() / Common::Dot(w_inverse, baricentric_coordinates);

                // interpolated_z = z / w
                const float interpolated_z_over_w =
                    (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 +
                     v2.screenpos[2].ToFloat32() * w2) /
                    wsum;

                // Not fully accurate. About 3 bits in precision are missing.
                // Z-Buffer (z / w * scale + offset)
                const float depth_scale =
                    f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32();
                const float depth_offset =
                    f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32();
                float depth = interpolated_z_over_w * depth_scale + depth_offset;

                // Potentially switch to W-Buffer
                if (regs.rasterizer.depthmap_enable ==
                    Pica::RasterizerRegs::DepthBuffering::WBuffering) {
                    // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
                    depth *= interpolated_w_inverse.ToFloat32() * wsum;
                }

                // Clamp the result
                depth = std::clamp(depth, 0.0f, 1.0f);

                /**
                 * Perspective correct attribute interpolation:
                 * Attribute values cannot be calculated by simple linear interpolation since
                 * they are not linear in screen space. For example, when interpolating a
                 * texture coordinate across two vertices, something simple like
                 *     u = (u0*w0 + u1*w1)/(w0+w1)
                 * will not work. However, the attribute value divided by the
                 * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear
                 * in screenspace. Hence, we can linearly interpolate these two independently and
                 * calculate the interpolated attribute by dividing the results.
                 * I.e.
                 *     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1)
                 *     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
                 *     u = u_over_w / one_over_w
                 *
                 * The generalization to three vertices is straightforward in baricentric
                 *coordinates.
                 **/
                const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) {
                    auto attr_over_w = Common::MakeVec(attr0, attr1, attr2);
                    f24 interpolated_attr_over_w =
                        Common::Dot(attr_over_w, baricentric_coordinates);
                    return interpolated_attr_over_w * interpolated_w_inverse;
                };

                const Common::Vec4<u8> primary_color{
                    static_cast<u8>(
                        round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r())
                                  .ToFloat32() *
                              255)),
                    static_cast<u8>(
                        round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g())
                                  .ToFloat32() *
                              255)),
                    static_cast<u8>(
                        round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b())
                                  .ToFloat32() *
                              255)),
                    static_cast<u8>(
                        round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a())
                                  .ToFloat32() *
                              255)),
                };

                std::array<Common::Vec2<f24>, 3> uv;
                uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
                uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
                uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u());
                uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v());
                uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u());
                uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v());

                // Sample bound texture units.
                const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
                const auto texture_color = TextureColor(uv, textures, tc0_w);

                Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
                Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};

                if (!regs.lighting.disable) {
                    const auto normquat =
                        Common::Quaternion<f32>{
                            {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x)
                                 .ToFloat32(),
                             get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y)
                                 .ToFloat32(),
                             get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z)
                                 .ToFloat32()},
                            get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
                        }
                            .Normalized();

                    const Common::Vec3f view{
                        get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
                        get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
                        get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
                    };
                    std::tie(primary_fragment_color, secondary_fragment_color) =
                        ComputeFragmentsColors(regs.lighting, state.lighting, normquat, view,
                                               texture_color);
                }

                // Write the TEV stages.
                auto combiner_output =
                    WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color,
                                   secondary_fragment_color);

                const auto& output_merger = regs.framebuffer.output_merger;
                if (output_merger.fragment_operation_mode ==
                    FramebufferRegs::FragmentOperationMode::Shadow) {
                    const u32 depth_int = static_cast<u32>(depth * 0xFFFFFF);
                    // Use green color as the shadow intensity
                    const u8 stencil = combiner_output.y;
                    fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil);
                    // Skip the normal output merger pipeline if it is in shadow mode
                    continue;
                }

                // Does alpha testing happen before or after stencil?
                if (!DoAlphaTest(combiner_output.a())) {
                    continue;
                }
                WriteFog(depth, combiner_output);
                if (!DoDepthStencilTest(x, y, depth)) {
                    continue;
                }
                const auto result = PixelColor(x, y, combiner_output);
                if (regs.framebuffer.framebuffer.allow_color_write != 0) {
                    fb.DrawPixel(x >> 4, y >> 4, result);
                }
            }
        };
        sw_workers.QueueWork(std::move(process_scanline));
    }
    sw_workers.WaitForRequests();
}

std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
    std::span<const Common::Vec2<f24>, 3> uv,
    std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const {
    std::array<Common::Vec4<u8>, 4> texture_color{};
    for (u32 i = 0; i < 3; ++i) {
        const auto& texture = textures[i];
        if (!texture.enabled) [[unlikely]] {
            continue;
        }
        if (texture.config.address == 0) [[unlikely]] {
            texture_color[i] = {0, 0, 0, 255};
            continue;
        }

        const s32 coordinate_i = (i == 2 && regs.texturing.main_config.texture2_use_coord1) ? 1 : i;
        f24 u = uv[coordinate_i].u();
        f24 v = uv[coordinate_i].v();

        // Only unit 0 respects the texturing type (according to 3DBrew)
        PAddr texture_address = texture.config.GetPhysicalAddress();
        f24 shadow_z;
        if (i == 0) {
            switch (texture.config.type) {
            case TexturingRegs::TextureConfig::Texture2D:
                break;
            case TexturingRegs::TextureConfig::ShadowCube:
            case TexturingRegs::TextureConfig::TextureCube: {
                std::tie(u, v, shadow_z, texture_address) =
                    ConvertCubeCoord(u, v, tc0_w, regs.texturing);
                break;
            }
            case TexturingRegs::TextureConfig::Projection2D: {
                u /= tc0_w;
                v /= tc0_w;
                break;
            }
            case TexturingRegs::TextureConfig::Shadow2D: {
                if (!regs.texturing.shadow.orthographic) {
                    u /= tc0_w;
                    v /= tc0_w;
                }
                shadow_z = f24::FromFloat32(std::abs(tc0_w.ToFloat32()));
                break;
            }
            case TexturingRegs::TextureConfig::Disabled:
                continue; // skip this unit and continue to the next unit
            default:
                LOG_ERROR(HW_GPU, "Unhandled texture type {:x}", (int)texture.config.type);
                UNIMPLEMENTED();
                break;
            }
        }

        const f24 width = f24::FromFloat32(static_cast<f32>(texture.config.width));
        const f24 height = f24::FromFloat32(static_cast<f32>(texture.config.height));
        s32 s = static_cast<s32>((u * width).ToFloat32());
        s32 t = static_cast<s32>((v * height).ToFloat32());

        bool use_border_s = false;
        bool use_border_t = false;

        if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder) {
            use_border_s = s < 0 || s >= static_cast<s32>(texture.config.width);
        } else if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder2) {
            use_border_s = s >= static_cast<s32>(texture.config.width);
        }

        if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder) {
            use_border_t = t < 0 || t >= static_cast<s32>(texture.config.height);
        } else if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder2) {
            use_border_t = t >= static_cast<s32>(texture.config.height);
        }

        if (use_border_s || use_border_t) {
            const auto border_color = texture.config.border_color;
            texture_color[i] = Common::MakeVec(border_color.r.Value(), border_color.g.Value(),
                                               border_color.b.Value(), border_color.a.Value())
                                   .Cast<u8>();
        } else {
            // Textures are laid out from bottom to top, hence we invert the t coordinate.
            // NOTE: This may not be the right place for the inversion.
            // TODO: Check if this applies to ETC textures, too.
            s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
            t = texture.config.height - 1 -
                GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);

            const u8* texture_data = memory.GetPhysicalPointer(texture_address);
            const auto info = TextureInfo::FromPicaRegister(texture.config, texture.format);

            // TODO: Apply the min and mag filters to the texture
            texture_color[i] = LookupTexture(texture_data, s, t, info);
        }

        if (i == 0 && (texture.config.type == TexturingRegs::TextureConfig::Shadow2D ||
                       texture.config.type == TexturingRegs::TextureConfig::ShadowCube)) {

            s32 z_int = static_cast<s32>(std::min(shadow_z.ToFloat32(), 1.0f) * 0xFFFFFF);
            z_int -= regs.texturing.shadow.bias << 1;
            const auto& color = texture_color[i];
            const s32 z_ref = (color.w << 16) | (color.z << 8) | color.y;
            u8 density;
            if (z_ref >= z_int) {
                density = color.x;
            } else {
                density = 0;
            }
            texture_color[i] = {density, density, density, density};
        }
    }

    // Sample procedural texture
    if (regs.texturing.main_config.texture3_enable) {
        const auto& proctex_uv = uv[regs.texturing.main_config.texture3_coordinates];
        texture_color[3] = ProcTex(proctex_uv.u().ToFloat32(), proctex_uv.v().ToFloat32(),
                                   regs.texturing, state.proctex);
    }

    return texture_color;
}

Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y,
                                                Common::Vec4<u8> combiner_output) const {
    const auto dest = fb.GetPixel(x >> 4, y >> 4);
    Common::Vec4<u8> blend_output = combiner_output;

    const auto& output_merger = regs.framebuffer.output_merger;
    if (output_merger.alphablend_enable) {
        const auto params = output_merger.alpha_blending;
        const auto lookup_factor = [&](u32 channel, FramebufferRegs::BlendFactor factor) -> u8 {
            DEBUG_ASSERT(channel < 4);

            const Common::Vec4<u8> blend_const =
                Common::MakeVec(
                    output_merger.blend_const.r.Value(), output_merger.blend_const.g.Value(),
                    output_merger.blend_const.b.Value(), output_merger.blend_const.a.Value())
                    .Cast<u8>();

            switch (factor) {
            case FramebufferRegs::BlendFactor::Zero:
                return 0;
            case FramebufferRegs::BlendFactor::One:
                return 255;
            case FramebufferRegs::BlendFactor::SourceColor:
                return combiner_output[channel];
            case FramebufferRegs::BlendFactor::OneMinusSourceColor:
                return 255 - combiner_output[channel];
            case FramebufferRegs::BlendFactor::DestColor:
                return dest[channel];
            case FramebufferRegs::BlendFactor::OneMinusDestColor:
                return 255 - dest[channel];
            case FramebufferRegs::BlendFactor::SourceAlpha:
                return combiner_output.a();
            case FramebufferRegs::BlendFactor::OneMinusSourceAlpha:
                return 255 - combiner_output.a();
            case FramebufferRegs::BlendFactor::DestAlpha:
                return dest.a();
            case FramebufferRegs::BlendFactor::OneMinusDestAlpha:
                return 255 - dest.a();
            case FramebufferRegs::BlendFactor::ConstantColor:
                return blend_const[channel];
            case FramebufferRegs::BlendFactor::OneMinusConstantColor:
                return 255 - blend_const[channel];
            case FramebufferRegs::BlendFactor::ConstantAlpha:
                return blend_const.a();
            case FramebufferRegs::BlendFactor::OneMinusConstantAlpha:
                return 255 - blend_const.a();
            case FramebufferRegs::BlendFactor::SourceAlphaSaturate:
                // Returns 1.0 for the alpha channel
                if (channel == 3) {
                    return 255;
                }
                return std::min(combiner_output.a(), static_cast<u8>(255 - dest.a()));
            default:
                LOG_CRITICAL(HW_GPU, "Unknown blend factor {:x}", factor);
                UNIMPLEMENTED();
                break;
            }
            return combiner_output[channel];
        };

        const auto srcfactor = Common::MakeVec(
            lookup_factor(0, params.factor_source_rgb), lookup_factor(1, params.factor_source_rgb),
            lookup_factor(2, params.factor_source_rgb), lookup_factor(3, params.factor_source_a));

        const auto dstfactor = Common::MakeVec(
            lookup_factor(0, params.factor_dest_rgb), lookup_factor(1, params.factor_dest_rgb),
            lookup_factor(2, params.factor_dest_rgb), lookup_factor(3, params.factor_dest_a));

        blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor,
                                             params.blend_equation_rgb);
        blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor,
                                                 params.blend_equation_a)
                               .a();
    } else {
        blend_output =
            Common::MakeVec(LogicOp(combiner_output.r(), dest.r(), output_merger.logic_op),
                            LogicOp(combiner_output.g(), dest.g(), output_merger.logic_op),
                            LogicOp(combiner_output.b(), dest.b(), output_merger.logic_op),
                            LogicOp(combiner_output.a(), dest.a(), output_merger.logic_op));
    }

    const Common::Vec4<u8> result = {
        output_merger.red_enable ? blend_output.r() : dest.r(),
        output_merger.green_enable ? blend_output.g() : dest.g(),
        output_merger.blue_enable ? blend_output.b() : dest.b(),
        output_merger.alpha_enable ? blend_output.a() : dest.a(),
    };

    return result;
}

Common::Vec4<u8> RasterizerSoftware::WriteTevConfig(
    std::span<const Common::Vec4<u8>, 4> texture_color,
    std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages,
    Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color,
    Common::Vec4<u8> secondary_fragment_color) {
    /**
     * Texture environment - consists of 6 stages of color and alpha combining.
     * Color combiners take three input color values from some source (e.g. interpolated
     * vertex color, texture color, previous stage, etc), perform some very simple
     * operations on each of them (e.g. inversion) and then calculate the output color
     * with some basic arithmetic. Alpha combiners can be configured separately but work
     * analogously.
     **/
    Common::Vec4<u8> combiner_output = primary_color;
    Common::Vec4<u8> combiner_buffer = {0, 0, 0, 0};
    Common::Vec4<u8> next_combiner_buffer =
        Common::MakeVec(regs.texturing.tev_combiner_buffer_color.r.Value(),
                        regs.texturing.tev_combiner_buffer_color.g.Value(),
                        regs.texturing.tev_combiner_buffer_color.b.Value(),
                        regs.texturing.tev_combiner_buffer_color.a.Value())
            .Cast<u8>();

    for (u32 tev_stage_index = 0; tev_stage_index < tev_stages.size(); ++tev_stage_index) {
        const auto& tev_stage = tev_stages[tev_stage_index];
        using Source = TexturingRegs::TevStageConfig::Source;

        auto get_source = [&](Source source) -> Common::Vec4<u8> {
            switch (source) {
            case Source::PrimaryColor:
                return primary_color;
            case Source::PrimaryFragmentColor:
                return primary_fragment_color;
            case Source::SecondaryFragmentColor:
                return secondary_fragment_color;
            case Source::Texture0:
                return texture_color[0];
            case Source::Texture1:
                return texture_color[1];
            case Source::Texture2:
                return texture_color[2];
            case Source::Texture3:
                return texture_color[3];
            case Source::PreviousBuffer:
                return combiner_buffer;
            case Source::Constant:
                return Common::MakeVec(tev_stage.const_r.Value(), tev_stage.const_g.Value(),
                                       tev_stage.const_b.Value(), tev_stage.const_a.Value())
                    .Cast<u8>();
            case Source::Previous:
                return combiner_output;
            default:
                LOG_ERROR(HW_GPU, "Unknown color combiner source {}", (int)source);
                UNIMPLEMENTED();
                return {0, 0, 0, 0};
            }
        };

        /**
         * Color combiner
         * NOTE: Not sure if the alpha combiner might use the color output of the previous
         *       stage as input. Hence, we currently don't directly write the result to
         *       combiner_output.rgb(), but instead store it in a temporary variable until
         *       alpha combining has been done.
         **/
        const std::array<Common::Vec3<u8>, 3> color_result = {
            GetColorModifier(tev_stage.color_modifier1, get_source(tev_stage.color_source1)),
            GetColorModifier(tev_stage.color_modifier2, get_source(tev_stage.color_source2)),
            GetColorModifier(tev_stage.color_modifier3, get_source(tev_stage.color_source3)),
        };
        const Common::Vec3<u8> color_output = ColorCombine(tev_stage.color_op, color_result);

        u8 alpha_output;
        if (tev_stage.color_op == TexturingRegs::TevStageConfig::Operation::Dot3_RGBA) {
            // result of Dot3_RGBA operation is also placed to the alpha component
            alpha_output = color_output.x;
        } else {
            // alpha combiner
            const std::array<u8, 3> alpha_result = {{
                GetAlphaModifier(tev_stage.alpha_modifier1, get_source(tev_stage.alpha_source1)),
                GetAlphaModifier(tev_stage.alpha_modifier2, get_source(tev_stage.alpha_source2)),
                GetAlphaModifier(tev_stage.alpha_modifier3, get_source(tev_stage.alpha_source3)),
            }};
            alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result);
        }

        combiner_output[0] = std::min(255U, color_output.r() * tev_stage.GetColorMultiplier());
        combiner_output[1] = std::min(255U, color_output.g() * tev_stage.GetColorMultiplier());
        combiner_output[2] = std::min(255U, color_output.b() * tev_stage.GetColorMultiplier());
        combiner_output[3] = std::min(255U, alpha_output * tev_stage.GetAlphaMultiplier());

        combiner_buffer = next_combiner_buffer;

        if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor(
                tev_stage_index)) {
            next_combiner_buffer.r() = combiner_output.r();
            next_combiner_buffer.g() = combiner_output.g();
            next_combiner_buffer.b() = combiner_output.b();
        }

        if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha(
                tev_stage_index)) {
            next_combiner_buffer.a() = combiner_output.a();
        }
    }

    return combiner_output;
}

void RasterizerSoftware::WriteFog(float depth, Common::Vec4<u8>& combiner_output) const {
    /**
     * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to
     * store the depth etc. Using float for now until we know more about Pica datatypes.
     **/
    if (regs.texturing.fog_mode == TexturingRegs::FogMode::Fog) {
        const Common::Vec3<u8> fog_color =
            Common::MakeVec(regs.texturing.fog_color.r.Value(), regs.texturing.fog_color.g.Value(),
                            regs.texturing.fog_color.b.Value())
                .Cast<u8>();

        float fog_index;
        if (regs.texturing.fog_flip) {
            fog_index = (1.0f - depth) * 128.0f;
        } else {
            fog_index = depth * 128.0f;
        }

        // Generate clamped fog factor from LUT for given fog index
        const f32 fog_i = std::clamp(floorf(fog_index), 0.0f, 127.0f);
        const f32 fog_f = fog_index - fog_i;
        const auto& fog_lut_entry = state.fog.lut[static_cast<u32>(fog_i)];
        f32 fog_factor = fog_lut_entry.ToFloat() + fog_lut_entry.DiffToFloat() * fog_f;
        fog_factor = std::clamp(fog_factor, 0.0f, 1.0f);
        for (u32 i = 0; i < 3; i++) {
            combiner_output[i] = static_cast<u8>(fog_factor * combiner_output[i] +
                                                 (1.0f - fog_factor) * fog_color[i]);
        }
    }
}

bool RasterizerSoftware::DoAlphaTest(u8 alpha) const {
    const auto& output_merger = regs.framebuffer.output_merger;
    if (!output_merger.alpha_test.enable) {
        return true;
    }
    switch (output_merger.alpha_test.func) {
    case FramebufferRegs::CompareFunc::Never:
        return false;
    case FramebufferRegs::CompareFunc::Always:
        return true;
    case FramebufferRegs::CompareFunc::Equal:
        return alpha == output_merger.alpha_test.ref;
    case FramebufferRegs::CompareFunc::NotEqual:
        return alpha != output_merger.alpha_test.ref;
    case FramebufferRegs::CompareFunc::LessThan:
        return alpha < output_merger.alpha_test.ref;
    case FramebufferRegs::CompareFunc::LessThanOrEqual:
        return alpha <= output_merger.alpha_test.ref;
    case FramebufferRegs::CompareFunc::GreaterThan:
        return alpha > output_merger.alpha_test.ref;
    case FramebufferRegs::CompareFunc::GreaterThanOrEqual:
        return alpha >= output_merger.alpha_test.ref;
    default:
        LOG_CRITICAL(Render_Software, "Unknown alpha test condition {}",
                     output_merger.alpha_test.func.Value());
        return false;
    }
}

bool RasterizerSoftware::DoDepthStencilTest(u16 x, u16 y, float depth) const {
    const auto& framebuffer = regs.framebuffer.framebuffer;
    const auto stencil_test = regs.framebuffer.output_merger.stencil_test;
    u8 old_stencil = 0;

    const auto update_stencil = [&](Pica::FramebufferRegs::StencilAction action) {
        const u8 new_stencil =
            PerformStencilAction(action, old_stencil, stencil_test.reference_value);
        if (framebuffer.allow_depth_stencil_write != 0) {
            const u8 stencil =
                (new_stencil & stencil_test.write_mask) | (old_stencil & ~stencil_test.write_mask);
            fb.SetStencil(x >> 4, y >> 4, stencil);
        }
    };

    const bool stencil_action_enable =
        regs.framebuffer.output_merger.stencil_test.enable &&
        regs.framebuffer.framebuffer.depth_format == FramebufferRegs::DepthFormat::D24S8;

    if (stencil_action_enable) {
        old_stencil = fb.GetStencil(x >> 4, y >> 4);
        const u8 dest = old_stencil & stencil_test.input_mask;
        const u8 ref = stencil_test.reference_value & stencil_test.input_mask;
        bool pass = false;
        switch (stencil_test.func) {
        case FramebufferRegs::CompareFunc::Never:
            pass = false;
            break;
        case FramebufferRegs::CompareFunc::Always:
            pass = true;
            break;
        case FramebufferRegs::CompareFunc::Equal:
            pass = (ref == dest);
            break;
        case FramebufferRegs::CompareFunc::NotEqual:
            pass = (ref != dest);
            break;
        case FramebufferRegs::CompareFunc::LessThan:
            pass = (ref < dest);
            break;
        case FramebufferRegs::CompareFunc::LessThanOrEqual:
            pass = (ref <= dest);
            break;
        case FramebufferRegs::CompareFunc::GreaterThan:
            pass = (ref > dest);
            break;
        case FramebufferRegs::CompareFunc::GreaterThanOrEqual:
            pass = (ref >= dest);
            break;
        }
        if (!pass) {
            update_stencil(stencil_test.action_stencil_fail);
            return false;
        }
    }

    const u32 num_bits = FramebufferRegs::DepthBitsPerPixel(framebuffer.depth_format);
    const u32 z = static_cast<u32>(depth * ((1 << num_bits) - 1));

    const auto& output_merger = regs.framebuffer.output_merger;
    if (output_merger.depth_test_enable) {
        const u32 ref_z = fb.GetDepth(x >> 4, y >> 4);
        bool pass = false;
        switch (output_merger.depth_test_func) {
        case FramebufferRegs::CompareFunc::Never:
            pass = false;
            break;
        case FramebufferRegs::CompareFunc::Always:
            pass = true;
            break;
        case FramebufferRegs::CompareFunc::Equal:
            pass = z == ref_z;
            break;
        case FramebufferRegs::CompareFunc::NotEqual:
            pass = z != ref_z;
            break;
        case FramebufferRegs::CompareFunc::LessThan:
            pass = z < ref_z;
            break;
        case FramebufferRegs::CompareFunc::LessThanOrEqual:
            pass = z <= ref_z;
            break;
        case FramebufferRegs::CompareFunc::GreaterThan:
            pass = z > ref_z;
            break;
        case FramebufferRegs::CompareFunc::GreaterThanOrEqual:
            pass = z >= ref_z;
            break;
        }
        if (!pass) {
            if (stencil_action_enable) {
                update_stencil(stencil_test.action_depth_fail);
            }
            return false;
        }
    }
    if (framebuffer.allow_depth_stencil_write != 0 && output_merger.depth_write_enable) {
        fb.SetDepth(x >> 4, y >> 4, z);
    }
    // The stencil depth_pass action is executed even if depth testing is disabled
    if (stencil_action_enable) {
        update_stencil(stencil_test.action_depth_pass);
    }

    return true;
}

} // namespace SwRenderer