diff --git a/src/common/vector_math.h b/src/common/vector_math.h index 8c14cf410..5f8d69ada 100644 --- a/src/common/vector_math.h +++ b/src/common/vector_math.h @@ -184,6 +184,8 @@ template } using Vec2f = Vec2; +using Vec2u = Vec2; +using Vec2i = Vec2; template <> inline float Vec2::Length() const { @@ -412,6 +414,8 @@ inline float Vec3::Normalize() { } using Vec3f = Vec3; +using Vec3u = Vec3; +using Vec3i = Vec3; template class Vec4 { @@ -623,6 +627,8 @@ template } using Vec4f = Vec4; +using Vec4u = Vec4; +using Vec4i = Vec4; template constexpr decltype(T{} * T{} + T{} * T{}) Dot(const Vec2& a, const Vec2& b) { diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 6aca6d9e5..0c46e426b 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -27,8 +27,17 @@ add_library(video_core STATIC common/buffer.h common/framebuffer.h common/pica_types.h + common/pica_uniforms.cpp + common/pica_uniforms.h + common/pipeline_cache.cpp + common/pipeline_cache.h + common/rasterizer.cpp + common/rasterizer.h common/rasterizer_cache.cpp common/rasterizer_cache.h + common/shader_runtime_cache.h + common/shader_disk_cache.cpp + common/shader_disk_cache.h common/shader_gen.cpp common/shader_gen.h common/shader.h diff --git a/src/video_core/common/backend.h b/src/video_core/common/backend.h index fa58689c9..7c61e8cb8 100644 --- a/src/video_core/common/backend.h +++ b/src/video_core/common/backend.h @@ -39,6 +39,9 @@ public: // Creates a backend specific sampler object virtual SamplerHandle CreateSampler(SamplerInfo info) = 0; + // Creates a backend specific shader object + virtual ShaderHandle CreateShader(ShaderStage stage, std::string_view name, std::string source) = 0; + // Start a draw operation virtual void Draw(PipelineHandle pipeline, FramebufferHandle draw_framebuffer, BufferHandle vertex_buffer, diff --git a/src/video_core/common/pica_uniforms.cpp b/src/video_core/common/pica_uniforms.cpp new file mode 100644 index 000000000..ce2390919 --- /dev/null +++ b/src/video_core/common/pica_uniforms.cpp @@ -0,0 +1,25 @@ +// Copyright 2022 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include "video_core/common/pica_uniforms.h" + +namespace VideoCore { + +void PicaUniformsData::SetFromRegs(const Pica::ShaderRegs& regs, const Pica::Shader::ShaderSetup& setup) { + std::ranges::transform(setup.uniforms.b, bools.begin(), [](bool value) { + return BoolAligned{value ? true : false}; + }); + + std::ranges::transform(regs.int_uniforms, i.begin(), [](const auto& value) { + return Common::Vec4u{value.x.Value(), value.y.Value(), value.z.Value(), value.w.Value()}; + }); + + std::ranges::transform(setup.uniforms.f, f.begin(), [](const auto& value) { + return Common::Vec4f{value.x.ToFloat32(), value.y.ToFloat32(), + value.z.ToFloat32(), value.w.ToFloat32()}; + }); +} + +} // namespace VideoCore diff --git a/src/video_core/common/pica_uniforms.h b/src/video_core/common/pica_uniforms.h new file mode 100644 index 000000000..2bb9d265f --- /dev/null +++ b/src/video_core/common/pica_uniforms.h @@ -0,0 +1,96 @@ +// Copyright 2022 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include "common/vector_math.h" +#include "video_core/regs_lighting.h" +#include "video_core/regs_shader.h" +#include "video_core/shader/shader.h" + +namespace VideoCore { + +enum class UniformBindings : u32 { + Common = 0, + VertexShader = 1, + GeometryShader = 2 +}; + +struct LightSrc { + alignas(16) Common::Vec3f specular_0; + alignas(16) Common::Vec3f specular_1; + alignas(16) Common::Vec3f diffuse; + alignas(16) Common::Vec3f ambient; + alignas(16) Common::Vec3f position; + alignas(16) Common::Vec3f spot_direction; // negated + float dist_atten_bias; + float dist_atten_scale; +}; + +/** + * Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned + * NOTE: Always keep a vec4 at the end. The GL spec is not clear wether the alignment at + * the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not. + * Not following that rule will cause problems on some AMD drivers. + */ +struct UniformData { + int framebuffer_scale; + int alphatest_ref; + float depth_scale; + float depth_offset; + float shadow_bias_constant; + float shadow_bias_linear; + int scissor_x1; + int scissor_y1; + int scissor_x2; + int scissor_y2; + int fog_lut_offset; + int proctex_noise_lut_offset; + int proctex_color_map_offset; + int proctex_alpha_map_offset; + int proctex_lut_offset; + int proctex_diff_lut_offset; + float proctex_bias; + int shadow_texture_bias; + alignas(16) Common::Vec4i lighting_lut_offset[Pica::LightingRegs::NumLightingSampler / 4]; + alignas(16) Common::Vec3f fog_color; + alignas(8) Common::Vec2f proctex_noise_f; + alignas(8) Common::Vec2f proctex_noise_a; + alignas(8) Common::Vec2f proctex_noise_p; + alignas(16) Common::Vec3f lighting_global_ambient; + LightSrc light_src[8]; + alignas(16) Common::Vec4f const_color[6]; // A vec4 color for each of the six tev stages + alignas(16) Common::Vec4f tev_combiner_buffer_color; + alignas(16) Common::Vec4f clip_coef; +}; + +static_assert(sizeof(UniformData) == 0x4F0, + "The size of the UniformData structure has changed, update the structure in the shader"); + +/** + * Uniform struct for the Uniform Buffer Object that contains PICA vertex/geometry shader uniforms. + * NOTE: the same rule from UniformData also applies here. + */ +struct PicaUniformsData { + void SetFromRegs(const Pica::ShaderRegs& regs, const Pica::Shader::ShaderSetup& setup); + + struct BoolAligned { + alignas(16) int b; + }; + + std::array bools; + alignas(16) std::array i; + alignas(16) std::array f; +}; + +struct VSUniformData { + PicaUniformsData uniforms; +}; + +static_assert(sizeof(VSUniformData) == 1856, + "The size of the VSUniformData structure has changed, update the structure in the shader"); + + +} // namespace VideoCore diff --git a/src/video_core/common/pipeline.h b/src/video_core/common/pipeline.h index 439251f4d..26beb072c 100644 --- a/src/video_core/common/pipeline.h +++ b/src/video_core/common/pipeline.h @@ -87,19 +87,19 @@ enum class AttribType : u8 { Float = 0, Int = 1, Short = 2, - Byte = 3 + Byte = 3, + Ubyte = 4 }; union VertexAttribute { - u8 value = 0; - BitField<0, 2, AttribType> type; - BitField<2, 3, u8> components; + BitField<0, 3, AttribType> type; + BitField<3, 3, u8> components; }; #pragma pack(1) struct VertexLayout { u8 stride = 0; - std::array attributes; + std::array attributes{}; }; #pragma pack() @@ -123,6 +123,7 @@ struct PipelineInfo { }; #pragma pack() +// An opaque handle to a backend specific program pipeline class PipelineBase : public IntrusivePtrEnabled { public: PipelineBase(PipelineType type, PipelineInfo info) : diff --git a/src/video_core/common/pipeline_cache.cpp b/src/video_core/common/pipeline_cache.cpp new file mode 100644 index 000000000..470270a90 --- /dev/null +++ b/src/video_core/common/pipeline_cache.cpp @@ -0,0 +1,317 @@ +// Copyright 2018 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include +#include "core/frontend/scope_acquire_context.h" +#include "video_core/common/pipeline_cache.h" +#include "video_core/common/shader.h" +#include "video_core/common/shader_gen.h" +#include "video_core/video_core.h" + +namespace VideoCore { + +static u64 GetUniqueIdentifier(const Pica::Regs& regs, std::span code) { + u64 hash = Common::ComputeHash64(regs.reg_array.data(), Pica::Regs::NUM_REGS * sizeof(u32)); + + if (code.size() > 0) { + u64 code_uid = Common::ComputeHash64(code.data(), code.size() * sizeof(u32)); + hash = Common::HashCombine(hash, code_uid); + } + + return hash; +} + +static auto BuildVSConfigFromRaw(const ShaderDiskCacheRaw& raw) { + Pica::Shader::ProgramCode program_code{}; + Pica::Shader::SwizzleData swizzle_data{}; + std::copy_n(raw.GetProgramCode().begin(), Pica::Shader::MAX_PROGRAM_CODE_LENGTH, + program_code.begin()); + std::copy_n(raw.GetProgramCode().begin() + Pica::Shader::MAX_PROGRAM_CODE_LENGTH, + Pica::Shader::MAX_SWIZZLE_DATA_LENGTH, swizzle_data.begin()); + Pica::Shader::ShaderSetup setup; + setup.program_code = program_code; + setup.swizzle_data = swizzle_data; + return std::make_tuple(PicaVSConfig{raw.GetRawShaderConfig().vs, setup}, setup); +} + +PipelineCache::PipelineCache(Frontend::EmuWindow& emu_window, std::unique_ptr& backend) + : emu_window(emu_window), backend(backend), pica_vertex_shaders(backend, generator), + fixed_geometry_shaders(backend, generator), fragment_shaders(backend, generator), + disk_cache(backend) { + //generator = std::make_unique program_code{setup.program_code.begin(), setup.program_code.end()}; + program_code.insert(program_code.end(), setup.swizzle_data.begin(), setup.swizzle_data.end()); + + // Hash the bytecode and save the pica program + const u64 unique_identifier = GetUniqueIdentifier(regs, program_code); + const ShaderDiskCacheRaw raw{unique_identifier, ProgramType::VertexShader, + regs, std::move(program_code)}; + + disk_cache.SaveRaw(raw); + disk_cache.SaveDecompiled(unique_identifier, shader_str.value(), + VideoCore::g_hw_shader_accurate_mul); + } + + return true; +} + +void PipelineCache::UseTrivialVertexShader() { + current_vertex_shader = trivial_vertex_shader; +} + +void PipelineCache::UseFixedGeometryShader(const Pica::Regs& regs) { + PicaFixedGSConfig gs_config{regs}; + auto [handle, _] = fixed_geometry_shaders.Get(gs_config); + current_geometry_shader = handle; +} + +void PipelineCache::UseTrivialGeometryShader() { + current_geometry_shader = ShaderHandle{}; +} + +void PipelineCache::UseFragmentShader(const Pica::Regs& regs) { + PicaFSConfig config{regs}; + auto [handle, shader_str] = fragment_shaders.Get(config); + current_fragment_shader = handle; + + // Save FS to the disk cache if its a new shader + if (shader_str.has_value()) { + u64 unique_identifier = GetUniqueIdentifier(regs, {}); + ShaderDiskCacheRaw raw{unique_identifier, ProgramType::FragmentShader, regs, {}}; + disk_cache.SaveRaw(raw); + disk_cache.SaveDecompiled(unique_identifier, shader_str.value(), false); + } +} + +void PipelineCache::LoadDiskCache(const std::atomic_bool& stop_loading, const DiskLoadCallback& callback) { + const auto transferable = disk_cache.LoadTransferable(); + if (!transferable.has_value()) { + return; + } + + const auto& raws = transferable.value(); + + // Load uncompressed precompiled file for non-separable shaders. + // Precompiled file for separable shaders is compressed. + auto [decompiled, dumps] = disk_cache.LoadPrecompiled(true); + + if (stop_loading) { + return; + } + + std::set supported_formats = GetSupportedFormats(); + + // Track if precompiled cache was altered during loading to know if we have to serialize the + // virtual precompiled cache file back to the hard drive + bool precompiled_cache_altered = false; + + std::mutex mutex; + std::atomic_bool compilation_failed = false; + if (callback) { + callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size()); + } + + std::vector load_raws_index; + for (u64 i = 0; i < raws.size(); i++) { + if (stop_loading || compilation_failed) { + return; + } + + const ShaderDiskCacheRaw& raw = raws[i]; + const u64 unique_identifier = raw.GetUniqueIdentifier(); + const u64 calculated_hash = GetUniqueIdentifier(raw.GetRawShaderConfig(), raw.GetProgramCode()); + + // Check for any data corruption + if (unique_identifier != calculated_hash) { + LOG_ERROR(Render_Vulkan, "Invalid hash in entry={:016x} (obtained hash={:016x}) - removing " + "shader cache", + raw.GetUniqueIdentifier(), calculated_hash); + + disk_cache.InvalidateAll(); + return; + } + + const auto dump = dumps.find(unique_identifier); + const auto decomp = decompiled.find(unique_identifier); + + ShaderHandle shader{}; + if (dump != dumps.end() && decomp != decompiled.end()) { + // Only load the vertex shader if its sanitize_mul setting matches + if (raw.GetProgramType() == ProgramType::VertexShader && + decomp->second.sanitize_mul != VideoCore::g_hw_shader_accurate_mul) { + continue; + } + + // If the shader is dumped, attempt to load it + shader = GeneratePrecompiledProgram(dump->second, supported_formats); + if (!shader.IsValid()) { + // If any shader failed, stop trying to compile, delete the cache, and start + // loading from raws + compilation_failed = true; + return; + } + + // We have both the binary shader and the decompiled, so inject it into the + // cache + if (raw.GetProgramType() == ProgramType::VertexShader) { + auto [conf, setup] = BuildVSConfigFromRaw(raw); + std::scoped_lock lock(mutex); + pica_vertex_shaders.Inject(conf, decomp->second.result, std::move(shader)); + + } else if (raw.GetProgramType() == ProgramType::FragmentShader) { + const PicaFSConfig conf{raw.GetRawShaderConfig()}; + std::scoped_lock lock(mutex); + fragment_shaders.Inject(conf, std::move(shader)); + + } else { + // Unsupported shader type got stored somehow so nuke the cache + LOG_CRITICAL(Frontend, "failed to load raw ProgramType {}", raw.GetProgramType()); + compilation_failed = true; + return; + } + } else { + // Since precompiled didn't have the dump, we'll load them in the next phase + std::scoped_lock lock(mutex); + load_raws_index.push_back(i); + } + + if (callback) { + callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size()); + } + } + + + // Invalidate the precompiled cache if a shader dumped shader was rejected + bool load_all_raws = false; + if (compilation_failed) { + disk_cache.InvalidatePrecompiled(); + dumps.clear(); + precompiled_cache_altered = true; + load_all_raws = true; + } + + const std::size_t load_raws_size = load_all_raws ? raws.size() : load_raws_index.size(); + + if (callback) { + callback(VideoCore::LoadCallbackStage::Build, 0, load_raws_size); + } + + compilation_failed = false; + + std::size_t built_shaders = 0; // It doesn't have be atomic since it's used behind a mutex + const auto LoadRawSepareble = [&](Frontend::GraphicsContext* context, std::size_t begin, + std::size_t end) { + Frontend::ScopeAcquireContext scope(*context); + for (u64 i = begin; i < end; ++i) { + if (stop_loading || compilation_failed) { + return; + } + + const u64 raws_index = load_all_raws ? i : load_raws_index[i]; + const auto& raw = raws[raws_index]; + const u64 unique_identifier = raw.GetUniqueIdentifier(); + + bool sanitize_mul = false; + ShaderHandle shader{nullptr}; + std::optional result; + + // Otherwise decompile and build the shader at boot and save the result to the + // precompiled file + if (raw.GetProgramType() == ProgramType::VertexShader) { + auto [conf, setup] = BuildVSConfigFromRaw(raw); + result = generator->GenerateVertexShader(setup, conf); + + // Compile shader + shader = backend->CreateShader(ShaderStage::Vertex, "Vertex shader", result.value()); + shader->Compile(ShaderOptimization::Debug); + + sanitize_mul = conf.sanitize_mul; + std::scoped_lock lock(mutex); + pica_vertex_shaders.Inject(conf, result.value(), std::move(shader)); + + } else if (raw.GetProgramType() == ProgramType::FragmentShader) { + const PicaFSConfig conf{raw.GetRawShaderConfig()}; + result = generator->GenerateFragmentShader(conf); + + // Compile shader + shader = backend->CreateShader(ShaderStage::Fragment, "Fragment shader", result.value()); + shader->Compile(ShaderOptimization::Debug); + + std::scoped_lock lock(mutex); + fragment_shaders.Inject(conf, std::move(shader)); + + } else { + // Unsupported shader type got stored somehow so nuke the cache + LOG_ERROR(Frontend, "Failed to load raw ProgramType {}", raw.GetProgramType()); + compilation_failed = true; + return; + } + + if (!shader.IsValid()) { + LOG_ERROR(Frontend, "Compilation from raw failed {:x} {:x}", + raw.GetProgramCode()[0], raw.GetProgramCode()[1]); + compilation_failed = true; + return; + } + + std::scoped_lock lock(mutex); + + // If this is a new separable shader, add it the precompiled cache + if (result) { + disk_cache.SaveDecompiled(unique_identifier, *result, sanitize_mul); + disk_cache.SaveDump(unique_identifier, shader); + precompiled_cache_altered = true; + } + + if (callback) { + callback(VideoCore::LoadCallbackStage::Build, ++built_shaders, load_raws_size); + } + } + }; + + const std::size_t num_workers{std::max(1U, std::thread::hardware_concurrency())}; + const std::size_t bucket_size{load_raws_size / num_workers}; + std::vector> contexts(num_workers); + std::vector threads(num_workers); + + for (std::size_t i = 0; i < num_workers; ++i) { + const bool is_last_worker = i + 1 == num_workers; + const std::size_t start{bucket_size * i}; + const std::size_t end{is_last_worker ? load_raws_size : start + bucket_size}; + + // On some platforms the shared context has to be created from the GUI thread + contexts[i] = emu_window.CreateSharedContext(); + threads[i] = std::thread(LoadRawSepareble, contexts[i].get(), start, end); + } + + for (auto& thread : threads) { + thread.join(); + } + + if (compilation_failed) { + disk_cache.InvalidateAll(); + } + + if (precompiled_cache_altered) { + disk_cache.SaveVirtualPrecompiledFile(); + } +} + +} // namespace OpenGL diff --git a/src/video_core/common/pipeline_cache.h b/src/video_core/common/pipeline_cache.h new file mode 100644 index 000000000..1ef75645d --- /dev/null +++ b/src/video_core/common/pipeline_cache.h @@ -0,0 +1,75 @@ +// Copyright 2022 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include "video_core/regs.h" +#include "video_core/common/shader_runtime_cache.h" +#include "video_core/common/shader_disk_cache.h" + +namespace FileUtil { +class IOFile; +} + +namespace Core { +class System; +} + +namespace Frontend { +class EmuWindow; +} + +namespace VideoCore { + +enum class LoadCallbackStage : u8 { + Prepare = 0, + Decompile = 1, + Build = 2, + Complete = 3, +}; + +using DiskLoadCallback = std::function; + +// A class that manages and caches shaders and pipelines +class PipelineCache { +public: + PipelineCache(Frontend::EmuWindow& emu_window, std::unique_ptr& backend); + ~PipelineCache() = default; + + // Loads backend specific shader binaries from disk + void LoadDiskCache(const std::atomic_bool& stop_loading, const DiskLoadCallback& callback); + + bool UsePicaVertexShader(const Pica::Regs& config, Pica::Shader::ShaderSetup& setup); + void UseTrivialVertexShader(); + + void UseFixedGeometryShader(const Pica::Regs& regs); + void UseTrivialGeometryShader(); + + // Compiles and caches a fragment shader based on the current pica state + void UseFragmentShader(const Pica::Regs& config); + +private: + Frontend::EmuWindow& emu_window; + std::unique_ptr& backend; + std::unique_ptr generator; + + // Keeps all the compiled graphics pipelines + std::unordered_map cached_pipelines; + + // Current shaders + ShaderHandle current_vertex_shader; + ShaderHandle current_geometry_shader; + ShaderHandle current_fragment_shader; + + // Pica runtime shader caches + PicaVertexShaders pica_vertex_shaders; + FixedGeometryShaders fixed_geometry_shaders; + FragmentShaders fragment_shaders; + ShaderHandle trivial_vertex_shader; + + // Serializes shader binaries to disk + ShaderDiskCache disk_cache; +}; +} // namespace VideoCore diff --git a/src/video_core/common/rasterizer.cpp b/src/video_core/common/rasterizer.cpp new file mode 100644 index 000000000..2185e53a2 --- /dev/null +++ b/src/video_core/common/rasterizer.cpp @@ -0,0 +1,2213 @@ +// Copyright 2022 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include +#include "common/alignment.h" +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/math_util.h" +#include "common/microprofile.h" +#include "core/hw/gpu.h" +#include "video_core/pica_state.h" +#include "video_core/regs_framebuffer.h" +#include "video_core/regs_rasterizer.h" +#include "video_core/common/backend.h" +#include "video_core/common/rasterizer.h" +#include "video_core/common/shader_gen.h" +#include "video_core/video_core.h" + +namespace VideoCore { + +using PixelFormat = SurfaceParams::PixelFormat; +using SurfaceType = SurfaceParams::SurfaceType; + +MICROPROFILE_DEFINE(VertexSetup, "Rasterizer", "Vertex Setup", MP_RGB(255, 128, 0)); +MICROPROFILE_DEFINE(VertexShader, "Rasterizer", "Vertex Shader Setup", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(GeometryShader, "Rasterizer", "Geometry Shader Setup", MP_RGB(128, 192, 128)); +MICROPROFILE_DEFINE(Drawing, "Rasterizer", "Drawing", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(Blits, "Rasterizer", "Blits", MP_RGB(100, 100, 255)); +MICROPROFILE_DEFINE(CacheManagement, "Rasterizer", "Cache Management", MP_RGB(100, 255, 100)); + +HardwareVertex::HardwareVertex(const Pica::Shader::OutputVertex& v, bool flip_quaternion) { + position[0] = v.pos.x.ToFloat32(); + position[1] = v.pos.y.ToFloat32(); + position[2] = v.pos.z.ToFloat32(); + position[3] = v.pos.w.ToFloat32(); + color[0] = v.color.x.ToFloat32(); + color[1] = v.color.y.ToFloat32(); + color[2] = v.color.z.ToFloat32(); + color[3] = v.color.w.ToFloat32(); + tex_coord0[0] = v.tc0.x.ToFloat32(); + tex_coord0[1] = v.tc0.y.ToFloat32(); + tex_coord1[0] = v.tc1.x.ToFloat32(); + tex_coord1[1] = v.tc1.y.ToFloat32(); + tex_coord2[0] = v.tc2.x.ToFloat32(); + tex_coord2[1] = v.tc2.y.ToFloat32(); + tex_coord0_w = v.tc0_w.ToFloat32(); + normquat[0] = v.quat.x.ToFloat32(); + normquat[1] = v.quat.y.ToFloat32(); + normquat[2] = v.quat.z.ToFloat32(); + normquat[3] = v.quat.w.ToFloat32(); + view[0] = v.view.x.ToFloat32(); + view[1] = v.view.y.ToFloat32(); + view[2] = v.view.z.ToFloat32(); + + if (flip_quaternion) { + normquat = -normquat; + } +} + +constexpr VertexLayout HardwareVertex::GetVertexLayout() { + std::array attributes; + attributes[ATTRIBUTE_POSITION].components.Assign(4); + attributes[ATTRIBUTE_POSITION].type.Assign(AttribType::Float); + attributes[ATTRIBUTE_COLOR].components.Assign(4); + attributes[ATTRIBUTE_COLOR].type.Assign(AttribType::Float); + attributes[ATTRIBUTE_TEXCOORD0].components.Assign(2); + attributes[ATTRIBUTE_TEXCOORD0].type.Assign(AttribType::Float); + attributes[ATTRIBUTE_TEXCOORD1].components.Assign(2); + attributes[ATTRIBUTE_TEXCOORD1].type.Assign(AttribType::Float); + attributes[ATTRIBUTE_TEXCOORD2].components.Assign(2); + attributes[ATTRIBUTE_TEXCOORD2].type.Assign(AttribType::Float); + attributes[ATTRIBUTE_TEXCOORD0_W].components.Assign(1); + attributes[ATTRIBUTE_TEXCOORD0_W].type.Assign(AttribType::Float); + attributes[ATTRIBUTE_NORMQUAT].components.Assign(4); + attributes[ATTRIBUTE_NORMQUAT].type.Assign(AttribType::Float); + attributes[ATTRIBUTE_VIEW].components.Assign(3); + attributes[ATTRIBUTE_VIEW].type.Assign(AttribType::Float); + + return VertexLayout{sizeof(HardwareVertex), std::move(attributes)}; +} + +// Rasterizer pipeline layout +static constexpr PipelineLayoutInfo RASTERIZER_PIPELINE_INFO = { + .group_count = 3, + .binding_groups = { + // Uniform + LUT set + BindingGroup{ + BindingType::Uniform, + BindingType::Uniform, + BindingType::TexelBuffer, + BindingType::TexelBuffer + }, // Texture unit set + BindingGroup{ + BindingType::Texture, + BindingType::Texture, + BindingType::Texture, + BindingType::Texture + }, // Texture unit sampler set + BindingGroup{ + BindingType::Sampler, + BindingType::Sampler, + BindingType::Sampler, + BindingType::Sampler + } + } +}; + +// Define information about the rasterizer buffers +static constexpr BufferInfo VERTEX_BUFFER_INFO = { + .capacity = 16 * 1024 * 1024, + .usage = BufferUsage::Vertex +}; + +static constexpr BufferInfo INDEX_BUFFER_INFO = { + .capacity = 1 * 1024 * 1024, + .usage = BufferUsage::Index +}; + +static constexpr BufferInfo UNIFORM_BUFFER_INFO = { + .capacity = 2 * 1024 * 1024, + .usage = BufferUsage::Uniform +}; + +static constexpr BufferInfo TEXEL_BUFFER_INFO = { + .capacity = 1 * 1024 * 1024, + .usage = BufferUsage::Texel, + .views = { + ViewFormat::R32G32Float + } +}; + +static constexpr BufferInfo TEXEL_BUFFER_LF_INFO = { + .capacity = 1 * 1024 * 1024, + .usage = BufferUsage::Texel, + .views = { + ViewFormat::R32G32Float, + ViewFormat::R32G32B32A32Float + } +}; + +Rasterizer::Rasterizer(Frontend::EmuWindow& emu_window, std::unique_ptr& backend) : + backend(backend), res_cache(backend) { + + // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0 + //state.clip_distance[0] = true; + + const TextureInfo clear_info = { + .width = 1, + .height = 1, + .levels = 1, + .type = TextureType::Texture2D, + .view_type = TextureViewType::View2D, + .format = TextureFormat::RGBA8 + }; + + // Create a 1x1 clear texture to use in the NULL case + std::array clear_data = {0, 0, 0, 1}; + clear_texture = backend->CreateTexture(clear_info); + clear_texture->Upload(Rect2D{0, 0, 1, 1}, 1, clear_data); + + // Create rasterizer buffers + vertex_buffer = backend->CreateBuffer(VERTEX_BUFFER_INFO); + index_buffer = backend->CreateBuffer(INDEX_BUFFER_INFO); + uniform_buffer = backend->CreateBuffer(UNIFORM_BUFFER_INFO); + texel_buffer_lut = backend->CreateBuffer(TEXEL_BUFFER_INFO); + texel_buffer_lut_lf = backend->CreateBuffer(TEXEL_BUFFER_LF_INFO); + + /*glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment); + uniform_size_aligned_vs = + Common::AlignUp(sizeof(VSUniformData), uniform_buffer_alignment); + uniform_size_aligned_fs = + Common::AlignUp(sizeof(UniformData), uniform_buffer_alignment); + */ + + shader_program_manager = std::make_unique( + emu_window, GLAD_GL_ARB_separate_shader_objects, is_amd); + + // Synchronize pica state + SyncEntireState(); +} + +Rasterizer::~Rasterizer() = default; + +void Rasterizer::LoadDiskResources(const std::atomic_bool& stop_loading, + const VideoCore::DiskResourceLoadCallback& callback) { + shader_program_manager->LoadDiskCache(stop_loading, callback); +} + +void Rasterizer::SyncEntireState() { + // Sync fixed function OpenGL state + SyncClipEnabled(); + SyncCullMode(); + SyncBlendEnabled(); + SyncBlendFuncs(); + SyncBlendColor(); + SyncLogicOp(); + SyncStencilTest(); + SyncDepthTest(); + SyncColorWriteMask(); + SyncStencilWriteMask(); + SyncDepthWriteMask(); + + // Sync uniforms + SyncClipCoef(); + SyncDepthScale(); + SyncDepthOffset(); + SyncAlphaTest(); + SyncCombinerColor(); + auto& tev_stages = Pica::g_state.regs.texturing.GetTevStages(); + for (std::size_t index = 0; index < tev_stages.size(); ++index) + SyncTevConstColor(index, tev_stages[index]); + + SyncGlobalAmbient(); + for (unsigned light_index = 0; light_index < 8; light_index++) { + SyncLightSpecular0(light_index); + SyncLightSpecular1(light_index); + SyncLightDiffuse(light_index); + SyncLightAmbient(light_index); + SyncLightPosition(light_index); + SyncLightDistanceAttenuationBias(light_index); + SyncLightDistanceAttenuationScale(light_index); + } + + SyncFogColor(); + SyncProcTexNoise(); + SyncProcTexBias(); + SyncShadowBias(); + SyncShadowTextureBias(); +} + +/** + * This is a helper function to resolve an issue when interpolating opposite quaternions. See below + * for a detailed description of this issue (yuriks): + * + * For any rotation, there are two quaternions Q, and -Q, that represent the same rotation. If you + * interpolate two quaternions that are opposite, instead of going from one rotation to another + * using the shortest path, you'll go around the longest path. You can test if two quaternions are + * opposite by checking if Dot(Q1, Q2) < 0. In that case, you can flip either of them, therefore + * making Dot(Q1, -Q2) positive. + * + * This solution corrects this issue per-vertex before passing the quaternions to OpenGL. This is + * correct for most cases but can still rotate around the long way sometimes. An implementation + * which did `lerp(lerp(Q1, Q2), Q3)` (with proper weighting), applying the dot product check + * between each step would work for those cases at the cost of being more complex to implement. + * + * Fortunately however, the 3DS hardware happens to also use this exact same logic to work around + * these issues, making this basic implementation actually more accurate to the hardware. + */ +static bool AreQuaternionsOpposite(Common::Vec4 qa, Common::Vec4 qb) { + Common::Vec4f a{qa.x.ToFloat32(), qa.y.ToFloat32(), qa.z.ToFloat32(), qa.w.ToFloat32()}; + Common::Vec4f b{qb.x.ToFloat32(), qb.y.ToFloat32(), qb.z.ToFloat32(), qb.w.ToFloat32()}; + + return (Common::Dot(a, b) < 0.f); +} + +void Rasterizer::AddTriangle(const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) { + vertex_batch.emplace_back(v0, false); + vertex_batch.emplace_back(v1, AreQuaternionsOpposite(v0.quat, v1.quat)); + vertex_batch.emplace_back(v2, AreQuaternionsOpposite(v0.quat, v2.quat)); +} + +static constexpr std::array vs_attrib_types = { + AttribType::Byte, // VertexAttributeFormat::BYTE + AttribType::Ubyte, // VertexAttributeFormat::UBYTE + AttribType::Short, // VertexAttributeFormat::SHORT + AttribType::Float // VertexAttributeFormat::FLOAT +}; + +struct VertexArrayInfo { + u32 vs_input_index_min; + u32 vs_input_index_max; + u32 vs_input_size; +}; + +Rasterizer::VertexArrayInfo Rasterizer::AnalyzeVertexArray(bool is_indexed) { + const auto& regs = Pica::g_state.regs; + const auto& vertex_attributes = regs.pipeline.vertex_attributes; + + u32 vertex_min; + u32 vertex_max; + if (is_indexed) { + const auto& index_info = regs.pipeline.index_array; + const PAddr address = vertex_attributes.GetPhysicalBaseAddress() + index_info.offset; + const u8* index_address_8 = VideoCore::g_memory->GetPhysicalPointer(address); + const u16* index_address_16 = reinterpret_cast(index_address_8); + const bool index_u16 = index_info.format != 0; + + vertex_min = 0xFFFF; + vertex_max = 0; + const u32 size = regs.pipeline.num_vertices * (index_u16 ? 2 : 1); + res_cache.FlushRegion(address, size, nullptr); + for (u32 index = 0; index < regs.pipeline.num_vertices; ++index) { + const u32 vertex = index_u16 ? index_address_16[index] : index_address_8[index]; + vertex_min = std::min(vertex_min, vertex); + vertex_max = std::max(vertex_max, vertex); + } + } else { + vertex_min = regs.pipeline.vertex_offset; + vertex_max = regs.pipeline.vertex_offset + regs.pipeline.num_vertices - 1; + } + + const u32 vertex_num = vertex_max - vertex_min + 1; + u32 vs_input_size = 0; + for (const auto& loader : vertex_attributes.attribute_loaders) { + if (loader.component_count != 0) { + vs_input_size += loader.byte_count * vertex_num; + } + } + + return {vertex_min, vertex_max, vs_input_size}; +} + +void Rasterizer::SetupVertexArray(u8* array_ptr, u32 buffer_offset, u32 vs_input_index_min, u32 vs_input_index_max) { + MICROPROFILE_SCOPE(VertexSetup); + const auto& regs = Pica::g_state.regs; + const auto& vertex_attributes = regs.pipeline.vertex_attributes; + PAddr base_address = vertex_attributes.GetPhysicalBaseAddress(); + + std::array enable_attributes{}; + for (const auto& loader : vertex_attributes.attribute_loaders) { + if (loader.component_count == 0 || loader.byte_count == 0) { + continue; + } + + u32 offset = 0; + for (u32 comp = 0; comp < loader.component_count && comp < 12; ++comp) { + u32 attribute_index = loader.GetComponent(comp); + if (attribute_index < 12) { + if (vertex_attributes.GetNumElements(attribute_index) != 0) { + offset = Common::AlignUp(offset, vertex_attributes.GetElementSizeInBytes(attribute_index)); + + u32 input_reg = regs.vs.GetRegisterForAttribute(attribute_index); + u32 size = vertex_attributes.GetNumElements(attribute_index); + AttribType type = vs_attrib_types[static_cast(vertex_attributes.GetFormat(attribute_index))]; + + std::size_t stride = loader.byte_count; + glVertexAttribPointer(input_reg, size, type, GL_FALSE, stride, + reinterpret_cast(buffer_offset + offset)); + enable_attributes[input_reg] = true; + + offset += vertex_attributes.GetStride(attribute_index); + } + } else { + // Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, + // respectively + offset = Common::AlignUp(offset, 4); + offset += (attribute_index - 11) * 4; + } + } + + PAddr data_addr = + base_address + loader.data_offset + (vs_input_index_min * loader.byte_count); + + u32 vertex_num = vs_input_index_max - vs_input_index_min + 1; + u32 data_size = loader.byte_count * vertex_num; + + res_cache.FlushRegion(data_addr, data_size, nullptr); + std::memcpy(array_ptr, VideoCore::g_memory->GetPhysicalPointer(data_addr), data_size); + + array_ptr += data_size; + buffer_offset += data_size; + } + + for (std::size_t i = 0; i < enable_attributes.size(); ++i) { + if (enable_attributes[i] != hw_vao_enabled_attributes[i]) { + if (enable_attributes[i]) { + glEnableVertexAttribArray(static_cast(i)); + } else { + glDisableVertexAttribArray(static_cast(i)); + } + hw_vao_enabled_attributes[i] = enable_attributes[i]; + } + + if (vertex_attributes.IsDefaultAttribute(i)) { + const u32 reg = regs.vs.GetRegisterForAttribute(i); + if (!enable_attributes[reg]) { + const auto& attr = Pica::g_state.input_default_attributes.attr[i]; + glVertexAttrib4f(reg, attr.x.ToFloat32(), attr.y.ToFloat32(), attr.z.ToFloat32(), + attr.w.ToFloat32()); + } + } + } +} + +bool Rasterizer::AccelerateDrawBatch(bool is_indexed) { + const auto& regs = Pica::g_state.regs; + if (regs.pipeline.use_gs != Pica::PipelineRegs::UseGS::No) { + if (regs.pipeline.gs_config.mode != Pica::PipelineRegs::GSMode::Point) { + return false; + } + + if (regs.pipeline.triangle_topology != Pica::TriangleTopology::Shader) { + return false; + } + + LOG_ERROR(Render_Vulkan, "Accelerate draw doesn't support geometry shader"); + return false; + } + + // Setup vertex shader + MICROPROFILE_SCOPE(VertexShader); + if (!shader_program_manager->UseProgrammableVertexShader(regs, Pica::g_state.vs)) { + return false; + } + + // Setup geometry shader + MICROPROFILE_SCOPE(GeometryShader); + shader_program_manager->UseFixedGeometryShader(regs); + + return Draw(true, is_indexed); +} + +bool Rasterizer::AccelerateDrawBatchInternal(bool is_indexed) { + const auto& regs = Pica::g_state.regs; + GLenum primitive_mode = GetCurrentPrimitiveMode(); + + auto [vs_input_index_min, vs_input_index_max, vs_input_size] = AnalyzeVertexArray(is_indexed); + + if (vs_input_size > VERTEX_BUFFER_SIZE) { + LOG_WARNING(Render_OpenGL, "Too large vertex input size {}", vs_input_size); + return false; + } + + state.draw.vertex_buffer = vertex_buffer.GetHandle(); + state.Apply(); + + u8* buffer_ptr; + GLintptr buffer_offset; + std::tie(buffer_ptr, buffer_offset, std::ignore) = vertex_buffer.Map(vs_input_size, 4); + SetupVertexArray(buffer_ptr, buffer_offset, vs_input_index_min, vs_input_index_max); + vertex_buffer.Unmap(vs_input_size); + + shader_program_manager->ApplyTo(state); + state.Apply(); + + if (is_indexed) { + bool index_u16 = regs.pipeline.index_array.format != 0; + std::size_t index_buffer_size = regs.pipeline.num_vertices * (index_u16 ? 2 : 1); + + if (index_buffer_size > INDEX_BUFFER_SIZE) { + LOG_WARNING(Render_OpenGL, "Too large index input size {}", index_buffer_size); + return false; + } + + const u8* index_data = VideoCore::g_memory->GetPhysicalPointer( + regs.pipeline.vertex_attributes.GetPhysicalBaseAddress() + + regs.pipeline.index_array.offset); + std::tie(buffer_ptr, buffer_offset, std::ignore) = index_buffer.Map(index_buffer_size, 4); + std::memcpy(buffer_ptr, index_data, index_buffer_size); + index_buffer.Unmap(index_buffer_size); + + glDrawRangeElementsBaseVertex( + primitive_mode, vs_input_index_min, vs_input_index_max, regs.pipeline.num_vertices, + index_u16 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE, + reinterpret_cast(buffer_offset), -static_cast(vs_input_index_min)); + } else { + glDrawArrays(primitive_mode, 0, regs.pipeline.num_vertices); + } + return true; +} + +void Rasterizer::DrawTriangles() { + if (vertex_batch.empty()) + return; + Draw(false, false); +} + +bool Rasterizer::Draw(bool accelerate, bool is_indexed) { + MICROPROFILE_SCOPE(OpenGL_Drawing); + const auto& regs = Pica::g_state.regs; + + bool shadow_rendering = regs.framebuffer.output_merger.fragment_operation_mode == + Pica::FramebufferRegs::FragmentOperationMode::Shadow; + + const bool has_stencil = + regs.framebuffer.framebuffer.depth_format == Pica::FramebufferRegs::DepthFormat::D24S8; + + const bool write_color_fb = shadow_rendering || state.color_mask.red_enabled == GL_TRUE || + state.color_mask.green_enabled == GL_TRUE || + state.color_mask.blue_enabled == GL_TRUE || + state.color_mask.alpha_enabled == GL_TRUE; + + const bool write_depth_fb = + (state.depth.test_enabled && state.depth.write_mask == GL_TRUE) || + (has_stencil && state.stencil.test_enabled && state.stencil.write_mask != 0); + + const bool using_color_fb = + regs.framebuffer.framebuffer.GetColorBufferPhysicalAddress() != 0 && write_color_fb; + const bool using_depth_fb = + !shadow_rendering && regs.framebuffer.framebuffer.GetDepthBufferPhysicalAddress() != 0 && + (write_depth_fb || regs.framebuffer.output_merger.depth_test_enable != 0 || + (has_stencil && state.stencil.test_enabled)); + + Common::Rectangle viewport_rect_unscaled{ + // These registers hold half-width and half-height, so must be multiplied by 2 + regs.rasterizer.viewport_corner.x, // left + regs.rasterizer.viewport_corner.y + // top + static_cast(Pica::float24::FromRaw(regs.rasterizer.viewport_size_y).ToFloat32() * + 2), + regs.rasterizer.viewport_corner.x + // right + static_cast(Pica::float24::FromRaw(regs.rasterizer.viewport_size_x).ToFloat32() * + 2), + regs.rasterizer.viewport_corner.y // bottom + }; + + Surface color_surface; + Surface depth_surface; + Common::Rectangle surfaces_rect; + std::tie(color_surface, depth_surface, surfaces_rect) = + res_cache.GetFramebufferSurfaces(using_color_fb, using_depth_fb, viewport_rect_unscaled); + + const u16 res_scale = color_surface != nullptr + ? color_surface->res_scale + : (depth_surface == nullptr ? 1u : depth_surface->res_scale); + + Common::Rectangle draw_rect{ + static_cast(std::clamp(static_cast(surfaces_rect.left) + + viewport_rect_unscaled.left * res_scale, + surfaces_rect.left, surfaces_rect.right)), // Left + static_cast(std::clamp(static_cast(surfaces_rect.bottom) + + viewport_rect_unscaled.top * res_scale, + surfaces_rect.bottom, surfaces_rect.top)), // Top + static_cast(std::clamp(static_cast(surfaces_rect.left) + + viewport_rect_unscaled.right * res_scale, + surfaces_rect.left, surfaces_rect.right)), // Right + static_cast(std::clamp(static_cast(surfaces_rect.bottom) + + viewport_rect_unscaled.bottom * res_scale, + surfaces_rect.bottom, surfaces_rect.top))}; // Bottom + + // Bind the framebuffer surfaces + state.draw.draw_framebuffer = framebuffer.handle; + state.Apply(); + + if (shadow_rendering) { + if (!allow_shadow || color_surface == nullptr) { + return true; + } + glFramebufferParameteri(GL_DRAW_FRAMEBUFFER, GL_FRAMEBUFFER_DEFAULT_WIDTH, + color_surface->width * color_surface->res_scale); + glFramebufferParameteri(GL_DRAW_FRAMEBUFFER, GL_FRAMEBUFFER_DEFAULT_HEIGHT, + color_surface->height * color_surface->res_scale); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, + 0); + state.image_shadow_buffer = color_surface->texture.handle; + } else { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, + color_surface != nullptr ? color_surface->texture.handle : 0, 0); + if (depth_surface != nullptr) { + if (has_stencil) { + // attach both depth and stencil + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, + GL_TEXTURE_2D, depth_surface->texture.handle, 0); + } else { + // attach depth + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, + depth_surface->texture.handle, 0); + // clear stencil attachment + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, + 0); + } + } else { + // clear both depth and stencil attachment + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, + 0, 0); + } + } + + // Sync the viewport + state.viewport.x = + static_cast(surfaces_rect.left) + viewport_rect_unscaled.left * res_scale; + state.viewport.y = + static_cast(surfaces_rect.bottom) + viewport_rect_unscaled.bottom * res_scale; + state.viewport.width = static_cast(viewport_rect_unscaled.GetWidth() * res_scale); + state.viewport.height = static_cast(viewport_rect_unscaled.GetHeight() * res_scale); + + if (uniform_block_data.data.framebuffer_scale != res_scale) { + uniform_block_data.data.framebuffer_scale = res_scale; + uniform_block_data.dirty = true; + } + + // Scissor checks are window-, not viewport-relative, which means that if the cached texture + // sub-rect changes, the scissor bounds also need to be updated. + GLint scissor_x1 = + static_cast(surfaces_rect.left + regs.rasterizer.scissor_test.x1 * res_scale); + GLint scissor_y1 = + static_cast(surfaces_rect.bottom + regs.rasterizer.scissor_test.y1 * res_scale); + // x2, y2 have +1 added to cover the entire pixel area, otherwise you might get cracks when + // scaling or doing multisampling. + GLint scissor_x2 = + static_cast(surfaces_rect.left + (regs.rasterizer.scissor_test.x2 + 1) * res_scale); + GLint scissor_y2 = static_cast(surfaces_rect.bottom + + (regs.rasterizer.scissor_test.y2 + 1) * res_scale); + + if (uniform_block_data.data.scissor_x1 != scissor_x1 || + uniform_block_data.data.scissor_x2 != scissor_x2 || + uniform_block_data.data.scissor_y1 != scissor_y1 || + uniform_block_data.data.scissor_y2 != scissor_y2) { + + uniform_block_data.data.scissor_x1 = scissor_x1; + uniform_block_data.data.scissor_x2 = scissor_x2; + uniform_block_data.data.scissor_y1 = scissor_y1; + uniform_block_data.data.scissor_y2 = scissor_y2; + uniform_block_data.dirty = true; + } + + bool need_duplicate_texture = false; + auto CheckBarrier = [&need_duplicate_texture, &color_surface](GLuint handle) { + if (color_surface && color_surface->texture.handle == handle) { + need_duplicate_texture = true; + } + }; + + // Sync and bind the texture surfaces + const auto pica_textures = regs.texturing.GetTextures(); + for (unsigned texture_index = 0; texture_index < pica_textures.size(); ++texture_index) { + const auto& texture = pica_textures[texture_index]; + + if (texture.enabled) { + if (texture_index == 0) { + using TextureType = Pica::TexturingRegs::TextureConfig::TextureType; + switch (texture.config.type.Value()) { + case TextureType::Shadow2D: { + if (!allow_shadow) + continue; + + Surface surface = res_cache.GetTextureSurface(texture); + if (surface != nullptr) { + CheckBarrier(state.image_shadow_texture_px = surface->texture.handle); + } else { + state.image_shadow_texture_px = 0; + } + continue; + } + case TextureType::ShadowCube: { + if (!allow_shadow) + continue; + Pica::Texture::TextureInfo info = Pica::Texture::TextureInfo::FromPicaRegister( + texture.config, texture.format); + Surface surface; + + using CubeFace = Pica::TexturingRegs::CubeFace; + info.physical_address = + regs.texturing.GetCubePhysicalAddress(CubeFace::PositiveX); + surface = res_cache.GetTextureSurface(info); + if (surface != nullptr) { + CheckBarrier(state.image_shadow_texture_px = surface->texture.handle); + } else { + state.image_shadow_texture_px = 0; + } + + info.physical_address = + regs.texturing.GetCubePhysicalAddress(CubeFace::NegativeX); + surface = res_cache.GetTextureSurface(info); + if (surface != nullptr) { + CheckBarrier(state.image_shadow_texture_nx = surface->texture.handle); + } else { + state.image_shadow_texture_nx = 0; + } + + info.physical_address = + regs.texturing.GetCubePhysicalAddress(CubeFace::PositiveY); + surface = res_cache.GetTextureSurface(info); + if (surface != nullptr) { + CheckBarrier(state.image_shadow_texture_py = surface->texture.handle); + } else { + state.image_shadow_texture_py = 0; + } + + info.physical_address = + regs.texturing.GetCubePhysicalAddress(CubeFace::NegativeY); + surface = res_cache.GetTextureSurface(info); + if (surface != nullptr) { + CheckBarrier(state.image_shadow_texture_ny = surface->texture.handle); + } else { + state.image_shadow_texture_ny = 0; + } + + info.physical_address = + regs.texturing.GetCubePhysicalAddress(CubeFace::PositiveZ); + surface = res_cache.GetTextureSurface(info); + if (surface != nullptr) { + CheckBarrier(state.image_shadow_texture_pz = surface->texture.handle); + } else { + state.image_shadow_texture_pz = 0; + } + + info.physical_address = + regs.texturing.GetCubePhysicalAddress(CubeFace::NegativeZ); + surface = res_cache.GetTextureSurface(info); + if (surface != nullptr) { + CheckBarrier(state.image_shadow_texture_nz = surface->texture.handle); + } else { + state.image_shadow_texture_nz = 0; + } + + continue; + } + case TextureType::TextureCube: + using CubeFace = Pica::TexturingRegs::CubeFace; + TextureCubeConfig config; + config.px = regs.texturing.GetCubePhysicalAddress(CubeFace::PositiveX); + config.nx = regs.texturing.GetCubePhysicalAddress(CubeFace::NegativeX); + config.py = regs.texturing.GetCubePhysicalAddress(CubeFace::PositiveY); + config.ny = regs.texturing.GetCubePhysicalAddress(CubeFace::NegativeY); + config.pz = regs.texturing.GetCubePhysicalAddress(CubeFace::PositiveZ); + config.nz = regs.texturing.GetCubePhysicalAddress(CubeFace::NegativeZ); + config.width = texture.config.width; + config.format = texture.format; + state.texture_cube_unit.texture_cube = + res_cache.GetTextureCube(config).texture.handle; + + texture_cube_sampler.SyncWithConfig(texture.config); + state.texture_units[texture_index].texture_2d = 0; + continue; // Texture unit 0 setup finished. Continue to next unit + default: + state.texture_cube_unit.texture_cube = 0; + } + } + + texture_samplers[texture_index].SyncWithConfig(texture.config); + Surface surface = res_cache.GetTextureSurface(texture); + if (surface != nullptr) { + CheckBarrier(state.texture_units[texture_index].texture_2d = + surface->texture.handle); + } else { + // Can occur when texture addr is null or its memory is unmapped/invalid + // HACK: In this case, the correct behaviour for the PICA is to use the last + // rendered colour. But because this would be impractical to implement, the + // next best alternative is to use a clear texture, essentially skipping + // the geometry in question. + // For example: a bug in Pokemon X/Y causes NULL-texture squares to be drawn + // on the male character's face, which in the OpenGL default appear black. + state.texture_units[texture_index].texture_2d = default_texture; + } + } else { + state.texture_units[texture_index].texture_2d = 0; + } + } + + OGLTexture temp_tex; + if (need_duplicate_texture && (GLAD_GL_ARB_copy_image || GLES)) { + // The game is trying to use a surface as a texture and framebuffer at the same time + // which causes unpredictable behavior on the host. + // Making a copy to sample from eliminates this issue and seems to be fairly cheap. + temp_tex.Create(); + glBindTexture(GL_TEXTURE_2D, temp_tex.handle); + auto [internal_format, format, type] = GetFormatTuple(color_surface->pixel_format); + OGLTexture::Allocate(GL_TEXTURE_2D, color_surface->max_level + 1, internal_format, format, + type, color_surface->GetScaledWidth(), + color_surface->GetScaledHeight()); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glBindTexture(GL_TEXTURE_2D, state.texture_units[0].texture_2d); + + for (std::size_t level{0}; level <= color_surface->max_level; ++level) { + glCopyImageSubData(color_surface->texture.handle, GL_TEXTURE_2D, level, 0, 0, 0, + temp_tex.handle, GL_TEXTURE_2D, level, 0, 0, 0, + color_surface->GetScaledWidth() >> level, + color_surface->GetScaledHeight() >> level, 1); + } + + for (auto& unit : state.texture_units) { + if (unit.texture_2d == color_surface->texture.handle) { + unit.texture_2d = temp_tex.handle; + } + } + for (auto shadow_unit : {&state.image_shadow_texture_nx, &state.image_shadow_texture_ny, + &state.image_shadow_texture_nz, &state.image_shadow_texture_px, + &state.image_shadow_texture_py, &state.image_shadow_texture_pz}) { + if (*shadow_unit == color_surface->texture.handle) { + *shadow_unit = temp_tex.handle; + } + } + } + + // Sync and bind the shader + if (shader_dirty) { + SetShader(); + shader_dirty = false; + } + + // Sync the LUTs within the texture buffer + SyncAndUploadLUTs(); + SyncAndUploadLUTsLF(); + + // Sync the uniform data + UploadUniforms(accelerate); + + // Viewport can have negative offsets or larger + // dimensions than our framebuffer sub-rect. + // Enable scissor test to prevent drawing + // outside of the framebuffer region + state.scissor.enabled = true; + state.scissor.x = draw_rect.left; + state.scissor.y = draw_rect.bottom; + state.scissor.width = draw_rect.GetWidth(); + state.scissor.height = draw_rect.GetHeight(); + state.Apply(); + + // Draw the vertex batch + bool succeeded = true; + if (accelerate) { + succeeded = AccelerateDrawBatchInternal(is_indexed); + } else { + state.draw.vertex_array = sw_vao.handle; + state.draw.vertex_buffer = vertex_buffer.GetHandle(); + shader_program_manager->UseTrivialVertexShader(); + shader_program_manager->UseTrivialGeometryShader(); + shader_program_manager->ApplyTo(state); + state.Apply(); + + std::size_t max_vertices = 3 * (VERTEX_BUFFER_SIZE / (3 * sizeof(HardwareVertex))); + for (std::size_t base_vertex = 0; base_vertex < vertex_batch.size(); + base_vertex += max_vertices) { + const std::size_t vertices = std::min(max_vertices, vertex_batch.size() - base_vertex); + const std::size_t vertex_size = vertices * sizeof(HardwareVertex); + u8* vbo; + GLintptr offset; + std::tie(vbo, offset, std::ignore) = + vertex_buffer.Map(vertex_size, sizeof(HardwareVertex)); + std::memcpy(vbo, vertex_batch.data() + base_vertex, vertex_size); + vertex_buffer.Unmap(vertex_size); + glDrawArrays(GL_TRIANGLES, static_cast(offset / sizeof(HardwareVertex)), + static_cast(vertices)); + } + } + + vertex_batch.clear(); + + // Reset textures in rasterizer state context because the rasterizer cache might delete them + for (unsigned texture_index = 0; texture_index < pica_textures.size(); ++texture_index) { + state.texture_units[texture_index].texture_2d = 0; + } + state.texture_cube_unit.texture_cube = 0; + if (allow_shadow) { + state.image_shadow_texture_px = 0; + state.image_shadow_texture_nx = 0; + state.image_shadow_texture_py = 0; + state.image_shadow_texture_ny = 0; + state.image_shadow_texture_pz = 0; + state.image_shadow_texture_nz = 0; + state.image_shadow_buffer = 0; + } + state.Apply(); + + if (shadow_rendering) { + glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT | GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | + GL_TEXTURE_UPDATE_BARRIER_BIT | GL_FRAMEBUFFER_BARRIER_BIT); + } + + // Mark framebuffer surfaces as dirty + Common::Rectangle draw_rect_unscaled{draw_rect.left / res_scale, draw_rect.top / res_scale, + draw_rect.right / res_scale, + draw_rect.bottom / res_scale}; + + if (color_surface != nullptr && write_color_fb) { + auto interval = color_surface->GetSubRectInterval(draw_rect_unscaled); + res_cache.InvalidateRegion(boost::icl::first(interval), boost::icl::length(interval), + color_surface); + } + if (depth_surface != nullptr && write_depth_fb) { + auto interval = depth_surface->GetSubRectInterval(draw_rect_unscaled); + res_cache.InvalidateRegion(boost::icl::first(interval), boost::icl::length(interval), + depth_surface); + } + + return succeeded; +} + +void Rasterizer::NotifyPicaRegisterChanged(u32 id) { + const auto& regs = Pica::g_state.regs; + + switch (id) { + // Culling + case PICA_REG_INDEX(rasterizer.cull_mode): + SyncCullMode(); + break; + + // Clipping plane + case PICA_REG_INDEX(rasterizer.clip_enable): + SyncClipEnabled(); + break; + + case PICA_REG_INDEX(rasterizer.clip_coef[0]): + case PICA_REG_INDEX(rasterizer.clip_coef[1]): + case PICA_REG_INDEX(rasterizer.clip_coef[2]): + case PICA_REG_INDEX(rasterizer.clip_coef[3]): + SyncClipCoef(); + break; + + // Depth modifiers + case PICA_REG_INDEX(rasterizer.viewport_depth_range): + SyncDepthScale(); + break; + case PICA_REG_INDEX(rasterizer.viewport_depth_near_plane): + SyncDepthOffset(); + break; + + // Depth buffering + case PICA_REG_INDEX(rasterizer.depthmap_enable): + shader_dirty = true; + break; + + // Blending + case PICA_REG_INDEX(framebuffer.output_merger.alphablend_enable): + if (GLES) { + // With GLES, we need this in the fragment shader to emulate logic operations + shader_dirty = true; + } + SyncBlendEnabled(); + break; + case PICA_REG_INDEX(framebuffer.output_merger.alpha_blending): + SyncBlendFuncs(); + break; + case PICA_REG_INDEX(framebuffer.output_merger.blend_const): + SyncBlendColor(); + break; + + // Shadow texture + case PICA_REG_INDEX(texturing.shadow): + SyncShadowTextureBias(); + break; + + // Fog state + case PICA_REG_INDEX(texturing.fog_color): + SyncFogColor(); + break; + case PICA_REG_INDEX(texturing.fog_lut_data[0]): + case PICA_REG_INDEX(texturing.fog_lut_data[1]): + case PICA_REG_INDEX(texturing.fog_lut_data[2]): + case PICA_REG_INDEX(texturing.fog_lut_data[3]): + case PICA_REG_INDEX(texturing.fog_lut_data[4]): + case PICA_REG_INDEX(texturing.fog_lut_data[5]): + case PICA_REG_INDEX(texturing.fog_lut_data[6]): + case PICA_REG_INDEX(texturing.fog_lut_data[7]): + uniform_block_data.fog_lut_dirty = true; + break; + + // ProcTex state + case PICA_REG_INDEX(texturing.proctex): + case PICA_REG_INDEX(texturing.proctex_lut): + case PICA_REG_INDEX(texturing.proctex_lut_offset): + SyncProcTexBias(); + shader_dirty = true; + break; + + case PICA_REG_INDEX(texturing.proctex_noise_u): + case PICA_REG_INDEX(texturing.proctex_noise_v): + case PICA_REG_INDEX(texturing.proctex_noise_frequency): + SyncProcTexNoise(); + break; + + case PICA_REG_INDEX(texturing.proctex_lut_data[0]): + case PICA_REG_INDEX(texturing.proctex_lut_data[1]): + case PICA_REG_INDEX(texturing.proctex_lut_data[2]): + case PICA_REG_INDEX(texturing.proctex_lut_data[3]): + case PICA_REG_INDEX(texturing.proctex_lut_data[4]): + case PICA_REG_INDEX(texturing.proctex_lut_data[5]): + case PICA_REG_INDEX(texturing.proctex_lut_data[6]): + case PICA_REG_INDEX(texturing.proctex_lut_data[7]): + using Pica::TexturingRegs; + switch (regs.texturing.proctex_lut_config.ref_table.Value()) { + case TexturingRegs::ProcTexLutTable::Noise: + uniform_block_data.proctex_noise_lut_dirty = true; + break; + case TexturingRegs::ProcTexLutTable::ColorMap: + uniform_block_data.proctex_color_map_dirty = true; + break; + case TexturingRegs::ProcTexLutTable::AlphaMap: + uniform_block_data.proctex_alpha_map_dirty = true; + break; + case TexturingRegs::ProcTexLutTable::Color: + uniform_block_data.proctex_lut_dirty = true; + break; + case TexturingRegs::ProcTexLutTable::ColorDiff: + uniform_block_data.proctex_diff_lut_dirty = true; + break; + } + break; + + // Alpha test + case PICA_REG_INDEX(framebuffer.output_merger.alpha_test): + SyncAlphaTest(); + shader_dirty = true; + break; + + // Sync GL stencil test + stencil write mask + // (Pica stencil test function register also contains a stencil write mask) + case PICA_REG_INDEX(framebuffer.output_merger.stencil_test.raw_func): + SyncStencilTest(); + SyncStencilWriteMask(); + break; + case PICA_REG_INDEX(framebuffer.output_merger.stencil_test.raw_op): + case PICA_REG_INDEX(framebuffer.framebuffer.depth_format): + SyncStencilTest(); + break; + + // Sync GL depth test + depth and color write mask + // (Pica depth test function register also contains a depth and color write mask) + case PICA_REG_INDEX(framebuffer.output_merger.depth_test_enable): + SyncDepthTest(); + SyncDepthWriteMask(); + SyncColorWriteMask(); + break; + + // Sync GL depth and stencil write mask + // (This is a dedicated combined depth / stencil write-enable register) + case PICA_REG_INDEX(framebuffer.framebuffer.allow_depth_stencil_write): + SyncDepthWriteMask(); + SyncStencilWriteMask(); + break; + + // Sync GL color write mask + // (This is a dedicated color write-enable register) + case PICA_REG_INDEX(framebuffer.framebuffer.allow_color_write): + SyncColorWriteMask(); + break; + + case PICA_REG_INDEX(framebuffer.shadow): + SyncShadowBias(); + break; + + // Scissor test + case PICA_REG_INDEX(rasterizer.scissor_test.mode): + shader_dirty = true; + break; + + // Logic op + case PICA_REG_INDEX(framebuffer.output_merger.logic_op): + if (GLES) { + // With GLES, we need this in the fragment shader to emulate logic operations + shader_dirty = true; + } + SyncLogicOp(); + break; + + case PICA_REG_INDEX(texturing.main_config): + shader_dirty = true; + break; + + // Texture 0 type + case PICA_REG_INDEX(texturing.texture0.type): + shader_dirty = true; + break; + + // TEV stages + // (This also syncs fog_mode and fog_flip which are part of tev_combiner_buffer_input) + case PICA_REG_INDEX(texturing.tev_stage0.color_source1): + case PICA_REG_INDEX(texturing.tev_stage0.color_modifier1): + case PICA_REG_INDEX(texturing.tev_stage0.color_op): + case PICA_REG_INDEX(texturing.tev_stage0.color_scale): + case PICA_REG_INDEX(texturing.tev_stage1.color_source1): + case PICA_REG_INDEX(texturing.tev_stage1.color_modifier1): + case PICA_REG_INDEX(texturing.tev_stage1.color_op): + case PICA_REG_INDEX(texturing.tev_stage1.color_scale): + case PICA_REG_INDEX(texturing.tev_stage2.color_source1): + case PICA_REG_INDEX(texturing.tev_stage2.color_modifier1): + case PICA_REG_INDEX(texturing.tev_stage2.color_op): + case PICA_REG_INDEX(texturing.tev_stage2.color_scale): + case PICA_REG_INDEX(texturing.tev_stage3.color_source1): + case PICA_REG_INDEX(texturing.tev_stage3.color_modifier1): + case PICA_REG_INDEX(texturing.tev_stage3.color_op): + case PICA_REG_INDEX(texturing.tev_stage3.color_scale): + case PICA_REG_INDEX(texturing.tev_stage4.color_source1): + case PICA_REG_INDEX(texturing.tev_stage4.color_modifier1): + case PICA_REG_INDEX(texturing.tev_stage4.color_op): + case PICA_REG_INDEX(texturing.tev_stage4.color_scale): + case PICA_REG_INDEX(texturing.tev_stage5.color_source1): + case PICA_REG_INDEX(texturing.tev_stage5.color_modifier1): + case PICA_REG_INDEX(texturing.tev_stage5.color_op): + case PICA_REG_INDEX(texturing.tev_stage5.color_scale): + case PICA_REG_INDEX(texturing.tev_combiner_buffer_input): + shader_dirty = true; + break; + case PICA_REG_INDEX(texturing.tev_stage0.const_r): + SyncTevConstColor(0, regs.texturing.tev_stage0); + break; + case PICA_REG_INDEX(texturing.tev_stage1.const_r): + SyncTevConstColor(1, regs.texturing.tev_stage1); + break; + case PICA_REG_INDEX(texturing.tev_stage2.const_r): + SyncTevConstColor(2, regs.texturing.tev_stage2); + break; + case PICA_REG_INDEX(texturing.tev_stage3.const_r): + SyncTevConstColor(3, regs.texturing.tev_stage3); + break; + case PICA_REG_INDEX(texturing.tev_stage4.const_r): + SyncTevConstColor(4, regs.texturing.tev_stage4); + break; + case PICA_REG_INDEX(texturing.tev_stage5.const_r): + SyncTevConstColor(5, regs.texturing.tev_stage5); + break; + + // TEV combiner buffer color + case PICA_REG_INDEX(texturing.tev_combiner_buffer_color): + SyncCombinerColor(); + break; + + // Fragment lighting switches + case PICA_REG_INDEX(lighting.disable): + case PICA_REG_INDEX(lighting.max_light_index): + case PICA_REG_INDEX(lighting.config0): + case PICA_REG_INDEX(lighting.config1): + case PICA_REG_INDEX(lighting.abs_lut_input): + case PICA_REG_INDEX(lighting.lut_input): + case PICA_REG_INDEX(lighting.lut_scale): + case PICA_REG_INDEX(lighting.light_enable): + break; + + // Fragment lighting specular 0 color + case PICA_REG_INDEX(lighting.light[0].specular_0): + SyncLightSpecular0(0); + break; + case PICA_REG_INDEX(lighting.light[1].specular_0): + SyncLightSpecular0(1); + break; + case PICA_REG_INDEX(lighting.light[2].specular_0): + SyncLightSpecular0(2); + break; + case PICA_REG_INDEX(lighting.light[3].specular_0): + SyncLightSpecular0(3); + break; + case PICA_REG_INDEX(lighting.light[4].specular_0): + SyncLightSpecular0(4); + break; + case PICA_REG_INDEX(lighting.light[5].specular_0): + SyncLightSpecular0(5); + break; + case PICA_REG_INDEX(lighting.light[6].specular_0): + SyncLightSpecular0(6); + break; + case PICA_REG_INDEX(lighting.light[7].specular_0): + SyncLightSpecular0(7); + break; + + // Fragment lighting specular 1 color + case PICA_REG_INDEX(lighting.light[0].specular_1): + SyncLightSpecular1(0); + break; + case PICA_REG_INDEX(lighting.light[1].specular_1): + SyncLightSpecular1(1); + break; + case PICA_REG_INDEX(lighting.light[2].specular_1): + SyncLightSpecular1(2); + break; + case PICA_REG_INDEX(lighting.light[3].specular_1): + SyncLightSpecular1(3); + break; + case PICA_REG_INDEX(lighting.light[4].specular_1): + SyncLightSpecular1(4); + break; + case PICA_REG_INDEX(lighting.light[5].specular_1): + SyncLightSpecular1(5); + break; + case PICA_REG_INDEX(lighting.light[6].specular_1): + SyncLightSpecular1(6); + break; + case PICA_REG_INDEX(lighting.light[7].specular_1): + SyncLightSpecular1(7); + break; + + // Fragment lighting diffuse color + case PICA_REG_INDEX(lighting.light[0].diffuse): + SyncLightDiffuse(0); + break; + case PICA_REG_INDEX(lighting.light[1].diffuse): + SyncLightDiffuse(1); + break; + case PICA_REG_INDEX(lighting.light[2].diffuse): + SyncLightDiffuse(2); + break; + case PICA_REG_INDEX(lighting.light[3].diffuse): + SyncLightDiffuse(3); + break; + case PICA_REG_INDEX(lighting.light[4].diffuse): + SyncLightDiffuse(4); + break; + case PICA_REG_INDEX(lighting.light[5].diffuse): + SyncLightDiffuse(5); + break; + case PICA_REG_INDEX(lighting.light[6].diffuse): + SyncLightDiffuse(6); + break; + case PICA_REG_INDEX(lighting.light[7].diffuse): + SyncLightDiffuse(7); + break; + + // Fragment lighting ambient color + case PICA_REG_INDEX(lighting.light[0].ambient): + SyncLightAmbient(0); + break; + case PICA_REG_INDEX(lighting.light[1].ambient): + SyncLightAmbient(1); + break; + case PICA_REG_INDEX(lighting.light[2].ambient): + SyncLightAmbient(2); + break; + case PICA_REG_INDEX(lighting.light[3].ambient): + SyncLightAmbient(3); + break; + case PICA_REG_INDEX(lighting.light[4].ambient): + SyncLightAmbient(4); + break; + case PICA_REG_INDEX(lighting.light[5].ambient): + SyncLightAmbient(5); + break; + case PICA_REG_INDEX(lighting.light[6].ambient): + SyncLightAmbient(6); + break; + case PICA_REG_INDEX(lighting.light[7].ambient): + SyncLightAmbient(7); + break; + + // Fragment lighting position + case PICA_REG_INDEX(lighting.light[0].x): + case PICA_REG_INDEX(lighting.light[0].z): + SyncLightPosition(0); + break; + case PICA_REG_INDEX(lighting.light[1].x): + case PICA_REG_INDEX(lighting.light[1].z): + SyncLightPosition(1); + break; + case PICA_REG_INDEX(lighting.light[2].x): + case PICA_REG_INDEX(lighting.light[2].z): + SyncLightPosition(2); + break; + case PICA_REG_INDEX(lighting.light[3].x): + case PICA_REG_INDEX(lighting.light[3].z): + SyncLightPosition(3); + break; + case PICA_REG_INDEX(lighting.light[4].x): + case PICA_REG_INDEX(lighting.light[4].z): + SyncLightPosition(4); + break; + case PICA_REG_INDEX(lighting.light[5].x): + case PICA_REG_INDEX(lighting.light[5].z): + SyncLightPosition(5); + break; + case PICA_REG_INDEX(lighting.light[6].x): + case PICA_REG_INDEX(lighting.light[6].z): + SyncLightPosition(6); + break; + case PICA_REG_INDEX(lighting.light[7].x): + case PICA_REG_INDEX(lighting.light[7].z): + SyncLightPosition(7); + break; + + // Fragment spot lighting direction + case PICA_REG_INDEX(lighting.light[0].spot_x): + case PICA_REG_INDEX(lighting.light[0].spot_z): + SyncLightSpotDirection(0); + break; + case PICA_REG_INDEX(lighting.light[1].spot_x): + case PICA_REG_INDEX(lighting.light[1].spot_z): + SyncLightSpotDirection(1); + break; + case PICA_REG_INDEX(lighting.light[2].spot_x): + case PICA_REG_INDEX(lighting.light[2].spot_z): + SyncLightSpotDirection(2); + break; + case PICA_REG_INDEX(lighting.light[3].spot_x): + case PICA_REG_INDEX(lighting.light[3].spot_z): + SyncLightSpotDirection(3); + break; + case PICA_REG_INDEX(lighting.light[4].spot_x): + case PICA_REG_INDEX(lighting.light[4].spot_z): + SyncLightSpotDirection(4); + break; + case PICA_REG_INDEX(lighting.light[5].spot_x): + case PICA_REG_INDEX(lighting.light[5].spot_z): + SyncLightSpotDirection(5); + break; + case PICA_REG_INDEX(lighting.light[6].spot_x): + case PICA_REG_INDEX(lighting.light[6].spot_z): + SyncLightSpotDirection(6); + break; + case PICA_REG_INDEX(lighting.light[7].spot_x): + case PICA_REG_INDEX(lighting.light[7].spot_z): + SyncLightSpotDirection(7); + break; + + // Fragment lighting light source config + case PICA_REG_INDEX(lighting.light[0].config): + case PICA_REG_INDEX(lighting.light[1].config): + case PICA_REG_INDEX(lighting.light[2].config): + case PICA_REG_INDEX(lighting.light[3].config): + case PICA_REG_INDEX(lighting.light[4].config): + case PICA_REG_INDEX(lighting.light[5].config): + case PICA_REG_INDEX(lighting.light[6].config): + case PICA_REG_INDEX(lighting.light[7].config): + shader_dirty = true; + break; + + // Fragment lighting distance attenuation bias + case PICA_REG_INDEX(lighting.light[0].dist_atten_bias): + SyncLightDistanceAttenuationBias(0); + break; + case PICA_REG_INDEX(lighting.light[1].dist_atten_bias): + SyncLightDistanceAttenuationBias(1); + break; + case PICA_REG_INDEX(lighting.light[2].dist_atten_bias): + SyncLightDistanceAttenuationBias(2); + break; + case PICA_REG_INDEX(lighting.light[3].dist_atten_bias): + SyncLightDistanceAttenuationBias(3); + break; + case PICA_REG_INDEX(lighting.light[4].dist_atten_bias): + SyncLightDistanceAttenuationBias(4); + break; + case PICA_REG_INDEX(lighting.light[5].dist_atten_bias): + SyncLightDistanceAttenuationBias(5); + break; + case PICA_REG_INDEX(lighting.light[6].dist_atten_bias): + SyncLightDistanceAttenuationBias(6); + break; + case PICA_REG_INDEX(lighting.light[7].dist_atten_bias): + SyncLightDistanceAttenuationBias(7); + break; + + // Fragment lighting distance attenuation scale + case PICA_REG_INDEX(lighting.light[0].dist_atten_scale): + SyncLightDistanceAttenuationScale(0); + break; + case PICA_REG_INDEX(lighting.light[1].dist_atten_scale): + SyncLightDistanceAttenuationScale(1); + break; + case PICA_REG_INDEX(lighting.light[2].dist_atten_scale): + SyncLightDistanceAttenuationScale(2); + break; + case PICA_REG_INDEX(lighting.light[3].dist_atten_scale): + SyncLightDistanceAttenuationScale(3); + break; + case PICA_REG_INDEX(lighting.light[4].dist_atten_scale): + SyncLightDistanceAttenuationScale(4); + break; + case PICA_REG_INDEX(lighting.light[5].dist_atten_scale): + SyncLightDistanceAttenuationScale(5); + break; + case PICA_REG_INDEX(lighting.light[6].dist_atten_scale): + SyncLightDistanceAttenuationScale(6); + break; + case PICA_REG_INDEX(lighting.light[7].dist_atten_scale): + SyncLightDistanceAttenuationScale(7); + break; + + // Fragment lighting global ambient color (emission + ambient * ambient) + case PICA_REG_INDEX(lighting.global_ambient): + SyncGlobalAmbient(); + break; + + // Fragment lighting lookup tables + case PICA_REG_INDEX(lighting.lut_data[0]): + case PICA_REG_INDEX(lighting.lut_data[1]): + case PICA_REG_INDEX(lighting.lut_data[2]): + case PICA_REG_INDEX(lighting.lut_data[3]): + case PICA_REG_INDEX(lighting.lut_data[4]): + case PICA_REG_INDEX(lighting.lut_data[5]): + case PICA_REG_INDEX(lighting.lut_data[6]): + case PICA_REG_INDEX(lighting.lut_data[7]): { + const auto& lut_config = regs.lighting.lut_config; + uniform_block_data.lighting_lut_dirty[lut_config.type] = true; + uniform_block_data.lighting_lut_dirty_any = true; + break; + } + } +} + +void Rasterizer::FlushAll() { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + res_cache.FlushAll(); +} + +void Rasterizer::FlushRegion(PAddr addr, u32 size) { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + res_cache.FlushRegion(addr, size); +} + +void Rasterizer::InvalidateRegion(PAddr addr, u32 size) { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + res_cache.InvalidateRegion(addr, size, nullptr); +} + +void Rasterizer::FlushAndInvalidateRegion(PAddr addr, u32 size) { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + res_cache.FlushRegion(addr, size); + res_cache.InvalidateRegion(addr, size, nullptr); +} + +void Rasterizer::ClearAll(bool flush) { + res_cache.ClearAll(flush); +} + +bool Rasterizer::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config) { + MICROPROFILE_SCOPE(OpenGL_Blits); + + SurfaceParams src_params; + src_params.addr = config.GetPhysicalInputAddress(); + src_params.width = config.output_width; + src_params.stride = config.input_width; + src_params.height = config.output_height; + src_params.is_tiled = !config.input_linear; + src_params.pixel_format = SurfaceParams::PixelFormatFromGPUPixelFormat(config.input_format); + src_params.UpdateParams(); + + SurfaceParams dst_params; + dst_params.addr = config.GetPhysicalOutputAddress(); + dst_params.width = config.scaling != config.NoScale ? config.output_width.Value() / 2 + : config.output_width.Value(); + dst_params.height = config.scaling == config.ScaleXY ? config.output_height.Value() / 2 + : config.output_height.Value(); + dst_params.is_tiled = config.input_linear != config.dont_swizzle; + dst_params.pixel_format = SurfaceParams::PixelFormatFromGPUPixelFormat(config.output_format); + dst_params.UpdateParams(); + + Common::Rectangle src_rect; + Surface src_surface; + std::tie(src_surface, src_rect) = + res_cache.GetSurfaceSubRect(src_params, ScaleMatch::Ignore, true); + if (src_surface == nullptr) + return false; + + dst_params.res_scale = src_surface->res_scale; + + Common::Rectangle dst_rect; + Surface dst_surface; + std::tie(dst_surface, dst_rect) = + res_cache.GetSurfaceSubRect(dst_params, ScaleMatch::Upscale, false); + if (dst_surface == nullptr) + return false; + + if (src_surface->is_tiled != dst_surface->is_tiled) + std::swap(src_rect.top, src_rect.bottom); + + if (config.flip_vertically) + std::swap(src_rect.top, src_rect.bottom); + + if (!res_cache.BlitSurfaces(src_surface, src_rect, dst_surface, dst_rect)) + return false; + + res_cache.InvalidateRegion(dst_params.addr, dst_params.size, dst_surface); + return true; +} + +bool Rasterizer::AccelerateTextureCopy(const GPU::Regs::DisplayTransferConfig& config) { + u32 copy_size = Common::AlignDown(config.texture_copy.size, 16); + if (copy_size == 0) { + return false; + } + + u32 input_gap = config.texture_copy.input_gap * 16; + u32 input_width = config.texture_copy.input_width * 16; + if (input_width == 0 && input_gap != 0) { + return false; + } + if (input_gap == 0 || input_width >= copy_size) { + input_width = copy_size; + input_gap = 0; + } + if (copy_size % input_width != 0) { + return false; + } + + u32 output_gap = config.texture_copy.output_gap * 16; + u32 output_width = config.texture_copy.output_width * 16; + if (output_width == 0 && output_gap != 0) { + return false; + } + if (output_gap == 0 || output_width >= copy_size) { + output_width = copy_size; + output_gap = 0; + } + if (copy_size % output_width != 0) { + return false; + } + + SurfaceParams src_params; + src_params.addr = config.GetPhysicalInputAddress(); + src_params.stride = input_width + input_gap; // stride in bytes + src_params.width = input_width; // width in bytes + src_params.height = copy_size / input_width; + src_params.size = ((src_params.height - 1) * src_params.stride) + src_params.width; + src_params.end = src_params.addr + src_params.size; + + Common::Rectangle src_rect; + Surface src_surface; + std::tie(src_surface, src_rect) = res_cache.GetTexCopySurface(src_params); + if (src_surface == nullptr) { + return false; + } + + if (output_gap != 0 && + (output_width != src_surface->BytesInPixels(src_rect.GetWidth() / src_surface->res_scale) * + (src_surface->is_tiled ? 8 : 1) || + output_gap % src_surface->BytesInPixels(src_surface->is_tiled ? 64 : 1) != 0)) { + return false; + } + + SurfaceParams dst_params = *src_surface; + dst_params.addr = config.GetPhysicalOutputAddress(); + dst_params.width = src_rect.GetWidth() / src_surface->res_scale; + dst_params.stride = dst_params.width + src_surface->PixelsInBytes( + src_surface->is_tiled ? output_gap / 8 : output_gap); + dst_params.height = src_rect.GetHeight() / src_surface->res_scale; + dst_params.res_scale = src_surface->res_scale; + dst_params.UpdateParams(); + + // Since we are going to invalidate the gap if there is one, we will have to load it first + const bool load_gap = output_gap != 0; + Common::Rectangle dst_rect; + Surface dst_surface; + std::tie(dst_surface, dst_rect) = + res_cache.GetSurfaceSubRect(dst_params, ScaleMatch::Upscale, load_gap); + if (dst_surface == nullptr) { + return false; + } + + if (dst_surface->type == SurfaceType::Texture) { + return false; + } + + if (!res_cache.BlitSurfaces(src_surface, src_rect, dst_surface, dst_rect)) { + return false; + } + + res_cache.InvalidateRegion(dst_params.addr, dst_params.size, dst_surface); + return true; +} + +bool Rasterizer::AccelerateFill(const GPU::Regs::MemoryFillConfig& config) { + Surface dst_surface = res_cache.GetFillSurface(config); + if (dst_surface == nullptr) + return false; + + res_cache.InvalidateRegion(dst_surface->addr, dst_surface->size, dst_surface); + return true; +} + +bool Rasterizer::AccelerateDisplay(const GPU::Regs::FramebufferConfig& config, + PAddr framebuffer_addr, u32 pixel_stride, + ScreenInfo& screen_info) { + if (framebuffer_addr == 0) { + return false; + } + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + + SurfaceParams src_params; + src_params.addr = framebuffer_addr; + src_params.width = std::min(config.width.Value(), pixel_stride); + src_params.height = config.height; + src_params.stride = pixel_stride; + src_params.is_tiled = false; + src_params.pixel_format = SurfaceParams::PixelFormatFromGPUPixelFormat(config.color_format); + src_params.UpdateParams(); + + Common::Rectangle src_rect; + Surface src_surface; + std::tie(src_surface, src_rect) = + res_cache.GetSurfaceSubRect(src_params, ScaleMatch::Ignore, true); + + if (src_surface == nullptr) { + return false; + } + + u32 scaled_width = src_surface->GetScaledWidth(); + u32 scaled_height = src_surface->GetScaledHeight(); + + screen_info.display_texcoords = Common::Rectangle( + (float)src_rect.bottom / (float)scaled_height, (float)src_rect.left / (float)scaled_width, + (float)src_rect.top / (float)scaled_height, (float)src_rect.right / (float)scaled_width); + + screen_info.display_texture = src_surface->texture.handle; + + return true; +} + +void Rasterizer::SamplerInfo::Create() { + sampler.Create(); + mag_filter = min_filter = mip_filter = TextureConfig::Linear; + wrap_s = wrap_t = TextureConfig::Repeat; + border_color = 0; + lod_min = lod_max = 0; + lod_bias = 0; + + // default is 1000 and -1000 + // Other attributes have correct defaults + glSamplerParameterf(sampler.handle, GL_TEXTURE_MAX_LOD, static_cast(lod_max)); + glSamplerParameterf(sampler.handle, GL_TEXTURE_MIN_LOD, static_cast(lod_min)); +} + +void Rasterizer::SamplerInfo::SyncWithConfig( + const Pica::TexturingRegs::TextureConfig& config) { + + GLuint s = sampler.handle; + + if (mag_filter != config.mag_filter) { + mag_filter = config.mag_filter; + glSamplerParameteri(s, GL_TEXTURE_MAG_FILTER, PicaToGL::TextureMagFilterMode(mag_filter)); + } + + // TODO(wwylele): remove new_supress_mipmap_for_cube logic once mipmap for cube is implemented + bool new_supress_mipmap_for_cube = + config.type == Pica::TexturingRegs::TextureConfig::TextureCube; + if (min_filter != config.min_filter || mip_filter != config.mip_filter || + supress_mipmap_for_cube != new_supress_mipmap_for_cube) { + min_filter = config.min_filter; + mip_filter = config.mip_filter; + supress_mipmap_for_cube = new_supress_mipmap_for_cube; + if (new_supress_mipmap_for_cube) { + // HACK: use mag filter converter for min filter because they are the same anyway + glSamplerParameteri(s, GL_TEXTURE_MIN_FILTER, + PicaToGL::TextureMagFilterMode(min_filter)); + } else { + glSamplerParameteri(s, GL_TEXTURE_MIN_FILTER, + PicaToGL::TextureMinFilterMode(min_filter, mip_filter)); + } + } + + if (wrap_s != config.wrap_s) { + wrap_s = config.wrap_s; + glSamplerParameteri(s, GL_TEXTURE_WRAP_S, PicaToGL::WrapMode(wrap_s)); + } + if (wrap_t != config.wrap_t) { + wrap_t = config.wrap_t; + glSamplerParameteri(s, GL_TEXTURE_WRAP_T, PicaToGL::WrapMode(wrap_t)); + } + + if (wrap_s == TextureConfig::ClampToBorder || wrap_t == TextureConfig::ClampToBorder) { + if (border_color != config.border_color.raw) { + border_color = config.border_color.raw; + auto gl_color = PicaToGL::ColorRGBA8(border_color); + glSamplerParameterfv(s, GL_TEXTURE_BORDER_COLOR, gl_color.data()); + } + } + + if (lod_min != config.lod.min_level) { + lod_min = config.lod.min_level; + glSamplerParameterf(s, GL_TEXTURE_MIN_LOD, static_cast(lod_min)); + } + + if (lod_max != config.lod.max_level) { + lod_max = config.lod.max_level; + glSamplerParameterf(s, GL_TEXTURE_MAX_LOD, static_cast(lod_max)); + } + + if (!GLES && lod_bias != config.lod.bias) { + lod_bias = config.lod.bias; + glSamplerParameterf(s, GL_TEXTURE_LOD_BIAS, lod_bias / 256.0f); + } +} + +void Rasterizer::SetShader() { + shader_program_manager->UseFragmentShader(Pica::g_state.regs); +} + +void Rasterizer::SyncClipEnabled() { + state.clip_distance[1] = Pica::g_state.regs.rasterizer.clip_enable != 0; +} + +void Rasterizer::SyncClipCoef() { + const auto raw_clip_coef = Pica::g_state.regs.rasterizer.GetClipCoef(); + const GLvec4 new_clip_coef = {raw_clip_coef.x.ToFloat32(), raw_clip_coef.y.ToFloat32(), + raw_clip_coef.z.ToFloat32(), raw_clip_coef.w.ToFloat32()}; + if (new_clip_coef != uniform_block_data.data.clip_coef) { + uniform_block_data.data.clip_coef = new_clip_coef; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncCullMode() { + const auto& regs = Pica::g_state.regs; + + switch (regs.rasterizer.cull_mode) { + case Pica::RasterizerRegs::CullMode::KeepAll: + state.cull.enabled = false; + break; + + case Pica::RasterizerRegs::CullMode::KeepClockWise: + state.cull.enabled = true; + state.cull.front_face = GL_CW; + break; + + case Pica::RasterizerRegs::CullMode::KeepCounterClockWise: + state.cull.enabled = true; + state.cull.front_face = GL_CCW; + break; + + default: + LOG_CRITICAL(Render_OpenGL, "Unknown cull mode {}", + static_cast(regs.rasterizer.cull_mode.Value())); + UNIMPLEMENTED(); + break; + } +} + +void Rasterizer::SyncDepthScale() { + float depth_scale = + Pica::float24::FromRaw(Pica::g_state.regs.rasterizer.viewport_depth_range).ToFloat32(); + if (depth_scale != uniform_block_data.data.depth_scale) { + uniform_block_data.data.depth_scale = depth_scale; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncDepthOffset() { + float depth_offset = + Pica::float24::FromRaw(Pica::g_state.regs.rasterizer.viewport_depth_near_plane).ToFloat32(); + if (depth_offset != uniform_block_data.data.depth_offset) { + uniform_block_data.data.depth_offset = depth_offset; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncBlendEnabled() { + state.blend.enabled = (Pica::g_state.regs.framebuffer.output_merger.alphablend_enable == 1); +} + +void Rasterizer::SyncBlendFuncs() { + const auto& regs = Pica::g_state.regs; + state.blend.rgb_equation = + PicaToGL::BlendEquation(regs.framebuffer.output_merger.alpha_blending.blend_equation_rgb); + state.blend.a_equation = + PicaToGL::BlendEquation(regs.framebuffer.output_merger.alpha_blending.blend_equation_a); + state.blend.src_rgb_func = + PicaToGL::BlendFunc(regs.framebuffer.output_merger.alpha_blending.factor_source_rgb); + state.blend.dst_rgb_func = + PicaToGL::BlendFunc(regs.framebuffer.output_merger.alpha_blending.factor_dest_rgb); + state.blend.src_a_func = + PicaToGL::BlendFunc(regs.framebuffer.output_merger.alpha_blending.factor_source_a); + state.blend.dst_a_func = + PicaToGL::BlendFunc(regs.framebuffer.output_merger.alpha_blending.factor_dest_a); +} + +void Rasterizer::SyncBlendColor() { + auto blend_color = + PicaToGL::ColorRGBA8(Pica::g_state.regs.framebuffer.output_merger.blend_const.raw); + state.blend.color.red = blend_color[0]; + state.blend.color.green = blend_color[1]; + state.blend.color.blue = blend_color[2]; + state.blend.color.alpha = blend_color[3]; +} + +void Rasterizer::SyncFogColor() { + const auto& regs = Pica::g_state.regs; + uniform_block_data.data.fog_color = { + regs.texturing.fog_color.r.Value() / 255.0f, + regs.texturing.fog_color.g.Value() / 255.0f, + regs.texturing.fog_color.b.Value() / 255.0f, + }; + uniform_block_data.dirty = true; +} + +void Rasterizer::SyncProcTexNoise() { + const auto& regs = Pica::g_state.regs.texturing; + uniform_block_data.data.proctex_noise_f = { + Pica::float16::FromRaw(regs.proctex_noise_frequency.u).ToFloat32(), + Pica::float16::FromRaw(regs.proctex_noise_frequency.v).ToFloat32(), + }; + uniform_block_data.data.proctex_noise_a = { + regs.proctex_noise_u.amplitude / 4095.0f, + regs.proctex_noise_v.amplitude / 4095.0f, + }; + uniform_block_data.data.proctex_noise_p = { + Pica::float16::FromRaw(regs.proctex_noise_u.phase).ToFloat32(), + Pica::float16::FromRaw(regs.proctex_noise_v.phase).ToFloat32(), + }; + + uniform_block_data.dirty = true; +} + +void Rasterizer::SyncProcTexBias() { + const auto& regs = Pica::g_state.regs.texturing; + uniform_block_data.data.proctex_bias = + Pica::float16::FromRaw(regs.proctex.bias_low | (regs.proctex_lut.bias_high << 8)) + .ToFloat32(); + + uniform_block_data.dirty = true; +} + +void Rasterizer::SyncAlphaTest() { + const auto& regs = Pica::g_state.regs; + if (regs.framebuffer.output_merger.alpha_test.ref != uniform_block_data.data.alphatest_ref) { + uniform_block_data.data.alphatest_ref = regs.framebuffer.output_merger.alpha_test.ref; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncLogicOp() { + const auto& regs = Pica::g_state.regs; + state.logic_op = PicaToGL::LogicOp(regs.framebuffer.output_merger.logic_op); + + if (GLES) { + if (!regs.framebuffer.output_merger.alphablend_enable) { + if (regs.framebuffer.output_merger.logic_op == Pica::FramebufferRegs::LogicOp::NoOp) { + // Color output is disabled by logic operation. We use color write mask to skip + // color but allow depth write. + state.color_mask = {}; + } + } + } +} + +void Rasterizer::SyncColorWriteMask() { + const auto& regs = Pica::g_state.regs; + if (GLES) { + if (!regs.framebuffer.output_merger.alphablend_enable) { + if (regs.framebuffer.output_merger.logic_op == Pica::FramebufferRegs::LogicOp::NoOp) { + // Color output is disabled by logic operation. We use color write mask to skip + // color but allow depth write. Return early to avoid overwriting this. + return; + } + } + } + + auto IsColorWriteEnabled = [&](u32 value) { + return (regs.framebuffer.framebuffer.allow_color_write != 0 && value != 0) ? GL_TRUE + : GL_FALSE; + }; + + state.color_mask.red_enabled = IsColorWriteEnabled(regs.framebuffer.output_merger.red_enable); + state.color_mask.green_enabled = + IsColorWriteEnabled(regs.framebuffer.output_merger.green_enable); + state.color_mask.blue_enabled = IsColorWriteEnabled(regs.framebuffer.output_merger.blue_enable); + state.color_mask.alpha_enabled = + IsColorWriteEnabled(regs.framebuffer.output_merger.alpha_enable); +} + +void Rasterizer::SyncStencilWriteMask() { + const auto& regs = Pica::g_state.regs; + state.stencil.write_mask = + (regs.framebuffer.framebuffer.allow_depth_stencil_write != 0) + ? static_cast(regs.framebuffer.output_merger.stencil_test.write_mask) + : 0; +} + +void Rasterizer::SyncDepthWriteMask() { + const auto& regs = Pica::g_state.regs; + state.depth.write_mask = (regs.framebuffer.framebuffer.allow_depth_stencil_write != 0 && + regs.framebuffer.output_merger.depth_write_enable) + ? GL_TRUE + : GL_FALSE; +} + +void Rasterizer::SyncStencilTest() { + const auto& regs = Pica::g_state.regs; + state.stencil.test_enabled = + regs.framebuffer.output_merger.stencil_test.enable && + regs.framebuffer.framebuffer.depth_format == Pica::FramebufferRegs::DepthFormat::D24S8; + state.stencil.test_func = + PicaToGL::CompareFunc(regs.framebuffer.output_merger.stencil_test.func); + state.stencil.test_ref = regs.framebuffer.output_merger.stencil_test.reference_value; + state.stencil.test_mask = regs.framebuffer.output_merger.stencil_test.input_mask; + state.stencil.action_stencil_fail = + PicaToGL::StencilOp(regs.framebuffer.output_merger.stencil_test.action_stencil_fail); + state.stencil.action_depth_fail = + PicaToGL::StencilOp(regs.framebuffer.output_merger.stencil_test.action_depth_fail); + state.stencil.action_depth_pass = + PicaToGL::StencilOp(regs.framebuffer.output_merger.stencil_test.action_depth_pass); +} + +void Rasterizer::SyncDepthTest() { + const auto& regs = Pica::g_state.regs; + state.depth.test_enabled = regs.framebuffer.output_merger.depth_test_enable == 1 || + regs.framebuffer.output_merger.depth_write_enable == 1; + state.depth.test_func = + regs.framebuffer.output_merger.depth_test_enable == 1 + ? PicaToGL::CompareFunc(regs.framebuffer.output_merger.depth_test_func) + : GL_ALWAYS; +} + +void Rasterizer::SyncCombinerColor() { + auto combiner_color = + PicaToGL::ColorRGBA8(Pica::g_state.regs.texturing.tev_combiner_buffer_color.raw); + if (combiner_color != uniform_block_data.data.tev_combiner_buffer_color) { + uniform_block_data.data.tev_combiner_buffer_color = combiner_color; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncTevConstColor(std::size_t stage_index, + const Pica::TexturingRegs::TevStageConfig& tev_stage) { + const auto const_color = PicaToGL::ColorRGBA8(tev_stage.const_color); + + if (const_color == uniform_block_data.data.const_color[stage_index]) { + return; + } + + uniform_block_data.data.const_color[stage_index] = const_color; + uniform_block_data.dirty = true; +} + +void Rasterizer::SyncGlobalAmbient() { + auto color = PicaToGL::LightColor(Pica::g_state.regs.lighting.global_ambient); + if (color != uniform_block_data.data.lighting_global_ambient) { + uniform_block_data.data.lighting_global_ambient = color; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncLightSpecular0(int light_index) { + auto color = PicaToGL::LightColor(Pica::g_state.regs.lighting.light[light_index].specular_0); + if (color != uniform_block_data.data.light_src[light_index].specular_0) { + uniform_block_data.data.light_src[light_index].specular_0 = color; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncLightSpecular1(int light_index) { + auto color = PicaToGL::LightColor(Pica::g_state.regs.lighting.light[light_index].specular_1); + if (color != uniform_block_data.data.light_src[light_index].specular_1) { + uniform_block_data.data.light_src[light_index].specular_1 = color; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncLightDiffuse(int light_index) { + auto color = PicaToGL::LightColor(Pica::g_state.regs.lighting.light[light_index].diffuse); + if (color != uniform_block_data.data.light_src[light_index].diffuse) { + uniform_block_data.data.light_src[light_index].diffuse = color; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncLightAmbient(int light_index) { + auto color = PicaToGL::LightColor(Pica::g_state.regs.lighting.light[light_index].ambient); + if (color != uniform_block_data.data.light_src[light_index].ambient) { + uniform_block_data.data.light_src[light_index].ambient = color; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncLightPosition(int light_index) { + GLvec3 position = { + Pica::float16::FromRaw(Pica::g_state.regs.lighting.light[light_index].x).ToFloat32(), + Pica::float16::FromRaw(Pica::g_state.regs.lighting.light[light_index].y).ToFloat32(), + Pica::float16::FromRaw(Pica::g_state.regs.lighting.light[light_index].z).ToFloat32()}; + + if (position != uniform_block_data.data.light_src[light_index].position) { + uniform_block_data.data.light_src[light_index].position = position; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncLightSpotDirection(int light_index) { + const auto& light = Pica::g_state.regs.lighting.light[light_index]; + GLvec3 spot_direction = {light.spot_x / 2047.0f, light.spot_y / 2047.0f, + light.spot_z / 2047.0f}; + + if (spot_direction != uniform_block_data.data.light_src[light_index].spot_direction) { + uniform_block_data.data.light_src[light_index].spot_direction = spot_direction; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncLightDistanceAttenuationBias(int light_index) { + GLfloat dist_atten_bias = + Pica::float20::FromRaw(Pica::g_state.regs.lighting.light[light_index].dist_atten_bias) + .ToFloat32(); + + if (dist_atten_bias != uniform_block_data.data.light_src[light_index].dist_atten_bias) { + uniform_block_data.data.light_src[light_index].dist_atten_bias = dist_atten_bias; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncLightDistanceAttenuationScale(int light_index) { + GLfloat dist_atten_scale = + Pica::float20::FromRaw(Pica::g_state.regs.lighting.light[light_index].dist_atten_scale) + .ToFloat32(); + + if (dist_atten_scale != uniform_block_data.data.light_src[light_index].dist_atten_scale) { + uniform_block_data.data.light_src[light_index].dist_atten_scale = dist_atten_scale; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncShadowBias() { + const auto& shadow = Pica::g_state.regs.framebuffer.shadow; + GLfloat constant = Pica::float16::FromRaw(shadow.constant).ToFloat32(); + GLfloat linear = Pica::float16::FromRaw(shadow.linear).ToFloat32(); + + if (constant != uniform_block_data.data.shadow_bias_constant || + linear != uniform_block_data.data.shadow_bias_linear) { + uniform_block_data.data.shadow_bias_constant = constant; + uniform_block_data.data.shadow_bias_linear = linear; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncShadowTextureBias() { + GLint bias = Pica::g_state.regs.texturing.shadow.bias << 1; + if (bias != uniform_block_data.data.shadow_texture_bias) { + uniform_block_data.data.shadow_texture_bias = bias; + uniform_block_data.dirty = true; + } +} + +void Rasterizer::SyncAndUploadLUTsLF() { + constexpr std::size_t max_size = + sizeof(GLvec2) * 256 * Pica::LightingRegs::NumLightingSampler + sizeof(GLvec2) * 128; // fog + + if (!uniform_block_data.lighting_lut_dirty_any && !uniform_block_data.fog_lut_dirty) { + return; + } + + u8* buffer; + GLintptr offset; + bool invalidate; + std::size_t bytes_used = 0; + glBindBuffer(GL_TEXTURE_BUFFER, texture_lf_buffer.GetHandle()); + std::tie(buffer, offset, invalidate) = texture_lf_buffer.Map(max_size, sizeof(GLvec4)); + + // Sync the lighting luts + if (uniform_block_data.lighting_lut_dirty_any || invalidate) { + for (unsigned index = 0; index < uniform_block_data.lighting_lut_dirty.size(); index++) { + if (uniform_block_data.lighting_lut_dirty[index] || invalidate) { + std::array new_data; + const auto& source_lut = Pica::g_state.lighting.luts[index]; + std::transform(source_lut.begin(), source_lut.end(), new_data.begin(), + [](const auto& entry) { + return GLvec2{entry.ToFloat(), entry.DiffToFloat()}; + }); + + if (new_data != lighting_lut_data[index] || invalidate) { + lighting_lut_data[index] = new_data; + std::memcpy(buffer + bytes_used, new_data.data(), + new_data.size() * sizeof(GLvec2)); + uniform_block_data.data.lighting_lut_offset[index / 4][index % 4] = + static_cast((offset + bytes_used) / sizeof(GLvec2)); + uniform_block_data.dirty = true; + bytes_used += new_data.size() * sizeof(GLvec2); + } + uniform_block_data.lighting_lut_dirty[index] = false; + } + } + uniform_block_data.lighting_lut_dirty_any = false; + } + + // Sync the fog lut + if (uniform_block_data.fog_lut_dirty || invalidate) { + std::array new_data; + + std::transform(Pica::g_state.fog.lut.begin(), Pica::g_state.fog.lut.end(), new_data.begin(), + [](const auto& entry) { + return GLvec2{entry.ToFloat(), entry.DiffToFloat()}; + }); + + if (new_data != fog_lut_data || invalidate) { + fog_lut_data = new_data; + std::memcpy(buffer + bytes_used, new_data.data(), new_data.size() * sizeof(GLvec2)); + uniform_block_data.data.fog_lut_offset = + static_cast((offset + bytes_used) / sizeof(GLvec2)); + uniform_block_data.dirty = true; + bytes_used += new_data.size() * sizeof(GLvec2); + } + uniform_block_data.fog_lut_dirty = false; + } + + texture_lf_buffer.Unmap(bytes_used); +} + +void Rasterizer::SyncAndUploadLUTs() { + constexpr std::size_t max_size = sizeof(GLvec2) * 128 * 3 + // proctex: noise + color + alpha + sizeof(GLvec4) * 256 + // proctex + sizeof(GLvec4) * 256; // proctex diff + + if (!uniform_block_data.proctex_noise_lut_dirty && + !uniform_block_data.proctex_color_map_dirty && + !uniform_block_data.proctex_alpha_map_dirty && !uniform_block_data.proctex_lut_dirty && + !uniform_block_data.proctex_diff_lut_dirty) { + return; + } + + u8* buffer; + GLintptr offset; + bool invalidate; + std::size_t bytes_used = 0; + glBindBuffer(GL_TEXTURE_BUFFER, texture_buffer.GetHandle()); + std::tie(buffer, offset, invalidate) = texture_buffer.Map(max_size, sizeof(GLvec4)); + + // helper function for SyncProcTexNoiseLUT/ColorMap/AlphaMap + auto SyncProcTexValueLUT = [this, buffer, offset, invalidate, &bytes_used]( + const std::array& lut, + std::array& lut_data, GLint& lut_offset) { + std::array new_data; + std::transform(lut.begin(), lut.end(), new_data.begin(), [](const auto& entry) { + return GLvec2{entry.ToFloat(), entry.DiffToFloat()}; + }); + + if (new_data != lut_data || invalidate) { + lut_data = new_data; + std::memcpy(buffer + bytes_used, new_data.data(), new_data.size() * sizeof(GLvec2)); + lut_offset = static_cast((offset + bytes_used) / sizeof(GLvec2)); + uniform_block_data.dirty = true; + bytes_used += new_data.size() * sizeof(GLvec2); + } + }; + + // Sync the proctex noise lut + if (uniform_block_data.proctex_noise_lut_dirty || invalidate) { + SyncProcTexValueLUT(Pica::g_state.proctex.noise_table, proctex_noise_lut_data, + uniform_block_data.data.proctex_noise_lut_offset); + uniform_block_data.proctex_noise_lut_dirty = false; + } + + // Sync the proctex color map + if (uniform_block_data.proctex_color_map_dirty || invalidate) { + SyncProcTexValueLUT(Pica::g_state.proctex.color_map_table, proctex_color_map_data, + uniform_block_data.data.proctex_color_map_offset); + uniform_block_data.proctex_color_map_dirty = false; + } + + // Sync the proctex alpha map + if (uniform_block_data.proctex_alpha_map_dirty || invalidate) { + SyncProcTexValueLUT(Pica::g_state.proctex.alpha_map_table, proctex_alpha_map_data, + uniform_block_data.data.proctex_alpha_map_offset); + uniform_block_data.proctex_alpha_map_dirty = false; + } + + // Sync the proctex lut + if (uniform_block_data.proctex_lut_dirty || invalidate) { + std::array new_data; + + std::transform(Pica::g_state.proctex.color_table.begin(), + Pica::g_state.proctex.color_table.end(), new_data.begin(), + [](const auto& entry) { + auto rgba = entry.ToVector() / 255.0f; + return GLvec4{rgba.r(), rgba.g(), rgba.b(), rgba.a()}; + }); + + if (new_data != proctex_lut_data || invalidate) { + proctex_lut_data = new_data; + std::memcpy(buffer + bytes_used, new_data.data(), new_data.size() * sizeof(GLvec4)); + uniform_block_data.data.proctex_lut_offset = + static_cast((offset + bytes_used) / sizeof(GLvec4)); + uniform_block_data.dirty = true; + bytes_used += new_data.size() * sizeof(GLvec4); + } + uniform_block_data.proctex_lut_dirty = false; + } + + // Sync the proctex difference lut + if (uniform_block_data.proctex_diff_lut_dirty || invalidate) { + std::array new_data; + + std::transform(Pica::g_state.proctex.color_diff_table.begin(), + Pica::g_state.proctex.color_diff_table.end(), new_data.begin(), + [](const auto& entry) { + auto rgba = entry.ToVector() / 255.0f; + return GLvec4{rgba.r(), rgba.g(), rgba.b(), rgba.a()}; + }); + + if (new_data != proctex_diff_lut_data || invalidate) { + proctex_diff_lut_data = new_data; + std::memcpy(buffer + bytes_used, new_data.data(), new_data.size() * sizeof(GLvec4)); + uniform_block_data.data.proctex_diff_lut_offset = + static_cast((offset + bytes_used) / sizeof(GLvec4)); + uniform_block_data.dirty = true; + bytes_used += new_data.size() * sizeof(GLvec4); + } + uniform_block_data.proctex_diff_lut_dirty = false; + } + + texture_buffer.Unmap(bytes_used); +} + +void Rasterizer::UploadUniforms(bool accelerate_draw) { + // glBindBufferRange below also changes the generic buffer binding point, so we sync the state + // first + state.draw.uniform_buffer = uniform_buffer.GetHandle(); + state.Apply(); + + bool sync_vs = accelerate_draw; + bool sync_fs = uniform_block_data.dirty; + + if (!sync_vs && !sync_fs) + return; + + std::size_t uniform_size = uniform_size_aligned_vs + uniform_size_aligned_fs; + std::size_t used_bytes = 0; + u8* uniforms; + GLintptr offset; + bool invalidate; + std::tie(uniforms, offset, invalidate) = + uniform_buffer.Map(uniform_size, uniform_buffer_alignment); + + if (sync_vs) { + VSUniformData vs_uniforms; + vs_uniforms.uniforms.SetFromRegs(Pica::g_state.regs.vs, Pica::g_state.vs); + std::memcpy(uniforms + used_bytes, &vs_uniforms, sizeof(vs_uniforms)); + glBindBufferRange(GL_UNIFORM_BUFFER, static_cast(UniformBindings::VS), + uniform_buffer.GetHandle(), offset + used_bytes, sizeof(VSUniformData)); + used_bytes += uniform_size_aligned_vs; + } + + if (sync_fs || invalidate) { + std::memcpy(uniforms + used_bytes, &uniform_block_data.data, sizeof(UniformData)); + glBindBufferRange(GL_UNIFORM_BUFFER, static_cast(UniformBindings::Common), + uniform_buffer.GetHandle(), offset + used_bytes, sizeof(UniformData)); + uniform_block_data.dirty = false; + used_bytes += uniform_size_aligned_fs; + } + + uniform_buffer.Unmap(used_bytes); +} + +} // namespace OpenGL diff --git a/src/video_core/common/rasterizer.h b/src/video_core/common/rasterizer.h new file mode 100644 index 000000000..9296366b9 --- /dev/null +++ b/src/video_core/common/rasterizer.h @@ -0,0 +1,244 @@ +// Copyright 2022 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include "common/vector_math.h" +#include "video_core/regs_lighting.h" +#include "video_core/regs_texturing.h" +#include "video_core/common/rasterizer_cache.h" +#include "video_core/common/pipeline.h" +#include "video_core/shader/shader.h" + +namespace Frontend { +class EmuWindow; +} + +namespace VideoCore { +class ShaderProgramManager; + +/// Structure that the hardware rendered vertices are composed of +struct HardwareVertex { + HardwareVertex() = default; + HardwareVertex(const Pica::Shader::OutputVertex& v, bool flip_quaternion); + + // Returns the pipeline vertex layout of the vertex + constexpr static VertexLayout GetVertexLayout(); + + Common::Vec4f position; + Common::Vec4f color; + Common::Vec2f tex_coord0; + Common::Vec2f tex_coord1; + Common::Vec2f tex_coord2; + float tex_coord0_w; + Common::Vec4f normquat; + Common::Vec3f view; +}; + +class BackendBase; + +class Rasterizer { +public: + explicit Rasterizer(Frontend::EmuWindow& emu_window, std::unique_ptr& backend); + ~Rasterizer(); + + //void LoadDiskResources(const std::atomic_bool& stop_loading, + // const VideoCore::DiskResourceLoadCallback& callback); + + void AddTriangle(const Pica::Shader::OutputVertex& v0, const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2); + void DrawTriangles(); + void NotifyPicaRegisterChanged(u32 id); + void FlushAll(); + void FlushRegion(PAddr addr, u32 size); + void InvalidateRegion(PAddr addr, u32 size); + void FlushAndInvalidateRegion(PAddr addr, u32 size); + void ClearAll(bool flush); + bool AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config); + bool AccelerateTextureCopy(const GPU::Regs::DisplayTransferConfig& config); + bool AccelerateFill(const GPU::Regs::MemoryFillConfig& config); + bool AccelerateDisplay(const GPU::Regs::FramebufferConfig& config, PAddr framebuffer_addr, + u32 pixel_stride, ScreenInfo& screen_info); + bool AccelerateDrawBatch(bool is_indexed); + + /// Syncs entire status to match PICA registers + void SyncEntireState(); + +private: + /// Syncs the clip enabled status to match the PICA register + void SyncClipEnabled(); + + /// Syncs the clip coefficients to match the PICA register + void SyncClipCoef(); + + /// Sets the OpenGL shader in accordance with the current PICA register state + void SetShader(); + + /// Syncs the cull mode to match the PICA register + void SyncCullMode(); + + /// Syncs the depth scale to match the PICA register + void SyncDepthScale(); + + /// Syncs the depth offset to match the PICA register + void SyncDepthOffset(); + + /// Syncs the blend enabled status to match the PICA register + void SyncBlendEnabled(); + + /// Syncs the blend functions to match the PICA register + void SyncBlendFuncs(); + + /// Syncs the blend color to match the PICA register + void SyncBlendColor(); + + /// Syncs the fog states to match the PICA register + void SyncFogColor(); + + /// Sync the procedural texture noise configuration to match the PICA register + void SyncProcTexNoise(); + + /// Sync the procedural texture bias configuration to match the PICA register + void SyncProcTexBias(); + + /// Syncs the alpha test states to match the PICA register + void SyncAlphaTest(); + + /// Syncs the logic op states to match the PICA register + void SyncLogicOp(); + + /// Syncs the color write mask to match the PICA register state + void SyncColorWriteMask(); + + /// Syncs the stencil write mask to match the PICA register state + void SyncStencilWriteMask(); + + /// Syncs the depth write mask to match the PICA register state + void SyncDepthWriteMask(); + + /// Syncs the stencil test states to match the PICA register + void SyncStencilTest(); + + /// Syncs the depth test states to match the PICA register + void SyncDepthTest(); + + /// Syncs the TEV combiner color buffer to match the PICA register + void SyncCombinerColor(); + + /// Syncs the TEV constant color to match the PICA register + void SyncTevConstColor(std::size_t tev_index, const Pica::TexturingRegs::TevStageConfig& tev_stage); + + /// Syncs the lighting global ambient color to match the PICA register + void SyncGlobalAmbient(); + + /// Syncs the specified light's specular 0 color to match the PICA register + void SyncLightSpecular0(int light_index); + + /// Syncs the specified light's specular 1 color to match the PICA register + void SyncLightSpecular1(int light_index); + + /// Syncs the specified light's diffuse color to match the PICA register + void SyncLightDiffuse(int light_index); + + /// Syncs the specified light's ambient color to match the PICA register + void SyncLightAmbient(int light_index); + + /// Syncs the specified light's position to match the PICA register + void SyncLightPosition(int light_index); + + /// Syncs the specified spot light direcition to match the PICA register + void SyncLightSpotDirection(int light_index); + + /// Syncs the specified light's distance attenuation bias to match the PICA register + void SyncLightDistanceAttenuationBias(int light_index); + + /// Syncs the specified light's distance attenuation scale to match the PICA register + void SyncLightDistanceAttenuationScale(int light_index); + + /// Syncs the shadow rendering bias to match the PICA register + void SyncShadowBias(); + + /// Syncs the shadow texture bias to match the PICA register + void SyncShadowTextureBias(); + + /// Syncs and uploads the lighting, fog and proctex LUTs + void SyncAndUploadLUTs(); + void SyncAndUploadLUTsLF(); + + /// Upload the uniform blocks to the uniform buffer object + void UploadUniforms(bool accelerate_draw); + + /// Generic draw function for DrawTriangles and AccelerateDrawBatch + bool Draw(bool accelerate, bool is_indexed); + + /// Internal implementation for AccelerateDrawBatch + bool AccelerateDrawBatchInternal(bool is_indexed); + + struct VertexArrayInfo { + u32 vs_input_index_min; + u32 vs_input_index_max; + u32 vs_input_size; + }; + + /// Retrieve the range and the size of the input vertex + VertexArrayInfo AnalyzeVertexArray(bool is_indexed); + + /// Setup vertex array for AccelerateDrawBatch + void SetupVertexArray(u8* array_ptr, u32 buffer_offset, u32 vs_input_index_min, u32 vs_input_index_max); + +private: + std::unique_ptr& backend; + RasterizerCache res_cache; + std::vector vertex_batch; + bool shader_dirty = true; + + struct { + UniformData data; + std::array lighting_lut_dirty{true}; + bool lighting_lut_dirty_any = true; + bool fog_lut_dirty = true; + bool proctex_noise_lut_dirty = true; + bool proctex_color_map_dirty = true; + bool proctex_alpha_map_dirty = true; + bool proctex_lut_dirty = true; + bool proctex_diff_lut_dirty = true; + bool dirty = true; + } uniform_block_data{}; + + std::unique_ptr shader_program_manager; + + // Clear texture for placeholder purposes + TextureHandle clear_texture; + + // Uniform alignment + std::array hw_vao_enabled_attributes{}; + std::size_t uniform_size_aligned_vs = 0; + std::size_t uniform_size_aligned_fs = 0; + + // Rasterizer used buffers (vertex, index, uniform, lut) + BufferHandle vertex_buffer, index_buffer, uniform_buffer; + BufferHandle texel_buffer_lut_lf, texel_buffer_lut; + + // Pica lighting data + std::array, Pica::LightingRegs::NumLightingSampler> lighting_lut_data{}; + std::array fog_lut_data{}; + std::array proctex_noise_lut_data{}; + std::array proctex_color_map_data{}; + std::array proctex_alpha_map_data{}; + std::array proctex_lut_data{}; + std::array proctex_diff_lut_data{}; + + // Texture unit sampler cache + SamplerInfo texture_cube_sampler; + std::array texture_samplers; + std::unordered_map sampler_cache; + + // TODO: Remove this + bool allow_shadow = false; +}; + +} // namespace VideoCore diff --git a/src/video_core/common/rasterizer_cache.cpp b/src/video_core/common/rasterizer_cache.cpp index 8764c8707..22ec43f58 100644 --- a/src/video_core/common/rasterizer_cache.cpp +++ b/src/video_core/common/rasterizer_cache.cpp @@ -1671,8 +1671,9 @@ void RasterizerCache::UpdatePagesCachedCount(PAddr addr, u32 size, int delta) { // Interval maps will erase segments if count reaches 0, so if delta is negative we have to // subtract after iterating const auto pages_interval = PageMap::interval_type::right_open(page_start, page_end); - if (delta > 0) + if (delta > 0) { cached_pages.add({pages_interval, delta}); + } for (const auto& pair : RangeFromInterval(cached_pages, pages_interval)) { const auto interval = pair.first & pages_interval; diff --git a/src/video_core/common/shader.h b/src/video_core/common/shader.h index 6020322b9..7742f96fd 100644 --- a/src/video_core/common/shader.h +++ b/src/video_core/common/shader.h @@ -36,11 +36,6 @@ public: /// Compiles the shader source code virtual bool Compile(ShaderOptimization level) = 0; - /// Returns the API specific shader bytecode - std::string_view GetSource() const { - return source; - } - /// Returns the name given the shader module std::string_view GetName() const { return name; @@ -54,7 +49,7 @@ public: protected: std::string_view name = "None"; ShaderStage stage = ShaderStage::Undefined; - std::string source; + std::string source = ""; }; using ShaderHandle = IntrusivePtr; diff --git a/src/video_core/common/shader_disk_cache.cpp b/src/video_core/common/shader_disk_cache.cpp new file mode 100644 index 000000000..13a93723e --- /dev/null +++ b/src/video_core/common/shader_disk_cache.cpp @@ -0,0 +1,574 @@ +// Copyright 2022 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include "common/assert.h" +#include "common/common_paths.h" +#include "common/file_util.h" +#include "common/logging/log.h" +#include "common/scm_rev.h" +#include "common/zstd_compression.h" +#include "core/core.h" +#include "core/hle/kernel/process.h" +#include "core/settings.h" +#include "video_core/common/shader_disk_cache.h" + +namespace VideoCore { + +using ShaderCacheVersionHash = std::array; + +enum class TransferableEntryKind : u32 { + Raw = 0, +}; + +enum class PrecompiledEntryKind : u32 { + Decompiled = 0, + Dump = 1, +}; + +constexpr u32 NativeVersion = 1; + +ShaderCacheVersionHash GetShaderCacheVersionHash() { + ShaderCacheVersionHash hash{}; + const std::size_t length = std::min(std::strlen(Common::g_shader_cache_version), hash.size()); + std::memcpy(hash.data(), Common::g_shader_cache_version, length); + return hash; +} + +bool ShaderDiskCacheRaw::Load(FileUtil::IOFile& file) { + if (file.ReadBytes(&unique_identifier, sizeof(u64)) != sizeof(u64) || + file.ReadBytes(&program_type, sizeof(u32)) != sizeof(u32)) { + return false; + } + + u64 reg_array_len{}; + if (file.ReadBytes(®_array_len, sizeof(u64)) != sizeof(u64)) { + return false; + } + + if (file.ReadArray(config.reg_array.data(), reg_array_len) != reg_array_len) { + return false; + } + + // Read in type specific configuration + if (program_type == ProgramType::VertexShader) { + u64 code_len{}; + if (file.ReadBytes(&code_len, sizeof(u64)) != sizeof(u64)) { + return false; + } + + program_code.resize(code_len); + if (file.ReadArray(program_code.data(), code_len) != code_len) { + return false; + } + } + + return true; +} + +bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const { + if (file.WriteObject(unique_identifier) != 1 || + file.WriteObject(static_cast(program_type)) != 1) { + return false; + } + + // Just for future proofing, save the sizes of the array to the file + const std::size_t reg_array_len = Pica::Regs::NUM_REGS; + if (file.WriteObject(static_cast(reg_array_len)) != 1) { + return false; + } + if (file.WriteArray(config.reg_array.data(), reg_array_len) != reg_array_len) { + return false; + } + + if (program_type == ProgramType::VertexShader) { + const std::size_t code_len = program_code.size(); + if (file.WriteObject(static_cast(code_len)) != 1) { + return false; + } + if (file.WriteArray(program_code.data(), code_len) != code_len) { + return false; + } + } + return true; +} + +ShaderDiskCache::ShaderDiskCache(std::unique_ptr& backend) : backend(backend) { + +} + +std::optional> ShaderDiskCache::LoadTransferable() { + const bool has_title_id = GetProgramID() != 0; + if (!Settings::values.use_hw_shader || !Settings::values.use_disk_shader_cache || + !has_title_id) { + return std::nullopt; + } + tried_to_load = true; + + FileUtil::IOFile file{GetTransferablePath(), "rb"}; + if (!file.IsOpen()) { + LOG_INFO(Render_Vulkan, "No transferable shader cache found for game with title id={}", + GetTitleID()); + return std::nullopt; + } + + u32 version{}; + if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) { + LOG_ERROR(Render_Vulkan, "Failed to get transferable cache version for title id={} - skipping", + GetTitleID()); + return std::nullopt; + } + + if (version < NativeVersion) { + LOG_INFO(Render_Vulkan, "Transferable shader cache is old - removing"); + file.Close(); + InvalidateAll(); + + return std::nullopt; + } + if (version > NativeVersion) { + LOG_WARNING(Render_Vulkan, "Transferable shader cache was generated with a newer version " + "of the emulator - skipping"); + return std::nullopt; + } + + // Version is valid, load the shaders + std::vector raws; + while (file.Tell() < file.GetSize()) { + TransferableEntryKind kind{}; + if (file.ReadBytes(&kind, sizeof(u32)) != sizeof(u32)) { + LOG_ERROR(Render_Vulkan, "Failed to read transferable file - skipping"); + return std::nullopt; + } + + switch (kind) { + case TransferableEntryKind::Raw: { + ShaderDiskCacheRaw entry; + if (!entry.Load(file)) { + LOG_ERROR(Render_Vulkan, "Failed to load transferable raw entry - skipping"); + return std::nullopt; + } + transferable.emplace(entry.GetUniqueIdentifier(), ShaderDiskCacheRaw{}); + raws.push_back(std::move(entry)); + break; + } + default: + LOG_ERROR(Render_OpenGL, "Unknown transferable shader cache entry kind={} - skipping", + kind); + return std::nullopt; + } + } + + LOG_INFO(Render_OpenGL, "Found a transferable disk cache with {} entries", raws.size()); + return {std::move(raws)}; +} + +std::pair, ShaderDumpsMap> +ShaderDiskCache::LoadPrecompiled(bool compressed) { + if (!IsUsable()) + return {}; + + FileUtil::IOFile file(GetPrecompiledPath(), "rb"); + if (!file.IsOpen()) { + LOG_INFO(Render_OpenGL, "No precompiled shader cache found for game with title id={}", + GetTitleID()); + return {}; + } + + const std::optional result = LoadPrecompiledFile(file, compressed); + if (!result.has_value()) { + LOG_INFO(Render_OpenGL, + "Failed to load precompiled cache for game with title id={} - removing", + GetTitleID()); + file.Close(); + InvalidatePrecompiled(); + return {}; + } + + return result.value(); +} + +std::optional, ShaderDumpsMap>> +ShaderDiskCache::LoadPrecompiledFile(FileUtil::IOFile& file, bool compressed) { + // Read compressed file from disk and decompress to virtual precompiled cache file + std::vector precompiled_file(file.GetSize()); + file.ReadBytes(precompiled_file.data(), precompiled_file.size()); + + if (compressed) { + const auto decompressed = Common::Compression::DecompressDataZSTD(precompiled_file); + SaveArrayToPrecompiled(decompressed.data(), decompressed.size()); + } else { + SaveArrayToPrecompiled(precompiled_file.data(), precompiled_file.size()); + } + + decompressed_precompiled_cache_offset = 0; + + ShaderCacheVersionHash file_hash{}; + if (!LoadArrayFromPrecompiled(file_hash.data(), file_hash.size())) { + return std::nullopt; + } + if (GetShaderCacheVersionHash() != file_hash) { + LOG_INFO(Render_OpenGL, "Precompiled cache is from another version of the emulator"); + return std::nullopt; + } + + std::unordered_map decompiled; + ShaderDumpsMap dumps; + while (decompressed_precompiled_cache_offset < decompressed_precompiled_cache.size()) { + PrecompiledEntryKind kind{}; + if (!LoadObjectFromPrecompiled(kind)) { + return std::nullopt; + } + + switch (kind) { + case PrecompiledEntryKind::Decompiled: { + u64 unique_identifier{}; + if (!LoadObjectFromPrecompiled(unique_identifier)) { + return std::nullopt; + } + + std::optional entry = LoadDecompiledEntry(); + if (!entry) { + return std::nullopt; + } + + decompiled.insert({unique_identifier, std::move(*entry)}); + break; + } + case PrecompiledEntryKind::Dump: { + u64 unique_identifier; + if (!LoadObjectFromPrecompiled(unique_identifier)) { + return std::nullopt; + } + + ShaderDiskCacheDump dump; + if (!LoadObjectFromPrecompiled(dump.binary_format)) { + return std::nullopt; + } + + u32 binary_length{}; + if (!LoadObjectFromPrecompiled(binary_length)) { + return std::nullopt; + } + + dump.binary.resize(binary_length); + if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) { + return std::nullopt; + } + + dumps.insert({unique_identifier, dump}); + break; + } + default: + return std::nullopt; + } + } + + LOG_INFO(Render_OpenGL, + "Found a precompiled disk cache with {} decompiled entries and {} binary entries", + decompiled.size(), dumps.size()); + return {{decompiled, dumps}}; +} + +std::optional ShaderDiskCache::LoadDecompiledEntry() { + + bool sanitize_mul; + if (!LoadObjectFromPrecompiled(sanitize_mul)) { + return std::nullopt; + } + + u32 code_size{}; + if (!LoadObjectFromPrecompiled(code_size)) { + return std::nullopt; + } + + std::string code(code_size, '\0'); + if (!LoadArrayFromPrecompiled(code.data(), code.size())) { + return std::nullopt; + } + + const ShaderDiskCacheDecompiled entry = { + .result = std::move(code), + .sanitize_mul = sanitize_mul + }; + + return entry; +} + +void ShaderDiskCache::SaveDecompiledToFile(FileUtil::IOFile& file, u64 unique_identifier, + const std::string& result, + bool sanitize_mul) { + if (!IsUsable()) + return; + + if (file.WriteObject(static_cast(PrecompiledEntryKind::Decompiled)) != 1 || + file.WriteObject(unique_identifier) != 1 || file.WriteObject(sanitize_mul) != 1 || + file.WriteObject(static_cast(result.size())) != 1 || + file.WriteArray(result.data(), result.size()) != result.size()) { + LOG_ERROR(Render_OpenGL, "Failed to save decompiled cache entry - removing"); + + file.Close(); + InvalidatePrecompiled(); + } +} + +bool ShaderDiskCache::SaveDecompiledToCache(u64 unique_identifier, const std::string& result, + bool sanitize_mul) { + if (!SaveObjectToPrecompiled(static_cast(PrecompiledEntryKind::Decompiled)) || + !SaveObjectToPrecompiled(unique_identifier) || !SaveObjectToPrecompiled(sanitize_mul) || + !SaveObjectToPrecompiled(static_cast(result.size())) || + !SaveArrayToPrecompiled(result.data(), result.size())) { + return false; + } + + return true; +} + +void ShaderDiskCache::InvalidateAll() { + if (!FileUtil::Delete(GetTransferablePath())) { + LOG_ERROR(Render_OpenGL, "Failed to invalidate transferable file={}", + GetTransferablePath()); + } + InvalidatePrecompiled(); +} + +void ShaderDiskCache::InvalidatePrecompiled() { + // Clear virtual precompiled cache file + decompressed_precompiled_cache.resize(0); + + if (!FileUtil::Delete(GetPrecompiledPath())) { + LOG_ERROR(Render_OpenGL, "Failed to invalidate precompiled file={}", GetPrecompiledPath()); + } +} + +void ShaderDiskCache::SaveRaw(const ShaderDiskCacheRaw& entry) { + if (!IsUsable()) + return; + + const u64 id = entry.GetUniqueIdentifier(); + if (transferable.find(id) != transferable.end()) { + // The shader already exists + return; + } + + FileUtil::IOFile file = AppendTransferableFile(); + if (!file.IsOpen()) + return; + if (file.WriteObject(TransferableEntryKind::Raw) != 1 || !entry.Save(file)) { + LOG_ERROR(Render_OpenGL, "Failed to save raw transferable cache entry - removing"); + file.Close(); + InvalidateAll(); + return; + } + + transferable.insert({id, entry}); +} + +void ShaderDiskCache::SaveDecompiled(u64 unique_identifier, const std::string& code, bool sanitize_mul) { + if (!IsUsable()) + return; + + if (decompressed_precompiled_cache.empty()) { + SavePrecompiledHeaderToVirtualPrecompiledCache(); + } + + if (!SaveDecompiledToCache(unique_identifier, code, sanitize_mul)) { + LOG_ERROR(Render_OpenGL, + "Failed to save decompiled entry to the precompiled file - removing"); + InvalidatePrecompiled(); + } +} + +void ShaderDiskCache::SaveDump(u64 unique_identifier, ShaderHandle shader) { + if (!IsUsable()) + return; + + GLint binary_length{}; + glGetProgramiv(program, GL_PROGRAM_BINARY_LENGTH, &binary_length); + + GLenum binary_format{}; + std::vector binary(binary_length); + glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data()); + + if (!SaveObjectToPrecompiled(static_cast(PrecompiledEntryKind::Dump)) || + !SaveObjectToPrecompiled(unique_identifier) || + !SaveObjectToPrecompiled(static_cast(binary_format)) || + !SaveObjectToPrecompiled(static_cast(binary_length)) || + !SaveArrayToPrecompiled(binary.data(), binary.size())) { + LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016x} - removing", + unique_identifier); + InvalidatePrecompiled(); + return; + } +} + +void ShaderDiskCache::SaveDumpToFile(u64 unique_identifier, ShaderHandle shader, bool sanitize_mul) { + if (!IsUsable()) + return; + + FileUtil::IOFile file = AppendPrecompiledFile(); + if (!file.IsOpen()) + return; + + GLint binary_length{}; + glGetProgramiv(program, GL_PROGRAM_BINARY_LENGTH, &binary_length); + + GLenum binary_format{}; + std::vector binary(binary_length); + glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data()); + + if (file.WriteObject(static_cast(PrecompiledEntryKind::Dump)) != 1 || + file.WriteObject(unique_identifier) != 1 || + file.WriteObject(static_cast(binary_format)) != 1 || + file.WriteObject(static_cast(binary_length)) != 1 || + file.WriteArray(binary.data(), binary.size()) != binary.size()) { + LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016x} - removing", + unique_identifier); + InvalidatePrecompiled(); + return; + } + + // SaveDecompiled is used only to store the accurate multiplication setting, a better way is to + // probably change the header in SaveDump + SaveDecompiledToFile(file, unique_identifier, {}, sanitize_mul); +} + +bool ShaderDiskCache::IsUsable() const { + return tried_to_load && Settings::values.use_disk_shader_cache; +} + +FileUtil::IOFile ShaderDiskCache::AppendTransferableFile() { + if (!EnsureDirectories()) + return {}; + + const auto transferable_path{GetTransferablePath()}; + const bool existed = FileUtil::Exists(transferable_path); + + FileUtil::IOFile file(transferable_path, "ab"); + if (!file.IsOpen()) { + LOG_ERROR(Render_OpenGL, "Failed to open transferable cache in path={}", transferable_path); + return {}; + } + if (!existed || file.GetSize() == 0) { + // If the file didn't exist, write its version + if (file.WriteObject(NativeVersion) != 1) { + LOG_ERROR(Render_OpenGL, "Failed to write transferable cache version in path={}", + transferable_path); + return {}; + } + } + return file; +} + +FileUtil::IOFile ShaderDiskCache::AppendPrecompiledFile() { + if (!EnsureDirectories()) + return {}; + + const auto precompiled_path{GetPrecompiledPath()}; + const bool existed = FileUtil::Exists(precompiled_path); + + FileUtil::IOFile file(precompiled_path, "ab"); + if (!file.IsOpen()) { + LOG_ERROR(Render_OpenGL, "Failed to open precompiled cache in path={}", precompiled_path); + return {}; + } + if (!existed || file.GetSize() == 0) { + // If the file didn't exist, write its version + const auto hash{GetShaderCacheVersionHash()}; + if (file.WriteArray(hash.data(), hash.size()) != hash.size()) { + LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}", + precompiled_path); + return {}; + } + } + return file; +} + +void ShaderDiskCache::SavePrecompiledHeaderToVirtualPrecompiledCache() { + const ShaderCacheVersionHash hash = GetShaderCacheVersionHash(); + if (!SaveArrayToPrecompiled(hash.data(), hash.size())) { + LOG_ERROR(Render_OpenGL, + "Failed to write precompiled cache version hash to virtual precompiled cache file"); + } +} + +void ShaderDiskCache::SaveVirtualPrecompiledFile() { + decompressed_precompiled_cache_offset = 0; + const std::vector& compressed = Common::Compression::CompressDataZSTDDefault( + decompressed_precompiled_cache.data(), decompressed_precompiled_cache.size()); + + const auto precompiled_path{GetPrecompiledPath()}; + FileUtil::IOFile file(precompiled_path, "wb"); + + if (!file.IsOpen()) { + LOG_ERROR(Render_OpenGL, "Failed to open precompiled cache in path={}", precompiled_path); + return; + } + if (file.WriteBytes(compressed.data(), compressed.size()) != compressed.size()) { + LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}", + precompiled_path); + return; + } +} + +bool ShaderDiskCache::EnsureDirectories() const { + const auto CreateDir = [](const std::string& dir) { + if (!FileUtil::CreateDir(dir)) { + LOG_ERROR(Render_OpenGL, "Failed to create directory={}", dir); + return false; + } + return true; + }; + + return CreateDir(FileUtil::GetUserPath(FileUtil::UserPath::ShaderDir)) && + CreateDir(GetBaseDir()) && CreateDir(GetTransferableDir()) && + CreateDir(GetPrecompiledDir()) && CreateDir(GetPrecompiledShaderDir()); +} + +std::string ShaderDiskCache::GetTransferablePath() { + return FileUtil::SanitizePath(GetTransferableDir() + DIR_SEP_CHR + GetTitleID() + ".bin"); +} + +std::string ShaderDiskCache::GetPrecompiledPath() { + return FileUtil::SanitizePath(GetPrecompiledShaderDir() + DIR_SEP_CHR + GetTitleID() + ".bin"); +} + +std::string ShaderDiskCache::GetTransferableDir() const { + return GetBaseDir() + DIR_SEP "transferable"; +} + +std::string ShaderDiskCache::GetPrecompiledDir() const { + return GetBaseDir() + DIR_SEP "precompiled"; +} + +std::string ShaderDiskCache::GetPrecompiledShaderDir() const { + return GetPrecompiledDir() + DIR_SEP "separable"; +} + +std::string ShaderDiskCache::GetBaseDir() const { + return FileUtil::GetUserPath(FileUtil::UserPath::ShaderDir) + DIR_SEP "opengl"; +} + +u64 ShaderDiskCache::GetProgramID() { + // Skip games without title id + if (program_id != 0) { + return program_id; + } + if (Core::System::GetInstance().GetAppLoader().ReadProgramId(program_id) != + Loader::ResultStatus::Success) { + return 0; + } + return program_id; +} + +std::string ShaderDiskCache::GetTitleID() { + if (!title_id.empty()) { + return title_id; + } + title_id = fmt::format("{:016X}", GetProgramID()); + return title_id; +} + +} // namespace VideoCore diff --git a/src/video_core/common/shader_disk_cache.h b/src/video_core/common/shader_disk_cache.h new file mode 100644 index 000000000..b9cecf9e2 --- /dev/null +++ b/src/video_core/common/shader_disk_cache.h @@ -0,0 +1,225 @@ +// Copyright 2022 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "video_core/regs.h" +#include "video_core/common/shader.h" + +namespace Core { +class System; +} + +namespace FileUtil { +class IOFile; +} + +namespace VideoCore { + +enum class ProgramType : u32 { + VertexShader = 0, + GeometryShader = 1, + FragmentShader = 2 +}; + +// Describes a shader how it's used by the Pica GPU +class ShaderDiskCacheRaw { +public: + ShaderDiskCacheRaw() = default; + ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type, Pica::Regs config, + std::vector program_code) : unique_identifier(unique_identifier), + program_type(program_type), config(config), program_code(program_code) {} + ~ShaderDiskCacheRaw() = default; + + bool Load(FileUtil::IOFile& file); + bool Save(FileUtil::IOFile& file) const; + + // Returns the unique hash of the program code and pica registers + u64 GetUniqueIdentifier() const { + return unique_identifier; + } + + // Returns the shader program type + ProgramType GetProgramType() const { + return program_type; + } + + // Returns an immutable span to the program code + std::span GetProgramCode() const { + return program_code; + } + + // Returns the pica register state used to generate the program code + const Pica::Regs& GetRawShaderConfig() const { + return config; + } + +private: + u64 unique_identifier = 0; + ProgramType program_type{}; + Pica::Regs config{}; + std::vector program_code{}; +}; + +// Contains decompiled data from a shader +struct ShaderDiskCacheDecompiled { + std::string result; + bool sanitize_mul; +}; + +// Contains an OpenGL dumped binary program +struct ShaderDiskCacheDump { + //GLenum binary_format; + std::vector binary; +}; + +using ShaderDecompiledMap = std::unordered_map; +using ShaderDumpsMap = std::unordered_map; + +class BackendBase; + +class ShaderDiskCache { +public: + ShaderDiskCache(std::unique_ptr& backend); + ~ShaderDiskCache() = default; + + /// Loads transferable cache. If file has a old version or on failure, it deletes the file. + std::optional> LoadTransferable(); + + /// Loads current game's precompiled cache. Invalidates on failure. + std::pair LoadPrecompiled(bool compressed); + + /// Removes the transferable (and precompiled) cache file. + void InvalidateAll(); + + /// Removes the precompiled cache file and clears virtual precompiled cache file. + void InvalidatePrecompiled(); + + /// Saves a raw dump to the transferable file. Checks for collisions. + void SaveRaw(const ShaderDiskCacheRaw& entry); + + /// Saves a decompiled entry to the precompiled file. Does not check for collisions. + void SaveDecompiled(u64 unique_identifier, const std::string& code, bool sanitize_mul); + + /// Saves a dump entry to the precompiled file. Does not check for collisions. + void SaveDump(u64 unique_identifier, ShaderHandle shader); + + /// Saves a dump entry to the precompiled file. Does not check for collisions. + void SaveDumpToFile(u64 unique_identifier, ShaderHandle shader, bool sanitize_mul); + + /// Serializes virtual precompiled shader cache file to real file + void SaveVirtualPrecompiledFile(); + +private: + /// Loads the transferable cache. Returns empty on failure. + std::optional> LoadPrecompiledFile( + FileUtil::IOFile& file, bool compressed); + + /// Loads a decompiled cache entry from m_precompiled_cache_virtual_file. + /// Returns empty on failure. + std::optional LoadDecompiledEntry(); + + /// Saves a decompiled entry to the passed file. Does not check for collisions. + void SaveDecompiledToFile(FileUtil::IOFile& file, u64 unique_identifier, + const std::string& code, bool sanitize_mul); + + /// Saves a decompiled entry to the virtual precompiled cache. Does not check for collisions. + bool SaveDecompiledToCache(u64 unique_identifier, const std::string& code, bool sanitize_mul); + + /// Returns if the cache can be used + bool IsUsable() const; + + /// Opens current game's transferable file and write it's header if it doesn't exist + FileUtil::IOFile AppendTransferableFile(); + + /// Save precompiled header to precompiled_cache_in_memory + void SavePrecompiledHeaderToVirtualPrecompiledCache(); + + /// Create shader disk cache directories. Returns true on success. + bool EnsureDirectories() const; + + /// Gets current game's transferable file path + std::string GetTransferablePath(); + + /// Gets current game's precompiled file path + std::string GetPrecompiledPath(); + + /// Get user's transferable directory path + std::string GetTransferableDir() const; + + /// Get user's precompiled directory path + std::string GetPrecompiledDir() const; + + std::string GetPrecompiledShaderDir() const; + + /// Get user's shader directory path + std::string GetBaseDir() const; + + /// Get current game's title id as u64 + u64 GetProgramID(); + + /// Get current game's title id + std::string GetTitleID(); + + template + bool SaveArrayToPrecompiled(const T* data, std::size_t length) { + const u8* data_view = reinterpret_cast(data); + decompressed_precompiled_cache.insert(decompressed_precompiled_cache.end(), &data_view[0], + &data_view[length * sizeof(T)]); + decompressed_precompiled_cache_offset += length * sizeof(T); + return true; + } + + template + bool LoadArrayFromPrecompiled(T* data, std::size_t length) { + u8* data_view = reinterpret_cast(data); + std::copy_n(decompressed_precompiled_cache.data() + decompressed_precompiled_cache_offset, + length * sizeof(T), data_view); + decompressed_precompiled_cache_offset += length * sizeof(T); + return true; + } + + template + bool SaveObjectToPrecompiled(const T& object) { + return SaveArrayToPrecompiled(&object, 1); + } + + bool SaveObjectToPrecompiled(bool object) { + const auto value = static_cast(object); + return SaveArrayToPrecompiled(&value, 1); + } + + template + bool LoadObjectFromPrecompiled(T& object) { + return LoadArrayFromPrecompiled(&object, 1); + } + +private: + std::unique_ptr& backend; + + // Stores whole precompiled cache which will be read from or saved to the precompiled cache file + std::vector decompressed_precompiled_cache; + + // Stores the current offset of the precompiled cache file for IO purposes + std::size_t decompressed_precompiled_cache_offset = 0; + + // Stored transferable shaders + std::unordered_map transferable; + + // The cache has been loaded at boot + bool tried_to_load{}; + + u64 program_id{}; + std::string title_id; + + FileUtil::IOFile AppendPrecompiledFile(); +}; + +} // namespace OpenGL diff --git a/src/video_core/common/shader_gen.h b/src/video_core/common/shader_gen.h index 334f44ce7..ee4b2279e 100644 --- a/src/video_core/common/shader_gen.h +++ b/src/video_core/common/shader_gen.h @@ -199,20 +199,20 @@ public: * @param separable_shader generates shader that can be used for separate shader object * @returns String of the shader source code */ - virtual std::string GenerateTrivialVertexShader(bool separable_shader) = 0; + virtual std::string GenerateTrivialVertexShader() = 0; /** * Generates the GLSL vertex shader program source code for the given VS program * @returns String of the shader source code */ - virtual std::string GenerateVertexShader(const Pica::Shader::ShaderSetup& setup, const PicaVSConfig& config, - bool separable_shader) = 0; + virtual std::string GenerateVertexShader(const Pica::Shader::ShaderSetup& setup, + const PicaVSConfig& config) = 0; /** * Generates the GLSL fixed geometry shader program source code for non-GS PICA pipeline * @returns String of the shader source code */ - virtual std::string GenerateFixedGeometryShader(const PicaFixedGSConfig& config, bool separable_shader) = 0; + virtual std::string GenerateFixedGeometryShader(const PicaFixedGSConfig& config) = 0; /** * Generates the GLSL fragment shader program source code for the current Pica state @@ -221,7 +221,7 @@ public: * @param separable_shader generates shader that can be used for separate shader object * @returns String of the shader source code */ - virtual std::string GenerateFragmentShader(const PicaFSConfig& config, bool separable_shader) = 0; + virtual std::string GenerateFragmentShader(const PicaFSConfig& config) = 0; }; } // namespace VideoCore diff --git a/src/video_core/common/shader_runtime_cache.h b/src/video_core/common/shader_runtime_cache.h new file mode 100644 index 000000000..f6758cbf1 --- /dev/null +++ b/src/video_core/common/shader_runtime_cache.h @@ -0,0 +1,118 @@ +// Copyright 2022 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include "video_core/common/backend.h" +#include "video_core/common/shader_gen.h" + +namespace VideoCore { + +using ShaderCacheResult = std::tuple>; + +template +using ShaderGenerator = std::string(ShaderGeneratorBase::*)(const KeyType&); + +template CodeGenerator, ShaderStage stage> +class ShaderCache { +public: + ShaderCache(std::unique_ptr& backend, + std::unique_ptr& generator) : + backend(backend), generator(generator) {} + ~ShaderCache() = default; + + // Returns a shader handle generated from the provided config + ShaderCacheResult Get(const KeyType& config) { + auto [iter, new_shader] = shaders.emplace(config, ShaderHandle{}); + ShaderHandle& shader = iter->second; + + if (new_shader) { + auto result = (generator.get()->*CodeGenerator)(config); + shader = backend->CreateShader(stage, "Cached shader", result); + shader->Compile(ShaderOptimization::Debug); // TODO: Change this + + return std::make_tuple(shader, result); + } + + return std::make_tuple(shader, std::nullopt); + } + + void Inject(const KeyType& key, ShaderHandle&& shader) { + shaders.emplace(key, std::move(shader)); + } + +private: + std::unique_ptr& backend; + std::unique_ptr& generator; + std::unordered_map shaders; +}; + +template +using PicaShaderGenerator = std::string (ShaderGeneratorBase::*)(const Pica::Shader::ShaderSetup&, + const KeyType&); + +/** + * This is a cache designed for shaders translated from PICA shaders. The first cache matches the + * config structure like a normal cache does. On cache miss, the second cache matches the generated + * GLSL code. The configuration is like this because there might be leftover code in the PICA shader + * program buffer from the previous shader, which is hashed into the config, resulting several + * different config values from the same shader program. + */ +template CodeGenerator, ShaderStage stage> +class ShaderDoubleCache { +public: + ShaderDoubleCache(std::unique_ptr& backend, + std::unique_ptr& generator) : + backend(backend), generator(generator) {} + ~ShaderDoubleCache() = default; + + ShaderCacheResult Get(const KeyType& key, const Pica::Shader::ShaderSetup& setup) { + if (auto map_iter = shader_map.find(key); map_iter == shader_map.end()) { + std::string program = (generator.get()->*CodeGenerator)(setup, key); + auto [iter, new_shader] = shader_cache.emplace(program, ShaderHandle{}); + ShaderHandle& shader = iter->second; + + if (new_shader) { + shader = backend->CreateShader(stage, "Cached shader", program); + shader->Compile(ShaderOptimization::Debug); // TODO: Change this + } + + shader_map[key] = &shader; + return std::make_tuple(shader, std::move(program)); + } else { + return std::make_tuple(*map_iter->second, std::nullopt); + } + } + + void Inject(const KeyType& key, std::string decomp, ShaderHandle&& program) { + const auto iter = shader_cache.emplace(std::move(decomp), std::move(stage)).first; + + ShaderHandle& cached_shader = iter->second; + shader_map.insert_or_assign(key, &cached_shader); + } + +private: + std::unique_ptr& backend; + std::unique_ptr& generator; + std::unordered_map shader_map; + std::unordered_map shader_cache; +}; + +// Define shader cache types for convenience +using FragmentShaders = ShaderCache; + +using PicaVertexShaders = ShaderDoubleCache; + +using FixedGeometryShaders = ShaderCache; + +} // namespace VideoCore diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 1a3ea1d73..f427bb4c8 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -66,6 +66,7 @@ bool ShaderDiskCacheRaw::Load(FileUtil::IOFile& file) { if (file.ReadBytes(&code_len, sizeof(u64)) != sizeof(u64)) { return false; } + program_code.resize(code_len); if (file.ReadArray(program_code.data(), code_len) != code_len) { return false; diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index 5a2faeb9a..a8d3f10d3 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -41,16 +41,17 @@ using ProgramCode = std::vector; using ShaderDecompiledMap = std::unordered_map; using ShaderDumpsMap = std::unordered_map; -/// Describes a shader how it's used by the guest GPU +// Describes a shader how it's used by the guest GPU class ShaderDiskCacheRaw { public: - explicit ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type, - RawShaderConfig config, ProgramCode program_code); ShaderDiskCacheRaw() = default; + ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type, + Pica::Regs config, std::vector program_code) : + unique_identifier(unique_identifier), program_type(program_type), config(config), + program_code(program_code) {} ~ShaderDiskCacheRaw() = default; bool Load(FileUtil::IOFile& file); - bool Save(FileUtil::IOFile& file) const; u64 GetUniqueIdentifier() const { @@ -61,19 +62,19 @@ public: return program_type; } - const ProgramCode& GetProgramCode() const { + const std::vector& GetProgramCode() const { return program_code; } - const RawShaderConfig& GetRawShaderConfig() const { + const Pica::Regs& GetRawShaderConfig() const { return config; } private: - u64 unique_identifier{}; + u64 unique_identifier = 0; ProgramType program_type{}; - RawShaderConfig config{}; - ProgramCode program_code{}; + Pica::Regs config{}; + std::vector program_code{}; }; /// Contains decompiled data from a shader diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index fd9c01b07..af219031c 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -393,7 +393,7 @@ bool ShaderProgramManager::UseProgrammableVertexShader(const Pica::Regs& regs, // Save VS to the disk cache if its a new shader if (result) { auto& disk_cache = impl->disk_cache; - ProgramCode program_code{setup.program_code.begin(), setup.program_code.end()}; + std::vector program_code{setup.program_code.begin(), setup.program_code.end()}; program_code.insert(program_code.end(), setup.swizzle_data.begin(), setup.swizzle_data.end()); const u64 unique_identifier = GetUniqueIdentifier(regs, program_code); @@ -715,6 +715,7 @@ void ShaderProgramManager::LoadDiskCache(const std::atomic_bool& stop_loading, contexts[i] = emu_window.CreateSharedContext(); threads[i] = std::thread(LoadRawSepareble, contexts[i].get(), start, end); } + for (auto& thread : threads) { thread.join(); } diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 39c1392f2..895d0eb65 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include "video_core/rasterizer_interface.h" #include "video_core/regs_lighting.h" @@ -22,39 +23,39 @@ namespace OpenGL { enum class UniformBindings : GLuint { Common, VS, GS }; struct LightSrc { - alignas(16) GLvec3 specular_0; - alignas(16) GLvec3 specular_1; - alignas(16) GLvec3 diffuse; - alignas(16) GLvec3 ambient; - alignas(16) GLvec3 position; - alignas(16) GLvec3 spot_direction; // negated - GLfloat dist_atten_bias; - GLfloat dist_atten_scale; + alignas(16) Common::Vec3f specular_0; + alignas(16) Common::Vec3f specular_1; + alignas(16) Common::Vec3f diffuse; + alignas(16) Common::Vec3f ambient; + alignas(16) Common::Vec3f position; + alignas(16) Common::Vec3f spot_direction; // negated + float dist_atten_bias; + float dist_atten_scale; }; -/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned +// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned // NOTE: Always keep a vec4 at the end. The GL spec is not clear wether the alignment at // the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not. // Not following that rule will cause problems on some AMD drivers. struct UniformData { - GLint framebuffer_scale; - GLint alphatest_ref; - GLfloat depth_scale; - GLfloat depth_offset; - GLfloat shadow_bias_constant; - GLfloat shadow_bias_linear; - GLint scissor_x1; - GLint scissor_y1; - GLint scissor_x2; - GLint scissor_y2; - GLint fog_lut_offset; - GLint proctex_noise_lut_offset; - GLint proctex_color_map_offset; - GLint proctex_alpha_map_offset; - GLint proctex_lut_offset; - GLint proctex_diff_lut_offset; - GLfloat proctex_bias; - GLint shadow_texture_bias; + int framebuffer_scale; + int alphatest_ref; + float depth_scale; + float depth_offset; + float shadow_bias_constant; + float shadow_bias_linear; + int scissor_x1; + int scissor_y1; + int scissor_x2; + int scissor_y2; + int fog_lut_offset; + int proctex_noise_lut_offset; + int proctex_color_map_offset; + int proctex_alpha_map_offset; + int proctex_lut_offset; + int proctex_diff_lut_offset; + float proctex_bias; + int shadow_texture_bias; alignas(16) GLivec4 lighting_lut_offset[Pica::LightingRegs::NumLightingSampler / 4]; alignas(16) GLvec3 fog_color; alignas(8) GLvec2 proctex_noise_f; @@ -62,29 +63,29 @@ struct UniformData { alignas(8) GLvec2 proctex_noise_p; alignas(16) GLvec3 lighting_global_ambient; LightSrc light_src[8]; - alignas(16) GLvec4 const_color[6]; // A vec4 color for each of the six tev stages - alignas(16) GLvec4 tev_combiner_buffer_color; - alignas(16) GLvec4 clip_coef; + alignas(16) Common::Vec4f const_color[6]; // A vec4 color for each of the six tev stages + alignas(16) Common::Vec4f tev_combiner_buffer_color; + alignas(16) Common::Vec4f clip_coef; }; -static_assert( - sizeof(UniformData) == 0x4F0, - "The size of the UniformData structure has changed, update the structure in the shader"); +static_assert(sizeof(UniformData) == 0x4F0, + "The size of the UniformData structure has changed, update the structure in the shader"); static_assert(sizeof(UniformData) < 16384, "UniformData structure must be less than 16kb as per the OpenGL spec"); -/// Uniform struct for the Uniform Buffer Object that contains PICA vertex/geometry shader uniforms. + +// Uniform struct for the Uniform Buffer Object that contains PICA vertex/geometry shader uniforms. // NOTE: the same rule from UniformData also applies here. struct PicaUniformsData { void SetFromRegs(const Pica::ShaderRegs& regs, const Pica::Shader::ShaderSetup& setup); struct BoolAligned { - alignas(16) GLint b; + alignas(16) int b; }; - + GLvec4 std::array bools; - alignas(16) std::array i; - alignas(16) std::array f; + alignas(16) std::array i; + alignas(16) std::array f; }; struct VSUniformData {