From 19617f32c8ca999e9b564a60554031476085190a Mon Sep 17 00:00:00 2001 From: GPUCode Date: Tue, 28 Feb 2023 23:35:10 +0200 Subject: [PATCH] custom_tex_manager: Multithread custom texture loading and decode * Each texture has an atomic flag to signal to the backend when decoding is finished * Don't store the file data as well to conserve RAM. --- src/common/image_util.cpp | 24 +- src/common/image_util.h | 4 +- src/common/thread_worker.h | 4 + .../rasterizer_cache/custom_tex_manager.cpp | 205 ++++++++++++------ .../rasterizer_cache/custom_tex_manager.h | 21 +- .../rasterizer_cache/rasterizer_cache.h | 4 +- src/video_core/rasterizer_cache/utils.h | 9 + .../renderer_opengl/gl_texture_runtime.cpp | 4 + src/video_core/renderer_vulkan/vk_common.h | 2 +- .../renderer_vulkan/vk_texture_runtime.cpp | 5 + 10 files changed, 194 insertions(+), 88 deletions(-) diff --git a/src/common/image_util.cpp b/src/common/image_util.cpp index 14639f486..f4291303d 100644 --- a/src/common/image_util.cpp +++ b/src/common/image_util.cpp @@ -72,11 +72,11 @@ bool DecodePNG(std::span png_data, std::span out_data) { return true; } -bool ParseDDSKTX(std::span in_data, std::vector& out_data, u32& width, u32& height, +bool ParseDDSKTX(std::span dds_data, size_t& decoded_size, u32& width, u32& height, ddsktx_format& format) { ddsktx_texture_info tc{}; - const int size = static_cast(in_data.size()); - if (!ddsktx_parse(&tc, in_data.data(), size, nullptr)) { + const int size = static_cast(dds_data.size()); + if (!ddsktx_parse(&tc, dds_data.data(), size, nullptr)) { return false; } @@ -85,9 +85,23 @@ bool ParseDDSKTX(std::span in_data, std::vector& out_data, u32& wi format = tc.format; ddsktx_sub_data sub_data{}; - ddsktx_get_sub(&tc, &sub_data, in_data.data(), size, 0, 0, 0); + ddsktx_get_sub(&tc, &sub_data, dds_data.data(), size, 0, 0, 0); + decoded_size = sub_data.size_bytes; - out_data.resize(sub_data.size_bytes); + return true; +} + +bool LoadDDSKTX(std::span dds_data, std::span out_data) { + ddsktx_texture_info tc{}; + const int size = static_cast(dds_data.size()); + if (!ddsktx_parse(&tc, dds_data.data(), size, nullptr)) { + return false; + } + + ddsktx_sub_data sub_data{}; + ddsktx_get_sub(&tc, &sub_data, dds_data.data(), size, 0, 0, 0); + + ASSERT(out_data.size() == sub_data.size_bytes); std::memcpy(out_data.data(), sub_data.buff, sub_data.size_bytes); return true; diff --git a/src/common/image_util.h b/src/common/image_util.h index d0b92ce07..b7e4bc063 100644 --- a/src/common/image_util.h +++ b/src/common/image_util.h @@ -13,9 +13,11 @@ bool ParsePNG(std::span png_data, size_t& decoded_size, u32& width, u3 bool DecodePNG(std::span png_data, std::span out_data); -bool ParseDDSKTX(std::span in_data, std::vector& out_data, u32& width, u32& height, +bool ParseDDSKTX(std::span dds_data, size_t& decoded_size, u32& width, u32& height, ddsktx_format& format); +bool LoadDDSKTX(std::span dds_data, std::span out_data); + bool EncodePNG(const std::string& out_path, std::span in_data, u32 width, u32 height, s32 level = 6); diff --git a/src/common/thread_worker.h b/src/common/thread_worker.h index a7dfc379c..fef30452c 100644 --- a/src/common/thread_worker.h +++ b/src/common/thread_worker.h @@ -99,6 +99,10 @@ public: }); } + const std::size_t NumWorkers() const noexcept { + return threads.size(); + } + private: std::queue requests; std::mutex queue_mutex; diff --git a/src/video_core/rasterizer_cache/custom_tex_manager.cpp b/src/video_core/rasterizer_cache/custom_tex_manager.cpp index fdac8dff2..722ecfc13 100644 --- a/src/video_core/rasterizer_cache/custom_tex_manager.cpp +++ b/src/video_core/rasterizer_cache/custom_tex_manager.cpp @@ -55,9 +55,7 @@ CustomPixelFormat ToCustomPixelFormat(ddsktx_format format) { } // Anonymous namespace -CustomTexManager::CustomTexManager(Core::System& system_) - : system{system_}, workers{std::max(std::thread::hardware_concurrency(), 2U) - 1, - "Hires processing"} {} +CustomTexManager::CustomTexManager(Core::System& system_) : system{system_} {} CustomTexManager::~CustomTexManager() = default; @@ -66,58 +64,91 @@ void CustomTexManager::FindCustomTextures() { return; } + // If custom textures isn't enabled we don't want to create the thread pool + // so don't do it in the constructor, do it here instead. + workers = std::make_unique( + std::max(std::thread::hardware_concurrency(), 2U) - 1, "Custom textures"); + // Custom textures are currently stored as // [TitleID]/tex1_[width]x[height]_[64-bit hash]_[format].png - using namespace FileUtil; - const u64 program_id = system.Kernel().GetCurrentProcess()->codeset->program_id; const std::string load_path = - fmt::format("{}textures/{:016X}/", GetUserPath(UserPath::LoadDir), program_id); + fmt::format("{}textures/{:016X}/", GetUserPath(FileUtil::UserPath::LoadDir), program_id); // Create the directory if it did not exist - if (!Exists(load_path)) { - CreateFullPath(load_path); + if (!FileUtil::Exists(load_path)) { + FileUtil::CreateFullPath(load_path); } - FSTEntry texture_dir; - std::vector textures; + FileUtil::FSTEntry texture_dir; + std::vector textures; // 64 nested folders should be plenty for most cases - ScanDirectoryTree(load_path, texture_dir, 64); - GetAllFilesFromNestedEntries(texture_dir, textures); + FileUtil::ScanDirectoryTree(load_path, texture_dir, 64); + FileUtil::GetAllFilesFromNestedEntries(texture_dir, textures); - u32 width{}; - u32 height{}; - u32 format{}; - unsigned long long hash{}; - std::string ext(3, ' '); + // Reserve space for all the textures in the folder + const size_t num_textures = textures.size(); + custom_textures.resize(num_textures); - for (const FSTEntry& file : textures) { - const std::string& path = file.physicalName; - if (file.isDirectory || !file.virtualName.starts_with("tex1_")) { + const auto load = [&](u32 begin, u32 end) { + u32 width{}; + u32 height{}; + u32 format{}; + unsigned long long hash{}; + std::string ext(3, ' '); + + for (u32 i = begin; i < end; i++) { + const auto& file = textures[i]; + const std::string& path = file.physicalName; + if (file.isDirectory || !file.virtualName.starts_with("tex1_")) { + continue; + } + + // Parse the texture filename. We only really care about the hash, + // the rest should be queried from the file itself. + if (std::sscanf(file.virtualName.c_str(), "tex1_%ux%u_%llX_%u.%s", &width, &height, + &hash, &format, ext.data()) != 5) { + continue; + } + + custom_textures[i] = std::make_unique(); + CustomTexture& texture = *custom_textures[i]; + + // Fill in relevant information + texture.file_format = MakeFileFormat(ext); + texture.hash = hash; + texture.path = path; + + // Query the file for the rest + QueryTexture(texture); + } + }; + + const std::size_t num_workers{workers->NumWorkers()}; + const std::size_t bucket_size{num_textures / num_workers}; + + for (std::size_t i = 0; i < num_workers; ++i) { + const bool is_last_worker = i + 1 == num_workers; + const std::size_t start{bucket_size * i}; + const std::size_t end{is_last_worker ? num_textures : start + bucket_size}; + workers->QueueWork([start, end, &load]() { load(start, end); }); + } + + workers->WaitForRequests(); + + // Assign each texture to the hash map + for (const auto& texture : custom_textures) { + if (!texture) { continue; } - - // Parse the texture filename. We only really care about the hash, - // the rest should be queried from the file itself. - if (std::sscanf(file.virtualName.c_str(), "tex1_%ux%u_%llX_%u.%s", &width, &height, &hash, - &format, ext.data()) != 5) { - continue; - } - - auto [it, new_texture] = custom_textures.try_emplace(hash); + const unsigned long long hash = texture->hash; + auto [it, new_texture] = custom_texture_map.try_emplace(hash); if (!new_texture) { - LOG_ERROR(Render, "Textures {} and {} conflict, ignoring!", custom_textures[hash].path, - path); + LOG_ERROR(Render, "Textures {} and {} conflict, ignoring!", + custom_texture_map[hash]->path, texture->path); continue; } - - auto& texture = it->second; - texture.file_format = MakeFileFormat(ext); - texture.path = path; - - // Query the required information from the file and load it. - // Since this doesn't involve any decoding it shouldn't consume too much RAM. - LoadTexture(texture); + it->second = texture.get(); } textures_loaded = true; @@ -134,7 +165,6 @@ u64 CustomTexManager::ComputeHash(const SurfaceParams& params, std::span dat // this must be done... const auto decoded = std::span{temp_buffer.data(), decoded_size}; DecodeTexture(params, params.addr, params.end, data, decoded); - return ComputeHash64(decoded.data(), decoded_size); } @@ -185,65 +215,98 @@ void CustomTexManager::DumpTexture(const SurfaceParams& params, u32 level, std:: EncodePNG(dump_path, decoded, width, height); }; - workers.QueueWork(std::move(dump)); + workers->QueueWork(std::move(dump)); dumped_textures.insert(data_hash); } -const Texture& CustomTexManager::GetTexture(u64 data_hash) { - auto it = custom_textures.find(data_hash); - if (it == custom_textures.end()) { +CustomTexture& CustomTexManager::GetTexture(u64 data_hash) { + auto it = custom_texture_map.find(data_hash); + if (it == custom_texture_map.end()) { LOG_WARNING(Render, "Unable to find replacement for surface with hash {:016X}", data_hash); return dummy_texture; } - LOG_DEBUG(Render, "Assigning {} to surface with hash {:016X}", it->second.path, data_hash); - return it->second; + CustomTexture& texture = *it->second; + LOG_DEBUG(Render, "Assigning {} to surface with hash {:016X}", texture.path, data_hash); + + return texture; } -void CustomTexManager::DecodeToStaging(const Texture& texture, const StagingData& staging) { - switch (texture.file_format) { - case CustomFileFormat::PNG: - if (!DecodePNG(texture.data, staging.mapped)) { - LOG_ERROR(Render, "Failed to decode png {}", texture.path); - } - if (compatibility_mode) { - const u32 stride = texture.width * 4; - FlipTexture(staging.mapped, texture.width, texture.height, stride); - } - break; - case CustomFileFormat::DDS: - case CustomFileFormat::KTX: - // Compressed formats don't need CPU decoding +void CustomTexManager::DecodeToStaging(CustomTexture& texture, StagingData& staging) { + if (texture.decoded) { + // Nothing to do here, just copy over the data + ASSERT_MSG(staging.size == texture.staging_size, + "Incorrect staging size for custom texture with hash {:016X}", texture.hash); std::memcpy(staging.mapped.data(), texture.data.data(), texture.data.size()); - break; + return; } + + // Set an atomic flag in staging data so the backend can wait until the data is finished + staging.flag = &texture.flag; + + const auto decode = [this, &texture, mapped = staging.mapped]() { + // Read the file this is potentially the most expensive step + FileUtil::IOFile file{texture.path, "rb"}; + ScratchBuffer file_data{file.GetSize()}; + file.ReadBytes(file_data.Data(), file.GetSize()); + + // Resize the decoded data buffer + std::vector& decoded_data = texture.data; + decoded_data.resize(texture.staging_size); + + // Decode + switch (texture.file_format) { + case CustomFileFormat::PNG: + if (!DecodePNG(file_data.Span(), decoded_data)) { + LOG_ERROR(Render, "Failed to decode png {}", texture.path); + } + if (compatibility_mode) { + const u32 stride = texture.width * 4; + FlipTexture(decoded_data, texture.width, texture.height, stride); + } + break; + case CustomFileFormat::DDS: + case CustomFileFormat::KTX: + // Compressed formats don't need CPU decoding and must be pre-flippede + LoadDDSKTX(file_data.Span(), decoded_data); + break; + } + + // Copy it over to the staging memory + texture.decoded = true; + std::memcpy(mapped.data(), decoded_data.data(), decoded_data.size()); + + // Notify the backend that decode is done + texture.flag.test_and_set(); + texture.flag.notify_all(); + }; + + workers->QueueWork(std::move(decode)); } -void CustomTexManager::LoadTexture(Texture& texture) { - std::vector& data = texture.data; - +void CustomTexManager::QueryTexture(CustomTexture& texture) { // Read the file - auto file = FileUtil::IOFile(texture.path, "rb"); - data.resize(file.GetSize()); - file.ReadBytes(data.data(), file.GetSize()); + FileUtil::IOFile file{texture.path, "rb"}; + ScratchBuffer data{file.GetSize()}; + file.ReadBytes(data.Data(), file.GetSize()); // Parse it based on the file extension switch (texture.file_format) { case CustomFileFormat::PNG: - texture.format = CustomPixelFormat::RGBA8; // Check for other formats too? - if (!ParsePNG(data, texture.staging_size, texture.width, texture.height)) { + if (!ParsePNG(data.Span(), texture.staging_size, texture.width, texture.height)) { LOG_ERROR(Render, "Failed to parse png file {}", texture.path); return; } + texture.format = CustomPixelFormat::RGBA8; // Check for other formats too? break; case CustomFileFormat::DDS: case CustomFileFormat::KTX: ddsktx_format format{}; - if (!ParseDDSKTX(data, texture.data, texture.width, texture.height, format)) { + if (!ParseDDSKTX(data.Span(), texture.staging_size, texture.width, texture.height, + format)) { LOG_ERROR(Render, "Failed to parse dds/ktx file {}", texture.path); return; } - texture.staging_size = texture.data.size(); texture.format = ToCustomPixelFormat(format); break; } diff --git a/src/video_core/rasterizer_cache/custom_tex_manager.h b/src/video_core/rasterizer_cache/custom_tex_manager.h index 4ab366d4d..7061dbeab 100644 --- a/src/video_core/rasterizer_cache/custom_tex_manager.h +++ b/src/video_core/rasterizer_cache/custom_tex_manager.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include #include @@ -27,17 +28,20 @@ enum class CustomFileFormat : u32 { KTX = 2, }; -struct Texture { +struct CustomTexture { u32 width; u32 height; + unsigned long long hash{}; CustomPixelFormat format; CustomFileFormat file_format; std::string path; std::size_t staging_size; std::vector data; + std::atomic_flag flag; + bool decoded = false; operator bool() const noexcept { - return !data.empty(); + return hash != 0; } }; @@ -56,10 +60,10 @@ public: void DumpTexture(const SurfaceParams& params, u32 level, std::span data); /// Returns the custom texture handle assigned to the provided data hash - const Texture& GetTexture(u64 data_hash); + CustomTexture& GetTexture(u64 data_hash); /// Decodes the data in texture to a consumable format - void DecodeToStaging(const Texture& texture, const StagingData& staging); + void DecodeToStaging(CustomTexture& texture, StagingData& staging); bool CompatibilityMode() const noexcept { return compatibility_mode; @@ -67,15 +71,16 @@ public: private: /// Fills the texture structure with information from the file in path - void LoadTexture(Texture& texture); + void QueryTexture(CustomTexture& texture); private: Core::System& system; - Common::ThreadWorker workers; + std::unique_ptr workers; std::unordered_set dumped_textures; - std::unordered_map custom_textures; + std::unordered_map custom_texture_map; + std::vector> custom_textures; std::vector temp_buffer; - Texture dummy_texture{}; + CustomTexture dummy_texture{}; bool textures_loaded{}; bool compatibility_mode{true}; }; diff --git a/src/video_core/rasterizer_cache/rasterizer_cache.h b/src/video_core/rasterizer_cache/rasterizer_cache.h index 7e88f245e..d4feffe0e 100644 --- a/src/video_core/rasterizer_cache/rasterizer_cache.h +++ b/src/video_core/rasterizer_cache/rasterizer_cache.h @@ -966,7 +966,7 @@ bool RasterizerCache::UploadCustomSurface(Surface& surface, const SurfacePara const u32 level = surface.LevelOf(load_info.addr); const bool is_base_level = level == 0; const u64 hash = custom_tex_manager.ComputeHash(load_info, upload_data); - const Texture& texture = custom_tex_manager.GetTexture(hash); + CustomTexture& texture = custom_tex_manager.GetTexture(hash); // The old texture pack system did not support mipmaps so older packs might do // wonky things. For example many packs have mipmaps larger than the base @@ -995,7 +995,7 @@ bool RasterizerCache::UploadCustomSurface(Surface& surface, const SurfacePara // Copy and decode the custom texture to the staging buffer const u32 custom_size = static_cast(texture.staging_size); - const StagingData staging = runtime.FindStaging(custom_size, true); + StagingData staging = runtime.FindStaging(custom_size, true); custom_tex_manager.DecodeToStaging(texture, staging); // Upload surface diff --git a/src/video_core/rasterizer_cache/utils.h b/src/video_core/rasterizer_cache/utils.h index b1f7dc043..b7e63691d 100644 --- a/src/video_core/rasterizer_cache/utils.h +++ b/src/video_core/rasterizer_cache/utils.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include #include "common/hash.h" @@ -84,6 +85,14 @@ struct StagingData { u32 size = 0; std::span mapped{}; u64 buffer_offset = 0; + const std::atomic_flag* flag{}; + + void Wait() const noexcept { + if (!flag) { + return; + } + flag->wait(false); + } }; struct TextureCubeConfig { diff --git a/src/video_core/renderer_opengl/gl_texture_runtime.cpp b/src/video_core/renderer_opengl/gl_texture_runtime.cpp index 288a78df5..3466f802c 100644 --- a/src/video_core/renderer_opengl/gl_texture_runtime.cpp +++ b/src/video_core/renderer_opengl/gl_texture_runtime.cpp @@ -378,6 +378,10 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, Handle()); + // Wait for the buffer if a decode is pending, this isn't very optimal + // but this kind of threading is very hard in gl + staging.Wait(); + const auto& tuple = alloc.tuple; if (is_custom && custom_format != VideoCore::CustomPixelFormat::RGBA8) { glCompressedTexSubImage2D(GL_TEXTURE_2D, upload.texture_level, rect.left, rect.bottom, diff --git a/src/video_core/renderer_vulkan/vk_common.h b/src/video_core/renderer_vulkan/vk_common.h index d5b5d6412..ca5300ad3 100644 --- a/src/video_core/renderer_vulkan/vk_common.h +++ b/src/video_core/renderer_vulkan/vk_common.h @@ -9,7 +9,7 @@ // Include vulkan-hpp header #define VK_ENABLE_BETA_EXTENSIONS -#define VK_NO_PROTOTYPES 1 +#define VK_NO_PROTOTYPES #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 #define VULKAN_HPP_NO_CONSTRUCTORS #define VULKAN_HPP_NO_STRUCT_SETTERS diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp index 58e5356fe..89475488a 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp @@ -907,6 +907,11 @@ void Surface::Upload(const VideoCore::BufferTextureCopy& upload, const StagingDa cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, params.pipeline_flags, vk::DependencyFlagBits::eByRegion, {}, {}, write_barrier); + + // Wait for a decode to finish if one is pending. Normally this isn't + // needed until we actually submit the command buffer but it's safer to do it now + // to prevent the stream buffer from reclaiming our space before we are done with it. + staging.Wait(); }); runtime->upload_buffer.Commit(staging.size);