DMA & InlineToMemory Engines Rework.
This commit is contained in:
		| @@ -24,4 +24,12 @@ template <class ForwardIt, class T, class Compare = std::less<>> | ||||
|     return first != last && !comp(value, *first) ? first : last; | ||||
| } | ||||
|  | ||||
| template <typename T, typename Func, typename... Args> | ||||
| T FoldRight(T initial_value, Func&& func, Args&&... args) { | ||||
|     T value{initial_value}; | ||||
|     const auto high_func = [&value, &func]<typename T>(T x) { value = func(value, x); }; | ||||
|     (std::invoke(high_func, std::forward<Args>(args)), ...); | ||||
|     return value; | ||||
| } | ||||
|  | ||||
| } // namespace Common | ||||
|   | ||||
| @@ -126,7 +126,7 @@ public: | ||||
|  | ||||
|     void DownloadMemory(VAddr cpu_addr, u64 size); | ||||
|  | ||||
|     bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<u8> inlined_buffer); | ||||
|     bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer); | ||||
|  | ||||
|     void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); | ||||
|  | ||||
| @@ -1685,7 +1685,7 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, | ||||
|  | ||||
| template <class P> | ||||
| bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size, | ||||
|                                   std::span<u8> inlined_buffer) { | ||||
|                                   std::span<const u8> inlined_buffer) { | ||||
|     const bool is_dirty = IsRegionRegistered(dest_address, copy_size); | ||||
|     if (!is_dirty) { | ||||
|         return false; | ||||
|   | ||||
| @@ -3,6 +3,7 @@ | ||||
|  | ||||
| #include <cstring> | ||||
|  | ||||
| #include "common/algorithm.h" | ||||
| #include "common/assert.h" | ||||
| #include "video_core/engines/engine_upload.h" | ||||
| #include "video_core/memory_manager.h" | ||||
| @@ -34,21 +35,48 @@ void State::ProcessData(const u32 data, const bool is_last_call) { | ||||
|     if (!is_last_call) { | ||||
|         return; | ||||
|     } | ||||
|     ProcessData(inner_buffer); | ||||
| } | ||||
|  | ||||
| void State::ProcessData(const u32* data, size_t num_data) { | ||||
|     std::span<const u8> read_buffer(reinterpret_cast<const u8*>(data), num_data * sizeof(u32)); | ||||
|     ProcessData(read_buffer); | ||||
| } | ||||
|  | ||||
| void State::ProcessData(std::span<const u8> read_buffer) { | ||||
|     const GPUVAddr address{regs.dest.Address()}; | ||||
|     if (is_linear) { | ||||
|         rasterizer->AccelerateInlineToMemory(address, copy_size, inner_buffer); | ||||
|         if (regs.line_count == 1) { | ||||
|             rasterizer->AccelerateInlineToMemory(address, copy_size, read_buffer); | ||||
|         } else { | ||||
|             for (u32 line = 0; line < regs.line_count; ++line) { | ||||
|                 const GPUVAddr dest_line = address + static_cast<size_t>(line) * regs.dest.pitch; | ||||
|                 memory_manager.WriteBlockUnsafe( | ||||
|                     dest_line, read_buffer.data() + static_cast<size_t>(line) * regs.line_length_in, | ||||
|                     regs.line_length_in); | ||||
|             } | ||||
|             memory_manager.InvalidateRegion(address, regs.dest.pitch * regs.line_count); | ||||
|         } | ||||
|     } else { | ||||
|         UNIMPLEMENTED_IF(regs.dest.z != 0); | ||||
|         UNIMPLEMENTED_IF(regs.dest.depth != 1); | ||||
|         UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 0); | ||||
|         UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 0); | ||||
|         u32 width = regs.dest.width; | ||||
|         u32 x_elements = regs.line_length_in; | ||||
|         u32 x_offset = regs.dest.x; | ||||
|         const u32 bpp_shift = Common::FoldRight( | ||||
|             4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); }, | ||||
|             width, x_elements, x_offset, static_cast<u32>(address)); | ||||
|         width >>= bpp_shift; | ||||
|         x_elements >>= bpp_shift; | ||||
|         x_offset >>= bpp_shift; | ||||
|         const u32 bytes_per_pixel = 1U << bpp_shift; | ||||
|         const std::size_t dst_size = Tegra::Texture::CalculateSize( | ||||
|             true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 0); | ||||
|             true, bytes_per_pixel, width, regs.dest.height, regs.dest.depth, | ||||
|             regs.dest.BlockHeight(), regs.dest.BlockDepth()); | ||||
|         tmp_buffer.resize(dst_size); | ||||
|         memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size); | ||||
|         Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y, | ||||
|                                       regs.dest.BlockHeight(), copy_size, inner_buffer.data(), | ||||
|                                       tmp_buffer.data()); | ||||
|         Tegra::Texture::SwizzleSubrect(tmp_buffer, read_buffer, bytes_per_pixel, width, | ||||
|                                        regs.dest.height, regs.dest.depth, x_offset, regs.dest.y, | ||||
|                                        x_elements, regs.line_count, regs.dest.BlockHeight(), | ||||
|                                        regs.dest.BlockDepth(), regs.line_length_in); | ||||
|         memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -3,6 +3,7 @@ | ||||
|  | ||||
| #pragma once | ||||
|  | ||||
| #include <span> | ||||
| #include <vector> | ||||
| #include "common/bit_field.h" | ||||
| #include "common/common_types.h" | ||||
| @@ -33,7 +34,7 @@ struct Registers { | ||||
|         u32 width; | ||||
|         u32 height; | ||||
|         u32 depth; | ||||
|         u32 z; | ||||
|         u32 layer; | ||||
|         u32 x; | ||||
|         u32 y; | ||||
|  | ||||
| @@ -62,11 +63,14 @@ public: | ||||
|  | ||||
|     void ProcessExec(bool is_linear_); | ||||
|     void ProcessData(u32 data, bool is_last_call); | ||||
|     void ProcessData(const u32* data, size_t num_data); | ||||
|  | ||||
|     /// Binds a rasterizer to this engine. | ||||
|     void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); | ||||
|  | ||||
| private: | ||||
|     void ProcessData(std::span<const u8> read_buffer); | ||||
|  | ||||
|     u32 write_offset = 0; | ||||
|     u32 copy_size = 0; | ||||
|     std::vector<u8> inner_buffer; | ||||
|   | ||||
| @@ -36,8 +36,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal | ||||
|     } | ||||
|     case KEPLER_COMPUTE_REG_INDEX(data_upload): { | ||||
|         upload_state.ProcessData(method_argument, is_last_call); | ||||
|         if (is_last_call) { | ||||
|         } | ||||
|         break; | ||||
|     } | ||||
|     case KEPLER_COMPUTE_REG_INDEX(launch): | ||||
| @@ -50,8 +48,15 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal | ||||
|  | ||||
| void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount, | ||||
|                                     u32 methods_pending) { | ||||
|     for (std::size_t i = 0; i < amount; i++) { | ||||
|         CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); | ||||
|     switch (method) { | ||||
|     case KEPLER_COMPUTE_REG_INDEX(data_upload): | ||||
|         upload_state.ProcessData(base_start, static_cast<size_t>(amount)); | ||||
|         return; | ||||
|     default: | ||||
|         for (std::size_t i = 0; i < amount; i++) { | ||||
|             CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); | ||||
|         } | ||||
|         break; | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -33,8 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call | ||||
|     } | ||||
|     case KEPLERMEMORY_REG_INDEX(data): { | ||||
|         upload_state.ProcessData(method_argument, is_last_call); | ||||
|         if (is_last_call) { | ||||
|         } | ||||
|         break; | ||||
|     } | ||||
|     } | ||||
| @@ -42,8 +40,15 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call | ||||
|  | ||||
| void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount, | ||||
|                                    u32 methods_pending) { | ||||
|     for (std::size_t i = 0; i < amount; i++) { | ||||
|         CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); | ||||
|     switch (method) { | ||||
|     case KEPLERMEMORY_REG_INDEX(data): | ||||
|         upload_state.ProcessData(base_start, static_cast<size_t>(amount)); | ||||
|         return; | ||||
|     default: | ||||
|         for (std::size_t i = 0; i < amount; i++) { | ||||
|             CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); | ||||
|         } | ||||
|         break; | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -239,8 +239,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume | ||||
|         return upload_state.ProcessExec(regs.exec_upload.linear != 0); | ||||
|     case MAXWELL3D_REG_INDEX(data_upload): | ||||
|         upload_state.ProcessData(argument, is_last_call); | ||||
|         if (is_last_call) { | ||||
|         } | ||||
|         return; | ||||
|     case MAXWELL3D_REG_INDEX(fragment_barrier): | ||||
|         return rasterizer->FragmentBarrier(); | ||||
| @@ -316,6 +314,9 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, | ||||
|     case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15: | ||||
|         ProcessCBMultiData(base_start, amount); | ||||
|         break; | ||||
|     case MAXWELL3D_REG_INDEX(data_upload): | ||||
|         upload_state.ProcessData(base_start, static_cast<size_t>(amount)); | ||||
|         return; | ||||
|     default: | ||||
|         for (std::size_t i = 0; i < amount; i++) { | ||||
|             CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); | ||||
|   | ||||
| @@ -1,6 +1,7 @@ | ||||
| // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project | ||||
| // SPDX-License-Identifier: GPL-2.0-or-later | ||||
|  | ||||
| #include "common/algorithm.h" | ||||
| #include "common/assert.h" | ||||
| #include "common/logging/log.h" | ||||
| #include "common/microprofile.h" | ||||
| @@ -54,8 +55,6 @@ void MaxwellDMA::Launch() { | ||||
|     const LaunchDMA& launch = regs.launch_dma; | ||||
|     ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE); | ||||
|     ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED); | ||||
|     ASSERT(regs.dst_params.origin.x == 0); | ||||
|     ASSERT(regs.dst_params.origin.y == 0); | ||||
|  | ||||
|     const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH; | ||||
|     const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; | ||||
| @@ -121,12 +120,13 @@ void MaxwellDMA::CopyPitchToPitch() { | ||||
|  | ||||
| void MaxwellDMA::CopyBlockLinearToPitch() { | ||||
|     UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); | ||||
|     UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0); | ||||
|     UNIMPLEMENTED_IF(regs.src_params.layer != 0); | ||||
|  | ||||
|     const bool is_remapping = regs.launch_dma.remap_enable != 0; | ||||
|  | ||||
|     // Optimized path for micro copies. | ||||
|     const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; | ||||
|     if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X && | ||||
|     if (!is_remapping && dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X && | ||||
|         regs.src_params.height > GOB_SIZE_Y) { | ||||
|         FastCopyBlockLinearToPitch(); | ||||
|         return; | ||||
| @@ -134,10 +134,27 @@ void MaxwellDMA::CopyBlockLinearToPitch() { | ||||
|  | ||||
|     // Deswizzle the input and copy it over. | ||||
|     UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); | ||||
|     const u32 bytes_per_pixel = | ||||
|         regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1; | ||||
|     const Parameters& src_params = regs.src_params; | ||||
|     const u32 width = src_params.width; | ||||
|  | ||||
|     const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1; | ||||
|     const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1; | ||||
|  | ||||
|     const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size; | ||||
|  | ||||
|     u32 width = src_params.width; | ||||
|     u32 x_elements = regs.line_length_in; | ||||
|     u32 x_offset = src_params.origin.x; | ||||
|     u32 bpp_shift = 0U; | ||||
|     if (!is_remapping) { | ||||
|         bpp_shift = Common::FoldRight( | ||||
|             4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); }, | ||||
|             width, x_elements, x_offset, static_cast<u32>(regs.offset_in)); | ||||
|         width >>= bpp_shift; | ||||
|         x_elements >>= bpp_shift; | ||||
|         x_offset >>= bpp_shift; | ||||
|     } | ||||
|  | ||||
|     const u32 bytes_per_pixel = base_bpp << bpp_shift; | ||||
|     const u32 height = src_params.height; | ||||
|     const u32 depth = src_params.depth; | ||||
|     const u32 block_height = src_params.block_size.height; | ||||
| @@ -155,30 +172,46 @@ void MaxwellDMA::CopyBlockLinearToPitch() { | ||||
|     memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); | ||||
|     memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); | ||||
|  | ||||
|     UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel, | ||||
|                      block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(), | ||||
|                      read_buffer.data()); | ||||
|     UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, | ||||
|                      src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, | ||||
|                      regs.pitch_out); | ||||
|  | ||||
|     memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); | ||||
| } | ||||
|  | ||||
| void MaxwellDMA::CopyPitchToBlockLinear() { | ||||
|     UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one"); | ||||
|     UNIMPLEMENTED_IF(regs.dst_params.layer != 0); | ||||
|     UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); | ||||
|  | ||||
|     const bool is_remapping = regs.launch_dma.remap_enable != 0; | ||||
|     const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1; | ||||
|     const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1; | ||||
|  | ||||
|     const auto& dst_params = regs.dst_params; | ||||
|     const u32 bytes_per_pixel = | ||||
|         regs.launch_dma.remap_enable ? regs.pitch_in / regs.line_length_in : 1; | ||||
|     const u32 width = dst_params.width; | ||||
|  | ||||
|     const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size; | ||||
|  | ||||
|     u32 width = dst_params.width; | ||||
|     u32 x_elements = regs.line_length_in; | ||||
|     u32 x_offset = dst_params.origin.x; | ||||
|     u32 bpp_shift = 0U; | ||||
|     if (!is_remapping) { | ||||
|         bpp_shift = Common::FoldRight( | ||||
|             4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); }, | ||||
|             width, x_elements, x_offset, static_cast<u32>(regs.offset_out)); | ||||
|         width >>= bpp_shift; | ||||
|         x_elements >>= bpp_shift; | ||||
|         x_offset >>= bpp_shift; | ||||
|     } | ||||
|  | ||||
|     const u32 bytes_per_pixel = base_bpp << bpp_shift; | ||||
|     const u32 height = dst_params.height; | ||||
|     const u32 depth = dst_params.depth; | ||||
|     const u32 block_height = dst_params.block_size.height; | ||||
|     const u32 block_depth = dst_params.block_size.depth; | ||||
|     const size_t dst_size = | ||||
|         CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); | ||||
|     const size_t dst_layer_size = | ||||
|         CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth); | ||||
|  | ||||
|     const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count; | ||||
|  | ||||
|     if (read_buffer.size() < src_size) { | ||||
| @@ -188,32 +221,23 @@ void MaxwellDMA::CopyPitchToBlockLinear() { | ||||
|         write_buffer.resize(dst_size); | ||||
|     } | ||||
|  | ||||
|     memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); | ||||
|     if (Settings::IsGPULevelExtreme()) { | ||||
|         memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); | ||||
|         memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); | ||||
|     } else { | ||||
|         memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size); | ||||
|         memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); | ||||
|     } | ||||
|  | ||||
|     // If the input is linear and the output is tiled, swizzle the input and copy it over. | ||||
|     if (regs.dst_params.block_size.depth > 0) { | ||||
|         ASSERT(dst_params.layer == 0); | ||||
|         SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height, | ||||
|                             bytes_per_pixel, block_height, block_depth, dst_params.origin.x, | ||||
|                             dst_params.origin.y, write_buffer.data(), read_buffer.data()); | ||||
|     } else { | ||||
|         SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel, | ||||
|                        write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(), | ||||
|                        block_height, dst_params.origin.x, dst_params.origin.y); | ||||
|     } | ||||
|     SwizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, | ||||
|                    dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth, | ||||
|                    regs.pitch_in); | ||||
|  | ||||
|     memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); | ||||
| } | ||||
|  | ||||
| void MaxwellDMA::FastCopyBlockLinearToPitch() { | ||||
|     const u32 bytes_per_pixel = | ||||
|         regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1; | ||||
|     const u32 bytes_per_pixel = 1U; | ||||
|     const size_t src_size = GOB_SIZE; | ||||
|     const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; | ||||
|     u32 pos_x = regs.src_params.origin.x; | ||||
| @@ -239,9 +263,10 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() { | ||||
|         memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); | ||||
|     } | ||||
|  | ||||
|     UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width, | ||||
|                      bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y, | ||||
|                      write_buffer.data(), read_buffer.data()); | ||||
|     UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, regs.src_params.width, | ||||
|                      regs.src_params.height, 1, pos_x, pos_y, regs.line_length_in, regs.line_count, | ||||
|                      regs.src_params.block_size.height, regs.src_params.block_size.depth, | ||||
|                      regs.pitch_out); | ||||
|  | ||||
|     memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); | ||||
| } | ||||
|   | ||||
| @@ -189,10 +189,16 @@ public: | ||||
|             BitField<4, 3, Swizzle> dst_y; | ||||
|             BitField<8, 3, Swizzle> dst_z; | ||||
|             BitField<12, 3, Swizzle> dst_w; | ||||
|             BitField<0, 12, u32> dst_components_raw; | ||||
|             BitField<16, 2, u32> component_size_minus_one; | ||||
|             BitField<20, 2, u32> num_src_components_minus_one; | ||||
|             BitField<24, 2, u32> num_dst_components_minus_one; | ||||
|         }; | ||||
|  | ||||
|         Swizzle GetComponent(size_t i) { | ||||
|             const u32 raw = dst_components_raw; | ||||
|             return static_cast<Swizzle>((raw >> (i * 3)) & 0x7); | ||||
|         } | ||||
|     }; | ||||
|     static_assert(sizeof(RemapConst) == 12); | ||||
|  | ||||
|   | ||||
| @@ -156,8 +156,9 @@ void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) { | ||||
|         const u32 block_height = static_cast<u32>(config.block_linear_height_log2); | ||||
|         const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); | ||||
|         luma_buffer.resize(size); | ||||
|         Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(), | ||||
|                                 converted_frame_buf_addr, block_height, 0, 0); | ||||
|         std::span<const u8> frame_buff(converted_frame_buf_addr, 4 * width * height); | ||||
|         Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1, | ||||
|                                 0, 0, width, height, block_height, 0, width * 4); | ||||
|  | ||||
|         host1x.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); | ||||
|     } else { | ||||
|   | ||||
| @@ -462,6 +462,97 @@ void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size) const { | ||||
|     MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, flush_short_pages); | ||||
| } | ||||
|  | ||||
| bool MemoryManager::IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const { | ||||
|     bool result = false; | ||||
|     auto do_nothing = [&]([[maybe_unused]] std::size_t page_index, | ||||
|                           [[maybe_unused]] std::size_t offset, | ||||
|                           [[maybe_unused]] std::size_t copy_amount) { return false; }; | ||||
|  | ||||
|     auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) { | ||||
|         const VAddr cpu_addr_base = | ||||
|             (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset; | ||||
|         result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount); | ||||
|         return result; | ||||
|     }; | ||||
|     auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) { | ||||
|         const VAddr cpu_addr_base = | ||||
|             (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset; | ||||
|         result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount); | ||||
|         return result; | ||||
|     }; | ||||
|     auto check_short_pages = [&](std::size_t page_index, std::size_t offset, | ||||
|                                  std::size_t copy_amount) { | ||||
|         GPUVAddr base = (page_index << big_page_bits) + offset; | ||||
|         MemoryOperation<false>(base, copy_amount, mapped_normal, do_nothing, do_nothing); | ||||
|         return result; | ||||
|     }; | ||||
|     MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, check_short_pages); | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| size_t MemoryManager::MaxContinousRange(GPUVAddr gpu_addr, size_t size) const { | ||||
|     std::optional<VAddr> old_page_addr{}; | ||||
|     size_t range_so_far = 0; | ||||
|     bool result{false}; | ||||
|     auto fail = [&]([[maybe_unused]] std::size_t page_index, [[maybe_unused]] std::size_t offset, | ||||
|                     std::size_t copy_amount) { | ||||
|         result = true; | ||||
|         return true; | ||||
|     }; | ||||
|     auto short_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) { | ||||
|         const VAddr cpu_addr_base = | ||||
|             (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset; | ||||
|         if (old_page_addr && *old_page_addr != cpu_addr_base) { | ||||
|             result = true; | ||||
|             return true; | ||||
|         } | ||||
|         range_so_far += copy_amount; | ||||
|         old_page_addr = {cpu_addr_base + copy_amount}; | ||||
|         return false; | ||||
|     }; | ||||
|     auto big_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) { | ||||
|         const VAddr cpu_addr_base = | ||||
|             (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset; | ||||
|         if (old_page_addr && *old_page_addr != cpu_addr_base) { | ||||
|             return true; | ||||
|         } | ||||
|         range_so_far += copy_amount; | ||||
|         old_page_addr = {cpu_addr_base + copy_amount}; | ||||
|         return false; | ||||
|     }; | ||||
|     auto check_short_pages = [&](std::size_t page_index, std::size_t offset, | ||||
|                                  std::size_t copy_amount) { | ||||
|         GPUVAddr base = (page_index << big_page_bits) + offset; | ||||
|         MemoryOperation<false>(base, copy_amount, short_check, fail, fail); | ||||
|         return result; | ||||
|     }; | ||||
|     MemoryOperation<true>(gpu_addr, size, big_check, fail, check_short_pages); | ||||
|     return range_so_far; | ||||
| } | ||||
|  | ||||
| void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size) const { | ||||
|     auto do_nothing = [&]([[maybe_unused]] std::size_t page_index, | ||||
|                           [[maybe_unused]] std::size_t offset, | ||||
|                           [[maybe_unused]] std::size_t copy_amount) {}; | ||||
|  | ||||
|     auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) { | ||||
|         const VAddr cpu_addr_base = | ||||
|             (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset; | ||||
|         rasterizer->InvalidateRegion(cpu_addr_base, copy_amount); | ||||
|     }; | ||||
|     auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) { | ||||
|         const VAddr cpu_addr_base = | ||||
|             (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset; | ||||
|         rasterizer->InvalidateRegion(cpu_addr_base, copy_amount); | ||||
|     }; | ||||
|     auto invalidate_short_pages = [&](std::size_t page_index, std::size_t offset, | ||||
|                                       std::size_t copy_amount) { | ||||
|         GPUVAddr base = (page_index << big_page_bits) + offset; | ||||
|         MemoryOperation<false>(base, copy_amount, mapped_normal, do_nothing, do_nothing); | ||||
|     }; | ||||
|     MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, invalidate_short_pages); | ||||
| } | ||||
|  | ||||
| void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) { | ||||
|     std::vector<u8> tmp_buffer(size); | ||||
|     ReadBlock(gpu_src_addr, tmp_buffer.data(), size); | ||||
|   | ||||
| @@ -104,6 +104,12 @@ public: | ||||
|  | ||||
|     void FlushRegion(GPUVAddr gpu_addr, size_t size) const; | ||||
|  | ||||
|     void InvalidateRegion(GPUVAddr gpu_addr, size_t size) const; | ||||
|  | ||||
|     bool IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const; | ||||
|  | ||||
|     size_t MaxContinousRange(GPUVAddr gpu_addr, size_t size) const; | ||||
|  | ||||
| private: | ||||
|     template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped> | ||||
|     inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped, | ||||
|   | ||||
| @@ -129,7 +129,7 @@ public: | ||||
|     [[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0; | ||||
|  | ||||
|     virtual void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, | ||||
|                                           std::span<u8> memory) = 0; | ||||
|                                           std::span<const u8> memory) = 0; | ||||
|  | ||||
|     /// Attempt to use a faster method to display the framebuffer to screen | ||||
|     [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, | ||||
|   | ||||
| @@ -476,7 +476,7 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA() | ||||
| } | ||||
|  | ||||
| void RasterizerOpenGL::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, | ||||
|                                                 std::span<u8> memory) { | ||||
|                                                 std::span<const u8> memory) { | ||||
|     auto cpu_addr = gpu_memory->GpuToCpuAddress(address); | ||||
|     if (!cpu_addr) [[unlikely]] { | ||||
|         gpu_memory->WriteBlock(address, memory.data(), copy_size); | ||||
|   | ||||
| @@ -99,7 +99,7 @@ public: | ||||
|                                const Tegra::Engines::Fermi2D::Config& copy_config) override; | ||||
|     Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override; | ||||
|     void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, | ||||
|                                   std::span<u8> memory) override; | ||||
|                                   std::span<const u8> memory) override; | ||||
|     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, | ||||
|                            u32 pixel_stride) override; | ||||
|     void LoadDiskResources(u64 title_id, std::stop_token stop_loading, | ||||
|   | ||||
| @@ -26,8 +26,6 @@ | ||||
|  | ||||
| namespace Vulkan { | ||||
|  | ||||
| using Tegra::Texture::SWIZZLE_TABLE; | ||||
|  | ||||
| namespace { | ||||
|  | ||||
| constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; | ||||
|   | ||||
| @@ -548,7 +548,7 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA() | ||||
| } | ||||
|  | ||||
| void RasterizerVulkan::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, | ||||
|                                                 std::span<u8> memory) { | ||||
|                                                 std::span<const u8> memory) { | ||||
|     auto cpu_addr = gpu_memory->GpuToCpuAddress(address); | ||||
|     if (!cpu_addr) [[unlikely]] { | ||||
|         gpu_memory->WriteBlock(address, memory.data(), copy_size); | ||||
|   | ||||
| @@ -95,7 +95,7 @@ public: | ||||
|                                const Tegra::Engines::Fermi2D::Config& copy_config) override; | ||||
|     Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override; | ||||
|     void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, | ||||
|                                   std::span<u8> memory) override; | ||||
|                                   std::span<const u8> memory) override; | ||||
|     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, | ||||
|                            u32 pixel_stride) override; | ||||
|     void LoadDiskResources(u64 title_id, std::stop_token stop_loading, | ||||
|   | ||||
| @@ -517,7 +517,6 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr | ||||
|     const u32 host_bytes_per_layer = num_blocks_per_layer * bytes_per_block; | ||||
|  | ||||
|     UNIMPLEMENTED_IF(info.tile_width_spacing > 0); | ||||
|  | ||||
|     UNIMPLEMENTED_IF(copy.image_offset.x != 0); | ||||
|     UNIMPLEMENTED_IF(copy.image_offset.y != 0); | ||||
|     UNIMPLEMENTED_IF(copy.image_offset.z != 0); | ||||
|   | ||||
| @@ -89,6 +89,69 @@ void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 | ||||
|     } | ||||
| } | ||||
|  | ||||
| template <bool TO_LINEAR, u32 BYTES_PER_PIXEL> | ||||
| void SwizzleSubrectImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height, | ||||
|                         u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 num_lines, | ||||
|                         u32 block_height, u32 block_depth, u32 pitch_linear) { | ||||
|     // The origin of the transformation can be configured here, leave it as zero as the current API | ||||
|     // doesn't expose it. | ||||
|     static constexpr u32 origin_z = 0; | ||||
|  | ||||
|     // We can configure here a custom pitch | ||||
|     // As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch. | ||||
|     const u32 pitch = pitch_linear; | ||||
|     const u32 stride = Common::AlignUpLog2(width * BYTES_PER_PIXEL, GOB_SIZE_X_SHIFT); | ||||
|  | ||||
|     const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); | ||||
|     const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); | ||||
|     const u32 slice_size = | ||||
|         Common::DivCeilLog2(height, block_height + GOB_SIZE_Y_SHIFT) * block_size; | ||||
|  | ||||
|     const u32 block_height_mask = (1U << block_height) - 1; | ||||
|     const u32 block_depth_mask = (1U << block_depth) - 1; | ||||
|     const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth; | ||||
|  | ||||
|     u32 unprocessed_lines = num_lines; | ||||
|     u32 extent_y = std::min(num_lines, height - origin_y); | ||||
|  | ||||
|     for (u32 slice = 0; slice < depth; ++slice) { | ||||
|         const u32 z = slice + origin_z; | ||||
|         const u32 offset_z = (z >> block_depth) * slice_size + | ||||
|                              ((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height)); | ||||
|         const u32 lines_in_y = std::min(unprocessed_lines, extent_y); | ||||
|         for (u32 line = 0; line < lines_in_y; ++line) { | ||||
|             const u32 y = line + origin_y; | ||||
|             const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(y); | ||||
|  | ||||
|             const u32 block_y = y >> GOB_SIZE_Y_SHIFT; | ||||
|             const u32 offset_y = (block_y >> block_height) * block_size + | ||||
|                                  ((block_y & block_height_mask) << GOB_SIZE_SHIFT); | ||||
|  | ||||
|             u32 swizzled_x = pdep<SWIZZLE_X_BITS>(origin_x * BYTES_PER_PIXEL); | ||||
|             for (u32 column = 0; column < extent_x; | ||||
|                  ++column, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) { | ||||
|                 const u32 x = (column + origin_x) * BYTES_PER_PIXEL; | ||||
|                 const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift; | ||||
|  | ||||
|                 const u32 base_swizzled_offset = offset_z + offset_y + offset_x; | ||||
|                 const u32 swizzled_offset = base_swizzled_offset + (swizzled_x | swizzled_y); | ||||
|  | ||||
|                 const u32 unswizzled_offset = | ||||
|                     slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL; | ||||
|  | ||||
|                 u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset]; | ||||
|                 const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset]; | ||||
|  | ||||
|                 std::memcpy(dst, src, BYTES_PER_PIXEL); | ||||
|             } | ||||
|         } | ||||
|         unprocessed_lines -= lines_in_y; | ||||
|         if (unprocessed_lines == 0) { | ||||
|             return; | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| template <bool TO_LINEAR> | ||||
| void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, | ||||
|              u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { | ||||
| @@ -111,97 +174,6 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe | ||||
|     } | ||||
| } | ||||
|  | ||||
| template <u32 BYTES_PER_PIXEL> | ||||
| void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, | ||||
|                     u8* swizzled_data, const u8* unswizzled_data, u32 block_height_bit, | ||||
|                     u32 offset_x, u32 offset_y) { | ||||
|     const u32 block_height = 1U << block_height_bit; | ||||
|     const u32 image_width_in_gobs = | ||||
|         (swizzled_width * BYTES_PER_PIXEL + (GOB_SIZE_X - 1)) / GOB_SIZE_X; | ||||
|     for (u32 line = 0; line < subrect_height; ++line) { | ||||
|         const u32 dst_y = line + offset_y; | ||||
|         const u32 gob_address_y = | ||||
|             (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs + | ||||
|             ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; | ||||
|  | ||||
|         const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(dst_y); | ||||
|         u32 swizzled_x = pdep<SWIZZLE_X_BITS>(offset_x * BYTES_PER_PIXEL); | ||||
|         for (u32 x = 0; x < subrect_width; | ||||
|              ++x, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) { | ||||
|             const u32 dst_x = x + offset_x; | ||||
|             const u32 gob_address = | ||||
|                 gob_address_y + (dst_x * BYTES_PER_PIXEL / GOB_SIZE_X) * GOB_SIZE * block_height; | ||||
|             const u32 swizzled_offset = gob_address + (swizzled_x | swizzled_y); | ||||
|             const u32 unswizzled_offset = line * source_pitch + x * BYTES_PER_PIXEL; | ||||
|  | ||||
|             const u8* const source_line = unswizzled_data + unswizzled_offset; | ||||
|             u8* const dest_addr = swizzled_data + swizzled_offset; | ||||
|             std::memcpy(dest_addr, source_line, BYTES_PER_PIXEL); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| template <u32 BYTES_PER_PIXEL> | ||||
| void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 block_height, | ||||
|                       u32 origin_x, u32 origin_y, u8* output, const u8* input) { | ||||
|     const u32 stride = width * BYTES_PER_PIXEL; | ||||
|     const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X; | ||||
|     const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height); | ||||
|  | ||||
|     const u32 block_height_mask = (1U << block_height) - 1; | ||||
|     const u32 x_shift = GOB_SIZE_SHIFT + block_height; | ||||
|  | ||||
|     for (u32 line = 0; line < line_count; ++line) { | ||||
|         const u32 src_y = line + origin_y; | ||||
|         const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(src_y); | ||||
|  | ||||
|         const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT; | ||||
|         const u32 src_offset_y = (block_y >> block_height) * block_size + | ||||
|                                  ((block_y & block_height_mask) << GOB_SIZE_SHIFT); | ||||
|  | ||||
|         u32 swizzled_x = pdep<SWIZZLE_X_BITS>(origin_x * BYTES_PER_PIXEL); | ||||
|         for (u32 column = 0; column < line_length_in; | ||||
|              ++column, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) { | ||||
|             const u32 src_x = (column + origin_x) * BYTES_PER_PIXEL; | ||||
|             const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift; | ||||
|  | ||||
|             const u32 swizzled_offset = src_offset_y + src_offset_x + (swizzled_x | swizzled_y); | ||||
|             const u32 unswizzled_offset = line * pitch + column * BYTES_PER_PIXEL; | ||||
|  | ||||
|             std::memcpy(output + unswizzled_offset, input + swizzled_offset, BYTES_PER_PIXEL); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| template <u32 BYTES_PER_PIXEL> | ||||
| void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height, | ||||
|                          u32 block_height, u32 block_depth, u32 origin_x, u32 origin_y, u8* output, | ||||
|                          const u8* input) { | ||||
|     UNIMPLEMENTED_IF(origin_x > 0); | ||||
|     UNIMPLEMENTED_IF(origin_y > 0); | ||||
|  | ||||
|     const u32 stride = width * BYTES_PER_PIXEL; | ||||
|     const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X; | ||||
|     const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); | ||||
|  | ||||
|     const u32 block_height_mask = (1U << block_height) - 1; | ||||
|     const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth; | ||||
|  | ||||
|     for (u32 line = 0; line < line_count; ++line) { | ||||
|         const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(line); | ||||
|         const u32 block_y = line / GOB_SIZE_Y; | ||||
|         const u32 dst_offset_y = | ||||
|             (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE; | ||||
|  | ||||
|         u32 swizzled_x = 0; | ||||
|         for (u32 x = 0; x < line_length_in; ++x, incrpdep<SWIZZLE_X_BITS, 1>(swizzled_x)) { | ||||
|             const u32 dst_offset = | ||||
|                 ((x / GOB_SIZE_X) << x_shift) + dst_offset_y + (swizzled_x | swizzled_y); | ||||
|             const u32 src_offset = x * BYTES_PER_PIXEL + line * pitch; | ||||
|             std::memcpy(output + dst_offset, input + src_offset, BYTES_PER_PIXEL); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| } // Anonymous namespace | ||||
|  | ||||
| void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, | ||||
| @@ -218,15 +190,15 @@ void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_p | ||||
|                   stride_alignment); | ||||
| } | ||||
|  | ||||
| void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, | ||||
|                     u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data, | ||||
|                     u32 block_height_bit, u32 offset_x, u32 offset_y) { | ||||
| void SwizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, | ||||
|                     u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 extent_y, | ||||
|                     u32 block_height, u32 block_depth, u32 pitch_linear) { | ||||
|     switch (bytes_per_pixel) { | ||||
| #define BPP_CASE(x)                                                                                \ | ||||
|     case x:                                                                                        \ | ||||
|         return SwizzleSubrect<x>(subrect_width, subrect_height, source_pitch, swizzled_width,      \ | ||||
|                                  swizzled_data, unswizzled_data, block_height_bit, offset_x,       \ | ||||
|                                  offset_y); | ||||
|         return SwizzleSubrectImpl<true, x>(output, input, width, height, depth, origin_x,          \ | ||||
|                                            origin_y, extent_x, extent_y, block_height,             \ | ||||
|                                            block_depth, pitch_linear); | ||||
|         BPP_CASE(1) | ||||
|         BPP_CASE(2) | ||||
|         BPP_CASE(3) | ||||
| @@ -241,13 +213,15 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 | ||||
|     } | ||||
| } | ||||
|  | ||||
| void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel, | ||||
|                       u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) { | ||||
| void UnswizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, | ||||
|                       u32 width, u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, | ||||
|                       u32 extent_y, u32 block_height, u32 block_depth, u32 pitch_linear) { | ||||
|     switch (bytes_per_pixel) { | ||||
| #define BPP_CASE(x)                                                                                \ | ||||
|     case x:                                                                                        \ | ||||
|         return UnswizzleSubrect<x>(line_length_in, line_count, pitch, width, block_height,         \ | ||||
|                                    origin_x, origin_y, output, input); | ||||
|         return SwizzleSubrectImpl<false, x>(output, input, width, height, depth, origin_x,         \ | ||||
|                                             origin_y, extent_x, extent_y, block_height,            \ | ||||
|                                             block_depth, pitch_linear); | ||||
|         BPP_CASE(1) | ||||
|         BPP_CASE(2) | ||||
|         BPP_CASE(3) | ||||
| @@ -262,55 +236,6 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, | ||||
|     } | ||||
| } | ||||
|  | ||||
| void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height, | ||||
|                          u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x, | ||||
|                          u32 origin_y, u8* output, const u8* input) { | ||||
|     switch (bytes_per_pixel) { | ||||
| #define BPP_CASE(x)                                                                                \ | ||||
|     case x:                                                                                        \ | ||||
|         return SwizzleSliceToVoxel<x>(line_length_in, line_count, pitch, width, height,            \ | ||||
|                                       block_height, block_depth, origin_x, origin_y, output,       \ | ||||
|                                       input); | ||||
|         BPP_CASE(1) | ||||
|         BPP_CASE(2) | ||||
|         BPP_CASE(3) | ||||
|         BPP_CASE(4) | ||||
|         BPP_CASE(6) | ||||
|         BPP_CASE(8) | ||||
|         BPP_CASE(12) | ||||
|         BPP_CASE(16) | ||||
| #undef BPP_CASE | ||||
|     default: | ||||
|         ASSERT_MSG(false, "Invalid bytes_per_pixel={}", bytes_per_pixel); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y, | ||||
|                    const u32 block_height_bit, const std::size_t copy_size, const u8* source_data, | ||||
|                    u8* swizzle_data) { | ||||
|     const u32 block_height = 1U << block_height_bit; | ||||
|     const u32 image_width_in_gobs{(width + GOB_SIZE_X - 1) / GOB_SIZE_X}; | ||||
|     std::size_t count = 0; | ||||
|     for (std::size_t y = dst_y; y < height && count < copy_size; ++y) { | ||||
|         const std::size_t gob_address_y = | ||||
|             (y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs + | ||||
|             ((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; | ||||
|         const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(static_cast<u32>(y)); | ||||
|         u32 swizzled_x = pdep<SWIZZLE_X_BITS>(dst_x); | ||||
|         for (std::size_t x = dst_x; x < width && count < copy_size; | ||||
|              ++x, incrpdep<SWIZZLE_X_BITS, 1>(swizzled_x)) { | ||||
|             const std::size_t gob_address = | ||||
|                 gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height; | ||||
|             const std::size_t swizzled_offset = gob_address + (swizzled_x | swizzled_y); | ||||
|             const u8* source_line = source_data + count; | ||||
|             u8* dest_addr = swizzle_data + swizzled_offset; | ||||
|             count++; | ||||
|  | ||||
|             *dest_addr = *source_line; | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, | ||||
|                           u32 block_height, u32 block_depth) { | ||||
|     if (tiled) { | ||||
|   | ||||
| @@ -40,7 +40,6 @@ constexpr SwizzleTable MakeSwizzleTable() { | ||||
|     } | ||||
|     return table; | ||||
| } | ||||
| constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTable(); | ||||
|  | ||||
| /// Unswizzles a block linear texture into linear memory. | ||||
| void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, | ||||
| @@ -57,34 +56,14 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height | ||||
|                           u32 block_height, u32 block_depth); | ||||
|  | ||||
| /// Copies an untiled subrectangle into a tiled surface. | ||||
| void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, | ||||
|                     u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data, | ||||
|                     u32 block_height_bit, u32 offset_x, u32 offset_y); | ||||
| void SwizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, | ||||
|                     u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 extent_y, | ||||
|                     u32 block_height, u32 block_depth, u32 pitch_linear); | ||||
|  | ||||
| /// Copies a tiled subrectangle into a linear surface. | ||||
| void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel, | ||||
|                       u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input); | ||||
|  | ||||
| /// @brief Swizzles a 2D array of pixels into a 3D texture | ||||
| /// @param line_length_in  Number of pixels per line | ||||
| /// @param line_count      Number of lines | ||||
| /// @param pitch           Number of bytes per line | ||||
| /// @param width           Width of the swizzled texture | ||||
| /// @param height          Height of the swizzled texture | ||||
| /// @param bytes_per_pixel Number of bytes used per pixel | ||||
| /// @param block_height    Block height shift | ||||
| /// @param block_depth     Block depth shift | ||||
| /// @param origin_x        Column offset in pixels of the swizzled texture | ||||
| /// @param origin_y        Row offset in pixels of the swizzled texture | ||||
| /// @param output          Pointer to the pixels of the swizzled texture | ||||
| /// @param input           Pointer to the 2D array of pixels used as input | ||||
| /// @pre input and output points to an array large enough to hold the number of bytes used | ||||
| void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height, | ||||
|                          u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x, | ||||
|                          u32 origin_y, u8* output, const u8* input); | ||||
|  | ||||
| void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height, | ||||
|                    std::size_t copy_size, const u8* source_data, u8* swizzle_data); | ||||
| void UnswizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, | ||||
|                       u32 width, u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, | ||||
|                       u32 extent_y, u32 block_height, u32 block_depth, u32 pitch_linear); | ||||
|  | ||||
| /// Obtains the offset of the gob for positions 'dst_x' & 'dst_y' | ||||
| u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user