diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 61c9bcea2..f0a5cbd0b 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -21,7 +21,7 @@ namespace Vulkan { -constexpr u32 VERTEX_BUFFER_SIZE = 256 * 1024 * 1024; +constexpr u32 VERTEX_BUFFER_SIZE = 64 * 1024 * 1024; constexpr u32 INDEX_BUFFER_SIZE = 16 * 1024 * 1024; constexpr u32 UNIFORM_BUFFER_SIZE = 16 * 1024 * 1024; constexpr u32 TEXTURE_BUFFER_SIZE = 16 * 1024 * 1024; @@ -177,7 +177,7 @@ void RasterizerVulkan::SyncFixedState() { void RasterizerVulkan::SetupVertexArray(u32 vs_input_size, u32 vs_input_index_min, u32 vs_input_index_max) { - auto [array_ptr, array_offset, invalidate] = vertex_buffer.Map(vs_input_size, 4); + auto [array_ptr, array_offset, invalidate] = vertex_buffer.Map(vs_input_size); /** * The Nintendo 3DS has 12 attribute loaders which are used to tell the GPU @@ -402,7 +402,7 @@ bool RasterizerVulkan::AccelerateDrawBatchInternal(bool is_indexed) { regs.pipeline.index_array.offset); // Upload index buffer data to the GPU - auto [index_ptr, index_offset, _] = index_buffer.Map(index_buffer_size, 4); + auto [index_ptr, index_offset, _] = index_buffer.Map(index_buffer_size); std::memcpy(index_ptr, index_data, index_buffer_size); index_buffer.Commit(index_buffer_size); @@ -744,7 +744,7 @@ bool RasterizerVulkan::Draw(bool accelerate, bool is_indexed) { const u32 vertex_size = vertices * sizeof(HardwareVertex); // Copy vertex data - auto [array_ptr, offset, _] = vertex_buffer.Map(vertex_size, sizeof(HardwareVertex)); + auto [array_ptr, offset, _] = vertex_buffer.Map(vertex_size); std::memcpy(array_ptr, vertex_batch.data() + base_vertex, vertex_size); vertex_buffer.Commit(vertex_size); @@ -1266,7 +1266,7 @@ void RasterizerVulkan::SyncAndUploadLUTsLF() { } std::size_t bytes_used = 0; - auto [buffer, offset, invalidate] = texture_lf_buffer.Map(max_size, sizeof(Common::Vec4f)); + auto [buffer, offset, invalidate] = texture_lf_buffer.Map(max_size); // Sync the lighting luts if (uniform_block_data.lighting_lut_dirty_any || invalidate) { @@ -1332,7 +1332,7 @@ void RasterizerVulkan::SyncAndUploadLUTs() { } std::size_t bytes_used = 0; - auto [buffer, offset, invalidate] = texture_buffer.Map(max_size, sizeof(Common::Vec4f)); + auto [buffer, offset, invalidate] = texture_buffer.Map(max_size); // helper function for SyncProcTexNoiseLUT/ColorMap/AlphaMap auto SyncProcTexValueLUT = @@ -1434,8 +1434,7 @@ void RasterizerVulkan::UploadUniforms(bool accelerate_draw) { u32 used_bytes = 0; const u32 uniform_size = static_cast(uniform_size_aligned_vs + uniform_size_aligned_fs); - auto [uniforms, offset, invalidate] = - uniform_buffer.Map(uniform_size, static_cast(uniform_buffer_alignment)); + auto [uniforms, offset, invalidate] = uniform_buffer.Map(uniform_size); if (sync_vs) { Pica::Shader::VSUniformData vs_uniforms; diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp index 7c17ba3fa..cee2e378a 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp @@ -80,15 +80,16 @@ StagingBuffer::~StagingBuffer() { vmaDestroyBuffer(instance.GetAllocator(), static_cast(buffer), allocation); } -StreamBuffer::StreamBuffer(const Instance& instance, Scheduler& scheduler, u32 size, bool readback) - : instance{instance}, scheduler{scheduler}, staging{instance, size, readback}, total_size{size}, - bucket_size{size / BUCKET_COUNT}, readback{readback} {} +StreamBuffer::StreamBuffer(const Instance& instance, Scheduler& scheduler, u32 size, + bool readback) + : instance{instance}, scheduler{scheduler}, staging{instance, size, readback}, + total_size{size}, bucket_size{size / BUCKET_COUNT}, readback{readback} {} StreamBuffer::StreamBuffer(const Instance& instance, Scheduler& scheduler, u32 size, vk::BufferUsageFlagBits usage, std::span view_formats, bool readback) - : instance{instance}, scheduler{scheduler}, staging{instance, size, readback}, usage{usage}, - total_size{size}, bucket_size{size / BUCKET_COUNT}, readback{readback} { + : instance{instance}, scheduler{scheduler}, staging{instance, size, readback}, + usage{usage}, total_size{size}, bucket_size{size / BUCKET_COUNT}, readback{readback} { const vk::BufferCreateInfo buffer_info = { .size = total_size, .usage = usage | vk::BufferUsageFlagBits::eTransferDst}; @@ -128,57 +129,51 @@ StreamBuffer::~StreamBuffer() { } } -std::tuple StreamBuffer::Map(u32 size, u32 alignment) { - ASSERT(size <= total_size && alignment <= total_size); +std::tuple StreamBuffer::Map(u32 size) { + ASSERT(size <= total_size); + size = Common::AlignUp(size, 16); - if (alignment > 0) { - buffer_offset = Common::AlignUp(buffer_offset, alignment); - } - - bool invalidate = false; - const u32 new_offset = buffer_offset + size; - if (u32 new_index = new_offset / bucket_size; new_index != bucket_index) { - if (new_index >= BUCKET_COUNT) { - if (readback) { - Invalidate(); - } else { - Flush(); - } - buffer_offset = 0; - flush_offset = 0; - new_index = 0; - invalidate = true; - } - ticks[bucket_index] = scheduler.CurrentTick(); - scheduler.Wait(ticks[new_index]); - bucket_index = new_index; + Bucket& bucket = buckets[bucket_index]; + + // If we reach bucket boundaries move over to the next one + if (bucket.cursor + size > bucket_size) { + bucket.gpu_tick = scheduler.CurrentTick(); + MoveNextBucket(); + return Map(size); } + const bool invalidate = std::exchange(bucket.invalid, false); + const u32 buffer_offset = bucket_index * bucket_size + bucket.cursor; u8* mapped = reinterpret_cast(staging.mapped.data() + buffer_offset); + return std::make_tuple(mapped, buffer_offset, invalidate); } void StreamBuffer::Commit(u32 size) { - buffer_offset += size; + size = Common::AlignUp(size, 16); + buckets[bucket_index].cursor += size; } void StreamBuffer::Flush() { if (readback) { + LOG_WARNING(Render_Vulkan, "Cannot flush read only buffer"); return; } - const u32 flush_size = buffer_offset - flush_offset; - ASSERT(flush_size <= total_size); - ASSERT(flush_offset + flush_size <= total_size); + Bucket& bucket = buckets[bucket_index]; + const u32 flush_start = bucket_index * bucket_size + bucket.flush_cursor; + const u32 flush_size = bucket.cursor - bucket.flush_cursor; + ASSERT(flush_size <= bucket_size); + ASSERT(flush_start + flush_size <= total_size); if (flush_size > 0) [[likely]] { + // Ensure all staging writes are visible to the host memory domain VmaAllocator allocator = instance.GetAllocator(); - vmaFlushAllocation(allocator, staging.allocation, flush_offset, flush_size); + vmaFlushAllocation(allocator, staging.allocation, flush_start, flush_size); if (gpu_buffer) { - scheduler.Record([this, flush_offset = flush_offset, - flush_size](vk::CommandBuffer, vk::CommandBuffer upload_cmdbuf) { + scheduler.Record([this, flush_start, flush_size](vk::CommandBuffer, vk::CommandBuffer upload_cmdbuf) { const vk::BufferCopy copy_region = { - .srcOffset = flush_offset, .dstOffset = flush_offset, .size = flush_size}; + .srcOffset = flush_start, .dstOffset = flush_start, .size = flush_size}; upload_cmdbuf.copyBuffer(staging.buffer, gpu_buffer, copy_region); @@ -188,15 +183,15 @@ void StreamBuffer::Flush() { .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .buffer = gpu_buffer, - .offset = flush_offset, + .offset = flush_start, .size = flush_size}; - upload_cmdbuf.pipelineBarrier( - vk::PipelineStageFlagBits::eTransfer, MakePipelineStage(usage), - vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, {}); + upload_cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, MakePipelineStage(usage), + vk::DependencyFlagBits::eByRegion, {}, buffer_barrier, + {}); }); } - flush_offset = buffer_offset; + bucket.flush_cursor += flush_size; } } @@ -205,15 +200,33 @@ void StreamBuffer::Invalidate() { return; } - const u32 flush_size = buffer_offset - flush_offset; - ASSERT(flush_size <= total_size); - ASSERT(flush_offset + flush_size <= total_size); + Bucket& bucket = buckets[bucket_index]; + const u32 flush_start = bucket_index * bucket_size + bucket.flush_cursor; + const u32 flush_size = bucket.cursor - bucket.flush_cursor; + ASSERT(flush_size <= bucket_size); if (flush_size > 0) [[likely]] { + // Ensure the staging memory can be read by the host VmaAllocator allocator = instance.GetAllocator(); - vmaInvalidateAllocation(allocator, staging.allocation, flush_offset, flush_size); - flush_offset = buffer_offset; + vmaInvalidateAllocation(allocator, staging.allocation, flush_start, flush_size); + bucket.flush_cursor += flush_size; } } +void StreamBuffer::MoveNextBucket() { + // Flush and Invalidate are bucket local operations for simplicity so perform them here + if (readback) { + Invalidate(); + } else { + Flush(); + } + + bucket_index = (bucket_index + 1) % BUCKET_COUNT; + Bucket& next_bucket = buckets[bucket_index]; + scheduler.Wait(next_bucket.gpu_tick); + next_bucket.cursor = 0; + next_bucket.flush_cursor = 0; + next_bucket.invalid = true; +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index 83e0808e0..9941e8add 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -29,11 +29,11 @@ struct StagingBuffer { class StreamBuffer { static constexpr u32 MAX_BUFFER_VIEWS = 3; - static constexpr u32 BUCKET_COUNT = 8; - + static constexpr u32 BUCKET_COUNT = 4; public: /// Staging only constructor - StreamBuffer(const Instance& instance, Scheduler& scheduler, u32 size, bool readback = false); + StreamBuffer(const Instance& instance, Scheduler& scheduler, u32 size, + bool readback = false); /// Staging + GPU streaming constructor StreamBuffer(const Instance& instance, Scheduler& scheduler, u32 size, vk::BufferUsageFlagBits usage, std::span views, @@ -44,7 +44,7 @@ public: StreamBuffer& operator=(const StreamBuffer&) = delete; /// Maps aligned staging memory of size bytes - std::tuple Map(u32 size, u32 alignment = 0); + std::tuple Map(u32 size); /// Commits size bytes from the currently mapped staging memory void Commit(u32 size = 0); @@ -71,6 +71,17 @@ public: return views[index]; } +private: + /// Moves to the next bucket + void MoveNextBucket(); + + struct Bucket { + bool invalid = false; + u32 gpu_tick = 0; + u32 cursor = 0; + u32 flush_cursor = 0; + }; + private: const Instance& instance; Scheduler& scheduler; @@ -79,14 +90,12 @@ private: VmaAllocation allocation{}; vk::BufferUsageFlagBits usage; std::array views{}; + std::array buckets; std::size_t view_count = 0; u32 total_size = 0; u32 bucket_size = 0; - u32 buffer_offset = 0; - u32 flush_offset = 0; u32 bucket_index = 0; bool readback = false; - std::array ticks{}; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp index 82c8bdec2..32b0401fc 100644 --- a/src/video_core/renderer_vulkan/vk_texture_runtime.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_runtime.cpp @@ -111,7 +111,7 @@ TextureRuntime::~TextureRuntime() { StagingData TextureRuntime::FindStaging(u32 size, bool upload) { // Depth uploads require 4 byte alignment, doesn't hurt to do it for everyone auto& buffer = upload ? upload_buffer : download_buffer; - auto [data, offset, invalidate] = buffer.Map(size, 4); + auto [data, offset, invalidate] = buffer.Map(size); return StagingData{.buffer = buffer.GetStagingHandle(), .size = size,