Accelerate DMA: Use texture cache async downloads to perform the copies
to host. WIP
This commit is contained in:
		| @@ -1287,8 +1287,7 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, | ||||
|     } | ||||
|     const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); | ||||
|     static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; | ||||
|     const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing | ||||
|                                          : VideoCommon::ObtainBufferOperation::MarkAsWritten; | ||||
|     const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; | ||||
|     const auto [buffer, offset] = | ||||
|         buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op); | ||||
|  | ||||
| @@ -1299,7 +1298,8 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, | ||||
|     if constexpr (IS_IMAGE_UPLOAD) { | ||||
|         image->UploadMemory(buffer->Handle(), offset, copy_span); | ||||
|     } else { | ||||
|         texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span); | ||||
|         texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span, | ||||
|                                               buffer_operand.address, buffer_size); | ||||
|     } | ||||
|     return true; | ||||
| } | ||||
|   | ||||
| @@ -781,8 +781,7 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, | ||||
|     } | ||||
|     const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); | ||||
|     static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; | ||||
|     const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing | ||||
|                                          : VideoCommon::ObtainBufferOperation::MarkAsWritten; | ||||
|     const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; | ||||
|     const auto [buffer, offset] = | ||||
|         buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op); | ||||
|  | ||||
| @@ -793,7 +792,8 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, | ||||
|     if constexpr (IS_IMAGE_UPLOAD) { | ||||
|         image->UploadMemory(buffer->Handle(), offset, copy_span); | ||||
|     } else { | ||||
|         texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span); | ||||
|         texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span, | ||||
|                                               buffer_operand.address, buffer_size); | ||||
|     } | ||||
|     return true; | ||||
| } | ||||
|   | ||||
| @@ -1342,17 +1342,19 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImag | ||||
|     UploadMemory(map.buffer, map.offset, copies); | ||||
| } | ||||
|  | ||||
| void Image::DownloadMemory(std::span<VkBuffer> buffers_span, VkDeviceSize offset, | ||||
| void Image::DownloadMemory(std::span<VkBuffer> buffers_span, std::span<VkDeviceSize> offsets_span, | ||||
|                            std::span<const VideoCommon::BufferImageCopy> copies) { | ||||
|     const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); | ||||
|     if (is_rescaled) { | ||||
|         ScaleDown(); | ||||
|     } | ||||
|     boost::container::small_vector<VkBuffer, 1> buffers_vector{}; | ||||
|     for (auto& buffer : buffers_span) { | ||||
|         buffers_vector.push_back(buffer); | ||||
|     boost::container::small_vector<std::vector<VkBufferImageCopy>, 1> vk_copies; | ||||
|     for (size_t index = 0; index < buffers_span.size(); index++) { | ||||
|         buffers_vector.emplace_back(buffers_span[index]); | ||||
|         vk_copies.emplace_back( | ||||
|             TransformBufferImageCopies(copies, offsets_span[index], aspect_mask)); | ||||
|     } | ||||
|     std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask); | ||||
|     scheduler->RequestOutsideRenderPassOperationContext(); | ||||
|     scheduler->Record([buffers = std::move(buffers_vector), image = *original_image, | ||||
|                        aspect_mask = aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) { | ||||
| @@ -1377,9 +1379,9 @@ void Image::DownloadMemory(std::span<VkBuffer> buffers_span, VkDeviceSize offset | ||||
|         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, | ||||
|                                0, read_barrier); | ||||
|  | ||||
|         for (auto buffer : buffers) { | ||||
|             cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, | ||||
|                                      vk_copies); | ||||
|         for (size_t index = 0; index < buffers.size(); index++) { | ||||
|             cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffers[index], | ||||
|                                      vk_copies[index]); | ||||
|         } | ||||
|  | ||||
|         const VkMemoryBarrier memory_write_barrier{ | ||||
| @@ -1418,7 +1420,10 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferIm | ||||
|     std::array buffers{ | ||||
|         map.buffer, | ||||
|     }; | ||||
|     DownloadMemory(buffers, map.offset, copies); | ||||
|     std::array offsets{ | ||||
|         map.offset, | ||||
|     }; | ||||
|     DownloadMemory(buffers, offsets, copies); | ||||
| } | ||||
|  | ||||
| bool Image::IsRescaled() const noexcept { | ||||
|   | ||||
| @@ -138,7 +138,7 @@ public: | ||||
|     void UploadMemory(const StagingBufferRef& map, | ||||
|                       std::span<const VideoCommon::BufferImageCopy> copies); | ||||
|  | ||||
|     void DownloadMemory(std::span<VkBuffer> buffers, VkDeviceSize offset, | ||||
|     void DownloadMemory(std::span<VkBuffer> buffers, std::span<VkDeviceSize> offsets, | ||||
|                         std::span<const VideoCommon::BufferImageCopy> copies); | ||||
|  | ||||
|     void DownloadMemory(const StagingBufferRef& map, | ||||
|   | ||||
| @@ -661,27 +661,40 @@ template <class P> | ||||
| void TextureCache<P>::CommitAsyncFlushes() { | ||||
|     // This is intentionally passing the value by copy | ||||
|     if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { | ||||
|         const std::span<const ImageId> download_ids = uncommitted_downloads; | ||||
|         auto& download_ids = uncommitted_downloads; | ||||
|         if (download_ids.empty()) { | ||||
|             committed_downloads.emplace_back(std::move(uncommitted_downloads)); | ||||
|             uncommitted_downloads.clear(); | ||||
|             async_buffers.emplace_back(std::optional<AsyncBuffer>{}); | ||||
|             async_buffers.emplace_back(std::move(uncommitted_async_buffers)); | ||||
|             uncommitted_async_buffers.clear(); | ||||
|             return; | ||||
|         } | ||||
|         size_t total_size_bytes = 0; | ||||
|         for (const ImageId image_id : download_ids) { | ||||
|             total_size_bytes += slot_images[image_id].unswizzled_size_bytes; | ||||
|         size_t last_async_buffer_id = uncommitted_async_buffers.size(); | ||||
|         bool any_none_dma = false; | ||||
|         for (PendingDownload& download_info : download_ids) { | ||||
|             if (download_info.is_swizzle) { | ||||
|                 total_size_bytes += slot_images[download_info.object_id].unswizzled_size_bytes; | ||||
|                 any_none_dma = true; | ||||
|                 download_info.async_buffer_id = last_async_buffer_id; | ||||
|             } | ||||
|         } | ||||
|         auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true); | ||||
|         for (const ImageId image_id : download_ids) { | ||||
|             Image& image = slot_images[image_id]; | ||||
|             const auto copies = FullDownloadCopies(image.info); | ||||
|             image.DownloadMemory(download_map, copies); | ||||
|             download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64); | ||||
|         if (any_none_dma) { | ||||
|             auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true); | ||||
|             for (const PendingDownload& download_info : download_ids) { | ||||
|                 if (download_info.is_swizzle) { | ||||
|                     Image& image = slot_images[download_info.object_id]; | ||||
|                     const auto copies = FullDownloadCopies(image.info); | ||||
|                     image.DownloadMemory(download_map, copies); | ||||
|                     download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64); | ||||
|                 } | ||||
|             } | ||||
|             uncommitted_async_buffers.emplace_back(download_map); | ||||
|         } | ||||
|         async_buffers.emplace_back(download_map); | ||||
|     } | ||||
|     committed_downloads.emplace_back(std::move(uncommitted_downloads)); | ||||
|     async_buffers.emplace_back(std::move(uncommitted_async_buffers)); | ||||
|     uncommitted_async_buffers.clear(); | ||||
|     uncommitted_downloads.clear(); | ||||
| } | ||||
|  | ||||
| @@ -691,39 +704,57 @@ void TextureCache<P>::PopAsyncFlushes() { | ||||
|         return; | ||||
|     } | ||||
|     if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { | ||||
|         const std::span<const ImageId> download_ids = committed_downloads.front(); | ||||
|         const auto& download_ids = committed_downloads.front(); | ||||
|         if (download_ids.empty()) { | ||||
|             committed_downloads.pop_front(); | ||||
|             async_buffers.pop_front(); | ||||
|             return; | ||||
|         } | ||||
|         auto download_map = *async_buffers.front(); | ||||
|         std::span<u8> download_span = download_map.mapped_span; | ||||
|         auto download_map = std::move(async_buffers.front()); | ||||
|         for (size_t i = download_ids.size(); i > 0; i--) { | ||||
|             const ImageBase& image = slot_images[download_ids[i - 1]]; | ||||
|             const auto copies = FullDownloadCopies(image.info); | ||||
|             download_map.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64); | ||||
|             std::span<u8> download_span_alt = download_span.subspan(download_map.offset); | ||||
|             SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span_alt, | ||||
|                          swizzle_data_buffer); | ||||
|             auto& download_info = download_ids[i - 1]; | ||||
|             auto& download_buffer = download_map[download_info.async_buffer_id]; | ||||
|             if (download_info.is_swizzle) { | ||||
|                 const ImageBase& image = slot_images[download_info.object_id]; | ||||
|                 const auto copies = FullDownloadCopies(image.info); | ||||
|                 download_buffer.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64); | ||||
|                 std::span<u8> download_span = | ||||
|                     download_buffer.mapped_span.subspan(download_buffer.offset); | ||||
|                 SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span, | ||||
|                              swizzle_data_buffer); | ||||
|             } else { | ||||
|                 const BufferDownload& buffer_info = slot_buffer_downloads[download_info.object_id]; | ||||
|                 std::span<u8> download_span = | ||||
|                     download_buffer.mapped_span.subspan(download_buffer.offset); | ||||
|                 gpu_memory->WriteBlockUnsafe(buffer_info.address, download_span.data(), | ||||
|                                              buffer_info.size); | ||||
|                 slot_buffer_downloads.erase(download_info.object_id); | ||||
|             } | ||||
|         } | ||||
|         for (auto& download_buffer : download_map) { | ||||
|             runtime.FreeDeferredStagingBuffer(download_buffer); | ||||
|         } | ||||
|         runtime.FreeDeferredStagingBuffer(download_map); | ||||
|         committed_downloads.pop_front(); | ||||
|         async_buffers.pop_front(); | ||||
|     } else { | ||||
|         const std::span<const ImageId> download_ids = committed_downloads.front(); | ||||
|         const auto& download_ids = committed_downloads.front(); | ||||
|         if (download_ids.empty()) { | ||||
|             committed_downloads.pop_front(); | ||||
|             return; | ||||
|         } | ||||
|         size_t total_size_bytes = 0; | ||||
|         for (const ImageId image_id : download_ids) { | ||||
|             total_size_bytes += slot_images[image_id].unswizzled_size_bytes; | ||||
|         for (const PendingDownload& download_info : download_ids) { | ||||
|             if (download_info.is_swizzle) { | ||||
|                 total_size_bytes += slot_images[download_info.object_id].unswizzled_size_bytes; | ||||
|             } | ||||
|         } | ||||
|         auto download_map = runtime.DownloadStagingBuffer(total_size_bytes); | ||||
|         const size_t original_offset = download_map.offset; | ||||
|         for (const ImageId image_id : download_ids) { | ||||
|             Image& image = slot_images[image_id]; | ||||
|         for (const PendingDownload& download_info : download_ids) { | ||||
|             if (download_info.is_swizzle) { | ||||
|                 continue; | ||||
|             } | ||||
|             Image& image = slot_images[download_info.object_id]; | ||||
|             const auto copies = FullDownloadCopies(image.info); | ||||
|             image.DownloadMemory(download_map, copies); | ||||
|             download_map.offset += image.unswizzled_size_bytes; | ||||
| @@ -732,8 +763,11 @@ void TextureCache<P>::PopAsyncFlushes() { | ||||
|         runtime.Finish(); | ||||
|         download_map.offset = original_offset; | ||||
|         std::span<u8> download_span = download_map.mapped_span; | ||||
|         for (const ImageId image_id : download_ids) { | ||||
|             const ImageBase& image = slot_images[image_id]; | ||||
|         for (const PendingDownload& download_info : download_ids) { | ||||
|             if (download_info.is_swizzle) { | ||||
|                 continue; | ||||
|             } | ||||
|             const ImageBase& image = slot_images[download_info.object_id]; | ||||
|             const auto copies = FullDownloadCopies(image.info); | ||||
|             SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span, | ||||
|                          swizzle_data_buffer); | ||||
| @@ -836,11 +870,27 @@ std::pair<typename TextureCache<P>::Image*, BufferImageCopy> TextureCache<P>::Dm | ||||
| template <class P> | ||||
| void TextureCache<P>::DownloadImageIntoBuffer( | ||||
|     typename TextureCache<P>::Image* image, typename TextureCache<P>::BufferType buffer, | ||||
|     size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies) { | ||||
|     std::array buffers{ | ||||
|         buffer, | ||||
|     }; | ||||
|     image->DownloadMemory(buffers, buffer_offset, copies); | ||||
|     size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies, GPUVAddr address, size_t size) { | ||||
|     if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { | ||||
|         auto slot = slot_buffer_downloads.insert(address, size); | ||||
|         uncommitted_downloads.emplace_back(false, uncommitted_async_buffers.size(), slot); | ||||
|         auto download_map = runtime.DownloadStagingBuffer(size, true); | ||||
|         uncommitted_async_buffers.emplace_back(download_map); | ||||
|         std::array buffers{ | ||||
|             buffer, | ||||
|             download_map.buffer, | ||||
|         }; | ||||
|         std::array buffer_offsets{ | ||||
|             buffer_offset, | ||||
|             download_map.offset, | ||||
|         }; | ||||
|         image->DownloadMemory(buffers, buffer_offsets, copies); | ||||
|     } else { | ||||
|         std::array buffers{ | ||||
|             buffer, | ||||
|         }; | ||||
|         image->DownloadMemory(buffers, buffer_offset, copies); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| @@ -2219,7 +2269,7 @@ void TextureCache<P>::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id) | ||||
|     if (new_id) { | ||||
|         const ImageViewBase& old_view = slot_image_views[new_id]; | ||||
|         if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) { | ||||
|             uncommitted_downloads.push_back(old_view.image_id); | ||||
|             uncommitted_downloads.emplace_back(true, 0, old_view.image_id); | ||||
|         } | ||||
|     } | ||||
|     *old_id = new_id; | ||||
|   | ||||
| @@ -217,7 +217,8 @@ public: | ||||
|         const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image); | ||||
|  | ||||
|     void DownloadImageIntoBuffer(Image* image, BufferType buffer, size_t buffer_offset, | ||||
|                                  std::span<const VideoCommon::BufferImageCopy> copies); | ||||
|                                  std::span<const VideoCommon::BufferImageCopy> copies, | ||||
|                                  GPUVAddr address = 0, size_t size = 0); | ||||
|  | ||||
|     /// Return true when a CPU region is modified from the GPU | ||||
|     [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); | ||||
| @@ -428,17 +429,31 @@ private: | ||||
|     u64 critical_memory; | ||||
|     size_t critical_gc; | ||||
|  | ||||
|     struct BufferDownload { | ||||
|         GPUVAddr address; | ||||
|         size_t size; | ||||
|     }; | ||||
|  | ||||
|     struct PendingDownload { | ||||
|         bool is_swizzle; | ||||
|         size_t async_buffer_id; | ||||
|         SlotId object_id; | ||||
|     }; | ||||
|  | ||||
|     SlotVector<Image> slot_images; | ||||
|     SlotVector<ImageMapView> slot_map_views; | ||||
|     SlotVector<ImageView> slot_image_views; | ||||
|     SlotVector<ImageAlloc> slot_image_allocs; | ||||
|     SlotVector<Sampler> slot_samplers; | ||||
|     SlotVector<Framebuffer> slot_framebuffers; | ||||
|     SlotVector<BufferDownload> slot_buffer_downloads; | ||||
|  | ||||
|     // TODO: This data structure is not optimal and it should be reworked | ||||
|     std::vector<ImageId> uncommitted_downloads; | ||||
|     std::deque<std::vector<ImageId>> committed_downloads; | ||||
|     std::deque<std::optional<AsyncBuffer>> async_buffers; | ||||
|  | ||||
|     std::vector<PendingDownload> uncommitted_downloads; | ||||
|     std::deque<std::vector<PendingDownload>> committed_downloads; | ||||
|     std::vector<AsyncBuffer> uncommitted_async_buffers; | ||||
|     std::deque<std::vector<AsyncBuffer>> async_buffers; | ||||
|  | ||||
|     struct LRUItemParams { | ||||
|         using ObjectType = ImageId; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user