decoders: Optimize swizzle copy performance (#6790)
This makes UnswizzleTexture up to two times faster. It is the main bottleneck in NVDEC video decoding.
This commit is contained in:
		| @@ -18,9 +18,9 @@ | ||||
|  | ||||
| namespace Tegra::Texture { | ||||
| namespace { | ||||
| template <bool TO_LINEAR> | ||||
| void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, | ||||
|              u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { | ||||
| template <bool TO_LINEAR, u32 BYTES_PER_PIXEL> | ||||
| void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height, u32 depth, | ||||
|                  u32 block_height, u32 block_depth, u32 stride_alignment) { | ||||
|     // The origin of the transformation can be configured here, leave it as zero as the current API | ||||
|     // doesn't expose it. | ||||
|     static constexpr u32 origin_x = 0; | ||||
| @@ -28,9 +28,9 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe | ||||
|     static constexpr u32 origin_z = 0; | ||||
|  | ||||
|     // We can configure here a custom pitch | ||||
|     // As it's not exposed 'width * bpp' will be the expected pitch. | ||||
|     const u32 pitch = width * bytes_per_pixel; | ||||
|     const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel; | ||||
|     // As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch. | ||||
|     const u32 pitch = width * BYTES_PER_PIXEL; | ||||
|     const u32 stride = Common::AlignUpLog2(width, stride_alignment) * BYTES_PER_PIXEL; | ||||
|  | ||||
|     const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); | ||||
|     const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); | ||||
| @@ -54,14 +54,14 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe | ||||
|                                  ((block_y & block_height_mask) << GOB_SIZE_SHIFT); | ||||
|  | ||||
|             for (u32 column = 0; column < width; ++column) { | ||||
|                 const u32 x = (column + origin_x) * bytes_per_pixel; | ||||
|                 const u32 x = (column + origin_x) * BYTES_PER_PIXEL; | ||||
|                 const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift; | ||||
|  | ||||
|                 const u32 base_swizzled_offset = offset_z + offset_y + offset_x; | ||||
|                 const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X]; | ||||
|  | ||||
|                 const u32 unswizzled_offset = | ||||
|                     slice * pitch * height + line * pitch + column * bytes_per_pixel; | ||||
|                     slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL; | ||||
|  | ||||
|                 if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset); | ||||
|                     offset >= input.size()) { | ||||
| @@ -73,11 +73,45 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe | ||||
|  | ||||
|                 u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset]; | ||||
|                 const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset]; | ||||
|                 std::memcpy(dst, src, bytes_per_pixel); | ||||
|  | ||||
|                 std::memcpy(dst, src, BYTES_PER_PIXEL); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| template <bool TO_LINEAR> | ||||
| void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, | ||||
|              u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { | ||||
|     switch (bytes_per_pixel) { | ||||
|     case 1: | ||||
|         return SwizzleImpl<TO_LINEAR, 1>(output, input, width, height, depth, block_height, | ||||
|                                          block_depth, stride_alignment); | ||||
|     case 2: | ||||
|         return SwizzleImpl<TO_LINEAR, 2>(output, input, width, height, depth, block_height, | ||||
|                                          block_depth, stride_alignment); | ||||
|     case 3: | ||||
|         return SwizzleImpl<TO_LINEAR, 3>(output, input, width, height, depth, block_height, | ||||
|                                          block_depth, stride_alignment); | ||||
|     case 4: | ||||
|         return SwizzleImpl<TO_LINEAR, 4>(output, input, width, height, depth, block_height, | ||||
|                                          block_depth, stride_alignment); | ||||
|     case 6: | ||||
|         return SwizzleImpl<TO_LINEAR, 6>(output, input, width, height, depth, block_height, | ||||
|                                          block_depth, stride_alignment); | ||||
|     case 8: | ||||
|         return SwizzleImpl<TO_LINEAR, 8>(output, input, width, height, depth, block_height, | ||||
|                                          block_depth, stride_alignment); | ||||
|     case 12: | ||||
|         return SwizzleImpl<TO_LINEAR, 12>(output, input, width, height, depth, block_height, | ||||
|                                           block_depth, stride_alignment); | ||||
|     case 16: | ||||
|         return SwizzleImpl<TO_LINEAR, 16>(output, input, width, height, depth, block_height, | ||||
|                                           block_depth, stride_alignment); | ||||
|     default: | ||||
|         UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel); | ||||
|     } | ||||
| } | ||||
| } // Anonymous namespace | ||||
|  | ||||
| void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user