renderer_opengl: Accelerate ASTC texture decoding with a compute shader

ASTC texture decoding is currently handled by a CPU decoder for GPU's without native ASTC decoding support (most desktop GPUs). This is the cause for noticeable performance degradation in titles which use the format extensively. This commit adds support to accelerate ASTC decoding using a compute shader on OpenGL for GPUs without native support.
2021-02-13 15:50:12 -05:00
parent 3b85ac2ac4
commit 2985e5e94c
6 changed files with 1600 additions and 4 deletions
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -307,7 +307,8 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4

 [[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime,
                                    const VideoCommon::ImageInfo& info) {
-    // Disable accelerated uploads for now as they don't implement swizzled uploads
+    return (!runtime.HasNativeASTC() && IsPixelFormatASTC(info.format));
+    // Disable other accelerated uploads for now as they don't implement swizzled uploads
    return false;
    switch (info.type) {
    case ImageType::e2D:
@@ -567,6 +568,9 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,

 void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map,
                                                std::span<const SwizzleParameters> swizzles) {
+    if (IsPixelFormatASTC(image.info.format)) {
+        return util_shaders.ASTCDecode(image, map, swizzles);
+    }
    switch (image.info.type) {
    case ImageType::e2D:
        return util_shaders.BlockLinearUpload2D(image, map, swizzles);
@@ -599,6 +603,10 @@ FormatProperties TextureCacheRuntime::FormatInfo(ImageType type, GLenum internal
    }
 }

+bool TextureCacheRuntime::HasNativeASTC() const noexcept {
+    return device.HasASTC();
+}
+
 TextureCacheRuntime::StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_)
    : storage_flags{storage_flags_}, map_flags{map_flags_} {}

--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -95,6 +95,8 @@ public:
        return has_broken_texture_view_formats;
    }

+    bool HasNativeASTC() const noexcept;
+
 private:
    struct StagingBuffers {
        explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_);
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -3,7 +3,10 @@
 // Refer to the license.txt file included.

 #include <bit>
+#include <fstream>
 #include <span>
+#include <streambuf>
+#include <string>
 #include <string_view>

 #include <glad/glad.h>
@@ -24,11 +27,13 @@
 #include "video_core/texture_cache/accelerated_swizzle.h"
 #include "video_core/texture_cache/types.h"
 #include "video_core/texture_cache/util.h"
+#include "video_core/textures/astc.h"
 #include "video_core/textures/decoders.h"

 namespace OpenGL {

 using namespace HostShaders;
+using namespace Tegra::Texture::ASTC;

 using VideoCommon::Extent3D;
 using VideoCommon::ImageCopy;
@@ -63,13 +68,105 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
      pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)),
      copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)),
      copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) {
-    const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
-    swizzle_table_buffer.Create();
-    glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
+    // TODO: Load shader string as a header
+    std::string astc_path = "astc_decoder.comp";
+    std::ifstream t(astc_path);
+    std::string str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
+    astc_decoder_program = MakeProgram(str);
+    MakeBuffers();
 }

 UtilShaders::~UtilShaders() = default;

+void UtilShaders::MakeBuffers() {
+    const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
+    swizzle_table_buffer.Create();
+    glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
+
+    astc_encodings_buffer.Create();
+    glNamedBufferStorage(astc_encodings_buffer.handle, sizeof(EncodingsValues), &EncodingsValues,
+                         0);
+    replicate_6_to_8_buffer.Create();
+    glNamedBufferStorage(replicate_6_to_8_buffer.handle, sizeof(REPLICATE_6_BIT_TO_8_TABLE),
+                         &REPLICATE_6_BIT_TO_8_TABLE, 0);
+    replicate_7_to_8_buffer.Create();
+    glNamedBufferStorage(replicate_7_to_8_buffer.handle, sizeof(REPLICATE_7_BIT_TO_8_TABLE),
+                         &REPLICATE_7_BIT_TO_8_TABLE, 0);
+    replicate_8_to_8_buffer.Create();
+    glNamedBufferStorage(replicate_8_to_8_buffer.handle, sizeof(REPLICATE_8_BIT_TO_8_TABLE),
+                         &REPLICATE_8_BIT_TO_8_TABLE, 0);
+    replicate_byte_to_16_buffer.Create();
+    glNamedBufferStorage(replicate_byte_to_16_buffer.handle, sizeof(REPLICATE_BYTE_TO_16_TABLE),
+                         &REPLICATE_BYTE_TO_16_TABLE, 0);
+}
+
+void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
+                             std::span<const VideoCommon::SwizzleParameters> swizzles) {
+    static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0;
+    static constexpr GLuint BINDING_INPUT_BUFFER = 1;
+    static constexpr GLuint BINDING_ENC_BUFFER = 2;
+
+    static constexpr GLuint BINDING_6_TO_8_BUFFER = 3;
+    static constexpr GLuint BINDING_7_TO_8_BUFFER = 4;
+    static constexpr GLuint BINDING_8_TO_8_BUFFER = 5;
+    static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6;
+
+    static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
+    static constexpr GLuint LOC_NUM_IMAGE_BLOCKS = 0;
+    static constexpr GLuint LOC_BLOCK_DIMS = 1;
+    static constexpr GLuint LOC_LAYER = 2;
+
+    const Extent3D tile_size = {
+        VideoCore::Surface::DefaultBlockWidth(image.info.format),
+        VideoCore::Surface::DefaultBlockHeight(image.info.format),
+    };
+    program_manager.BindHostCompute(astc_decoder_program.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_encodings_buffer.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER,
+                     replicate_6_to_8_buffer.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER,
+                     replicate_7_to_8_buffer.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER,
+                     replicate_8_to_8_buffer.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER,
+                     replicate_byte_to_16_buffer.handle);
+
+    glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
+    glUniform2ui(LOC_BLOCK_DIMS, tile_size.width, tile_size.height);
+
+    for (u32 layer = 0; layer < image.info.resources.layers; layer++) {
+        for (const SwizzleParameters& swizzle : swizzles) {
+            glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_FALSE,
+                               layer, GL_WRITE_ONLY, GL_RGBA8);
+            const size_t input_offset = swizzle.buffer_offset + map.offset;
+            const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U);
+            const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U);
+
+            glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height);
+            glUniform1ui(LOC_LAYER, layer);
+
+            // To unswizzle the ASTC data
+            const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
+            glUniform3uiv(3, 1, params.origin.data());
+            glUniform3iv(4, 1, params.destination.data());
+            glUniform1ui(5, params.bytes_per_block_log2);
+            glUniform1ui(6, params.layer_stride);
+            glUniform1ui(7, params.block_size);
+            glUniform1ui(8, params.x_shift);
+            glUniform1ui(9, params.block_height);
+            glUniform1ui(10, params.block_height_mask);
+
+            // ASTC texture data
+            glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer,
+                              input_offset, image.guest_size_bytes - swizzle.buffer_offset);
+
+            glDispatchCompute(num_dispatches_x, num_dispatches_y, 1);
+        }
+    }
+    program_manager.RestoreGuestCompute();
+}
+
 void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
                                      std::span<const SwizzleParameters> swizzles) {
    static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1};
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -40,6 +40,11 @@ public:
    explicit UtilShaders(ProgramManager& program_manager);
    ~UtilShaders();

+    void MakeBuffers();
+
+    void ASTCDecode(Image& image, const ImageBufferMap& map,
+                    std::span<const VideoCommon::SwizzleParameters> swizzles);
+
    void BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
                             std::span<const VideoCommon::SwizzleParameters> swizzles);

@@ -59,7 +64,13 @@ private:
    ProgramManager& program_manager;

    OGLBuffer swizzle_table_buffer;
+    OGLBuffer astc_encodings_buffer;
+    OGLBuffer replicate_6_to_8_buffer;
+    OGLBuffer replicate_7_to_8_buffer;
+    OGLBuffer replicate_8_to_8_buffer;
+    OGLBuffer replicate_byte_to_16_buffer;

+    OGLProgram astc_decoder_program;
    OGLProgram block_linear_unswizzle_2d_program;
    OGLProgram block_linear_unswizzle_3d_program;
    OGLProgram pitch_unswizzle_program;
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -8,6 +8,196 @@

 namespace Tegra::Texture::ASTC {

+/// Count the number of bits set in a number.
+constexpr u32 Popcnt(u32 n) {
+    u32 c = 0;
+    for (; n; c++) {
+        n &= n - 1;
+    }
+    return c;
+}
+
+enum class IntegerEncoding { JustBits, Qus32, Trit };
+
+struct IntegerEncodedValue {
+    constexpr IntegerEncodedValue() = default;
+
+    constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
+        : encoding{encoding_}, num_bits{num_bits_} {}
+
+    constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
+        return encoding == other.encoding && num_bits == other.num_bits;
+    }
+
+    // Returns the number of bits required to encode nVals values.
+    u32 GetBitLength(u32 nVals) const {
+        u32 totalBits = num_bits * nVals;
+        if (encoding == IntegerEncoding::Trit) {
+            totalBits += (nVals * 8 + 4) / 5;
+        } else if (encoding == IntegerEncoding::Qus32) {
+            totalBits += (nVals * 7 + 2) / 3;
+        }
+        return totalBits;
+    }
+
+    IntegerEncoding encoding{};
+    u32 num_bits = 0;
+    u32 bit_value = 0;
+    union {
+        u32 qus32_value = 0;
+        u32 trit_value;
+    };
+};
+
+// Returns a new instance of this struct that corresponds to the
+// can take no more than maxval values
+static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) {
+    while (maxVal > 0) {
+        u32 check = maxVal + 1;
+
+        // Is maxVal a power of two?
+        if (!(check & (check - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal));
+        }
+
+        // Is maxVal of the type 3*2^n - 1?
+        if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1));
+        }
+
+        // Is maxVal of the type 5*2^n - 1?
+        if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1));
+        }
+
+        // Apparently it can't be represented with a bounded integer sequence...
+        // just iterate.
+        maxVal--;
+    }
+    return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
+}
+
+static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
+    std::array<IntegerEncodedValue, 256> encodings{};
+    for (std::size_t i = 0; i < encodings.size(); ++i) {
+        encodings[i] = CreateEncoding(static_cast<u32>(i));
+    }
+    return encodings;
+}
+
+static constexpr std::array<IntegerEncodedValue, 256> EncodingsValues = MakeEncodedValues();
+
+// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
+// is the same as [(numBits - 1):0] and repeats all the way down.
+template <typename IntType>
+static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
+    if (numBits == 0) {
+        return 0;
+    }
+    if (toBit == 0) {
+        return 0;
+    }
+    const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
+    IntType res = v;
+    u32 reslen = numBits;
+    while (reslen < toBit) {
+        u32 comp = 0;
+        if (numBits > toBit - reslen) {
+            u32 newshift = toBit - reslen;
+            comp = numBits - newshift;
+            numBits = newshift;
+        }
+        res = static_cast<IntType>(res << numBits);
+        res = static_cast<IntType>(res | (v >> comp));
+        reslen += numBits;
+    }
+    return res;
+}
+
+static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
+    return std::size_t(1) << num_bits;
+}
+
+template <typename IntType, u32 num_bits, u32 to_bit>
+static constexpr auto MakeReplicateTable() {
+    std::array<IntType, NumReplicateEntries(num_bits)> table{};
+    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
+        table[value] = Replicate(value, num_bits, to_bit);
+    }
+    return table;
+}
+
+static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
+static constexpr u32 ReplicateByteTo16(std::size_t value) {
+    return REPLICATE_BYTE_TO_16_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
+static constexpr u32 ReplicateBitTo7(std::size_t value) {
+    return REPLICATE_BIT_TO_7_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
+static constexpr u32 ReplicateBitTo9(std::size_t value) {
+    return REPLICATE_BIT_TO_9_TABLE[value];
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
+static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
+static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
+static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
+static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
+static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
+static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
+/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
+/// to the runtime implementation
+static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_8_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_8_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_8_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_8_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_8_TABLE[value];
+    case 6:
+        return REPLICATE_6_BIT_TO_8_TABLE[value];
+    case 7:
+        return REPLICATE_7_BIT_TO_8_TABLE[value];
+    case 8:
+        return REPLICATE_8_BIT_TO_8_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 8);
+    }
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
+static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
+static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
+static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
+static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
+
+static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_6_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_6_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_6_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_6_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_6_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 6);
+    }
+}
+
 void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
                uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);