Merge pull request #3631 from ReinUsesLisp/more-astc
texture/astc: More small ASTC optimizations
This commit is contained in:
		| @@ -20,6 +20,8 @@ | ||||
| #include <cstring> | ||||
| #include <vector> | ||||
|  | ||||
| #include <boost/container/static_vector.hpp> | ||||
|  | ||||
| #include "common/common_types.h" | ||||
|  | ||||
| #include "video_core/textures/astc.h" | ||||
| @@ -39,25 +41,25 @@ constexpr u32 Popcnt(u32 n) { | ||||
|  | ||||
| class InputBitStream { | ||||
| public: | ||||
|     explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) | ||||
|         : m_CurByte(ptr), m_NextBit(start_offset % 8) {} | ||||
|     constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) | ||||
|         : cur_byte{ptr}, next_bit{start_offset % 8} {} | ||||
|  | ||||
|     std::size_t GetBitsRead() const { | ||||
|         return m_BitsRead; | ||||
|     constexpr std::size_t GetBitsRead() const { | ||||
|         return bits_read; | ||||
|     } | ||||
|  | ||||
|     u32 ReadBit() { | ||||
|         u32 bit = *m_CurByte >> m_NextBit++; | ||||
|         while (m_NextBit >= 8) { | ||||
|             m_NextBit -= 8; | ||||
|             m_CurByte++; | ||||
|     constexpr bool ReadBit() { | ||||
|         const bool bit = (*cur_byte >> next_bit++) & 1; | ||||
|         while (next_bit >= 8) { | ||||
|             next_bit -= 8; | ||||
|             cur_byte++; | ||||
|         } | ||||
|  | ||||
|         m_BitsRead++; | ||||
|         return bit & 1; | ||||
|         bits_read++; | ||||
|         return bit; | ||||
|     } | ||||
|  | ||||
|     u32 ReadBits(std::size_t nBits) { | ||||
|     constexpr u32 ReadBits(std::size_t nBits) { | ||||
|         u32 ret = 0; | ||||
|         for (std::size_t i = 0; i < nBits; ++i) { | ||||
|             ret |= (ReadBit() & 1) << i; | ||||
| @@ -66,7 +68,7 @@ public: | ||||
|     } | ||||
|  | ||||
|     template <std::size_t nBits> | ||||
|     u32 ReadBits() { | ||||
|     constexpr u32 ReadBits() { | ||||
|         u32 ret = 0; | ||||
|         for (std::size_t i = 0; i < nBits; ++i) { | ||||
|             ret |= (ReadBit() & 1) << i; | ||||
| @@ -75,64 +77,58 @@ public: | ||||
|     } | ||||
|  | ||||
| private: | ||||
|     const u8* m_CurByte; | ||||
|     std::size_t m_NextBit = 0; | ||||
|     std::size_t m_BitsRead = 0; | ||||
|     const u8* cur_byte; | ||||
|     std::size_t next_bit = 0; | ||||
|     std::size_t bits_read = 0; | ||||
| }; | ||||
|  | ||||
| class OutputBitStream { | ||||
| public: | ||||
|     explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0) | ||||
|         : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {} | ||||
|     constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0) | ||||
|         : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {} | ||||
|  | ||||
|     ~OutputBitStream() = default; | ||||
|  | ||||
|     s32 GetBitsWritten() const { | ||||
|         return m_BitsWritten; | ||||
|     constexpr std::size_t GetBitsWritten() const { | ||||
|         return bits_written; | ||||
|     } | ||||
|  | ||||
|     void WriteBitsR(u32 val, u32 nBits) { | ||||
|     constexpr void WriteBitsR(u32 val, u32 nBits) { | ||||
|         for (u32 i = 0; i < nBits; i++) { | ||||
|             WriteBit((val >> (nBits - i - 1)) & 1); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     void WriteBits(u32 val, u32 nBits) { | ||||
|     constexpr void WriteBits(u32 val, u32 nBits) { | ||||
|         for (u32 i = 0; i < nBits; i++) { | ||||
|             WriteBit((val >> i) & 1); | ||||
|         } | ||||
|     } | ||||
|  | ||||
| private: | ||||
|     void WriteBit(s32 b) { | ||||
|  | ||||
|         if (done) | ||||
|     constexpr void WriteBit(bool b) { | ||||
|         if (bits_written >= num_bits) { | ||||
|             return; | ||||
|         } | ||||
|  | ||||
|         const u32 mask = 1 << m_NextBit++; | ||||
|         const u32 mask = 1 << next_bit++; | ||||
|  | ||||
|         // clear the bit | ||||
|         *m_CurByte &= static_cast<u8>(~mask); | ||||
|         *cur_byte &= static_cast<u8>(~mask); | ||||
|  | ||||
|         // Write the bit, if necessary | ||||
|         if (b) | ||||
|             *m_CurByte |= static_cast<u8>(mask); | ||||
|             *cur_byte |= static_cast<u8>(mask); | ||||
|  | ||||
|         // Next byte? | ||||
|         if (m_NextBit >= 8) { | ||||
|             m_CurByte += 1; | ||||
|             m_NextBit = 0; | ||||
|         if (next_bit >= 8) { | ||||
|             cur_byte += 1; | ||||
|             next_bit = 0; | ||||
|         } | ||||
|  | ||||
|         done = done || ++m_BitsWritten >= m_NumBits; | ||||
|     } | ||||
|  | ||||
|     s32 m_BitsWritten = 0; | ||||
|     const s32 m_NumBits; | ||||
|     u8* m_CurByte; | ||||
|     s32 m_NextBit = 0; | ||||
|  | ||||
|     bool done = false; | ||||
|     u8* cur_byte; | ||||
|     std::size_t num_bits; | ||||
|     std::size_t bits_written = 0; | ||||
|     std::size_t next_bit = 0; | ||||
| }; | ||||
|  | ||||
| template <typename IntType> | ||||
| @@ -195,9 +191,13 @@ struct IntegerEncodedValue { | ||||
|         u32 trit_value; | ||||
|     }; | ||||
| }; | ||||
| using IntegerEncodedVector = boost::container::static_vector< | ||||
|     IntegerEncodedValue, 64, | ||||
|     boost::container::static_vector_options< | ||||
|         boost::container::inplace_alignment<alignof(IntegerEncodedValue)>, | ||||
|         boost::container::throw_on_overflow<false>>::type>; | ||||
|  | ||||
| static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, | ||||
|                             u32 nBitsPerValue) { | ||||
| static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) { | ||||
|     // Implement the algorithm in section C.2.12 | ||||
|     u32 m[5]; | ||||
|     u32 t[5]; | ||||
| @@ -255,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValu | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, | ||||
| static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result, | ||||
|                              u32 nBitsPerValue) { | ||||
|     // Implement the algorithm in section C.2.12 | ||||
|     u32 m[3]; | ||||
| @@ -343,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues(); | ||||
| // Fills result with the values that are encoded in the given | ||||
| // bitstream. We must know beforehand what the maximum possible | ||||
| // value is, and how many values we're decoding. | ||||
| static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits, | ||||
|                                   u32 maxRange, u32 nValues) { | ||||
| static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange, | ||||
|                                   u32 nValues) { | ||||
|     // Determine encoding parameters | ||||
|     IntegerEncodedValue val = EncodingsValues[maxRange]; | ||||
|  | ||||
| @@ -634,12 +634,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) { | ||||
| // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] | ||||
| // is the same as [(numBits - 1):0] and repeats all the way down. | ||||
| template <typename IntType> | ||||
| static IntType Replicate(IntType val, u32 numBits, u32 toBit) { | ||||
|     if (numBits == 0) | ||||
| static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) { | ||||
|     if (numBits == 0) { | ||||
|         return 0; | ||||
|     if (toBit == 0) | ||||
|     } | ||||
|     if (toBit == 0) { | ||||
|         return 0; | ||||
|     IntType v = val & static_cast<IntType>((1 << numBits) - 1); | ||||
|     } | ||||
|     const IntType v = val & static_cast<IntType>((1 << numBits) - 1); | ||||
|     IntType res = v; | ||||
|     u32 reslen = numBits; | ||||
|     while (reslen < toBit) { | ||||
| @@ -656,6 +658,89 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) { | ||||
|     return res; | ||||
| } | ||||
|  | ||||
| static constexpr std::size_t NumReplicateEntries(u32 num_bits) { | ||||
|     return std::size_t(1) << num_bits; | ||||
| } | ||||
|  | ||||
| template <typename IntType, u32 num_bits, u32 to_bit> | ||||
| static constexpr auto MakeReplicateTable() { | ||||
|     std::array<IntType, NumReplicateEntries(num_bits)> table{}; | ||||
|     for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) { | ||||
|         table[value] = Replicate(value, num_bits, to_bit); | ||||
|     } | ||||
|     return table; | ||||
| } | ||||
|  | ||||
| static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); | ||||
| static constexpr u32 ReplicateByteTo16(std::size_t value) { | ||||
|     return REPLICATE_BYTE_TO_16_TABLE[value]; | ||||
| } | ||||
|  | ||||
| static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>(); | ||||
| static constexpr u32 ReplicateBitTo7(std::size_t value) { | ||||
|     return REPLICATE_BIT_TO_7_TABLE[value]; | ||||
| } | ||||
|  | ||||
| static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>(); | ||||
| static constexpr u32 ReplicateBitTo9(std::size_t value) { | ||||
|     return REPLICATE_BIT_TO_9_TABLE[value]; | ||||
| } | ||||
|  | ||||
| static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>(); | ||||
| static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>(); | ||||
| static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>(); | ||||
| static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>(); | ||||
| static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>(); | ||||
| static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); | ||||
| static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); | ||||
| static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); | ||||
| /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback | ||||
| /// to the runtime implementation | ||||
| static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { | ||||
|     switch (num_bits) { | ||||
|     case 1: | ||||
|         return REPLICATE_1_BIT_TO_8_TABLE[value]; | ||||
|     case 2: | ||||
|         return REPLICATE_2_BIT_TO_8_TABLE[value]; | ||||
|     case 3: | ||||
|         return REPLICATE_3_BIT_TO_8_TABLE[value]; | ||||
|     case 4: | ||||
|         return REPLICATE_4_BIT_TO_8_TABLE[value]; | ||||
|     case 5: | ||||
|         return REPLICATE_5_BIT_TO_8_TABLE[value]; | ||||
|     case 6: | ||||
|         return REPLICATE_6_BIT_TO_8_TABLE[value]; | ||||
|     case 7: | ||||
|         return REPLICATE_7_BIT_TO_8_TABLE[value]; | ||||
|     case 8: | ||||
|         return REPLICATE_8_BIT_TO_8_TABLE[value]; | ||||
|     default: | ||||
|         return Replicate(value, num_bits, 8); | ||||
|     } | ||||
| } | ||||
|  | ||||
| static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>(); | ||||
| static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>(); | ||||
| static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>(); | ||||
| static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>(); | ||||
| static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>(); | ||||
| static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) { | ||||
|     switch (num_bits) { | ||||
|     case 1: | ||||
|         return REPLICATE_1_BIT_TO_6_TABLE[value]; | ||||
|     case 2: | ||||
|         return REPLICATE_2_BIT_TO_6_TABLE[value]; | ||||
|     case 3: | ||||
|         return REPLICATE_3_BIT_TO_6_TABLE[value]; | ||||
|     case 4: | ||||
|         return REPLICATE_4_BIT_TO_6_TABLE[value]; | ||||
|     case 5: | ||||
|         return REPLICATE_5_BIT_TO_6_TABLE[value]; | ||||
|     default: | ||||
|         return Replicate(value, num_bits, 6); | ||||
|     } | ||||
| } | ||||
|  | ||||
| class Pixel { | ||||
| protected: | ||||
|     using ChannelType = s16; | ||||
| @@ -674,10 +759,10 @@ public: | ||||
|     // significant bits when going from larger to smaller bit depth | ||||
|     // or by repeating the most significant bits when going from | ||||
|     // smaller to larger bit depths. | ||||
|     void ChangeBitDepth(const u8 (&depth)[4]) { | ||||
|     void ChangeBitDepth() { | ||||
|         for (u32 i = 0; i < 4; i++) { | ||||
|             Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]); | ||||
|             m_BitDepth[i] = depth[i]; | ||||
|             Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]); | ||||
|             m_BitDepth[i] = 8; | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -689,28 +774,23 @@ public: | ||||
|  | ||||
|     // Changes the bit depth of a single component. See the comment | ||||
|     // above for how we do this. | ||||
|     static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) { | ||||
|         assert(newDepth <= 8); | ||||
|     static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) { | ||||
|         assert(oldDepth <= 8); | ||||
|  | ||||
|         if (oldDepth == newDepth) { | ||||
|         if (oldDepth == 8) { | ||||
|             // Do nothing | ||||
|             return val; | ||||
|         } else if (oldDepth == 0 && newDepth != 0) { | ||||
|             return static_cast<ChannelType>((1 << newDepth) - 1); | ||||
|         } else if (newDepth > oldDepth) { | ||||
|             return Replicate(val, oldDepth, newDepth); | ||||
|         } else if (oldDepth == 0) { | ||||
|             return static_cast<ChannelType>((1 << 8) - 1); | ||||
|         } else if (8 > oldDepth) { | ||||
|             return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth)); | ||||
|         } else { | ||||
|             // oldDepth > newDepth | ||||
|             if (newDepth == 0) { | ||||
|                 return 0xFF; | ||||
|             } else { | ||||
|                 u8 bitsWasted = static_cast<u8>(oldDepth - newDepth); | ||||
|                 u16 v = static_cast<u16>(val); | ||||
|                 v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted); | ||||
|                 v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1)); | ||||
|                 return static_cast<u8>(v); | ||||
|             } | ||||
|             const u8 bitsWasted = static_cast<u8>(oldDepth - 8); | ||||
|             u16 v = static_cast<u16>(val); | ||||
|             v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted); | ||||
|             v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1)); | ||||
|             return static_cast<u8>(v); | ||||
|         } | ||||
|  | ||||
|         assert(false && "We shouldn't get here."); | ||||
| @@ -760,8 +840,7 @@ public: | ||||
|     // up in the most-significant byte. | ||||
|     u32 Pack() const { | ||||
|         Pixel eightBit(*this); | ||||
|         const u8 eightBitDepth[4] = {8, 8, 8, 8}; | ||||
|         eightBit.ChangeBitDepth(eightBitDepth); | ||||
|         eightBit.ChangeBitDepth(); | ||||
|  | ||||
|         u32 r = 0; | ||||
|         r |= eightBit.A(); | ||||
| @@ -816,8 +895,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP | ||||
|     } | ||||
|  | ||||
|     // We now have enough to decode our integer sequence. | ||||
|     std::vector<IntegerEncodedValue> decodedColorValues; | ||||
|     decodedColorValues.reserve(32); | ||||
|     IntegerEncodedVector decodedColorValues; | ||||
|  | ||||
|     InputBitStream colorStream(data); | ||||
|     DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); | ||||
| @@ -839,12 +917,12 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP | ||||
|  | ||||
|         u32 A = 0, B = 0, C = 0, D = 0; | ||||
|         // A is just the lsb replicated 9 times. | ||||
|         A = Replicate(bitval & 1, 1, 9); | ||||
|         A = ReplicateBitTo9(bitval & 1); | ||||
|  | ||||
|         switch (val.encoding) { | ||||
|         // Replicate bits | ||||
|         case IntegerEncoding::JustBits: | ||||
|             out[outIdx++] = Replicate(bitval, bitlen, 8); | ||||
|             out[outIdx++] = FastReplicateTo8(bitval, bitlen); | ||||
|             break; | ||||
|  | ||||
|         // Use algorithm in C.2.13 | ||||
| @@ -962,13 +1040,13 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { | ||||
|     u32 bitval = val.bit_value; | ||||
|     u32 bitlen = val.num_bits; | ||||
|  | ||||
|     u32 A = Replicate(bitval & 1, 1, 7); | ||||
|     u32 A = ReplicateBitTo7(bitval & 1); | ||||
|     u32 B = 0, C = 0, D = 0; | ||||
|  | ||||
|     u32 result = 0; | ||||
|     switch (val.encoding) { | ||||
|     case IntegerEncoding::JustBits: | ||||
|         result = Replicate(bitval, bitlen, 6); | ||||
|         result = FastReplicateTo6(bitval, bitlen); | ||||
|         break; | ||||
|  | ||||
|     case IntegerEncoding::Trit: { | ||||
| @@ -1047,7 +1125,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights, | ||||
| static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights, | ||||
|                                    const TexelWeightParams& params, const u32 blockWidth, | ||||
|                                    const u32 blockHeight) { | ||||
|     u32 weightIdx = 0; | ||||
| @@ -1545,8 +1623,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 | ||||
|         static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); | ||||
|     memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); | ||||
|  | ||||
|     std::vector<IntegerEncodedValue> texelWeightValues; | ||||
|     texelWeightValues.reserve(64); | ||||
|     IntegerEncodedVector texelWeightValues; | ||||
|  | ||||
|     InputBitStream weightStream(texelWeightData); | ||||
|  | ||||
| @@ -1568,9 +1645,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 | ||||
|             Pixel p; | ||||
|             for (u32 c = 0; c < 4; c++) { | ||||
|                 u32 C0 = endpos32s[partition][0].Component(c); | ||||
|                 C0 = Replicate(C0, 8, 16); | ||||
|                 C0 = ReplicateByteTo16(C0); | ||||
|                 u32 C1 = endpos32s[partition][1].Component(c); | ||||
|                 C1 = Replicate(C1, 8, 16); | ||||
|                 C1 = ReplicateByteTo16(C1); | ||||
|  | ||||
|                 u32 plane = 0; | ||||
|                 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user