diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 062b4f2528..365bde2f16 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -20,6 +20,8 @@ #include #include +#include + #include "common/common_types.h" #include "video_core/textures/astc.h" @@ -39,25 +41,25 @@ constexpr u32 Popcnt(u32 n) { class InputBitStream { public: - explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) - : m_CurByte(ptr), m_NextBit(start_offset % 8) {} + constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) + : cur_byte{ptr}, next_bit{start_offset % 8} {} - std::size_t GetBitsRead() const { - return m_BitsRead; + constexpr std::size_t GetBitsRead() const { + return bits_read; } - u32 ReadBit() { - u32 bit = *m_CurByte >> m_NextBit++; - while (m_NextBit >= 8) { - m_NextBit -= 8; - m_CurByte++; + constexpr bool ReadBit() { + const bool bit = (*cur_byte >> next_bit++) & 1; + while (next_bit >= 8) { + next_bit -= 8; + cur_byte++; } - m_BitsRead++; - return bit & 1; + bits_read++; + return bit; } - u32 ReadBits(std::size_t nBits) { + constexpr u32 ReadBits(std::size_t nBits) { u32 ret = 0; for (std::size_t i = 0; i < nBits; ++i) { ret |= (ReadBit() & 1) << i; @@ -66,7 +68,7 @@ public: } template - u32 ReadBits() { + constexpr u32 ReadBits() { u32 ret = 0; for (std::size_t i = 0; i < nBits; ++i) { ret |= (ReadBit() & 1) << i; @@ -75,64 +77,58 @@ public: } private: - const u8* m_CurByte; - std::size_t m_NextBit = 0; - std::size_t m_BitsRead = 0; + const u8* cur_byte; + std::size_t next_bit = 0; + std::size_t bits_read = 0; }; class OutputBitStream { public: - explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0) - : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {} + constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0) + : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {} - ~OutputBitStream() = default; - - s32 GetBitsWritten() const { - return m_BitsWritten; + constexpr std::size_t GetBitsWritten() const { + return bits_written; } - void WriteBitsR(u32 val, u32 nBits) { + constexpr void WriteBitsR(u32 val, u32 nBits) { for (u32 i = 0; i < nBits; i++) { WriteBit((val >> (nBits - i - 1)) & 1); } } - void WriteBits(u32 val, u32 nBits) { + constexpr void WriteBits(u32 val, u32 nBits) { for (u32 i = 0; i < nBits; i++) { WriteBit((val >> i) & 1); } } private: - void WriteBit(s32 b) { - - if (done) + constexpr void WriteBit(bool b) { + if (bits_written >= num_bits) { return; + } - const u32 mask = 1 << m_NextBit++; + const u32 mask = 1 << next_bit++; // clear the bit - *m_CurByte &= static_cast(~mask); + *cur_byte &= static_cast(~mask); // Write the bit, if necessary if (b) - *m_CurByte |= static_cast(mask); + *cur_byte |= static_cast(mask); // Next byte? - if (m_NextBit >= 8) { - m_CurByte += 1; - m_NextBit = 0; + if (next_bit >= 8) { + cur_byte += 1; + next_bit = 0; } - - done = done || ++m_BitsWritten >= m_NumBits; } - s32 m_BitsWritten = 0; - const s32 m_NumBits; - u8* m_CurByte; - s32 m_NextBit = 0; - - bool done = false; + u8* cur_byte; + std::size_t num_bits; + std::size_t bits_written = 0; + std::size_t next_bit = 0; }; template @@ -195,9 +191,13 @@ struct IntegerEncodedValue { u32 trit_value; }; }; +using IntegerEncodedVector = boost::container::static_vector< + IntegerEncodedValue, 64, + boost::container::static_vector_options< + boost::container::inplace_alignment, + boost::container::throw_on_overflow>::type>; -static void DecodeTritBlock(InputBitStream& bits, std::vector& result, - u32 nBitsPerValue) { +static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) { // Implement the algorithm in section C.2.12 u32 m[5]; u32 t[5]; @@ -255,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector& result, +static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) { // Implement the algorithm in section C.2.12 u32 m[3]; @@ -343,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues(); // Fills result with the values that are encoded in the given // bitstream. We must know beforehand what the maximum possible // value is, and how many values we're decoding. -static void DecodeIntegerSequence(std::vector& result, InputBitStream& bits, - u32 maxRange, u32 nValues) { +static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange, + u32 nValues) { // Determine encoding parameters IntegerEncodedValue val = EncodingsValues[maxRange]; @@ -634,12 +634,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) { // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] // is the same as [(numBits - 1):0] and repeats all the way down. template -static IntType Replicate(IntType val, u32 numBits, u32 toBit) { - if (numBits == 0) +static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) { + if (numBits == 0) { return 0; - if (toBit == 0) + } + if (toBit == 0) { return 0; - IntType v = val & static_cast((1 << numBits) - 1); + } + const IntType v = val & static_cast((1 << numBits) - 1); IntType res = v; u32 reslen = numBits; while (reslen < toBit) { @@ -656,6 +658,89 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) { return res; } +static constexpr std::size_t NumReplicateEntries(u32 num_bits) { + return std::size_t(1) << num_bits; +} + +template +static constexpr auto MakeReplicateTable() { + std::array table{}; + for (IntType value = 0; value < static_cast(std::size(table)); ++value) { + table[value] = Replicate(value, num_bits, to_bit); + } + return table; +} + +static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable(); +static constexpr u32 ReplicateByteTo16(std::size_t value) { + return REPLICATE_BYTE_TO_16_TABLE[value]; +} + +static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable(); +static constexpr u32 ReplicateBitTo7(std::size_t value) { + return REPLICATE_BIT_TO_7_TABLE[value]; +} + +static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable(); +static constexpr u32 ReplicateBitTo9(std::size_t value) { + return REPLICATE_BIT_TO_9_TABLE[value]; +} + +static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable(); +/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback +/// to the runtime implementation +static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_8_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_8_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_8_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_8_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_8_TABLE[value]; + case 6: + return REPLICATE_6_BIT_TO_8_TABLE[value]; + case 7: + return REPLICATE_7_BIT_TO_8_TABLE[value]; + case 8: + return REPLICATE_8_BIT_TO_8_TABLE[value]; + default: + return Replicate(value, num_bits, 8); + } +} + +static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_6_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_6_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_6_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_6_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_6_TABLE[value]; + default: + return Replicate(value, num_bits, 6); + } +} + class Pixel { protected: using ChannelType = s16; @@ -674,10 +759,10 @@ public: // significant bits when going from larger to smaller bit depth // or by repeating the most significant bits when going from // smaller to larger bit depths. - void ChangeBitDepth(const u8 (&depth)[4]) { + void ChangeBitDepth() { for (u32 i = 0; i < 4; i++) { - Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]); - m_BitDepth[i] = depth[i]; + Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]); + m_BitDepth[i] = 8; } } @@ -689,28 +774,23 @@ public: // Changes the bit depth of a single component. See the comment // above for how we do this. - static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) { - assert(newDepth <= 8); + static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) { assert(oldDepth <= 8); - if (oldDepth == newDepth) { + if (oldDepth == 8) { // Do nothing return val; - } else if (oldDepth == 0 && newDepth != 0) { - return static_cast((1 << newDepth) - 1); - } else if (newDepth > oldDepth) { - return Replicate(val, oldDepth, newDepth); + } else if (oldDepth == 0) { + return static_cast((1 << 8) - 1); + } else if (8 > oldDepth) { + return static_cast(FastReplicateTo8(static_cast(val), oldDepth)); } else { // oldDepth > newDepth - if (newDepth == 0) { - return 0xFF; - } else { - u8 bitsWasted = static_cast(oldDepth - newDepth); - u16 v = static_cast(val); - v = static_cast((v + (1 << (bitsWasted - 1))) >> bitsWasted); - v = ::std::min(::std::max(0, v), static_cast((1 << newDepth) - 1)); - return static_cast(v); - } + const u8 bitsWasted = static_cast(oldDepth - 8); + u16 v = static_cast(val); + v = static_cast((v + (1 << (bitsWasted - 1))) >> bitsWasted); + v = ::std::min(::std::max(0, v), static_cast((1 << 8) - 1)); + return static_cast(v); } assert(false && "We shouldn't get here."); @@ -760,8 +840,7 @@ public: // up in the most-significant byte. u32 Pack() const { Pixel eightBit(*this); - const u8 eightBitDepth[4] = {8, 8, 8, 8}; - eightBit.ChangeBitDepth(eightBitDepth); + eightBit.ChangeBitDepth(); u32 r = 0; r |= eightBit.A(); @@ -816,8 +895,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP } // We now have enough to decode our integer sequence. - std::vector decodedColorValues; - decodedColorValues.reserve(32); + IntegerEncodedVector decodedColorValues; InputBitStream colorStream(data); DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); @@ -839,12 +917,12 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP u32 A = 0, B = 0, C = 0, D = 0; // A is just the lsb replicated 9 times. - A = Replicate(bitval & 1, 1, 9); + A = ReplicateBitTo9(bitval & 1); switch (val.encoding) { // Replicate bits case IntegerEncoding::JustBits: - out[outIdx++] = Replicate(bitval, bitlen, 8); + out[outIdx++] = FastReplicateTo8(bitval, bitlen); break; // Use algorithm in C.2.13 @@ -962,13 +1040,13 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { u32 bitval = val.bit_value; u32 bitlen = val.num_bits; - u32 A = Replicate(bitval & 1, 1, 7); + u32 A = ReplicateBitTo7(bitval & 1); u32 B = 0, C = 0, D = 0; u32 result = 0; switch (val.encoding) { case IntegerEncoding::JustBits: - result = Replicate(bitval, bitlen, 6); + result = FastReplicateTo6(bitval, bitlen); break; case IntegerEncoding::Trit: { @@ -1047,7 +1125,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { return result; } -static void UnquantizeTexelWeights(u32 out[2][144], const std::vector& weights, +static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights, const TexelWeightParams& params, const u32 blockWidth, const u32 blockHeight) { u32 weightIdx = 0; @@ -1545,8 +1623,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 static_cast((1 << (weightParams.GetPackedBitSize() % 8)) - 1); memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); - std::vector texelWeightValues; - texelWeightValues.reserve(64); + IntegerEncodedVector texelWeightValues; InputBitStream weightStream(texelWeightData); @@ -1568,9 +1645,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 Pixel p; for (u32 c = 0; c < 4; c++) { u32 C0 = endpos32s[partition][0].Component(c); - C0 = Replicate(C0, 8, 16); + C0 = ReplicateByteTo16(C0); u32 C1 = endpos32s[partition][1].Component(c); - C1 = Replicate(C1, 8, 16); + C1 = ReplicateByteTo16(C1); u32 plane = 0; if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {