From 2985e5e94c82febcf215feb0023f4184b38bb24a Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 13 Feb 2021 15:50:12 -0500 Subject: [PATCH 1/6] renderer_opengl: Accelerate ASTC texture decoding with a compute shader ASTC texture decoding is currently handled by a CPU decoder for GPU's without native ASTC decoding support (most desktop GPUs). This is the cause for noticeable performance degradation in titles which use the format extensively. This commit adds support to accelerate ASTC decoding using a compute shader on OpenGL for GPUs without native support. --- src/video_core/host_shaders/astc_decoder.comp | 1288 +++++++++++++++++ .../renderer_opengl/gl_texture_cache.cpp | 10 +- .../renderer_opengl/gl_texture_cache.h | 2 + .../renderer_opengl/util_shaders.cpp | 103 +- src/video_core/renderer_opengl/util_shaders.h | 11 + src/video_core/textures/astc.h | 190 +++ 6 files changed, 1600 insertions(+), 4 deletions(-) create mode 100644 src/video_core/host_shaders/astc_decoder.comp diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp new file mode 100644 index 0000000000..070190a5ce --- /dev/null +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -0,0 +1,1288 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 + +#ifdef VULKAN + +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_SWIZZLE_BUFFER 0 +#define BINDING_INPUT_BUFFER 1 +#define BINDING_ENC_BUFFER 2 +#define BINDING_6_TO_8_BUFFER 3 +#define BINDING_7_TO_8_BUFFER 4 +#define BINDING_8_TO_8_BUFFER 5 +#define BINDING_BYTE_TO_16_BUFFER 6 +#define BINDING_OUTPUT_IMAGE 3 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout(location = n) uniform +#define BINDING_SWIZZLE_BUFFER 0 +#define BINDING_INPUT_BUFFER 1 +#define BINDING_ENC_BUFFER 2 +#define BINDING_6_TO_8_BUFFER 3 +#define BINDING_7_TO_8_BUFFER 4 +#define BINDING_8_TO_8_BUFFER 5 +#define BINDING_BYTE_TO_16_BUFFER 6 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uvec2 num_image_blocks; +UNIFORM(1) uvec2 block_dims; +UNIFORM(2) uint layer; + +UNIFORM(3) uvec3 origin; +UNIFORM(4) ivec3 destination; +UNIFORM(5) uint bytes_per_block_log2; +UNIFORM(6) uint layer_stride; +UNIFORM(7) uint block_size; +UNIFORM(8) uint x_shift; +UNIFORM(9) uint block_height; +UNIFORM(10) uint block_height_mask; + +END_PUSH_CONSTANTS + +uint current_index = 0; +int bitsread = 0; +uint total_bitsread = 0; +uint local_buff[16]; + +const int JustBits = 0; +const int Quint = 1; +const int Trit = 2; + +struct EncodingData { + uint encoding; + uint num_bits; + uint bit_value; + uint quint_trit_value; +}; + +struct TexelWeightParams { + uvec2 size; + bool dual_plane; + uint max_weight; + bool Error; + bool VoidExtentLDR; + bool VoidExtentHDR; +}; + +// Swizzle data +layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { + uint swizzle_table[]; +}; + +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { + uint astc_data[]; +}; +layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly image2D dest_image; + +const uint GOB_SIZE_X = 64; +const uint GOB_SIZE_Y = 8; +const uint GOB_SIZE_Z = 1; +const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; + +const uint GOB_SIZE_X_SHIFT = 6; +const uint GOB_SIZE_Y_SHIFT = 3; +const uint GOB_SIZE_Z_SHIFT = 0; +const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; + +const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); + +uint SwizzleOffset(uvec2 pos) { + pos = pos & SWIZZLE_MASK; + return swizzle_table[pos.y * 64 + pos.x]; +} + +uint ReadTexel(uint offset) { + // extract the 8-bit value from the 32-bit packed data. + return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8); +} + +// ASTC Encodings data +layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues { + EncodingData encoding_values[256]; +}; +// ASTC Precompiled tables +layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 { + uint REPLICATE_6_BIT_TO_8_TABLE[]; +}; +layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 { + uint REPLICATE_7_BIT_TO_8_TABLE[]; +}; +layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 { + uint REPLICATE_8_BIT_TO_8_TABLE[]; +}; +layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 { + uint REPLICATE_BYTE_TO_16_TABLE[]; +}; + +const int BLOCK_SIZE_IN_BYTES = 16; + +const int BLOCK_INFO_ERROR = 0; +const int BLOCK_INFO_VOID_EXTENT_HDR = 1; +const int BLOCK_INFO_VOID_EXTENT_LDR = 2; +const int BLOCK_INFO_NORMAL = 3; + +// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] +// is the same as [(numBits - 1):0] and repeats all the way down. +uint Replicate(uint val, uint num_bits, uint to_bit) { + if (num_bits == 0) { + return 0; + } + if (to_bit == 0) { + return 0; + } + const uint v = val & uint((1 << num_bits) - 1); + uint res = v; + uint reslen = num_bits; + while (reslen < to_bit) { + uint comp = 0; + if (num_bits > to_bit - reslen) { + uint newshift = to_bit - reslen; + comp = num_bits - newshift; + num_bits = newshift; + } + res = uint(res << num_bits); + res = uint(res | (v >> comp)); + reslen += num_bits; + } + return res; +} + +uvec4 ReplicateByteTo16(uvec4 value) { + return uvec4(REPLICATE_BYTE_TO_16_TABLE[value.x], REPLICATE_BYTE_TO_16_TABLE[value.y], + REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]); +} + +const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127); +uint ReplicateBitTo7(uint value) { + return REPLICATE_BIT_TO_7_TABLE[value]; + ; +} + +const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511); +uint ReplicateBitTo9(uint value) { + return REPLICATE_1_BIT_TO_9_TABLE[value]; +} + +const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255); +const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255); +const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255); +const uint REPLICATE_4_BIT_TO_8_TABLE[16] = + uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255); +const uint REPLICATE_5_BIT_TO_8_TABLE[32] = + uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165, + 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255); + +uint FastReplicateTo8(uint value, uint num_bits) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_8_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_8_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_8_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_8_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_8_TABLE[value]; + case 6: + return REPLICATE_6_BIT_TO_8_TABLE[value]; + case 7: + return REPLICATE_7_BIT_TO_8_TABLE[value]; + case 8: + return REPLICATE_8_BIT_TO_8_TABLE[value]; + } + return Replicate(value, num_bits, 8); +} + +const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63); +const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63); +const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63); +const uint REPLICATE_4_BIT_TO_6_TABLE[16] = + uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63); +const uint REPLICATE_5_BIT_TO_6_TABLE[32] = + uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, + 47, 49, 51, 53, 55, 57, 59, 61, 63); + +uint FastReplicateTo6(uint value, uint num_bits) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_6_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_6_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_6_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_6_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_6_TABLE[value]; + } + return Replicate(value, num_bits, 6); +} + +uint hash52(uint p) { + p ^= p >> 15; + p -= p << 17; + p += p << 7; + p += p << 4; + p ^= p >> 5; + p += p << 16; + p ^= p >> 7; + p ^= p >> 3; + p ^= p << 6; + p ^= p >> 17; + return p; +} + +uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) { + if (1 == partition_count) + return 0; + + if (small_block) { + x <<= 1; + y <<= 1; + z <<= 1; + } + + seed += (partition_count - 1) * 1024; + + uint rnum = hash52(uint(seed)); + uint seed1 = uint(rnum & 0xF); + uint seed2 = uint((rnum >> 4) & 0xF); + uint seed3 = uint((rnum >> 8) & 0xF); + uint seed4 = uint((rnum >> 12) & 0xF); + uint seed5 = uint((rnum >> 16) & 0xF); + uint seed6 = uint((rnum >> 20) & 0xF); + uint seed7 = uint((rnum >> 24) & 0xF); + uint seed8 = uint((rnum >> 28) & 0xF); + uint seed9 = uint((rnum >> 18) & 0xF); + uint seed10 = uint((rnum >> 22) & 0xF); + uint seed11 = uint((rnum >> 26) & 0xF); + uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF); + + seed1 = (seed1 * seed1); + seed2 = (seed2 * seed2); + seed3 = (seed3 * seed3); + seed4 = (seed4 * seed4); + seed5 = (seed5 * seed5); + seed6 = (seed6 * seed6); + seed7 = (seed7 * seed7); + seed8 = (seed8 * seed8); + seed9 = (seed9 * seed9); + seed10 = (seed10 * seed10); + seed11 = (seed11 * seed11); + seed12 = (seed12 * seed12); + + int sh1, sh2, sh3; + if ((seed & 1) > 0) { + sh1 = (seed & 2) > 0 ? 4 : 5; + sh2 = (partition_count == 3) ? 6 : 5; + } else { + sh1 = (partition_count == 3) ? 6 : 5; + sh2 = (seed & 2) > 0 ? 4 : 5; + } + sh3 = (seed & 0x10) > 0 ? sh1 : sh2; + + seed1 = (seed1 >> sh1); + seed2 = (seed2 >> sh2); + seed3 = (seed3 >> sh1); + seed4 = (seed4 >> sh2); + seed5 = (seed5 >> sh1); + seed6 = (seed6 >> sh2); + seed7 = (seed7 >> sh1); + seed8 = (seed8 >> sh2); + seed9 = (seed9 >> sh3); + seed10 = (seed10 >> sh3); + seed11 = (seed11 >> sh3); + seed12 = (seed12 >> sh3); + + uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); + uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); + uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); + uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); + + a &= 0x3F; + b &= 0x3F; + c &= 0x3F; + d &= 0x3F; + + if (partition_count < 4) + d = 0; + if (partition_count < 3) + c = 0; + + if (a >= b && a >= c && a >= d) + return 0; + else if (b >= c && b >= d) + return 1; + else if (c >= d) + return 2; + return 3; +} + +uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { + return SelectPartition(seed, x, y, 0, partition_count, small_block); +} + +uint ReadBit() { + uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1); + bitsread++; + total_bitsread++; + if (bitsread == 8) { + current_index++; + bitsread = 0; + } + return bit; +} + +uint StreamBits(uint num_bits) { + uint ret = 0; + for (uint i = 0; i < num_bits; i++) { + ret |= ((ReadBit() & 1) << i); + } + return ret; +} + +// Define color data. +uint color_endpoint_data[16]; +int color_bitsread = 0; +uint total_color_bitsread = 0; +int color_index = 0; + +// Define color data. +uint texel_weight_data[16]; +int texel_bitsread = 0; +uint total_texel_bitsread = 0; +int texel_index = 0; + +bool texel_flag = false; + +uint ReadColorBit() { + uint bit = 0; + if (texel_flag) { + bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1); + texel_bitsread++; + total_texel_bitsread++; + if (texel_bitsread == 8) { + texel_index++; + texel_bitsread = 0; + } + } else { + bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1); + color_bitsread++; + total_color_bitsread++; + if (color_bitsread == 8) { + color_index++; + color_bitsread = 0; + } + } + return bit; +} + +uint StreamColorBits(uint num_bits) { + uint ret = 0; + for (uint i = 0; i < num_bits; i++) { + ret |= ((ReadColorBit() & 1) << i); + } + return ret; +} + +EncodingData result_vector[100]; +int result_index = 0; + +EncodingData texel_vector[100]; +int texel_vector_index = 0; + +void ResultEmplaceBack(EncodingData val) { + if (texel_flag) { + texel_vector[texel_vector_index] = val; + texel_vector_index++; + } else { + result_vector[result_index] = val; + result_index++; + } +} + +// Returns the number of bits required to encode n_vals values. +uint GetBitLength(uint n_vals, uint encoding_index) { + uint totalBits = encoding_values[encoding_index].num_bits * n_vals; + if (encoding_values[encoding_index].encoding == Trit) { + totalBits += (n_vals * 8 + 4) / 5; + } else if (encoding_values[encoding_index].encoding == Quint) { + totalBits += (n_vals * 7 + 2) / 3; + } + return totalBits; +} + +uint GetNumWeightValues(uvec2 size, bool dual_plane) { + uint n_vals = size.x * size.y; + if (dual_plane) { + n_vals *= 2; + } + return n_vals; +} + +uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) { + uint n_vals = GetNumWeightValues(size, dual_plane); + return GetBitLength(n_vals, max_weight); +} + +uint BitsBracket(uint bits, uint pos) { + return ((bits >> pos) & 1); +} + +uint BitsOp(uint bits, uint start, uint end) { + if (start == end) { + return BitsBracket(bits, start); + } else if (start > end) { + uint t = start; + start = end; + end = t; + } + + uint mask = (1 << (end - start + 1)) - 1; + return ((bits >> start) & mask); +} + +void DecodeQuintBlock(uint num_bits) { // Value number of bits + uint m[3]; + uint q[3]; + uint Q; + m[0] = StreamColorBits(num_bits); + Q = StreamColorBits(3); + m[1] = StreamColorBits(num_bits); + Q |= StreamColorBits(2) << 3; + m[2] = StreamColorBits(num_bits); + Q |= StreamColorBits(2) << 5; + if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) { + q[0] = 4; + q[1] = 4; + q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) | + (BitsBracket(Q, 3) & ~BitsBracket(Q, 0)); + } else { + uint C = 0; + if (BitsOp(Q, 1, 2) == 3) { + q[2] = 4; + C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0); + } else { + q[2] = BitsOp(Q, 5, 6); + C = BitsOp(Q, 0, 4); + } + + if (BitsOp(C, 0, 2) == 5) { + q[1] = 4; + q[0] = BitsOp(C, 3, 4); + } else { + q[1] = BitsOp(C, 3, 4); + q[0] = BitsOp(C, 0, 2); + } + } + + for (uint i = 0; i < 3; i++) { + EncodingData val; + val.encoding = Quint; + val.num_bits = num_bits; + val.bit_value = m[i]; + val.quint_trit_value = q[i]; + ResultEmplaceBack(val); + } +} + +void DecodeTritBlock(uint num_bits) { + uint m[5]; + uint t[5]; + uint T; + m[0] = StreamColorBits(num_bits); + T = StreamColorBits(2); + m[1] = StreamColorBits(num_bits); + T |= StreamColorBits(2) << 2; + m[2] = StreamColorBits(num_bits); + T |= StreamColorBits(1) << 4; + m[3] = StreamColorBits(num_bits); + T |= StreamColorBits(2) << 5; + m[4] = StreamColorBits(num_bits); + T |= StreamColorBits(1) << 7; + uint C = 0; + if (BitsOp(T, 2, 4) == 7) { + C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1); + t[4] = 2; + t[3] = 2; + } else { + C = BitsOp(T, 0, 4); + if (BitsOp(T, 5, 6) == 3) { + t[4] = 2; + t[3] = BitsBracket(T, 7); + } else { + t[4] = BitsBracket(T, 7); + t[3] = BitsOp(T, 5, 6); + } + } + if (BitsOp(C, 0, 1) == 3) { + t[2] = 2; + t[1] = BitsBracket(C, 4); + t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3)); + } else if (BitsOp(C, 2, 3) == 3) { + t[2] = 2; + t[1] = 2; + t[0] = BitsOp(C, 0, 1); + } else { + t[2] = BitsBracket(C, 4); + t[1] = BitsOp(C, 2, 3); + t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1)); + } + for (uint i = 0; i < 5; i++) { + EncodingData val; + val.encoding = Trit; + val.num_bits = num_bits; + val.bit_value = m[i]; + val.quint_trit_value = t[i]; + ResultEmplaceBack(val); + } +} +void DecodeIntegerSequence(uint max_range, uint num_values) { + EncodingData val = encoding_values[max_range]; + uint vals_decoded = 0; + while (vals_decoded < num_values) { + switch (val.encoding) { + case Quint: + DecodeQuintBlock(val.num_bits); + vals_decoded += 3; + break; + + case Trit: + DecodeTritBlock(val.num_bits); + vals_decoded += 5; + break; + + case JustBits: + val.bit_value = StreamColorBits(val.num_bits); + ResultEmplaceBack(val); + vals_decoded++; + break; + } + } +} + +void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitions, + uint color_data_bits) { + uint num_values = 0; + for (uint i = 0; i < num_partitions; i++) { + num_values += ((modes[i] >> 2) + 1) << 1; + } + int range = 256; + while (--range > 0) { + EncodingData val = encoding_values[range]; + uint bitLength = GetBitLength(num_values, range); + if (bitLength <= color_data_bits) { + while (--range > 0) { + EncodingData newval = encoding_values[range]; + if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) { + break; + } + } + range++; + break; + } + } + DecodeIntegerSequence(range, num_values); + uint out_index = 0; + for (int itr = 0; itr < result_index; itr++) { + if (out_index >= num_values) { + break; + } + EncodingData val = result_vector[itr]; + uint bitlen = val.num_bits; + uint bitval = val.bit_value; + uint A = 0, B = 0, C = 0, D = 0; + A = ReplicateBitTo9((bitval & 1)); + switch (val.encoding) { + case JustBits: + color_values[out_index++] = FastReplicateTo8(bitval, bitlen); + break; + case Trit: { + D = val.quint_trit_value; + switch (bitlen) { + case 1: { + C = 204; + } break; + case 2: { + C = 93; + uint b = (bitval >> 1) & 1; + B = (b << 8) | (b << 4) | (b << 2) | (b << 1); + } break; + + case 3: { + C = 44; + uint cb = (bitval >> 1) & 3; + B = (cb << 7) | (cb << 2) | cb; + } break; + + case 4: { + C = 22; + uint dcb = (bitval >> 1) & 7; + B = (dcb << 6) | dcb; + } break; + + case 5: { + C = 11; + uint edcb = (bitval >> 1) & 0xF; + B = (edcb << 5) | (edcb >> 2); + } break; + + case 6: { + C = 5; + uint fedcb = (bitval >> 1) & 0x1F; + B = (fedcb << 4) | (fedcb >> 4); + } break; + } + } break; + case Quint: { + D = val.quint_trit_value; + switch (bitlen) { + case 1: { + C = 113; + } break; + case 2: { + C = 54; + uint b = (bitval >> 1) & 1; + B = (b << 8) | (b << 3) | (b << 2); + } break; + case 3: { + C = 26; + uint cb = (bitval >> 1) & 3; + B = (cb << 7) | (cb << 1) | (cb >> 1); + } break; + case 4: { + C = 13; + uint dcb = (bitval >> 1) & 7; + B = (dcb << 6) | (dcb >> 1); + } break; + case 5: { + C = 6; + uint edcb = (bitval >> 1) & 0xF; + B = (edcb << 5) | (edcb >> 3); + } break; + } + } break; + } + + if (val.encoding != JustBits) { + uint T = (D * C) + B; + T ^= A; + T = (A & 0x80) | (T >> 2); + color_values[out_index++] = T; + } + } +} +ivec2 BitTransferSigned(int a, int b) { + ivec2 transferred; + transferred[1] = b >> 1; + transferred[1] |= a & 0x80; + transferred[0] = a >> 1; + transferred[0] &= 0x3F; + if ((transferred[0] & 0x20) > 0) { + transferred[0] -= 0x40; + } + return transferred; +} + +uvec4 ClampByte(ivec4 color) { + for (uint i = 0; i < 4; i++) { + color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); + } + return uvec4(color); +} +ivec4 BlueContract(int a, int r, int g, int b) { + return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); +} +int colvals_index = 0; +void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32], + uint color_endpoint_mode) { +#define READ_UINT_VALUES(N) \ + uint v[N]; \ + for (uint i = 0; i < N; i++) { \ + v[i] = color_values[colvals_index++]; \ + } + +#define READ_INT_VALUES(N) \ + int v[N]; \ + for (uint i = 0; i < N; i++) { \ + v[i] = int(color_values[colvals_index++]); \ + } + + switch (color_endpoint_mode) { + case 0: { + READ_UINT_VALUES(2) + ep1 = uvec4(0xFF, v[0], v[0], v[0]); + ep2 = uvec4(0xFF, v[1], v[1], v[1]); + } break; + + case 1: { + READ_UINT_VALUES(2) + uint L0 = (v[0] >> 2) | (v[1] & 0xC0); + uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU); + ep1 = uvec4(0xFF, L0, L0, L0); + ep2 = uvec4(0xFF, L1, L1, L1); + } break; + + case 4: { + READ_UINT_VALUES(4) + ep1 = uvec4(v[2], v[0], v[0], v[0]); + ep2 = uvec4(v[3], v[1], v[1], v[1]); + } break; + + case 5: { + READ_INT_VALUES(4) + ivec2 transferred = BitTransferSigned(v[1], v[0]); + v[1] = transferred[0]; + v[0] = transferred[1]; + transferred = BitTransferSigned(v[3], v[2]); + v[3] = transferred[0]; + v[2] = transferred[1]; + ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0])); + ep2 = ClampByte(ivec4((v[2] + v[3]), v[0] + v[1], v[0] + v[1], v[0] + v[1])); + } break; + + case 6: { + READ_UINT_VALUES(4) + ep1 = uvec4(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); + ep2 = uvec4(0xFF, v[0], v[1], v[2]); + } break; + + case 8: { + READ_UINT_VALUES(6) + if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { + ep1 = uvec4(0xFF, v[0], v[2], v[4]); + ep2 = uvec4(0xFF, v[1], v[3], v[5]); + } else { + ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5]))); + ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4]))); + } + } break; + + case 9: { + READ_INT_VALUES(6) + ivec2 transferred = BitTransferSigned(v[1], v[0]); + v[1] = transferred[0]; + v[0] = transferred[1]; + transferred = BitTransferSigned(v[3], v[2]); + v[3] = transferred[0]; + v[2] = transferred[1]; + transferred = BitTransferSigned(v[5], v[4]); + v[5] = transferred[0]; + v[4] = transferred[1]; + if (v[1] + v[3] + v[5] >= 0) { + ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4])); + ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); + } else { + ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); + ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4])); + } + } break; + + case 10: { + READ_UINT_VALUES(6) + ep1 = uvec4(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); + ep2 = uvec4(v[5], v[0], v[1], v[2]); + } break; + + case 12: { + READ_UINT_VALUES(8) + if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { + ep1 = uvec4(v[6], v[0], v[2], v[4]); + ep2 = uvec4(v[7], v[1], v[3], v[5]); + } else { + ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5]))); + ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4]))); + } + } break; + + case 13: { + READ_INT_VALUES(8) + ivec2 transferred = BitTransferSigned(v[1], v[0]); + v[1] = transferred[0]; + v[0] = transferred[1]; + transferred = BitTransferSigned(v[3], v[2]); + v[3] = transferred[0]; + v[2] = transferred[1]; + + transferred = BitTransferSigned(v[5], v[4]); + v[5] = transferred[0]; + v[4] = transferred[1]; + + transferred = BitTransferSigned(v[7], v[6]); + v[7] = transferred[0]; + v[6] = transferred[1]; + + if (v[1] + v[3] + v[5] >= 0) { + ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4])); + ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5])); + } else { + ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5])); + ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4])); + } + } break; + } +#undef READ_UINT_VALUES +#undef READ_INT_VALUES +} + +uint UnquantizeTexelWeight(EncodingData val) { + uint bitval = val.bit_value; + uint bitlen = val.num_bits; + uint A = ReplicateBitTo7((bitval & 1)); + uint B = 0, C = 0, D = 0; + uint result = 0; + switch (val.encoding) { + case JustBits: + result = FastReplicateTo6(bitval, bitlen); + break; + case Trit: { + D = val.quint_trit_value; + switch (bitlen) { + case 0: { + uint results[3] = {0, 32, 63}; + result = results[D]; + } break; + case 1: { + C = 50; + } break; + case 2: { + C = 23; + uint b = (bitval >> 1) & 1; + B = (b << 6) | (b << 2) | b; + } break; + case 3: { + C = 11; + uint cb = (bitval >> 1) & 3; + B = (cb << 5) | cb; + } break; + default: + break; + } + } break; + case Quint: { + D = val.quint_trit_value; + switch (bitlen) { + case 0: { + uint results[5] = {0, 16, 32, 47, 63}; + result = results[D]; + } break; + case 1: { + C = 28; + } break; + case 2: { + C = 13; + uint b = (bitval >> 1) & 1; + B = (b << 6) | (b << 1); + } break; + } + } break; + } + if (val.encoding != JustBits && bitlen > 0) { + result = D * C + B; + result ^= A; + result = (A & 0x20) | (result >> 2); + } + if (result > 32) { + result += 1; + } + return result; +} + +void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 size) { + uint weight_idx = 0; + uint unquantized[2][144]; + uint area = size.x * size.y; + for (uint itr = 0; itr < texel_vector_index; itr++) { + unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]); + if (dual_plane) { + ++itr; + unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]); + if (itr == texel_vector_index) { + break; + } + } + if (++weight_idx >= (area)) + break; + } + uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); + uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); + uint kPlaneScale = dual_plane ? 2 : 1; + for (uint plane = 0; plane < kPlaneScale; plane++) + for (uint t = 0; t < block_dims.y; t++) + for (uint s = 0; s < block_dims.x; s++) { + uint cs = Ds * s; + uint ct = Dt * t; + uint gs = (cs * (size.x - 1) + 32) >> 6; + uint gt = (ct * (size.y - 1) + 32) >> 6; + uint js = gs >> 4; + uint fs = gs & 0xF; + uint jt = gt >> 4; + uint ft = gt & 0x0F; + uint w11 = (fs * ft + 8) >> 4; + uint w10 = ft - w11; + uint w01 = fs - w11; + uint w00 = 16 - fs - ft + w11; + uvec4 w = uvec4(w00, w01, w10, w11); + uint v0 = jt * size.x + js; + + uvec4 p = uvec4(0); + if (v0 < area) { + p.x = unquantized[plane][v0]; + } + if ((v0 + 1) < (area)) { + p.y = unquantized[plane][v0 + 1]; + } + if ((v0 + size.x) < (area)) { + p.z = unquantized[plane][(v0 + size.x)]; + } + if ((v0 + size.x + 1) < (area)) { + p.w = unquantized[plane][(v0 + size.x + 1)]; + } + outbuffer[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; + } +} + +int FindLayout(uint mode) { + if ((mode & 3) != 0) { + if ((mode & 8) != 0) { + if ((mode & 4) != 0) { + if ((mode & 0x100) != 0) { + return 4; + } + return 3; + } + return 2; + } + if ((mode & 4) != 0) { + return 1; + } + return 0; + } + if ((mode & 0x100) != 0) { + if ((mode & 0x80) != 0) { + if ((mode & 0x20) != 0) { + return 8; + } + return 7; + } + return 9; + } + if ((mode & 0x80) != 0) { + return 6; + } + return 5; +} + +TexelWeightParams DecodeBlockInfo(uint block_index) { + TexelWeightParams params = TexelWeightParams(uvec2(0), false, 0, false, false, false); + uint mode = StreamBits(11); + if ((mode & 0x1ff) == 0x1fc) { + if ((mode & 0x200) != 0) { + params.VoidExtentHDR = true; + } else { + params.VoidExtentLDR = true; + } + if ((mode & 0x400) == 0 || StreamBits(1) == 0) { + params.Error = true; + } + return params; + } + if ((mode & 0xf) == 0) { + params.Error = true; + return params; + } + if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { + params.Error = true; + return params; + } + uint A, B; + uint mode_layout = FindLayout(mode); + switch (mode_layout) { + case 0: + A = (mode >> 5) & 0x3; + B = (mode >> 7) & 0x3; + params.size = uvec2(B + 4, A + 2); + break; + case 1: + A = (mode >> 5) & 0x3; + B = (mode >> 7) & 0x3; + params.size = uvec2(B + 8, A + 2); + break; + case 2: + A = (mode >> 5) & 0x3; + B = (mode >> 7) & 0x3; + params.size = uvec2(A + 2, B + 8); + break; + case 3: + A = (mode >> 5) & 0x3; + B = (mode >> 7) & 0x1; + params.size = uvec2(A + 2, B + 6); + break; + case 4: + A = (mode >> 5) & 0x3; + B = (mode >> 7) & 0x1; + params.size = uvec2(B + 2, A + 2); + break; + case 5: + A = (mode >> 5) & 0x3; + params.size = uvec2(12, A + 2); + break; + case 6: + A = (mode >> 5) & 0x3; + params.size = uvec2(A + 2, 12); + break; + case 7: + params.size = uvec2(6, 10); + break; + case 8: + params.size = uvec2(10, 6); + break; + case 9: + A = (mode >> 5) & 0x3; + B = (mode >> 9) & 0x3; + params.size = uvec2(A + 6, B + 6); + break; + default: + params.Error = true; + break; + } + params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); + uint weight_index = (mode & 0x10) != 0 ? 1 : 0; + if (mode_layout < 5) { + weight_index |= (mode & 0x3) << 1; + } else { + weight_index |= (mode & 0xc) >> 1; + } + weight_index -= 2; + if ((mode_layout != 9) && ((mode & 0x200) != 0)) { + const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31); + params.max_weight = max_weights[weight_index]; + } else { + const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7); + params.max_weight = max_weights[weight_index]; + } + return params; +} + +void FillError(ivec3 coord) { + for (uint j = 0; j < block_dims.y; j++) { + for (uint i = 0; i < block_dims.x; i++) { + imageStore(dest_image, coord.xy + ivec2(i, j), vec4(1.0, 1.0, 0.0, 1.0)); + } + } + return; +} + +void FillVoidExtentLDR(ivec3 coord, uint block_index) { + for (int i = 0; i < 4; i++) { + StreamBits(13); + } + + uint r_u = StreamBits(16); + uint g_u = StreamBits(16); + uint b_u = StreamBits(16); + uint a_u = StreamBits(16); + float a = float(a_u) / 65535.0f; + float r = float(r_u) / 65535.0f; + float g = float(g_u) / 65535.0f; + float b = float(b_u) / 65535.0f; + for (uint j = 0; j < block_dims.y; j++) { + for (uint i = 0; i < block_dims.x; i++) { + imageStore(dest_image, coord.xy + ivec2(i, j), vec4(r, g, b, a)); + } + } +} + +void DecompressBlock(ivec3 coord, uint block_index) { + TexelWeightParams params; + params = DecodeBlockInfo(block_index); + if (params.Error) { + FillError(coord); + return; + } + if (params.VoidExtentHDR) { + FillError(coord); + return; + } + if (params.VoidExtentLDR) { + FillVoidExtentLDR(coord, block_index); + return; + } + if (params.size.x > block_dims.x || params.size.y > block_dims.y) { + FillError(coord); + return; + } + uint num_partitions = StreamBits(2) + 1; + if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) { + FillError(coord); + return; + } + int plane_index = -1; + uint partition_index = 1; + uvec4 color_endpoint_mode = uvec4(0); + uint ced_pointer = 0; + uint base_cem = 0; + if (num_partitions == 1) { + color_endpoint_mode[0] = StreamBits(4); + partition_index = 0; + } else { + partition_index = StreamBits(10); + base_cem = StreamBits(6); + } + uint base_mode = base_cem & 3; + uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight); + uint remaining_bits = 128 - weight_bits - total_bitsread; + uint extra_cem_bits = 0; + if (base_mode > 0) { + switch (num_partitions) { + case 2: + extra_cem_bits += 2; + break; + case 3: + extra_cem_bits += 5; + break; + case 4: + extra_cem_bits += 8; + break; + default: + return; + } + } + remaining_bits -= extra_cem_bits; + uint plane_selector_bits = 0; + if (params.dual_plane) { + plane_selector_bits = 2; + } + remaining_bits -= plane_selector_bits; + // Read color data... + uint color_data_bits = remaining_bits; + while (remaining_bits > 0) { + uint nb = min(remaining_bits, 8); + uint b = StreamBits(nb); + color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, 8)); + ced_pointer++; + remaining_bits -= nb; + } + plane_index = int(StreamBits(plane_selector_bits)); + if (base_mode > 0) { + uint extra_cem = StreamBits(extra_cem_bits); + uint cem = (extra_cem << 6) | base_cem; + cem >>= 2; + uint C[4] = {0, 0, 0, 0}; + for (uint i = 0; i < num_partitions; i++) { + C[i] = cem & 1; + cem >>= 1; + } + uint M[4] = {0, 0, 0, 0}; + for (uint i = 0; i < num_partitions; i++) { + M[i] = cem & 3; + cem >>= 2; + } + for (uint i = 0; i < num_partitions; i++) { + color_endpoint_mode[i] = base_mode; + if ((C[i]) == 0) { + color_endpoint_mode[i] -= 1; + } + color_endpoint_mode[i] <<= 2; + color_endpoint_mode[i] |= M[i]; + } + } else if (num_partitions > 1) { + uint cem = base_cem >> 2; + for (uint i = 0; i < num_partitions; i++) { + color_endpoint_mode[i] = cem; + } + } + + uint color_values[32]; // Four values, two endpoints, four maximum paritions + DecodeColorValues(color_values, color_endpoint_mode, num_partitions, color_data_bits); + uvec4 endpoints[4][2]; + for (uint i = 0; i < num_partitions; i++) { + ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_values, color_endpoint_mode[i]); + } + for (uint i = 0; i < 16; i++) { + texel_weight_data[i] = local_buff[i]; + } + for (uint i = 0; i < 8; i++) { +#define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16 + uint a = REVERSE_BYTE(texel_weight_data[i]); + uint b = REVERSE_BYTE(texel_weight_data[15 - i]); +#undef REVERSE_BYTE + texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8)); + texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8)); + } + uint clear_byte_start = + (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1; + texel_weight_data[clear_byte_start - 1] = + texel_weight_data[clear_byte_start - 1] & + uint( + ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); + for (uint i = 0; i < 16 - clear_byte_start; i++) { + texel_weight_data[clear_byte_start + i] = uint(0U); + } + texel_flag = true; // use texel "vector" and bit stream in integer decoding + DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); + uint weights[2][144]; + UnquantizeTexelWeights(weights, params.dual_plane, params.size); + for (uint j = 0; j < block_dims.y; j++) { + for (uint i = 0; i < block_dims.x; i++) { + uint local_partition = Select2DPartition(partition_index, i, j, num_partitions, + (block_dims.y * block_dims.x) < 32); + vec4 p; + uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]); + uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]); + uvec4 plane_vec = uvec4(0); + uvec4 weight_vec = uvec4(0); + for (uint c = 0; c < 4; c++) { + if (params.dual_plane && (((plane_index + 1) & 3) == c)) { + plane_vec[c] = 1; + } + weight_vec[c] = weights[plane_vec[c]][j * block_dims.x + i]; + } + vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6); + p = (Cf / 65535.0); + imageStore(dest_image, coord.xy + ivec2(i, j), p.gbar); + } + } +} + +void main() { + uvec3 pos = gl_GlobalInvocationID + origin; + pos.x <<= bytes_per_block_log2; + + // Read as soon as possible due to its latency + const uint swizzle = SwizzleOffset(pos.xy); + + const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; + + uint offset = 0; + offset += layer * layer_stride; + offset += (block_y >> block_height) * block_size; + offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT; + offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; + offset += swizzle; + + const ivec3 invocation_destination = ivec3(gl_GlobalInvocationID + destination); + const ivec3 coord = ivec3(invocation_destination * uvec3(block_dims, 1.0)); + uint block_index = + layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x; + current_index = 0; + bitsread = 0; + for (int i = 0; i < 16; i++) { + local_buff[i] = ReadTexel(offset + i); + } + DecompressBlock(coord, block_index); +} diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index e028677e98..29105ecad7 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -307,7 +307,8 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array swizzles) { + if (IsPixelFormatASTC(image.info.format)) { + return util_shaders.ASTCDecode(image, map, swizzles); + } switch (image.info.type) { case ImageType::e2D: return util_shaders.BlockLinearUpload2D(image, map, swizzles); @@ -599,6 +603,10 @@ FormatProperties TextureCacheRuntime::FormatInfo(ImageType type, GLenum internal } } +bool TextureCacheRuntime::HasNativeASTC() const noexcept { + return device.HasASTC(); +} + TextureCacheRuntime::StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_) : storage_flags{storage_flags_}, map_flags{map_flags_} {} diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 3fbaa102fc..3c871541b9 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -95,6 +95,8 @@ public: return has_broken_texture_view_formats; } + bool HasNativeASTC() const noexcept; + private: struct StagingBuffers { explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 2fe4799bc0..2a42206619 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -3,7 +3,10 @@ // Refer to the license.txt file included. #include +#include #include +#include +#include #include #include @@ -24,11 +27,13 @@ #include "video_core/texture_cache/accelerated_swizzle.h" #include "video_core/texture_cache/types.h" #include "video_core/texture_cache/util.h" +#include "video_core/textures/astc.h" #include "video_core/textures/decoders.h" namespace OpenGL { using namespace HostShaders; +using namespace Tegra::Texture::ASTC; using VideoCommon::Extent3D; using VideoCommon::ImageCopy; @@ -63,13 +68,105 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)), copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { - const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); - swizzle_table_buffer.Create(); - glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); + // TODO: Load shader string as a header + std::string astc_path = "astc_decoder.comp"; + std::ifstream t(astc_path); + std::string str((std::istreambuf_iterator(t)), std::istreambuf_iterator()); + astc_decoder_program = MakeProgram(str); + MakeBuffers(); } UtilShaders::~UtilShaders() = default; +void UtilShaders::MakeBuffers() { + const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); + swizzle_table_buffer.Create(); + glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); + + astc_encodings_buffer.Create(); + glNamedBufferStorage(astc_encodings_buffer.handle, sizeof(EncodingsValues), &EncodingsValues, + 0); + replicate_6_to_8_buffer.Create(); + glNamedBufferStorage(replicate_6_to_8_buffer.handle, sizeof(REPLICATE_6_BIT_TO_8_TABLE), + &REPLICATE_6_BIT_TO_8_TABLE, 0); + replicate_7_to_8_buffer.Create(); + glNamedBufferStorage(replicate_7_to_8_buffer.handle, sizeof(REPLICATE_7_BIT_TO_8_TABLE), + &REPLICATE_7_BIT_TO_8_TABLE, 0); + replicate_8_to_8_buffer.Create(); + glNamedBufferStorage(replicate_8_to_8_buffer.handle, sizeof(REPLICATE_8_BIT_TO_8_TABLE), + &REPLICATE_8_BIT_TO_8_TABLE, 0); + replicate_byte_to_16_buffer.Create(); + glNamedBufferStorage(replicate_byte_to_16_buffer.handle, sizeof(REPLICATE_BYTE_TO_16_TABLE), + &REPLICATE_BYTE_TO_16_TABLE, 0); +} + +void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, + std::span swizzles) { + static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; + static constexpr GLuint BINDING_INPUT_BUFFER = 1; + static constexpr GLuint BINDING_ENC_BUFFER = 2; + + static constexpr GLuint BINDING_6_TO_8_BUFFER = 3; + static constexpr GLuint BINDING_7_TO_8_BUFFER = 4; + static constexpr GLuint BINDING_8_TO_8_BUFFER = 5; + static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6; + + static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; + static constexpr GLuint LOC_NUM_IMAGE_BLOCKS = 0; + static constexpr GLuint LOC_BLOCK_DIMS = 1; + static constexpr GLuint LOC_LAYER = 2; + + const Extent3D tile_size = { + VideoCore::Surface::DefaultBlockWidth(image.info.format), + VideoCore::Surface::DefaultBlockHeight(image.info.format), + }; + program_manager.BindHostCompute(astc_decoder_program.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_encodings_buffer.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, + replicate_6_to_8_buffer.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, + replicate_7_to_8_buffer.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, + replicate_8_to_8_buffer.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, + replicate_byte_to_16_buffer.handle); + + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); + glUniform2ui(LOC_BLOCK_DIMS, tile_size.width, tile_size.height); + + for (u32 layer = 0; layer < image.info.resources.layers; layer++) { + for (const SwizzleParameters& swizzle : swizzles) { + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_FALSE, + layer, GL_WRITE_ONLY, GL_RGBA8); + const size_t input_offset = swizzle.buffer_offset + map.offset; + const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); + const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); + + glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height); + glUniform1ui(LOC_LAYER, layer); + + // To unswizzle the ASTC data + const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); + glUniform3uiv(3, 1, params.origin.data()); + glUniform3iv(4, 1, params.destination.data()); + glUniform1ui(5, params.bytes_per_block_log2); + glUniform1ui(6, params.layer_stride); + glUniform1ui(7, params.block_size); + glUniform1ui(8, params.x_shift); + glUniform1ui(9, params.block_height); + glUniform1ui(10, params.block_height_mask); + + // ASTC texture data + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, + input_offset, image.guest_size_bytes - swizzle.buffer_offset); + + glDispatchCompute(num_dispatches_x, num_dispatches_y, 1); + } + } + program_manager.RestoreGuestCompute(); +} + void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, std::span swizzles) { static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index 93b009743c..08a1cb9b29 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -40,6 +40,11 @@ public: explicit UtilShaders(ProgramManager& program_manager); ~UtilShaders(); + void MakeBuffers(); + + void ASTCDecode(Image& image, const ImageBufferMap& map, + std::span swizzles); + void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, std::span swizzles); @@ -59,7 +64,13 @@ private: ProgramManager& program_manager; OGLBuffer swizzle_table_buffer; + OGLBuffer astc_encodings_buffer; + OGLBuffer replicate_6_to_8_buffer; + OGLBuffer replicate_7_to_8_buffer; + OGLBuffer replicate_8_to_8_buffer; + OGLBuffer replicate_byte_to_16_buffer; + OGLProgram astc_decoder_program; OGLProgram block_linear_unswizzle_2d_program; OGLProgram block_linear_unswizzle_3d_program; OGLProgram pitch_unswizzle_program; diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index 9105119bc4..bc8bddaec2 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h @@ -8,6 +8,196 @@ namespace Tegra::Texture::ASTC { +/// Count the number of bits set in a number. +constexpr u32 Popcnt(u32 n) { + u32 c = 0; + for (; n; c++) { + n &= n - 1; + } + return c; +} + +enum class IntegerEncoding { JustBits, Qus32, Trit }; + +struct IntegerEncodedValue { + constexpr IntegerEncodedValue() = default; + + constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) + : encoding{encoding_}, num_bits{num_bits_} {} + + constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { + return encoding == other.encoding && num_bits == other.num_bits; + } + + // Returns the number of bits required to encode nVals values. + u32 GetBitLength(u32 nVals) const { + u32 totalBits = num_bits * nVals; + if (encoding == IntegerEncoding::Trit) { + totalBits += (nVals * 8 + 4) / 5; + } else if (encoding == IntegerEncoding::Qus32) { + totalBits += (nVals * 7 + 2) / 3; + } + return totalBits; + } + + IntegerEncoding encoding{}; + u32 num_bits = 0; + u32 bit_value = 0; + union { + u32 qus32_value = 0; + u32 trit_value; + }; +}; + +// Returns a new instance of this struct that corresponds to the +// can take no more than maxval values +static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) { + while (maxVal > 0) { + u32 check = maxVal + 1; + + // Is maxVal a power of two? + if (!(check & (check - 1))) { + return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal)); + } + + // Is maxVal of the type 3*2^n - 1? + if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { + return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1)); + } + + // Is maxVal of the type 5*2^n - 1? + if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { + return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1)); + } + + // Apparently it can't be represented with a bounded integer sequence... + // just iterate. + maxVal--; + } + return IntegerEncodedValue(IntegerEncoding::JustBits, 0); +} + +static constexpr std::array MakeEncodedValues() { + std::array encodings{}; + for (std::size_t i = 0; i < encodings.size(); ++i) { + encodings[i] = CreateEncoding(static_cast(i)); + } + return encodings; +} + +static constexpr std::array EncodingsValues = MakeEncodedValues(); + +// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] +// is the same as [(numBits - 1):0] and repeats all the way down. +template +static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) { + if (numBits == 0) { + return 0; + } + if (toBit == 0) { + return 0; + } + const IntType v = val & static_cast((1 << numBits) - 1); + IntType res = v; + u32 reslen = numBits; + while (reslen < toBit) { + u32 comp = 0; + if (numBits > toBit - reslen) { + u32 newshift = toBit - reslen; + comp = numBits - newshift; + numBits = newshift; + } + res = static_cast(res << numBits); + res = static_cast(res | (v >> comp)); + reslen += numBits; + } + return res; +} + +static constexpr std::size_t NumReplicateEntries(u32 num_bits) { + return std::size_t(1) << num_bits; +} + +template +static constexpr auto MakeReplicateTable() { + std::array table{}; + for (IntType value = 0; value < static_cast(std::size(table)); ++value) { + table[value] = Replicate(value, num_bits, to_bit); + } + return table; +} + +static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable(); +static constexpr u32 ReplicateByteTo16(std::size_t value) { + return REPLICATE_BYTE_TO_16_TABLE[value]; +} + +static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable(); +static constexpr u32 ReplicateBitTo7(std::size_t value) { + return REPLICATE_BIT_TO_7_TABLE[value]; +} + +static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable(); +static constexpr u32 ReplicateBitTo9(std::size_t value) { + return REPLICATE_BIT_TO_9_TABLE[value]; +} + +static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable(); +/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback +/// to the runtime implementation +static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_8_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_8_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_8_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_8_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_8_TABLE[value]; + case 6: + return REPLICATE_6_BIT_TO_8_TABLE[value]; + case 7: + return REPLICATE_7_BIT_TO_8_TABLE[value]; + case 8: + return REPLICATE_8_BIT_TO_8_TABLE[value]; + default: + return Replicate(value, num_bits, 8); + } +} + +static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable(); + +static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_6_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_6_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_6_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_6_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_6_TABLE[value]; + default: + return Replicate(value, num_bits, 6); + } +} + void Decompress(std::span data, uint32_t width, uint32_t height, uint32_t depth, uint32_t block_width, uint32_t block_height, std::span output); From f6566338ebd6559b0fbe61e1557ee735bf58dcdd Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 13 Feb 2021 15:52:21 -0500 Subject: [PATCH 2/6] host_shaders: Modify shader cmake integration to allow for larger shaders using a raw string to encapsulate the entire shader code limits us to shaders of size less than 2KB. This change overcomes this limitation. --- src/video_core/host_shaders/CMakeLists.txt | 1 + .../host_shaders/StringShaderHeader.cmake | 22 ++++++++++++++++++- .../host_shaders/source_shader.h.in | 4 +++- .../renderer_opengl/util_shaders.cpp | 8 ++----- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 3494318ca0..2208e19226 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -1,4 +1,5 @@ set(SHADER_FILES + astc_decoder.comp block_linear_unswizzle_2d.comp block_linear_unswizzle_3d.comp convert_depth_to_float.frag diff --git a/src/video_core/host_shaders/StringShaderHeader.cmake b/src/video_core/host_shaders/StringShaderHeader.cmake index c0fc49768b..1b4bc6103c 100644 --- a/src/video_core/host_shaders/StringShaderHeader.cmake +++ b/src/video_core/host_shaders/StringShaderHeader.cmake @@ -6,7 +6,27 @@ get_filename_component(CONTENTS_NAME ${SOURCE_FILE} NAME) string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME}) string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME) -file(READ ${SOURCE_FILE} CONTENTS) +FILE(READ ${SOURCE_FILE} line_contents) + +# Replace double quotes with single quotes, +# as double quotes will be used to wrap the lines +STRING(REGEX REPLACE "\"" "'" line_contents "${line_contents}") + +# CMake separates list elements with semicolons, but semicolons +# are used extensively in the shader code. +# Replace with a temporary marker, to be reverted later. +STRING(REGEX REPLACE ";" "{{SEMICOLON}}" line_contents "${line_contents}") + +# Make every line an individual element in the CMake list. +STRING(REGEX REPLACE "\n" ";" line_contents "${line_contents}") + +# Build the shader string, wrapping each line in double quotes. +foreach(line IN LISTS line_contents) + string(CONCAT CONTENTS "${CONTENTS}" \"${line}\\n\"\n) +endforeach() + +# Revert the original semicolons in the source. +STRING(REGEX REPLACE "{{SEMICOLON}}" ";" CONTENTS "${CONTENTS}") get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY) make_directory(${OUTPUT_DIR}) diff --git a/src/video_core/host_shaders/source_shader.h.in b/src/video_core/host_shaders/source_shader.h.in index ccdb0d2a9c..929dec39bb 100644 --- a/src/video_core/host_shaders/source_shader.h.in +++ b/src/video_core/host_shaders/source_shader.h.in @@ -4,6 +4,8 @@ namespace HostShaders { -constexpr std::string_view @CONTENTS_NAME@ = R"(@CONTENTS@)"; +constexpr std::string_view @CONTENTS_NAME@ = { +@CONTENTS@ +}; } // namespace HostShaders diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 2a42206619..d0979dab1a 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -14,6 +14,7 @@ #include "common/assert.h" #include "common/common_types.h" #include "common/div_ceil.h" +#include "video_core/host_shaders/astc_decoder_comp.h" #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h" #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h" #include "video_core/host_shaders/opengl_copy_bc4_comp.h" @@ -62,17 +63,12 @@ size_t NumPixelsInCopy(const VideoCommon::ImageCopy& copy) { } // Anonymous namespace UtilShaders::UtilShaders(ProgramManager& program_manager_) - : program_manager{program_manager_}, + : program_manager{program_manager_}, astc_decoder_program(MakeProgram(ASTC_DECODER_COMP)), block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)), block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)), pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)), copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { - // TODO: Load shader string as a header - std::string astc_path = "astc_decoder.comp"; - std::ifstream t(astc_path); - std::string str((std::istreambuf_iterator(t)), std::istreambuf_iterator()); - astc_decoder_program = MakeProgram(str); MakeBuffers(); } From 20eb368e147e1c27f05d6923c51596f8dfe24e89 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 13 Feb 2021 16:49:24 -0500 Subject: [PATCH 3/6] renderer_vulkan: Accelerate ASTC decoding Co-Authored-By: Rodrigo Locatti --- src/video_core/host_shaders/astc_decoder.comp | 43 +-- .../renderer_vulkan/maxwell_to_vk.cpp | 2 +- .../renderer_vulkan/vk_compute_pass.cpp | 298 ++++++++++++++++++ .../renderer_vulkan/vk_compute_pass.h | 32 ++ .../renderer_vulkan/vk_rasterizer.cpp | 5 +- .../renderer_vulkan/vk_rasterizer.h | 1 + .../renderer_vulkan/vk_texture_cache.cpp | 45 ++- .../renderer_vulkan/vk_texture_cache.h | 12 +- .../texture_cache/accelerated_swizzle.h | 4 +- src/video_core/textures/decoders.cpp | 23 -- src/video_core/textures/decoders.h | 18 +- 11 files changed, 426 insertions(+), 57 deletions(-) diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 070190a5ce..2ddac2e1d5 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -16,7 +16,7 @@ #define BINDING_7_TO_8_BUFFER 4 #define BINDING_8_TO_8_BUFFER 5 #define BINDING_BYTE_TO_16_BUFFER 6 -#define BINDING_OUTPUT_IMAGE 3 +#define BINDING_OUTPUT_IMAGE 7 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv @@ -85,7 +85,26 @@ layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint astc_data[]; }; -layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly image2D dest_image; + +// ASTC Encodings data +layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues { + EncodingData encoding_values[]; +}; +// ASTC Precompiled tables +layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 { + uint REPLICATE_6_BIT_TO_8_TABLE[]; +}; +layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 { + uint REPLICATE_7_BIT_TO_8_TABLE[]; +}; +layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 { + uint REPLICATE_8_BIT_TO_8_TABLE[]; +}; +layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 { + uint REPLICATE_BYTE_TO_16_TABLE[]; +}; + +layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2D dest_image; const uint GOB_SIZE_X = 64; const uint GOB_SIZE_Y = 8; @@ -109,23 +128,6 @@ uint ReadTexel(uint offset) { return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8); } -// ASTC Encodings data -layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues { - EncodingData encoding_values[256]; -}; -// ASTC Precompiled tables -layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 { - uint REPLICATE_6_BIT_TO_8_TABLE[]; -}; -layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 { - uint REPLICATE_7_BIT_TO_8_TABLE[]; -}; -layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 { - uint REPLICATE_8_BIT_TO_8_TABLE[]; -}; -layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 { - uint REPLICATE_BYTE_TO_16_TABLE[]; -}; const int BLOCK_SIZE_IN_BYTES = 16; @@ -1275,8 +1277,7 @@ void main() { offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; offset += swizzle; - const ivec3 invocation_destination = ivec3(gl_GlobalInvocationID + destination); - const ivec3 coord = ivec3(invocation_destination * uvec3(block_dims, 1.0)); + const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1.0)); uint block_index = layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x; current_index = 0; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 19aaf034fa..f088447e94 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -166,7 +166,7 @@ struct FormatTuple { {VK_FORMAT_R16G16_SINT, Attachable | Storage}, // R16G16_SINT {VK_FORMAT_R16G16_SNORM, Attachable | Storage}, // R16G16_SNORM {VK_FORMAT_UNDEFINED}, // R32G32B32_FLOAT - {VK_FORMAT_R8G8B8A8_SRGB, Attachable}, // A8B8G8R8_SRGB + {VK_FORMAT_A8B8G8R8_SRGB_PACK32, Attachable}, // A8B8G8R8_SRGB {VK_FORMAT_R8G8_UNORM, Attachable | Storage}, // R8G8_UNORM {VK_FORMAT_R8G8_SNORM, Attachable | Storage}, // R8G8_SNORM {VK_FORMAT_R8G8_SINT, Attachable | Storage}, // R8G8_SINT diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 2f9a7b0284..7587ab1e05 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -11,18 +11,38 @@ #include "common/assert.h" #include "common/common_types.h" #include "common/div_ceil.h" +#include "video_core/host_shaders/astc_decoder_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" +#include "video_core/texture_cache/accelerated_swizzle.h" +#include "video_core/texture_cache/types.h" +#include "video_core/textures/astc.h" +#include "video_core/textures/decoders.h" #include "video_core/vulkan_common/vulkan_device.h" #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { + +using Tegra::Texture::SWIZZLE_TABLE; +using Tegra::Texture::ASTC::EncodingsValues; + namespace { + +constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 0; +constexpr u32 ASTC_BINDING_INPUT_BUFFER = 1; +constexpr u32 ASTC_BINDING_ENC_BUFFER = 2; +constexpr u32 ASTC_BINDING_6_TO_8_BUFFER = 3; +constexpr u32 ASTC_BINDING_7_TO_8_BUFFER = 4; +constexpr u32 ASTC_BINDING_8_TO_8_BUFFER = 5; +constexpr u32 ASTC_BINDING_BYTE_TO_16_BUFFER = 6; +constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 7; + VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { return { .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, @@ -50,6 +70,67 @@ std::array BuildInputOutputDescriptorSetBinding }}; } +std::array BuildASTCDescriptorSetBindings() { + return {{ + { + .binding = ASTC_BINDING_SWIZZLE_BUFFER, // Swizzle buffer + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = ASTC_BINDING_INPUT_BUFFER, // ASTC Img data buffer + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = ASTC_BINDING_ENC_BUFFER, // Encodings buffer + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = ASTC_BINDING_6_TO_8_BUFFER, // BINDING_6_TO_8_BUFFER + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = ASTC_BINDING_7_TO_8_BUFFER, // BINDING_7_TO_8_BUFFER + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = ASTC_BINDING_8_TO_8_BUFFER, // BINDING_8_TO_8_BUFFER + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = ASTC_BINDING_BYTE_TO_16_BUFFER, // BINDING_BYTE_TO_16_BUFFER + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = ASTC_BINDING_OUTPUT_IMAGE, // Output image + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + }}; +} + VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() { return { .dstBinding = 0, @@ -61,6 +142,90 @@ VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() { }; } +std::array BuildASTCPassDescriptorUpdateTemplateEntry() { + return {{ + { + .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 0 * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + { + .dstBinding = ASTC_BINDING_INPUT_BUFFER, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 1 * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + { + .dstBinding = ASTC_BINDING_ENC_BUFFER, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 2 * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + { + .dstBinding = ASTC_BINDING_6_TO_8_BUFFER, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 3 * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + { + .dstBinding = ASTC_BINDING_7_TO_8_BUFFER, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 4 * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + { + .dstBinding = ASTC_BINDING_8_TO_8_BUFFER, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 5 * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + { + .dstBinding = ASTC_BINDING_BYTE_TO_16_BUFFER, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 6 * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + { + .dstBinding = ASTC_BINDING_OUTPUT_IMAGE, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .offset = 7 * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + }}; +} + +struct AstcPushConstants { + std::array num_image_blocks; + std::array blocks_dims; + u32 layer; + VideoCommon::Accelerated::BlockLinearSwizzle2DParams params; +}; + +struct AstcBufferData { + decltype(SWIZZLE_TABLE) swizzle_table_buffer = SWIZZLE_TABLE; + decltype(EncodingsValues) encoding_values = EncodingsValues; + decltype(REPLICATE_6_BIT_TO_8_TABLE) replicate_6_to_8 = REPLICATE_6_BIT_TO_8_TABLE; + decltype(REPLICATE_7_BIT_TO_8_TABLE) replicate_7_to_8 = REPLICATE_7_BIT_TO_8_TABLE; + decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE; + decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE; +} constexpr ASTC_BUFFER_DATA; } // Anonymous namespace VKComputePass::VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool, @@ -238,4 +403,137 @@ std::pair QuadIndexedPass::Assemble( return {staging.buffer, staging.offset}; } +using namespace Tegra::Texture::ASTC; +ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + StagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + MemoryAllocator& memory_allocator_) + : VKComputePass(device_, descriptor_pool_, BuildASTCDescriptorSetBindings(), + BuildASTCPassDescriptorUpdateTemplateEntry(), + BuildComputePushConstantRange(sizeof(AstcPushConstants)), + ASTC_DECODER_COMP_SPV), + device{device_}, scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_}, + update_descriptor_queue{update_descriptor_queue_}, memory_allocator{memory_allocator_} {} + +ASTCDecoderPass::~ASTCDecoderPass() = default; + +void ASTCDecoderPass::MakeDataBuffer() { + data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = sizeof(ASTC_BUFFER_DATA), + .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); + data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); + + const auto staging_ref = + staging_buffer_pool.Request(sizeof(ASTC_BUFFER_DATA), MemoryUsage::Upload); + std::memcpy(staging_ref.mapped_span.data(), &ASTC_BUFFER_DATA, sizeof(ASTC_BUFFER_DATA)); + scheduler.Record([src = staging_ref.buffer, dst = *data_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.CopyBuffer(src, dst, + VkBufferCopy{ + .srcOffset = 0, + .dstOffset = 0, + .size = sizeof(ASTC_BUFFER_DATA), + }); + cmdbuf.PipelineBarrier( + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, + VkMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + }, + {}, {}); + }); +} + +void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, + std::span swizzles) { + using namespace VideoCommon::Accelerated; + const VideoCommon::Extent2D tile_size{ + .width = VideoCore::Surface::DefaultBlockWidth(image.info.format), + .height = VideoCore::Surface::DefaultBlockHeight(image.info.format), + }; + scheduler.RequestOutsideRenderPassOperationContext(); + if (!data_buffer) { + MakeDataBuffer(); + } + const std::array block_dims{tile_size.width, tile_size.height}; + for (s32 layer = 0; layer < image.info.resources.layers; layer++) { + for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { + const size_t input_offset = swizzle.buffer_offset + map.offset; + const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); + const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); + const std::array num_image_blocks{swizzle.num_tiles.width, swizzle.num_tiles.height}; + const u32 layer_image_size = + image.guest_size_bytes - static_cast(swizzle.buffer_offset); + + update_descriptor_queue.Acquire(); + update_descriptor_queue.AddBuffer(*data_buffer, + offsetof(AstcBufferData, swizzle_table_buffer), + sizeof(AstcBufferData::swizzle_table_buffer)); + update_descriptor_queue.AddBuffer(map.buffer, input_offset, image.guest_size_bytes); + update_descriptor_queue.AddBuffer(*data_buffer, + offsetof(AstcBufferData, encoding_values), + sizeof(AstcBufferData::encoding_values)); + update_descriptor_queue.AddBuffer(*data_buffer, + offsetof(AstcBufferData, replicate_6_to_8), + sizeof(AstcBufferData::replicate_6_to_8)); + update_descriptor_queue.AddBuffer(*data_buffer, + offsetof(AstcBufferData, replicate_7_to_8), + sizeof(AstcBufferData::replicate_7_to_8)); + update_descriptor_queue.AddBuffer(*data_buffer, + offsetof(AstcBufferData, replicate_8_to_8), + sizeof(AstcBufferData::replicate_8_to_8)); + update_descriptor_queue.AddBuffer(*data_buffer, + offsetof(AstcBufferData, replicate_byte_to_16), + sizeof(AstcBufferData::replicate_byte_to_16)); + update_descriptor_queue.AddImage(image.StorageImageView()); + + const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); + // To unswizzle the ASTC data + const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); + scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = map.buffer, + num_dispatches_x, num_dispatches_y, layer_image_size, + num_image_blocks, block_dims, layer, params, set, + image = image.Handle(), input_offset, + aspect_mask = image.AspectMask()](vk::CommandBuffer cmdbuf) { + const AstcPushConstants uniforms{num_image_blocks, block_dims, layer, params}; + + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); + cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); + cmdbuf.Dispatch(num_dispatches_x, num_dispatches_y, 1); + + const VkImageMemoryBarrier image_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier); + }); + } + } +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 17d781d99c..5ea187c301 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -11,14 +11,21 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +namespace VideoCommon { +struct SwizzleParameters; +} + namespace Vulkan { class Device; class StagingBufferPool; class VKScheduler; class VKUpdateDescriptorQueue; +class Image; +struct StagingBufferRef; class VKComputePass { public: @@ -77,4 +84,29 @@ private: VKUpdateDescriptorQueue& update_descriptor_queue; }; +class ASTCDecoderPass final : public VKComputePass { +public: + explicit ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + StagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + MemoryAllocator& memory_allocator_); + ~ASTCDecoderPass(); + + void Assemble(Image& image, const StagingBufferRef& map, + std::span swizzles); + +private: + void MakeDataBuffer(); + + const Device& device; + VKScheduler& scheduler; + StagingBufferPool& staging_buffer_pool; + VKUpdateDescriptorQueue& update_descriptor_queue; + MemoryAllocator& memory_allocator; + + vk::Buffer data_buffer; + MemoryCommit data_buffer_commit; +}; + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index dfd38f5759..df5b7b172f 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -241,7 +241,10 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler), update_descriptor_queue(device, scheduler), blit_image(device, scheduler, state_tracker, descriptor_pool), - texture_cache_runtime{device, scheduler, memory_allocator, staging_pool, blit_image}, + astc_decoder_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue, + memory_allocator), + texture_cache_runtime{device, scheduler, memory_allocator, + staging_pool, blit_image, astc_decoder_pass}, texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, update_descriptor_queue, descriptor_pool), diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index acea1ba2dc..235afc6f3b 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -173,6 +173,7 @@ private: VKDescriptorPool descriptor_pool; VKUpdateDescriptorQueue update_descriptor_queue; BlitImageHelper blit_image; + ASTCDecoderPass astc_decoder_pass; GraphicsPipelineCacheKey graphics_key; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 22a1014a99..f7f7445874 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -10,6 +10,7 @@ #include "video_core/engines/fermi_2d.h" #include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" +#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -807,7 +808,7 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_ commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); } if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { - flags |= VideoCommon::ImageFlagBits::Converted; + flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; } if (runtime.device.HasDebuggingToolAttached()) { if (image) { @@ -816,6 +817,34 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_ buffer.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); } } + static constexpr VkImageViewUsageCreateInfo storage_image_view_usage_create_info{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, + .pNext = nullptr, + .usage = VK_IMAGE_USAGE_STORAGE_BIT, + }; + if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { + storage_image_view = runtime.device.GetLogical().CreateImageView(VkImageViewCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = &storage_image_view_usage_create_info, + .flags = 0, + .image = *image, + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = VK_FORMAT_A8B8G8R8_UNORM_PACK32, + .components{ + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }); + } } void Image::UploadMemory(const StagingBufferRef& map, std::span copies) { @@ -918,7 +947,6 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI } } const auto format_info = MaxwellToVK::SurfaceFormat(*device, FormatType::Optimal, true, format); - const VkFormat vk_format = format_info.format; const VkImageViewUsageCreateInfo image_view_usage{ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, .pNext = nullptr, @@ -930,7 +958,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI .flags = 0, .image = image.Handle(), .viewType = VkImageViewType{}, - .format = vk_format, + .format = format_info.format, .components{ .r = ComponentSwizzle(swizzle[0]), .g = ComponentSwizzle(swizzle[1]), @@ -982,7 +1010,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI .pNext = nullptr, .flags = 0, .buffer = image.Buffer(), - .format = vk_format, + .format = format_info.format, .offset = 0, // TODO: Redesign buffer cache to support this .range = image.guest_size_bytes, }); @@ -1167,4 +1195,13 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span swizzles) { + if (IsPixelFormatASTC(image.info.format)) { + return astc_decoder_pass.Assemble(image, map, swizzles); + } + UNREACHABLE(); +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 3aee27ce08..51705eccbe 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -20,6 +20,7 @@ using VideoCommon::Offset2D; using VideoCommon::RenderTargets; using VideoCore::Surface::PixelFormat; +class ASTCDecoderPass; class BlitImageHelper; class Device; class Image; @@ -60,6 +61,7 @@ struct TextureCacheRuntime { MemoryAllocator& memory_allocator; StagingBufferPool& staging_buffer_pool; BlitImageHelper& blit_image_helper; + ASTCDecoderPass& astc_decoder_pass; std::unordered_map renderpass_cache{}; void Finish(); @@ -83,9 +85,7 @@ struct TextureCacheRuntime { } void AccelerateImageUpload(Image&, const StagingBufferRef&, - std::span) { - UNREACHABLE(); - } + std::span); void InsertUploadMemoryBarrier() {} @@ -125,11 +125,17 @@ public: return aspect_mask; } + [[nodiscard]] VkImageView StorageImageView() const noexcept { + return *storage_image_view; + } + private: VKScheduler* scheduler; vk::Image image; vk::Buffer buffer; MemoryCommit commit; + vk::ImageView image_view; + vk::ImageView storage_image_view; VkImageAspectFlags aspect_mask = 0; bool initialized = false; }; diff --git a/src/video_core/texture_cache/accelerated_swizzle.h b/src/video_core/texture_cache/accelerated_swizzle.h index 6ec5c78c46..a11c924e1c 100644 --- a/src/video_core/texture_cache/accelerated_swizzle.h +++ b/src/video_core/texture_cache/accelerated_swizzle.h @@ -13,8 +13,8 @@ namespace VideoCommon::Accelerated { struct BlockLinearSwizzle2DParams { - std::array origin; - std::array destination; + alignas(16) std::array origin; + alignas(16) std::array destination; u32 bytes_per_block_log2; u32 layer_stride; u32 block_size; diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 62685a1834..3a463d5db6 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -17,26 +17,7 @@ #include "video_core/textures/texture.h" namespace Tegra::Texture { - namespace { -/** - * This table represents the internal swizzle of a gob, in format 16 bytes x 2 sector packing. - * Calculates the offset of an (x, y) position within a swizzled texture. - * Taken from the Tegra X1 Technical Reference Manual. pages 1187-1188 - */ -constexpr SwizzleTable MakeSwizzleTableConst() { - SwizzleTable table{}; - for (u32 y = 0; y < table.size(); ++y) { - for (u32 x = 0; x < table[0].size(); ++x) { - table[y][x] = ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + - (y % 2) * 16 + (x % 16); - } - } - return table; -} - -constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTableConst(); - template void Swizzle(std::span output, std::span input, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { @@ -91,10 +72,6 @@ void Swizzle(std::span output, std::span input, u32 bytes_per_pixe } } // Anonymous namespace -SwizzleTable MakeSwizzleTable() { - return SWIZZLE_TABLE; -} - void UnswizzleTexture(std::span output, std::span input, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index d7cdc81e82..4c14cefbfa 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -23,8 +23,22 @@ constexpr u32 GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_ using SwizzleTable = std::array, GOB_SIZE_Y>; -/// Returns a z-order swizzle table -SwizzleTable MakeSwizzleTable(); +/** + * This table represents the internal swizzle of a gob, in format 16 bytes x 2 sector packing. + * Calculates the offset of an (x, y) position within a swizzled texture. + * Taken from the Tegra X1 Technical Reference Manual. pages 1187-1188 + */ +constexpr SwizzleTable MakeSwizzleTable() { + SwizzleTable table{}; + for (u32 y = 0; y < table.size(); ++y) { + for (u32 x = 0; x < table[0].size(); ++x) { + table[y][x] = ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + + (y % 2) * 16 + (x % 16); + } + } + return table; +} +constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTable(); /// Unswizzles a block linear texture into linear memory. void UnswizzleTexture(std::span output, std::span input, u32 bytes_per_pixel, From c7553abe894ddac84fe8417a12ec51d5ab60dc58 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Tue, 19 Jan 2021 19:54:28 -0500 Subject: [PATCH 4/6] astc_decoder: Fix out of bounds memory access resolves a crash with some anamolous textures found in Astral Chain. --- src/video_core/host_shaders/astc_decoder.comp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 2ddac2e1d5..5be716309a 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -339,6 +339,9 @@ uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool sma } uint ReadBit() { + if (current_index >= local_buff.length()) { + return 0; + } uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1); bitsread++; total_bitsread++; @@ -1170,12 +1173,17 @@ void DecompressBlock(ivec3 coord, uint block_index) { plane_selector_bits = 2; } remaining_bits -= plane_selector_bits; + if (remaining_bits > 128) { + // Bad data, more remaining bits than 4 bytes + // return early + return; + } // Read color data... uint color_data_bits = remaining_bits; while (remaining_bits > 0) { - uint nb = min(remaining_bits, 8); + int nb = int(min(remaining_bits, 8U)); uint b = StreamBits(nb); - color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, 8)); + color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); ced_pointer++; remaining_bits -= nb; } From 2f30c105849c214345e2201f4bd6f9b4b76ab4a1 Mon Sep 17 00:00:00 2001 From: Rodrigo Locatti Date: Sat, 13 Feb 2021 16:08:50 -0500 Subject: [PATCH 5/6] astc_decoder: Reimplement Layers Reimplements the approach to decoding layers in the compute shader. Fixes multilayer astc decoding when using Vulkan. --- src/video_core/host_shaders/astc_decoder.comp | 33 ++-- .../renderer_opengl/util_shaders.cpp | 47 +++--- .../renderer_vulkan/vk_compute_pass.cpp | 154 ++++++++++-------- .../renderer_vulkan/vk_texture_cache.cpp | 46 +++--- .../renderer_vulkan/vk_texture_cache.h | 13 +- 5 files changed, 156 insertions(+), 137 deletions(-) diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 5be716309a..b903a2d37b 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -39,17 +39,15 @@ layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; BEGIN_PUSH_CONSTANTS UNIFORM(0) uvec2 num_image_blocks; UNIFORM(1) uvec2 block_dims; -UNIFORM(2) uint layer; - -UNIFORM(3) uvec3 origin; -UNIFORM(4) ivec3 destination; -UNIFORM(5) uint bytes_per_block_log2; -UNIFORM(6) uint layer_stride; -UNIFORM(7) uint block_size; -UNIFORM(8) uint x_shift; -UNIFORM(9) uint block_height; -UNIFORM(10) uint block_height_mask; +UNIFORM(2) uvec3 origin; +UNIFORM(3) ivec3 destination; +UNIFORM(4) uint bytes_per_block_log2; +UNIFORM(5) uint layer_stride; +UNIFORM(6) uint block_size; +UNIFORM(7) uint x_shift; +UNIFORM(8) uint block_height; +UNIFORM(9) uint block_height_mask; END_PUSH_CONSTANTS uint current_index = 0; @@ -82,7 +80,7 @@ layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { uint swizzle_table[]; }; -layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uint astc_data[]; }; @@ -104,7 +102,7 @@ layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BY uint REPLICATE_BYTE_TO_16_TABLE[]; }; -layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2D dest_image; +layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; const uint GOB_SIZE_X = 64; const uint GOB_SIZE_Y = 8; @@ -1086,10 +1084,9 @@ TexelWeightParams DecodeBlockInfo(uint block_index) { void FillError(ivec3 coord) { for (uint j = 0; j < block_dims.y; j++) { for (uint i = 0; i < block_dims.x; i++) { - imageStore(dest_image, coord.xy + ivec2(i, j), vec4(1.0, 1.0, 0.0, 1.0)); + imageStore(dest_image, coord + ivec3(i, j, 0), vec4(1.0, 1.0, 0.0, 1.0)); } } - return; } void FillVoidExtentLDR(ivec3 coord, uint block_index) { @@ -1107,7 +1104,7 @@ void FillVoidExtentLDR(ivec3 coord, uint block_index) { float b = float(b_u) / 65535.0f; for (uint j = 0; j < block_dims.y; j++) { for (uint i = 0; i < block_dims.x; i++) { - imageStore(dest_image, coord.xy + ivec2(i, j), vec4(r, g, b, a)); + imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a)); } } } @@ -1264,7 +1261,7 @@ void DecompressBlock(ivec3 coord, uint block_index) { } vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6); p = (Cf / 65535.0); - imageStore(dest_image, coord.xy + ivec2(i, j), p.gbar); + imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); } } } @@ -1279,7 +1276,7 @@ void main() { const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; uint offset = 0; - offset += layer * layer_stride; + offset += pos.z * layer_stride; offset += (block_y >> block_height) * block_size; offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT; offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; @@ -1287,7 +1284,7 @@ void main() { const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1.0)); uint block_index = - layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x; + pos.z * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x; current_index = 0; bitsread = 0; for (int i = 0; i < 16; i++) { diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index d0979dab1a..85722c54ad 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -110,7 +110,6 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; static constexpr GLuint LOC_NUM_IMAGE_BLOCKS = 0; static constexpr GLuint LOC_BLOCK_DIMS = 1; - static constexpr GLuint LOC_LAYER = 2; const Extent3D tile_size = { VideoCore::Surface::DefaultBlockWidth(image.info.format), @@ -130,35 +129,31 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glUniform2ui(LOC_BLOCK_DIMS, tile_size.width, tile_size.height); + for (const SwizzleParameters& swizzle : swizzles) { + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, + GL_WRITE_ONLY, GL_RGBA8); + const size_t input_offset = swizzle.buffer_offset + map.offset; + const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); + const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); - for (u32 layer = 0; layer < image.info.resources.layers; layer++) { - for (const SwizzleParameters& swizzle : swizzles) { - glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_FALSE, - layer, GL_WRITE_ONLY, GL_RGBA8); - const size_t input_offset = swizzle.buffer_offset + map.offset; - const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); - const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); + glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height); - glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height); - glUniform1ui(LOC_LAYER, layer); + // To unswizzle the ASTC data + const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); + glUniform3uiv(2, 1, params.origin.data()); + glUniform3iv(3, 1, params.destination.data()); + glUniform1ui(4, params.bytes_per_block_log2); + glUniform1ui(5, params.layer_stride); + glUniform1ui(6, params.block_size); + glUniform1ui(7, params.x_shift); + glUniform1ui(8, params.block_height); + glUniform1ui(9, params.block_height_mask); - // To unswizzle the ASTC data - const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); - glUniform3uiv(3, 1, params.origin.data()); - glUniform3iv(4, 1, params.destination.data()); - glUniform1ui(5, params.bytes_per_block_log2); - glUniform1ui(6, params.layer_stride); - glUniform1ui(7, params.block_size); - glUniform1ui(8, params.x_shift); - glUniform1ui(9, params.block_height); - glUniform1ui(10, params.block_height_mask); + // ASTC texture data + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); - // ASTC texture data - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, - input_offset, image.guest_size_bytes - swizzle.buffer_offset); - - glDispatchCompute(num_dispatches_x, num_dispatches_y, 1); - } + glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); } program_manager.RestoreGuestCompute(); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 7587ab1e05..a0050b68f9 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -31,6 +31,7 @@ namespace Vulkan { using Tegra::Texture::SWIZZLE_TABLE; using Tegra::Texture::ASTC::EncodingsValues; +using namespace Tegra::Texture::ASTC; namespace { @@ -214,7 +215,6 @@ std::array BuildASTCPassDescriptorUpdateT struct AstcPushConstants { std::array num_image_blocks; std::array blocks_dims; - u32 layer; VideoCommon::Accelerated::BlockLinearSwizzle2DParams params; }; @@ -226,6 +226,7 @@ struct AstcBufferData { decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE; decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE; } constexpr ASTC_BUFFER_DATA; + } // Anonymous namespace VKComputePass::VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool, @@ -403,7 +404,6 @@ std::pair QuadIndexedPass::Assemble( return {staging.buffer, staging.offset}; } -using namespace Tegra::Texture::ASTC; ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, VKDescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, @@ -464,76 +464,94 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, if (!data_buffer) { MakeDataBuffer(); } + const VkImageAspectFlags aspect_mask = image.AspectMask(); + const VkImage vk_image = image.Handle(); + const bool is_initialized = image.ExchangeInitialization(); + scheduler.Record([vk_image, aspect_mask, is_initialized](vk::CommandBuffer cmdbuf) { + const VkImageMemoryBarrier image_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = vk_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + cmdbuf.PipelineBarrier(0, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, image_barrier); + }); const std::array block_dims{tile_size.width, tile_size.height}; - for (s32 layer = 0; layer < image.info.resources.layers; layer++) { - for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { - const size_t input_offset = swizzle.buffer_offset + map.offset; - const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); - const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); - const std::array num_image_blocks{swizzle.num_tiles.width, swizzle.num_tiles.height}; - const u32 layer_image_size = - image.guest_size_bytes - static_cast(swizzle.buffer_offset); + for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { + const size_t input_offset = swizzle.buffer_offset + map.offset; + const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); + const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); + const u32 num_dispatches_z = image.info.resources.layers; + const std::array num_image_blocks{swizzle.num_tiles.width, swizzle.num_tiles.height}; + const u32 layer_image_size = + image.guest_size_bytes - static_cast(swizzle.buffer_offset); - update_descriptor_queue.Acquire(); - update_descriptor_queue.AddBuffer(*data_buffer, - offsetof(AstcBufferData, swizzle_table_buffer), - sizeof(AstcBufferData::swizzle_table_buffer)); - update_descriptor_queue.AddBuffer(map.buffer, input_offset, image.guest_size_bytes); - update_descriptor_queue.AddBuffer(*data_buffer, - offsetof(AstcBufferData, encoding_values), - sizeof(AstcBufferData::encoding_values)); - update_descriptor_queue.AddBuffer(*data_buffer, - offsetof(AstcBufferData, replicate_6_to_8), - sizeof(AstcBufferData::replicate_6_to_8)); - update_descriptor_queue.AddBuffer(*data_buffer, - offsetof(AstcBufferData, replicate_7_to_8), - sizeof(AstcBufferData::replicate_7_to_8)); - update_descriptor_queue.AddBuffer(*data_buffer, - offsetof(AstcBufferData, replicate_8_to_8), - sizeof(AstcBufferData::replicate_8_to_8)); - update_descriptor_queue.AddBuffer(*data_buffer, - offsetof(AstcBufferData, replicate_byte_to_16), - sizeof(AstcBufferData::replicate_byte_to_16)); - update_descriptor_queue.AddImage(image.StorageImageView()); + update_descriptor_queue.Acquire(); + update_descriptor_queue.AddBuffer(*data_buffer, + offsetof(AstcBufferData, swizzle_table_buffer), + sizeof(AstcBufferData::swizzle_table_buffer)); + update_descriptor_queue.AddBuffer(map.buffer, input_offset, layer_image_size); + update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, encoding_values), + sizeof(AstcBufferData::encoding_values)); + update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_6_to_8), + sizeof(AstcBufferData::replicate_6_to_8)); + update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_7_to_8), + sizeof(AstcBufferData::replicate_7_to_8)); + update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_8_to_8), + sizeof(AstcBufferData::replicate_8_to_8)); + update_descriptor_queue.AddBuffer(*data_buffer, + offsetof(AstcBufferData, replicate_byte_to_16), + sizeof(AstcBufferData::replicate_byte_to_16)); + update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); - const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); - // To unswizzle the ASTC data - const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); - scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = map.buffer, - num_dispatches_x, num_dispatches_y, layer_image_size, - num_image_blocks, block_dims, layer, params, set, - image = image.Handle(), input_offset, - aspect_mask = image.AspectMask()](vk::CommandBuffer cmdbuf) { - const AstcPushConstants uniforms{num_image_blocks, block_dims, layer, params}; - - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); - cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); - cmdbuf.Dispatch(num_dispatches_x, num_dispatches_y, 1); - - const VkImageMemoryBarrier image_barrier{ - .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, - .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, - .newLayout = VK_IMAGE_LAYOUT_GENERAL, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .image = image, - .subresourceRange{ - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = VK_REMAINING_MIP_LEVELS, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, - }; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier); - }); - } + const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); + const VkPipelineLayout vk_layout = *layout; + const VkPipeline vk_pipeline = *pipeline; + // To unswizzle the ASTC data + const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); + scheduler.Record([vk_layout, vk_pipeline, buffer = map.buffer, num_dispatches_x, + num_dispatches_y, num_dispatches_z, num_image_blocks, block_dims, params, + set, input_offset](vk::CommandBuffer cmdbuf) { + const AstcPushConstants uniforms{num_image_blocks, block_dims, params}; + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, vk_pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, vk_layout, 0, set, {}); + cmdbuf.PushConstants(vk_layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); + cmdbuf.Dispatch(num_dispatches_x, num_dispatches_y, num_dispatches_z); + }); } + scheduler.Record([vk_image, aspect_mask](vk::CommandBuffer cmdbuf) { + const VkImageMemoryBarrier image_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = vk_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + cmdbuf.PipelineBarrier(0, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, image_barrier); + }); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index f7f7445874..18155e4497 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -823,27 +823,31 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_ .usage = VK_IMAGE_USAGE_STORAGE_BIT, }; if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { - storage_image_view = runtime.device.GetLogical().CreateImageView(VkImageViewCreateInfo{ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .pNext = &storage_image_view_usage_create_info, - .flags = 0, - .image = *image, - .viewType = VK_IMAGE_VIEW_TYPE_2D, - .format = VK_FORMAT_A8B8G8R8_UNORM_PACK32, - .components{ - .r = VK_COMPONENT_SWIZZLE_IDENTITY, - .g = VK_COMPONENT_SWIZZLE_IDENTITY, - .b = VK_COMPONENT_SWIZZLE_IDENTITY, - .a = VK_COMPONENT_SWIZZLE_IDENTITY, - }, - .subresourceRange{ - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = VK_REMAINING_MIP_LEVELS, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, - }); + const auto& device = runtime.device.GetLogical(); + storage_image_views.reserve(info.resources.levels); + for (s32 level = 0; level < info.resources.levels; ++level) { + storage_image_views.push_back(device.CreateImageView(VkImageViewCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = &storage_image_view_usage_create_info, + .flags = 0, + .image = *image, + .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY, + .format = VK_FORMAT_A8B8G8R8_UNORM_PACK32, + .components{ + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = static_cast(level), + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + })); + } } } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 51705eccbe..628785d5ee 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -121,12 +121,17 @@ public: return *buffer; } - [[nodiscard]] VkImageCreateFlags AspectMask() const noexcept { + [[nodiscard]] VkImageAspectFlags AspectMask() const noexcept { return aspect_mask; } - [[nodiscard]] VkImageView StorageImageView() const noexcept { - return *storage_image_view; + [[nodiscard]] VkImageView StorageImageView(s32 level) const noexcept { + return *storage_image_views[level]; + } + + /// Returns true when the image is already initialized and mark it as initialized + [[nodiscard]] bool ExchangeInitialization() noexcept { + return std::exchange(initialized, true); } private: @@ -135,7 +140,7 @@ private: vk::Buffer buffer; MemoryCommit commit; vk::ImageView image_view; - vk::ImageView storage_image_view; + std::vector storage_image_views; VkImageAspectFlags aspect_mask = 0; bool initialized = false; }; From 2f83d9a61bca42d9ef24074beb2b11b19bd4cecd Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 25 Mar 2021 16:53:51 -0400 Subject: [PATCH 6/6] astc_decoder: Refactor for style and more efficient memory use --- src/video_core/CMakeLists.txt | 1 - src/video_core/host_shaders/astc_decoder.comp | 569 +++--- .../renderer_opengl/gl_texture_cache.cpp | 11 +- .../renderer_opengl/util_shaders.cpp | 102 +- src/video_core/renderer_opengl/util_shaders.h | 8 +- .../renderer_vulkan/vk_compute_pass.cpp | 175 +- src/video_core/texture_cache/util.cpp | 14 +- src/video_core/textures/astc.cpp | 1710 ----------------- src/video_core/textures/astc.h | 172 +- 9 files changed, 504 insertions(+), 2258 deletions(-) delete mode 100644 src/video_core/textures/astc.cpp diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 9b931976a1..47190c4643 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -236,7 +236,6 @@ add_library(video_core STATIC texture_cache/types.h texture_cache/util.cpp texture_cache/util.h - textures/astc.cpp textures/astc.h textures/decoders.cpp textures/decoders.h diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index b903a2d37b..703e345875 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -9,13 +9,13 @@ #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { #define END_PUSH_CONSTANTS }; #define UNIFORM(n) -#define BINDING_SWIZZLE_BUFFER 0 -#define BINDING_INPUT_BUFFER 1 -#define BINDING_ENC_BUFFER 2 -#define BINDING_6_TO_8_BUFFER 3 -#define BINDING_7_TO_8_BUFFER 4 -#define BINDING_8_TO_8_BUFFER 5 -#define BINDING_BYTE_TO_16_BUFFER 6 +#define BINDING_INPUT_BUFFER 0 +#define BINDING_ENC_BUFFER 1 +#define BINDING_6_TO_8_BUFFER 2 +#define BINDING_7_TO_8_BUFFER 3 +#define BINDING_8_TO_8_BUFFER 4 +#define BINDING_BYTE_TO_16_BUFFER 5 +#define BINDING_SWIZZLE_BUFFER 6 #define BINDING_OUTPUT_IMAGE 7 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv @@ -37,28 +37,16 @@ layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; BEGIN_PUSH_CONSTANTS -UNIFORM(0) uvec2 num_image_blocks; UNIFORM(1) uvec2 block_dims; -UNIFORM(2) uvec3 origin; -UNIFORM(3) ivec3 destination; -UNIFORM(4) uint bytes_per_block_log2; -UNIFORM(5) uint layer_stride; -UNIFORM(6) uint block_size; -UNIFORM(7) uint x_shift; -UNIFORM(8) uint block_height; -UNIFORM(9) uint block_height_mask; +UNIFORM(2) uint bytes_per_block_log2; +UNIFORM(3) uint layer_stride; +UNIFORM(4) uint block_size; +UNIFORM(5) uint x_shift; +UNIFORM(6) uint block_height; +UNIFORM(7) uint block_height_mask; END_PUSH_CONSTANTS -uint current_index = 0; -int bitsread = 0; -uint total_bitsread = 0; -uint local_buff[16]; - -const int JustBits = 0; -const int Quint = 1; -const int Trit = 2; - struct EncodingData { uint encoding; uint num_bits; @@ -68,11 +56,11 @@ struct EncodingData { struct TexelWeightParams { uvec2 size; - bool dual_plane; uint max_weight; - bool Error; - bool VoidExtentLDR; - bool VoidExtentHDR; + bool dual_plane; + bool error_state; + bool void_extent_ldr; + bool void_extent_hdr; }; // Swizzle data @@ -116,6 +104,75 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHI const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); +const int BLOCK_SIZE_IN_BYTES = 16; + +const int BLOCK_INFO_ERROR = 0; +const int BLOCK_INFO_VOID_EXTENT_HDR = 1; +const int BLOCK_INFO_VOID_EXTENT_LDR = 2; +const int BLOCK_INFO_NORMAL = 3; + +const int JUST_BITS = 0; +const int QUINT = 1; +const int TRIT = 2; + +// The following constants are expanded variants of the Replicate() +// function calls corresponding to the following arguments: +// value: index into the generated table +// num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4. +// to_bit: the integer after "TO_" +const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127); +const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511); + +const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255); +const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255); +const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255); +const uint REPLICATE_4_BIT_TO_8_TABLE[16] = + uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255); +const uint REPLICATE_5_BIT_TO_8_TABLE[32] = + uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165, + 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255); +const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63); +const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63); +const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63); +const uint REPLICATE_4_BIT_TO_6_TABLE[16] = + uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63); +const uint REPLICATE_5_BIT_TO_6_TABLE[32] = + uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, + 47, 49, 51, 53, 55, 57, 59, 61, 63); + +// Input ASTC texture globals +uint current_index = 0; +int bitsread = 0; +uint total_bitsread = 0; +uint local_buff[16]; + +// Color data globals +uint color_endpoint_data[16]; +int color_bitsread = 0; +uint total_color_bitsread = 0; +int color_index = 0; + +// Four values, two endpoints, four maximum paritions +uint color_values[32]; +int colvals_index = 0; + +// Weight data globals +uint texel_weight_data[16]; +int texel_bitsread = 0; +uint total_texel_bitsread = 0; +int texel_index = 0; + +bool texel_flag = false; + +// Global "vectors" to be pushed into when decoding +EncodingData result_vector[100]; +int result_index = 0; + +EncodingData texel_vector[100]; +int texel_vector_index = 0; + +uint unquantized_texel_weights[2][144]; + uint SwizzleOffset(uvec2 pos) { pos = pos & SWIZZLE_MASK; return swizzle_table[pos.y * 64 + pos.x]; @@ -126,21 +183,10 @@ uint ReadTexel(uint offset) { return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8); } - -const int BLOCK_SIZE_IN_BYTES = 16; - -const int BLOCK_INFO_ERROR = 0; -const int BLOCK_INFO_VOID_EXTENT_HDR = 1; -const int BLOCK_INFO_VOID_EXTENT_LDR = 2; -const int BLOCK_INFO_NORMAL = 3; - -// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] -// is the same as [(numBits - 1):0] and repeats all the way down. +// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] +// is the same as [(num_bits - 1):0] and repeats all the way down. uint Replicate(uint val, uint num_bits, uint to_bit) { - if (num_bits == 0) { - return 0; - } - if (to_bit == 0) { + if (num_bits == 0 || to_bit == 0) { return 0; } const uint v = val & uint((1 << num_bits) - 1); @@ -165,26 +211,14 @@ uvec4 ReplicateByteTo16(uvec4 value) { REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]); } -const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127); uint ReplicateBitTo7(uint value) { return REPLICATE_BIT_TO_7_TABLE[value]; - ; } -const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511); uint ReplicateBitTo9(uint value) { return REPLICATE_1_BIT_TO_9_TABLE[value]; } -const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255); -const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255); -const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255); -const uint REPLICATE_4_BIT_TO_8_TABLE[16] = - uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255); -const uint REPLICATE_5_BIT_TO_8_TABLE[32] = - uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165, - 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255); - uint FastReplicateTo8(uint value, uint num_bits) { switch (num_bits) { case 1: @@ -207,15 +241,6 @@ uint FastReplicateTo8(uint value, uint num_bits) { return Replicate(value, num_bits, 8); } -const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63); -const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63); -const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63); -const uint REPLICATE_4_BIT_TO_6_TABLE[16] = - uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63); -const uint REPLICATE_5_BIT_TO_6_TABLE[32] = - uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, - 47, 49, 51, 53, 55, 57, 59, 61, 63); - uint FastReplicateTo6(uint value, uint num_bits) { switch (num_bits) { case 1: @@ -232,7 +257,23 @@ uint FastReplicateTo6(uint value, uint num_bits) { return Replicate(value, num_bits, 6); } -uint hash52(uint p) { +uint Div3Floor(uint v) { + return (v * 0x5556) >> 16; +} + +uint Div3Ceil(uint v) { + return Div3Floor(v + 2); +} + +uint Div5Floor(uint v) { + return (v * 0x3334) >> 16; +} + +uint Div5Ceil(uint v) { + return Div5Floor(v + 4); +} + +uint Hash52(uint p) { p ^= p >> 15; p -= p << 17; p += p << 7; @@ -247,9 +288,9 @@ uint hash52(uint p) { } uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) { - if (1 == partition_count) + if (partition_count == 1) { return 0; - + } if (small_block) { x <<= 1; y <<= 1; @@ -258,7 +299,7 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo seed += (partition_count - 1) * 1024; - uint rnum = hash52(uint(seed)); + uint rnum = Hash52(uint(seed)); uint seed1 = uint(rnum & 0xF); uint seed2 = uint((rnum >> 4) & 0xF); uint seed3 = uint((rnum >> 8) & 0xF); @@ -318,18 +359,22 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo c &= 0x3F; d &= 0x3F; - if (partition_count < 4) + if (partition_count < 4) { d = 0; - if (partition_count < 3) + } + if (partition_count < 3) { c = 0; + } - if (a >= b && a >= c && a >= d) + if (a >= b && a >= c && a >= d) { return 0; - else if (b >= c && b >= d) + } else if (b >= c && b >= d) { return 1; - else if (c >= d) + } else if (c >= d) { return 2; - return 3; + } else { + return 3; + } } uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { @@ -341,10 +386,10 @@ uint ReadBit() { return 0; } uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1); - bitsread++; - total_bitsread++; + ++bitsread; + ++total_bitsread; if (bitsread == 8) { - current_index++; + ++current_index; bitsread = 0; } return bit; @@ -358,36 +403,22 @@ uint StreamBits(uint num_bits) { return ret; } -// Define color data. -uint color_endpoint_data[16]; -int color_bitsread = 0; -uint total_color_bitsread = 0; -int color_index = 0; - -// Define color data. -uint texel_weight_data[16]; -int texel_bitsread = 0; -uint total_texel_bitsread = 0; -int texel_index = 0; - -bool texel_flag = false; - uint ReadColorBit() { uint bit = 0; if (texel_flag) { bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1); - texel_bitsread++; - total_texel_bitsread++; + ++texel_bitsread; + ++total_texel_bitsread; if (texel_bitsread == 8) { - texel_index++; + ++texel_index; texel_bitsread = 0; } } else { bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1); - color_bitsread++; - total_color_bitsread++; + ++color_bitsread; + ++total_color_bitsread; if (color_bitsread == 8) { - color_index++; + ++color_index; color_bitsread = 0; } } @@ -402,31 +433,25 @@ uint StreamColorBits(uint num_bits) { return ret; } -EncodingData result_vector[100]; -int result_index = 0; - -EncodingData texel_vector[100]; -int texel_vector_index = 0; - void ResultEmplaceBack(EncodingData val) { if (texel_flag) { texel_vector[texel_vector_index] = val; - texel_vector_index++; + ++texel_vector_index; } else { result_vector[result_index] = val; - result_index++; + ++result_index; } } // Returns the number of bits required to encode n_vals values. uint GetBitLength(uint n_vals, uint encoding_index) { - uint totalBits = encoding_values[encoding_index].num_bits * n_vals; - if (encoding_values[encoding_index].encoding == Trit) { - totalBits += (n_vals * 8 + 4) / 5; - } else if (encoding_values[encoding_index].encoding == Quint) { - totalBits += (n_vals * 7 + 2) / 3; + uint total_bits = encoding_values[encoding_index].num_bits * n_vals; + if (encoding_values[encoding_index].encoding == TRIT) { + total_bits += Div5Ceil(n_vals * 8); + } else if (encoding_values[encoding_index].encoding == QUINT) { + total_bits += Div3Ceil(n_vals * 7); } - return totalBits; + return total_bits; } uint GetNumWeightValues(uvec2 size, bool dual_plane) { @@ -459,7 +484,7 @@ uint BitsOp(uint bits, uint start, uint end) { return ((bits >> start) & mask); } -void DecodeQuintBlock(uint num_bits) { // Value number of bits +void DecodeQuintBlock(uint num_bits) { uint m[3]; uint q[3]; uint Q; @@ -483,7 +508,6 @@ void DecodeQuintBlock(uint num_bits) { // Value number of bits q[2] = BitsOp(Q, 5, 6); C = BitsOp(Q, 0, 4); } - if (BitsOp(C, 0, 2) == 5) { q[1] = 4; q[0] = BitsOp(C, 3, 4); @@ -492,10 +516,9 @@ void DecodeQuintBlock(uint num_bits) { // Value number of bits q[0] = BitsOp(C, 0, 2); } } - for (uint i = 0; i < 3; i++) { EncodingData val; - val.encoding = Quint; + val.encoding = QUINT; val.num_bits = num_bits; val.bit_value = m[i]; val.quint_trit_value = q[i]; @@ -547,29 +570,28 @@ void DecodeTritBlock(uint num_bits) { } for (uint i = 0; i < 5; i++) { EncodingData val; - val.encoding = Trit; + val.encoding = TRIT; val.num_bits = num_bits; val.bit_value = m[i]; val.quint_trit_value = t[i]; ResultEmplaceBack(val); } } + void DecodeIntegerSequence(uint max_range, uint num_values) { EncodingData val = encoding_values[max_range]; uint vals_decoded = 0; while (vals_decoded < num_values) { switch (val.encoding) { - case Quint: + case QUINT: DecodeQuintBlock(val.num_bits); vals_decoded += 3; break; - - case Trit: + case TRIT: DecodeTritBlock(val.num_bits); vals_decoded += 5; break; - - case JustBits: + case JUST_BITS: val.bit_value = StreamColorBits(val.num_bits); ResultEmplaceBack(val); vals_decoded++; @@ -578,8 +600,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) { } } -void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitions, - uint color_data_bits) { +void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { uint num_values = 0; for (uint i = 0; i < num_partitions; i++) { num_values += ((modes[i] >> 2) + 1) << 1; @@ -587,21 +608,21 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio int range = 256; while (--range > 0) { EncodingData val = encoding_values[range]; - uint bitLength = GetBitLength(num_values, range); - if (bitLength <= color_data_bits) { + uint bit_length = GetBitLength(num_values, range); + if (bit_length <= color_data_bits) { while (--range > 0) { EncodingData newval = encoding_values[range]; if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) { break; } } - range++; + ++range; break; } } DecodeIntegerSequence(range, num_values); uint out_index = 0; - for (int itr = 0; itr < result_index; itr++) { + for (int itr = 0; itr < result_index; ++itr) { if (out_index >= num_values) { break; } @@ -611,77 +632,83 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio uint A = 0, B = 0, C = 0, D = 0; A = ReplicateBitTo9((bitval & 1)); switch (val.encoding) { - case JustBits: + case JUST_BITS: color_values[out_index++] = FastReplicateTo8(bitval, bitlen); break; - case Trit: { + case TRIT: { D = val.quint_trit_value; switch (bitlen) { - case 1: { + case 1: C = 204; - } break; + break; case 2: { C = 93; uint b = (bitval >> 1) & 1; B = (b << 8) | (b << 4) | (b << 2) | (b << 1); - } break; - + break; + } case 3: { C = 44; uint cb = (bitval >> 1) & 3; B = (cb << 7) | (cb << 2) | cb; - } break; - + break; + } case 4: { C = 22; uint dcb = (bitval >> 1) & 7; B = (dcb << 6) | dcb; - } break; - + break; + } case 5: { C = 11; uint edcb = (bitval >> 1) & 0xF; B = (edcb << 5) | (edcb >> 2); - } break; - + break; + } case 6: { C = 5; uint fedcb = (bitval >> 1) & 0x1F; B = (fedcb << 4) | (fedcb >> 4); - } break; + break; } - } break; - case Quint: { + } + break; + } + case QUINT: { D = val.quint_trit_value; switch (bitlen) { - case 1: { + case 1: C = 113; - } break; + break; case 2: { C = 54; uint b = (bitval >> 1) & 1; B = (b << 8) | (b << 3) | (b << 2); - } break; + break; + } case 3: { C = 26; uint cb = (bitval >> 1) & 3; B = (cb << 7) | (cb << 1) | (cb >> 1); - } break; + break; + } case 4: { C = 13; uint dcb = (bitval >> 1) & 7; B = (dcb << 6) | (dcb >> 1); - } break; + break; + } case 5: { C = 6; uint edcb = (bitval >> 1) & 0xF; B = (edcb << 5) | (edcb >> 3); - } break; + break; } - } break; + } + break; } - - if (val.encoding != JustBits) { + } + if (val.encoding != JUST_BITS) { uint T = (D * C) + B; T ^= A; T = (A & 0x80) | (T >> 2); @@ -689,30 +716,31 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio } } } + ivec2 BitTransferSigned(int a, int b) { ivec2 transferred; - transferred[1] = b >> 1; - transferred[1] |= a & 0x80; - transferred[0] = a >> 1; - transferred[0] &= 0x3F; - if ((transferred[0] & 0x20) > 0) { - transferred[0] -= 0x40; + transferred.y = b >> 1; + transferred.y |= a & 0x80; + transferred.x = a >> 1; + transferred.x &= 0x3F; + if ((transferred.x & 0x20) > 0) { + transferred.x -= 0x40; } return transferred; } uvec4 ClampByte(ivec4 color) { - for (uint i = 0; i < 4; i++) { + for (uint i = 0; i < 4; ++i) { color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); } return uvec4(color); } + ivec4 BlueContract(int a, int r, int g, int b) { return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); } -int colvals_index = 0; -void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32], - uint color_endpoint_mode) { + +void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { #define READ_UINT_VALUES(N) \ uint v[N]; \ for (uint i = 0; i < N; i++) { \ @@ -730,113 +758,120 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32], READ_UINT_VALUES(2) ep1 = uvec4(0xFF, v[0], v[0], v[0]); ep2 = uvec4(0xFF, v[1], v[1], v[1]); - } break; - + break; + } case 1: { READ_UINT_VALUES(2) uint L0 = (v[0] >> 2) | (v[1] & 0xC0); uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU); ep1 = uvec4(0xFF, L0, L0, L0); ep2 = uvec4(0xFF, L1, L1, L1); - } break; - + break; + } case 4: { READ_UINT_VALUES(4) ep1 = uvec4(v[2], v[0], v[0], v[0]); ep2 = uvec4(v[3], v[1], v[1], v[1]); - } break; - + break; + } case 5: { READ_INT_VALUES(4) ivec2 transferred = BitTransferSigned(v[1], v[0]); - v[1] = transferred[0]; - v[0] = transferred[1]; + v[1] = transferred.x; + v[0] = transferred.y; transferred = BitTransferSigned(v[3], v[2]); - v[3] = transferred[0]; - v[2] = transferred[1]; + v[3] = transferred.x; + v[2] = transferred.y; ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0])); - ep2 = ClampByte(ivec4((v[2] + v[3]), v[0] + v[1], v[0] + v[1], v[0] + v[1])); - } break; - + ep2 = ClampByte(ivec4(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1])); + break; + } case 6: { READ_UINT_VALUES(4) - ep1 = uvec4(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); + ep1 = uvec4(0xFF, (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); ep2 = uvec4(0xFF, v[0], v[1], v[2]); - } break; - + break; + } case 8: { READ_UINT_VALUES(6) - if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { + if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { ep1 = uvec4(0xFF, v[0], v[2], v[4]); ep2 = uvec4(0xFF, v[1], v[3], v[5]); } else { ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5]))); ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4]))); } - } break; - + break; + } case 9: { READ_INT_VALUES(6) ivec2 transferred = BitTransferSigned(v[1], v[0]); - v[1] = transferred[0]; - v[0] = transferred[1]; + v[1] = transferred.x; + v[0] = transferred.y; transferred = BitTransferSigned(v[3], v[2]); - v[3] = transferred[0]; - v[2] = transferred[1]; + v[3] = transferred.x; + v[2] = transferred.y; transferred = BitTransferSigned(v[5], v[4]); - v[5] = transferred[0]; - v[4] = transferred[1]; - if (v[1] + v[3] + v[5] >= 0) { + v[5] = transferred.x; + v[4] = transferred.y; + if ((v[1] + v[3] + v[5]) >= 0) { ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4])); ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); } else { ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4])); } - } break; - + break; + } case 10: { READ_UINT_VALUES(6) - ep1 = uvec4(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); + ep1 = uvec4(v[4], (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); ep2 = uvec4(v[5], v[0], v[1], v[2]); - } break; - + break; + } case 12: { READ_UINT_VALUES(8) - if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { + if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { ep1 = uvec4(v[6], v[0], v[2], v[4]); ep2 = uvec4(v[7], v[1], v[3], v[5]); } else { ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5]))); ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4]))); } - } break; - + break; + } case 13: { READ_INT_VALUES(8) ivec2 transferred = BitTransferSigned(v[1], v[0]); - v[1] = transferred[0]; - v[0] = transferred[1]; + v[1] = transferred.x; + v[0] = transferred.y; transferred = BitTransferSigned(v[3], v[2]); - v[3] = transferred[0]; - v[2] = transferred[1]; + v[3] = transferred.x; + v[2] = transferred.y; transferred = BitTransferSigned(v[5], v[4]); - v[5] = transferred[0]; - v[4] = transferred[1]; + v[5] = transferred.x; + v[4] = transferred.y; transferred = BitTransferSigned(v[7], v[6]); - v[7] = transferred[0]; - v[6] = transferred[1]; + v[7] = transferred.x; + v[6] = transferred.y; - if (v[1] + v[3] + v[5] >= 0) { + if ((v[1] + v[3] + v[5]) >= 0) { ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4])); ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5])); } else { ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5])); ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4])); } - } break; + break; + } + default: { + // HDR mode, or more likely a bug computing the color_endpoint_mode + ep1 = uvec4(0xFF, 0xFF, 0, 0); + ep2 = uvec4(0xFF, 0xFF, 0, 0); + break; + } } #undef READ_UINT_VALUES #undef READ_INT_VALUES @@ -849,52 +884,61 @@ uint UnquantizeTexelWeight(EncodingData val) { uint B = 0, C = 0, D = 0; uint result = 0; switch (val.encoding) { - case JustBits: + case JUST_BITS: result = FastReplicateTo6(bitval, bitlen); break; - case Trit: { + case TRIT: { D = val.quint_trit_value; switch (bitlen) { case 0: { uint results[3] = {0, 32, 63}; result = results[D]; - } break; + break; + } case 1: { C = 50; - } break; + break; + } case 2: { C = 23; uint b = (bitval >> 1) & 1; B = (b << 6) | (b << 2) | b; - } break; + break; + } case 3: { C = 11; uint cb = (bitval >> 1) & 3; B = (cb << 5) | cb; - } break; + break; + } default: break; } - } break; - case Quint: { + break; + } + case QUINT: { D = val.quint_trit_value; switch (bitlen) { case 0: { uint results[5] = {0, 16, 32, 47, 63}; result = results[D]; - } break; + break; + } case 1: { C = 28; - } break; + break; + } case 2: { C = 13; uint b = (bitval >> 1) & 1; B = (b << 6) | (b << 1); - } break; + break; } - } break; + } + break; } - if (val.encoding != JustBits && bitlen > 0) { + } + if (val.encoding != JUST_BITS && bitlen > 0) { result = D * C + B; result ^= A; result = (A & 0x20) | (result >> 2); @@ -905,7 +949,7 @@ uint UnquantizeTexelWeight(EncodingData val) { return result; } -void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 size) { +void UnquantizeTexelWeights(bool dual_plane, uvec2 size) { uint weight_idx = 0; uint unquantized[2][144]; uint area = size.x * size.y; @@ -921,11 +965,12 @@ void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 s if (++weight_idx >= (area)) break; } - uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); - uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); - uint kPlaneScale = dual_plane ? 2 : 1; - for (uint plane = 0; plane < kPlaneScale; plane++) - for (uint t = 0; t < block_dims.y; t++) + + const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); + const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); + const uint k_plane_scale = dual_plane ? 2 : 1; + for (uint plane = 0; plane < k_plane_scale; plane++) { + for (uint t = 0; t < block_dims.y; t++) { for (uint s = 0; s < block_dims.x; s++) { uint cs = Ds * s; uint ct = Dt * t; @@ -955,8 +1000,10 @@ void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 s if ((v0 + size.x + 1) < (area)) { p.w = unquantized[plane][(v0 + size.x + 1)]; } - outbuffer[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; + unquantized_texel_weights[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; } + } + } } int FindLayout(uint mode) { @@ -991,25 +1038,25 @@ int FindLayout(uint mode) { } TexelWeightParams DecodeBlockInfo(uint block_index) { - TexelWeightParams params = TexelWeightParams(uvec2(0), false, 0, false, false, false); + TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false); uint mode = StreamBits(11); if ((mode & 0x1ff) == 0x1fc) { if ((mode & 0x200) != 0) { - params.VoidExtentHDR = true; + params.void_extent_hdr = true; } else { - params.VoidExtentLDR = true; + params.void_extent_ldr = true; } if ((mode & 0x400) == 0 || StreamBits(1) == 0) { - params.Error = true; + params.error_state = true; } return params; } if ((mode & 0xf) == 0) { - params.Error = true; + params.error_state = true; return params; } if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { - params.Error = true; + params.error_state = true; return params; } uint A, B; @@ -1060,7 +1107,7 @@ TexelWeightParams DecodeBlockInfo(uint block_index) { params.size = uvec2(A + 6, B + 6); break; default: - params.Error = true; + params.error_state = true; break; } params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); @@ -1089,11 +1136,8 @@ void FillError(ivec3 coord) { } } -void FillVoidExtentLDR(ivec3 coord, uint block_index) { - for (int i = 0; i < 4; i++) { - StreamBits(13); - } - +void FillVoidExtentLDR(ivec3 coord) { + StreamBits(52); uint r_u = StreamBits(16); uint g_u = StreamBits(16); uint b_u = StreamBits(16); @@ -1110,21 +1154,20 @@ void FillVoidExtentLDR(ivec3 coord, uint block_index) { } void DecompressBlock(ivec3 coord, uint block_index) { - TexelWeightParams params; - params = DecodeBlockInfo(block_index); - if (params.Error) { + TexelWeightParams params = DecodeBlockInfo(block_index); + if (params.error_state) { FillError(coord); return; } - if (params.VoidExtentHDR) { + if (params.void_extent_hdr) { FillError(coord); return; } - if (params.VoidExtentLDR) { - FillVoidExtentLDR(coord, block_index); + if (params.void_extent_ldr) { + FillVoidExtentLDR(coord); return; } - if (params.size.x > block_dims.x || params.size.y > block_dims.y) { + if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) { FillError(coord); return; } @@ -1139,7 +1182,7 @@ void DecompressBlock(ivec3 coord, uint block_index) { uint ced_pointer = 0; uint base_cem = 0; if (num_partitions == 1) { - color_endpoint_mode[0] = StreamBits(4); + color_endpoint_mode.x = StreamBits(4); partition_index = 0; } else { partition_index = StreamBits(10); @@ -1181,7 +1224,7 @@ void DecompressBlock(ivec3 coord, uint block_index) { int nb = int(min(remaining_bits, 8U)); uint b = StreamBits(nb); color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); - ced_pointer++; + ++ced_pointer; remaining_bits -= nb; } plane_index = int(StreamBits(plane_selector_bits)); @@ -1189,20 +1232,20 @@ void DecompressBlock(ivec3 coord, uint block_index) { uint extra_cem = StreamBits(extra_cem_bits); uint cem = (extra_cem << 6) | base_cem; cem >>= 2; - uint C[4] = {0, 0, 0, 0}; + uvec4 C = uvec4(0); for (uint i = 0; i < num_partitions; i++) { - C[i] = cem & 1; + C[i] = (cem & 1); cem >>= 1; } - uint M[4] = {0, 0, 0, 0}; + uvec4 M = uvec4(0); for (uint i = 0; i < num_partitions; i++) { M[i] = cem & 3; cem >>= 2; } for (uint i = 0; i < num_partitions; i++) { color_endpoint_mode[i] = base_mode; - if ((C[i]) == 0) { - color_endpoint_mode[i] -= 1; + if (C[i] == 0) { + --color_endpoint_mode[i]; } color_endpoint_mode[i] <<= 2; color_endpoint_mode[i] |= M[i]; @@ -1213,13 +1256,13 @@ void DecompressBlock(ivec3 coord, uint block_index) { color_endpoint_mode[i] = cem; } } + DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits); - uint color_values[32]; // Four values, two endpoints, four maximum paritions - DecodeColorValues(color_values, color_endpoint_mode, num_partitions, color_data_bits); uvec4 endpoints[4][2]; for (uint i = 0; i < num_partitions; i++) { - ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_values, color_endpoint_mode[i]); + ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); } + for (uint i = 0; i < 16; i++) { texel_weight_data[i] = local_buff[i]; } @@ -1238,12 +1281,13 @@ void DecompressBlock(ivec3 coord, uint block_index) { uint( ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); for (uint i = 0; i < 16 - clear_byte_start; i++) { - texel_weight_data[clear_byte_start + i] = uint(0U); + texel_weight_data[clear_byte_start + i] = 0U; } texel_flag = true; // use texel "vector" and bit stream in integer decoding DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); - uint weights[2][144]; - UnquantizeTexelWeights(weights, params.dual_plane, params.size); + + UnquantizeTexelWeights(params.dual_plane, params.size); + for (uint j = 0; j < block_dims.y; j++) { for (uint i = 0; i < block_dims.x; i++) { uint local_partition = Select2DPartition(partition_index, i, j, num_partitions, @@ -1257,9 +1301,9 @@ void DecompressBlock(ivec3 coord, uint block_index) { if (params.dual_plane && (((plane_index + 1) & 3) == c)) { plane_vec[c] = 1; } - weight_vec[c] = weights[plane_vec[c]][j * block_dims.x + i]; + weight_vec[c] = unquantized_texel_weights[plane_vec[c]][j * block_dims.x + i]; } - vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6); + vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); p = (Cf / 65535.0); imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); } @@ -1267,7 +1311,7 @@ void DecompressBlock(ivec3 coord, uint block_index) { } void main() { - uvec3 pos = gl_GlobalInvocationID + origin; + uvec3 pos = gl_GlobalInvocationID; pos.x <<= bytes_per_block_log2; // Read as soon as possible due to its latency @@ -1282,9 +1326,10 @@ void main() { offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; offset += swizzle; - const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1.0)); + const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1)); uint block_index = - pos.z * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x; + pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x; + current_index = 0; bitsread = 0; for (int i = 0; i < 16; i++) { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 29105ecad7..623b43d8a4 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -307,7 +307,7 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array swizzles) { - if (IsPixelFormatASTC(image.info.format)) { - return util_shaders.ASTCDecode(image, map, swizzles); - } switch (image.info.type) { case ImageType::e2D: - return util_shaders.BlockLinearUpload2D(image, map, swizzles); + if (IsPixelFormatASTC(image.info.format)) { + return util_shaders.ASTCDecode(image, map, swizzles); + } else { + return util_shaders.BlockLinearUpload2D(image, map, swizzles); + } case ImageType::e3D: return util_shaders.BlockLinearUpload3D(image, map, swizzles); case ImageType::Linear: diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 85722c54ad..47fddcb6e8 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -2,11 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include -#include #include -#include -#include #include #include @@ -24,7 +20,6 @@ #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/util_shaders.h" -#include "video_core/surface.h" #include "video_core/texture_cache/accelerated_swizzle.h" #include "video_core/texture_cache/types.h" #include "video_core/texture_cache/util.h" @@ -36,6 +31,7 @@ namespace OpenGL { using namespace HostShaders; using namespace Tegra::Texture::ASTC; +using VideoCommon::Extent2D; using VideoCommon::Extent3D; using VideoCommon::ImageCopy; using VideoCommon::ImageType; @@ -69,33 +65,15 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)), copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { - MakeBuffers(); + const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); + swizzle_table_buffer.Create(); + astc_buffer.Create(); + glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); + glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_BUFFER_DATA), &ASTC_BUFFER_DATA, 0); } UtilShaders::~UtilShaders() = default; -void UtilShaders::MakeBuffers() { - const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); - swizzle_table_buffer.Create(); - glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); - - astc_encodings_buffer.Create(); - glNamedBufferStorage(astc_encodings_buffer.handle, sizeof(EncodingsValues), &EncodingsValues, - 0); - replicate_6_to_8_buffer.Create(); - glNamedBufferStorage(replicate_6_to_8_buffer.handle, sizeof(REPLICATE_6_BIT_TO_8_TABLE), - &REPLICATE_6_BIT_TO_8_TABLE, 0); - replicate_7_to_8_buffer.Create(); - glNamedBufferStorage(replicate_7_to_8_buffer.handle, sizeof(REPLICATE_7_BIT_TO_8_TABLE), - &REPLICATE_7_BIT_TO_8_TABLE, 0); - replicate_8_to_8_buffer.Create(); - glNamedBufferStorage(replicate_8_to_8_buffer.handle, sizeof(REPLICATE_8_BIT_TO_8_TABLE), - &REPLICATE_8_BIT_TO_8_TABLE, 0); - replicate_byte_to_16_buffer.Create(); - glNamedBufferStorage(replicate_byte_to_16_buffer.handle, sizeof(REPLICATE_BYTE_TO_16_TABLE), - &REPLICATE_BYTE_TO_16_TABLE, 0); -} - void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, std::span swizzles) { static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; @@ -108,47 +86,51 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; - static constexpr GLuint LOC_NUM_IMAGE_BLOCKS = 0; - static constexpr GLuint LOC_BLOCK_DIMS = 1; - const Extent3D tile_size = { - VideoCore::Surface::DefaultBlockWidth(image.info.format), - VideoCore::Surface::DefaultBlockHeight(image.info.format), + const Extent2D tile_size{ + .width = VideoCore::Surface::DefaultBlockWidth(image.info.format), + .height = VideoCore::Surface::DefaultBlockHeight(image.info.format), }; program_manager.BindHostCompute(astc_decoder_program.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_encodings_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, - replicate_6_to_8_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, - replicate_7_to_8_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, - replicate_8_to_8_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, - replicate_byte_to_16_buffer.handle); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle, + offsetof(AstcBufferData, encoding_values), + sizeof(AstcBufferData::encoding_values)); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, astc_buffer.handle, + offsetof(AstcBufferData, replicate_6_to_8), + sizeof(AstcBufferData::replicate_6_to_8)); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, astc_buffer.handle, + offsetof(AstcBufferData, replicate_7_to_8), + sizeof(AstcBufferData::replicate_7_to_8)); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, astc_buffer.handle, + offsetof(AstcBufferData, replicate_8_to_8), + sizeof(AstcBufferData::replicate_8_to_8)); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, astc_buffer.handle, + offsetof(AstcBufferData, replicate_byte_to_16), + sizeof(AstcBufferData::replicate_byte_to_16)); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); - glUniform2ui(LOC_BLOCK_DIMS, tile_size.width, tile_size.height); + glUniform2ui(1, tile_size.width, tile_size.height); + // Ensure buffer data is valid before dispatching + glFlush(); for (const SwizzleParameters& swizzle : swizzles) { + const size_t input_offset = swizzle.buffer_offset + map.offset; + const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); + const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); + + const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); + ASSERT(params.origin == (std::array{0, 0, 0})); + ASSERT(params.destination == (std::array{0, 0, 0})); + + glUniform1ui(2, params.bytes_per_block_log2); + glUniform1ui(3, params.layer_stride); + glUniform1ui(4, params.block_size); + glUniform1ui(5, params.x_shift); + glUniform1ui(6, params.block_height); + glUniform1ui(7, params.block_height_mask); + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, GL_WRITE_ONLY, GL_RGBA8); - const size_t input_offset = swizzle.buffer_offset + map.offset; - const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); - const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); - - glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height); - - // To unswizzle the ASTC data - const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); - glUniform3uiv(2, 1, params.origin.data()); - glUniform3iv(3, 1, params.destination.data()); - glUniform1ui(4, params.bytes_per_block_log2); - glUniform1ui(5, params.layer_stride); - glUniform1ui(6, params.block_size); - glUniform1ui(7, params.x_shift); - glUniform1ui(8, params.block_height); - glUniform1ui(9, params.block_height_mask); - // ASTC texture data glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, image.guest_size_bytes - swizzle.buffer_offset); diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index 08a1cb9b29..53d65f368e 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -40,8 +40,6 @@ public: explicit UtilShaders(ProgramManager& program_manager); ~UtilShaders(); - void MakeBuffers(); - void ASTCDecode(Image& image, const ImageBufferMap& map, std::span swizzles); @@ -64,11 +62,7 @@ private: ProgramManager& program_manager; OGLBuffer swizzle_table_buffer; - OGLBuffer astc_encodings_buffer; - OGLBuffer replicate_6_to_8_buffer; - OGLBuffer replicate_7_to_8_buffer; - OGLBuffer replicate_8_to_8_buffer; - OGLBuffer replicate_byte_to_16_buffer; + OGLBuffer astc_buffer; OGLProgram astc_decoder_program; OGLProgram block_linear_unswizzle_2d_program; diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index a0050b68f9..e11406e58f 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -35,13 +35,13 @@ using namespace Tegra::Texture::ASTC; namespace { -constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 0; -constexpr u32 ASTC_BINDING_INPUT_BUFFER = 1; -constexpr u32 ASTC_BINDING_ENC_BUFFER = 2; -constexpr u32 ASTC_BINDING_6_TO_8_BUFFER = 3; -constexpr u32 ASTC_BINDING_7_TO_8_BUFFER = 4; -constexpr u32 ASTC_BINDING_8_TO_8_BUFFER = 5; -constexpr u32 ASTC_BINDING_BYTE_TO_16_BUFFER = 6; +constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; +constexpr u32 ASTC_BINDING_ENC_BUFFER = 1; +constexpr u32 ASTC_BINDING_6_TO_8_BUFFER = 2; +constexpr u32 ASTC_BINDING_7_TO_8_BUFFER = 3; +constexpr u32 ASTC_BINDING_8_TO_8_BUFFER = 4; +constexpr u32 ASTC_BINDING_BYTE_TO_16_BUFFER = 5; +constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 6; constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 7; VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { @@ -74,56 +74,56 @@ std::array BuildInputOutputDescriptorSetBinding std::array BuildASTCDescriptorSetBindings() { return {{ { - .binding = ASTC_BINDING_SWIZZLE_BUFFER, // Swizzle buffer + .binding = ASTC_BINDING_INPUT_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, { - .binding = ASTC_BINDING_INPUT_BUFFER, // ASTC Img data buffer + .binding = ASTC_BINDING_ENC_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, { - .binding = ASTC_BINDING_ENC_BUFFER, // Encodings buffer + .binding = ASTC_BINDING_6_TO_8_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, { - .binding = ASTC_BINDING_6_TO_8_BUFFER, // BINDING_6_TO_8_BUFFER + .binding = ASTC_BINDING_7_TO_8_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, { - .binding = ASTC_BINDING_7_TO_8_BUFFER, // BINDING_7_TO_8_BUFFER + .binding = ASTC_BINDING_8_TO_8_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, { - .binding = ASTC_BINDING_8_TO_8_BUFFER, // BINDING_8_TO_8_BUFFER + .binding = ASTC_BINDING_BYTE_TO_16_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, { - .binding = ASTC_BINDING_BYTE_TO_16_BUFFER, // BINDING_BYTE_TO_16_BUFFER + .binding = ASTC_BINDING_SWIZZLE_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, { - .binding = ASTC_BINDING_OUTPUT_IMAGE, // Output image + .binding = ASTC_BINDING_OUTPUT_IMAGE, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, @@ -145,20 +145,12 @@ VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() { std::array BuildASTCPassDescriptorUpdateTemplateEntry() { return {{ - { - .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = 0 * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, { .dstBinding = ASTC_BINDING_INPUT_BUFFER, .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = 1 * sizeof(DescriptorUpdateEntry), + .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, { @@ -166,7 +158,7 @@ std::array BuildASTCPassDescriptorUpdateT .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = 2 * sizeof(DescriptorUpdateEntry), + .offset = ASTC_BINDING_ENC_BUFFER * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, { @@ -174,7 +166,7 @@ std::array BuildASTCPassDescriptorUpdateT .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = 3 * sizeof(DescriptorUpdateEntry), + .offset = ASTC_BINDING_6_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, { @@ -182,7 +174,7 @@ std::array BuildASTCPassDescriptorUpdateT .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = 4 * sizeof(DescriptorUpdateEntry), + .offset = ASTC_BINDING_7_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, { @@ -190,7 +182,7 @@ std::array BuildASTCPassDescriptorUpdateT .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = 5 * sizeof(DescriptorUpdateEntry), + .offset = ASTC_BINDING_8_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, { @@ -198,7 +190,15 @@ std::array BuildASTCPassDescriptorUpdateT .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = 6 * sizeof(DescriptorUpdateEntry), + .offset = ASTC_BINDING_BYTE_TO_16_BUFFER * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + { + .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, { @@ -206,16 +206,20 @@ std::array BuildASTCPassDescriptorUpdateT .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - .offset = 7 * sizeof(DescriptorUpdateEntry), + .offset = ASTC_BINDING_OUTPUT_IMAGE * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, }}; } struct AstcPushConstants { - std::array num_image_blocks; std::array blocks_dims; - VideoCommon::Accelerated::BlockLinearSwizzle2DParams params; + u32 bytes_per_block_log2; + u32 layer_stride; + u32 block_size; + u32 x_shift; + u32 block_height; + u32 block_height_mask; }; struct AstcBufferData { @@ -419,11 +423,12 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, ASTCDecoderPass::~ASTCDecoderPass() = default; void ASTCDecoderPass::MakeDataBuffer() { + constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_BUFFER_DATA) + sizeof(SWIZZLE_TABLE); data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .size = sizeof(ASTC_BUFFER_DATA), + .size = TOTAL_BUFFER_SIZE, .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, @@ -431,15 +436,19 @@ void ASTCDecoderPass::MakeDataBuffer() { }); data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); - const auto staging_ref = - staging_buffer_pool.Request(sizeof(ASTC_BUFFER_DATA), MemoryUsage::Upload); + const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload); std::memcpy(staging_ref.mapped_span.data(), &ASTC_BUFFER_DATA, sizeof(ASTC_BUFFER_DATA)); - scheduler.Record([src = staging_ref.buffer, dst = *data_buffer](vk::CommandBuffer cmdbuf) { + // Tack on the swizzle table at the end of the buffer + std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_BUFFER_DATA), &SWIZZLE_TABLE, + sizeof(SWIZZLE_TABLE)); + + scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer, + TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) { cmdbuf.CopyBuffer(src, dst, VkBufferCopy{ - .srcOffset = 0, + .srcOffset = offset, .dstOffset = 0, - .size = sizeof(ASTC_BUFFER_DATA), + .size = TOTAL_BUFFER_SIZE, }); cmdbuf.PipelineBarrier( VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, @@ -448,61 +457,58 @@ void ASTCDecoderPass::MakeDataBuffer() { .pNext = nullptr, .srcAccessMask = 0, .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, - }, - {}, {}); + }); }); } void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, std::span swizzles) { using namespace VideoCommon::Accelerated; - const VideoCommon::Extent2D tile_size{ - .width = VideoCore::Surface::DefaultBlockWidth(image.info.format), - .height = VideoCore::Surface::DefaultBlockHeight(image.info.format), + const std::array block_dims{ + VideoCore::Surface::DefaultBlockWidth(image.info.format), + VideoCore::Surface::DefaultBlockHeight(image.info.format), }; scheduler.RequestOutsideRenderPassOperationContext(); if (!data_buffer) { MakeDataBuffer(); } + const VkPipeline vk_pipeline = *pipeline; const VkImageAspectFlags aspect_mask = image.AspectMask(); const VkImage vk_image = image.Handle(); const bool is_initialized = image.ExchangeInitialization(); - scheduler.Record([vk_image, aspect_mask, is_initialized](vk::CommandBuffer cmdbuf) { - const VkImageMemoryBarrier image_barrier{ - .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, - .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, - .newLayout = VK_IMAGE_LAYOUT_GENERAL, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .image = vk_image, - .subresourceRange{ - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = VK_REMAINING_MIP_LEVELS, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, - }; - cmdbuf.PipelineBarrier(0, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, image_barrier); - }); - const std::array block_dims{tile_size.width, tile_size.height}; + scheduler.Record( + [vk_pipeline, vk_image, aspect_mask, is_initialized](vk::CommandBuffer cmdbuf) { + const VkImageMemoryBarrier image_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = vk_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + cmdbuf.PipelineBarrier(is_initialized ? VK_PIPELINE_STAGE_ALL_COMMANDS_BIT : 0, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, image_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, vk_pipeline); + }); for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { const size_t input_offset = swizzle.buffer_offset + map.offset; const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); const u32 num_dispatches_z = image.info.resources.layers; - const std::array num_image_blocks{swizzle.num_tiles.width, swizzle.num_tiles.height}; - const u32 layer_image_size = - image.guest_size_bytes - static_cast(swizzle.buffer_offset); update_descriptor_queue.Acquire(); - update_descriptor_queue.AddBuffer(*data_buffer, - offsetof(AstcBufferData, swizzle_table_buffer), - sizeof(AstcBufferData::swizzle_table_buffer)); - update_descriptor_queue.AddBuffer(map.buffer, input_offset, layer_image_size); + update_descriptor_queue.AddBuffer(map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, encoding_values), sizeof(AstcBufferData::encoding_values)); update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_6_to_8), @@ -514,18 +520,28 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_byte_to_16), sizeof(AstcBufferData::replicate_byte_to_16)); + update_descriptor_queue.AddBuffer(*data_buffer, sizeof(AstcBufferData), + sizeof(SWIZZLE_TABLE)); update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); const VkPipelineLayout vk_layout = *layout; - const VkPipeline vk_pipeline = *pipeline; + // To unswizzle the ASTC data const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); - scheduler.Record([vk_layout, vk_pipeline, buffer = map.buffer, num_dispatches_x, - num_dispatches_y, num_dispatches_z, num_image_blocks, block_dims, params, - set, input_offset](vk::CommandBuffer cmdbuf) { - const AstcPushConstants uniforms{num_image_blocks, block_dims, params}; - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, vk_pipeline); + ASSERT(params.origin == (std::array{0, 0, 0})); + ASSERT(params.destination == (std::array{0, 0, 0})); + scheduler.Record([vk_layout, num_dispatches_x, num_dispatches_y, num_dispatches_z, + block_dims, params, set](vk::CommandBuffer cmdbuf) { + const AstcPushConstants uniforms{ + .blocks_dims = block_dims, + .bytes_per_block_log2 = params.bytes_per_block_log2, + .layer_stride = params.layer_stride, + .block_size = params.block_size, + .x_shift = params.x_shift, + .block_height = params.block_height, + .block_height_mask = params.block_height_mask, + }; cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, vk_layout, 0, set, {}); cmdbuf.PushConstants(vk_layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); cmdbuf.Dispatch(num_dispatches_x, num_dispatches_y, num_dispatches_z); @@ -550,7 +566,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, .layerCount = VK_REMAINING_ARRAY_LAYERS, }, }; - cmdbuf.PipelineBarrier(0, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, image_barrier); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier); }); } diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 2c42d1449e..c22dd01489 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -47,7 +47,6 @@ #include "video_core/texture_cache/formatter.h" #include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/util.h" -#include "video_core/textures/astc.h" #include "video_core/textures/decoders.h" namespace VideoCommon { @@ -879,17 +878,8 @@ void ConvertImage(std::span input, const ImageInfo& info, std::span. -// - -#include -#include -#include -#include -#include - -#include - -#include "common/common_types.h" - -#include "video_core/textures/astc.h" - -namespace { - -/// Count the number of bits set in a number. -constexpr u32 Popcnt(u32 n) { - u32 c = 0; - for (; n; c++) { - n &= n - 1; - } - return c; -} - -} // Anonymous namespace - -class InputBitStream { -public: - constexpr explicit InputBitStream(std::span data, size_t start_offset = 0) - : cur_byte{data.data()}, total_bits{data.size()}, next_bit{start_offset % 8} {} - - constexpr size_t GetBitsRead() const { - return bits_read; - } - - constexpr bool ReadBit() { - if (bits_read >= total_bits * 8) { - return 0; - } - const bool bit = ((*cur_byte >> next_bit) & 1) != 0; - ++next_bit; - while (next_bit >= 8) { - next_bit -= 8; - ++cur_byte; - } - ++bits_read; - return bit; - } - - constexpr u32 ReadBits(std::size_t nBits) { - u32 ret = 0; - for (std::size_t i = 0; i < nBits; ++i) { - ret |= (ReadBit() & 1) << i; - } - return ret; - } - - template - constexpr u32 ReadBits() { - u32 ret = 0; - for (std::size_t i = 0; i < nBits; ++i) { - ret |= (ReadBit() & 1) << i; - } - return ret; - } - -private: - const u8* cur_byte; - size_t total_bits = 0; - size_t next_bit = 0; - size_t bits_read = 0; -}; - -class OutputBitStream { -public: - constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0) - : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {} - - constexpr std::size_t GetBitsWritten() const { - return bits_written; - } - - constexpr void WriteBitsR(u32 val, u32 nBits) { - for (u32 i = 0; i < nBits; i++) { - WriteBit((val >> (nBits - i - 1)) & 1); - } - } - - constexpr void WriteBits(u32 val, u32 nBits) { - for (u32 i = 0; i < nBits; i++) { - WriteBit((val >> i) & 1); - } - } - -private: - constexpr void WriteBit(bool b) { - if (bits_written >= num_bits) { - return; - } - - const u32 mask = 1 << next_bit++; - - // clear the bit - *cur_byte &= static_cast(~mask); - - // Write the bit, if necessary - if (b) - *cur_byte |= static_cast(mask); - - // Next byte? - if (next_bit >= 8) { - cur_byte += 1; - next_bit = 0; - } - } - - u8* cur_byte; - std::size_t num_bits; - std::size_t bits_written = 0; - std::size_t next_bit = 0; -}; - -template -class Bits { -public: - explicit Bits(const IntType& v) : m_Bits(v) {} - - Bits(const Bits&) = delete; - Bits& operator=(const Bits&) = delete; - - u8 operator[](u32 bitPos) const { - return static_cast((m_Bits >> bitPos) & 1); - } - - IntType operator()(u32 start, u32 end) const { - if (start == end) { - return (*this)[start]; - } else if (start > end) { - u32 t = start; - start = end; - end = t; - } - - u64 mask = (1 << (end - start + 1)) - 1; - return (m_Bits >> start) & static_cast(mask); - } - -private: - const IntType& m_Bits; -}; - -enum class IntegerEncoding { JustBits, Qus32, Trit }; - -struct IntegerEncodedValue { - constexpr IntegerEncodedValue() = default; - - constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) - : encoding{encoding_}, num_bits{num_bits_} {} - - constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { - return encoding == other.encoding && num_bits == other.num_bits; - } - - // Returns the number of bits required to encode nVals values. - u32 GetBitLength(u32 nVals) const { - u32 totalBits = num_bits * nVals; - if (encoding == IntegerEncoding::Trit) { - totalBits += (nVals * 8 + 4) / 5; - } else if (encoding == IntegerEncoding::Qus32) { - totalBits += (nVals * 7 + 2) / 3; - } - return totalBits; - } - - IntegerEncoding encoding{}; - u32 num_bits = 0; - u32 bit_value = 0; - union { - u32 qus32_value = 0; - u32 trit_value; - }; -}; -using IntegerEncodedVector = boost::container::static_vector< - IntegerEncodedValue, 256, - boost::container::static_vector_options< - boost::container::inplace_alignment, - boost::container::throw_on_overflow>::type>; - -static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) { - // Implement the algorithm in section C.2.12 - std::array m; - std::array t; - u32 T; - - // Read the trit encoded block according to - // table C.2.14 - m[0] = bits.ReadBits(nBitsPerValue); - T = bits.ReadBits<2>(); - m[1] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBits<2>() << 2; - m[2] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBit() << 4; - m[3] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBits<2>() << 5; - m[4] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBit() << 7; - - u32 C = 0; - - Bits Tb(T); - if (Tb(2, 4) == 7) { - C = (Tb(5, 7) << 2) | Tb(0, 1); - t[4] = t[3] = 2; - } else { - C = Tb(0, 4); - if (Tb(5, 6) == 3) { - t[4] = 2; - t[3] = Tb[7]; - } else { - t[4] = Tb[7]; - t[3] = Tb(5, 6); - } - } - - Bits Cb(C); - if (Cb(0, 1) == 3) { - t[2] = 2; - t[1] = Cb[4]; - t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]); - } else if (Cb(2, 3) == 3) { - t[2] = 2; - t[1] = 2; - t[0] = Cb(0, 1); - } else { - t[2] = Cb[4]; - t[1] = Cb(2, 3); - t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]); - } - - for (std::size_t i = 0; i < 5; ++i) { - IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Trit, nBitsPerValue); - val.bit_value = m[i]; - val.trit_value = t[i]; - } -} - -static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result, - u32 nBitsPerValue) { - // Implement the algorithm in section C.2.12 - u32 m[3]; - u32 q[3]; - u32 Q; - - // Read the trit encoded block according to - // table C.2.15 - m[0] = bits.ReadBits(nBitsPerValue); - Q = bits.ReadBits<3>(); - m[1] = bits.ReadBits(nBitsPerValue); - Q |= bits.ReadBits<2>() << 3; - m[2] = bits.ReadBits(nBitsPerValue); - Q |= bits.ReadBits<2>() << 5; - - Bits Qb(Q); - if (Qb(1, 2) == 3 && Qb(5, 6) == 0) { - q[0] = q[1] = 4; - q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]); - } else { - u32 C = 0; - if (Qb(1, 2) == 3) { - q[2] = 4; - C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0]; - } else { - q[2] = Qb(5, 6); - C = Qb(0, 4); - } - - Bits Cb(C); - if (Cb(0, 2) == 5) { - q[1] = 4; - q[0] = Cb(3, 4); - } else { - q[1] = Cb(3, 4); - q[0] = Cb(0, 2); - } - } - - for (std::size_t i = 0; i < 3; ++i) { - IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Qus32, nBitsPerValue); - val.bit_value = m[i]; - val.qus32_value = q[i]; - } -} - -// Returns a new instance of this struct that corresponds to the -// can take no more than maxval values -static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) { - while (maxVal > 0) { - u32 check = maxVal + 1; - - // Is maxVal a power of two? - if (!(check & (check - 1))) { - return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal)); - } - - // Is maxVal of the type 3*2^n - 1? - if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { - return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1)); - } - - // Is maxVal of the type 5*2^n - 1? - if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { - return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1)); - } - - // Apparently it can't be represented with a bounded integer sequence... - // just iterate. - maxVal--; - } - return IntegerEncodedValue(IntegerEncoding::JustBits, 0); -} - -static constexpr std::array MakeEncodedValues() { - std::array encodings{}; - for (std::size_t i = 0; i < encodings.size(); ++i) { - encodings[i] = CreateEncoding(static_cast(i)); - } - return encodings; -} - -static constexpr std::array EncodingsValues = MakeEncodedValues(); - -// Fills result with the values that are encoded in the given -// bitstream. We must know beforehand what the maximum possible -// value is, and how many values we're decoding. -static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange, - u32 nValues) { - // Determine encoding parameters - IntegerEncodedValue val = EncodingsValues[maxRange]; - - // Start decoding - u32 nValsDecoded = 0; - while (nValsDecoded < nValues) { - switch (val.encoding) { - case IntegerEncoding::Qus32: - DecodeQus32Block(bits, result, val.num_bits); - nValsDecoded += 3; - break; - - case IntegerEncoding::Trit: - DecodeTritBlock(bits, result, val.num_bits); - nValsDecoded += 5; - break; - - case IntegerEncoding::JustBits: - val.bit_value = bits.ReadBits(val.num_bits); - result.push_back(val); - nValsDecoded++; - break; - } - } -} - -namespace ASTCC { - -struct TexelWeightParams { - u32 m_Width = 0; - u32 m_Height = 0; - bool m_bDualPlane = false; - u32 m_MaxWeight = 0; - bool m_bError = false; - bool m_bVoidExtentLDR = false; - bool m_bVoidExtentHDR = false; - - u32 GetPackedBitSize() const { - // How many indices do we have? - u32 nIdxs = m_Height * m_Width; - if (m_bDualPlane) { - nIdxs *= 2; - } - - return EncodingsValues[m_MaxWeight].GetBitLength(nIdxs); - } - - u32 GetNumWeightValues() const { - u32 ret = m_Width * m_Height; - if (m_bDualPlane) { - ret *= 2; - } - return ret; - } -}; - -static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { - TexelWeightParams params; - - // Read the entire block mode all at once - u16 modeBits = static_cast(strm.ReadBits<11>()); - - // Does this match the void extent block mode? - if ((modeBits & 0x01FF) == 0x1FC) { - if (modeBits & 0x200) { - params.m_bVoidExtentHDR = true; - } else { - params.m_bVoidExtentLDR = true; - } - - // Next two bits must be one. - if (!(modeBits & 0x400) || !strm.ReadBit()) { - params.m_bError = true; - } - - return params; - } - - // First check if the last four bits are zero - if ((modeBits & 0xF) == 0) { - params.m_bError = true; - return params; - } - - // If the last two bits are zero, then if bits - // [6-8] are all ones, this is also reserved. - if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) { - params.m_bError = true; - return params; - } - - // Otherwise, there is no error... Figure out the layout - // of the block mode. Layout is determined by a number - // between 0 and 9 corresponding to table C.2.8 of the - // ASTC spec. - u32 layout = 0; - - if ((modeBits & 0x1) || (modeBits & 0x2)) { - // layout is in [0-4] - if (modeBits & 0x8) { - // layout is in [2-4] - if (modeBits & 0x4) { - // layout is in [3-4] - if (modeBits & 0x100) { - layout = 4; - } else { - layout = 3; - } - } else { - layout = 2; - } - } else { - // layout is in [0-1] - if (modeBits & 0x4) { - layout = 1; - } else { - layout = 0; - } - } - } else { - // layout is in [5-9] - if (modeBits & 0x100) { - // layout is in [7-9] - if (modeBits & 0x80) { - // layout is in [7-8] - assert((modeBits & 0x40) == 0U); - if (modeBits & 0x20) { - layout = 8; - } else { - layout = 7; - } - } else { - layout = 9; - } - } else { - // layout is in [5-6] - if (modeBits & 0x80) { - layout = 6; - } else { - layout = 5; - } - } - } - - assert(layout < 10); - - // Determine R - u32 R = !!(modeBits & 0x10); - if (layout < 5) { - R |= (modeBits & 0x3) << 1; - } else { - R |= (modeBits & 0xC) >> 1; - } - assert(2 <= R && R <= 7); - - // Determine width & height - switch (layout) { - case 0: { - u32 A = (modeBits >> 5) & 0x3; - u32 B = (modeBits >> 7) & 0x3; - params.m_Width = B + 4; - params.m_Height = A + 2; - break; - } - - case 1: { - u32 A = (modeBits >> 5) & 0x3; - u32 B = (modeBits >> 7) & 0x3; - params.m_Width = B + 8; - params.m_Height = A + 2; - break; - } - - case 2: { - u32 A = (modeBits >> 5) & 0x3; - u32 B = (modeBits >> 7) & 0x3; - params.m_Width = A + 2; - params.m_Height = B + 8; - break; - } - - case 3: { - u32 A = (modeBits >> 5) & 0x3; - u32 B = (modeBits >> 7) & 0x1; - params.m_Width = A + 2; - params.m_Height = B + 6; - break; - } - - case 4: { - u32 A = (modeBits >> 5) & 0x3; - u32 B = (modeBits >> 7) & 0x1; - params.m_Width = B + 2; - params.m_Height = A + 2; - break; - } - - case 5: { - u32 A = (modeBits >> 5) & 0x3; - params.m_Width = 12; - params.m_Height = A + 2; - break; - } - - case 6: { - u32 A = (modeBits >> 5) & 0x3; - params.m_Width = A + 2; - params.m_Height = 12; - break; - } - - case 7: { - params.m_Width = 6; - params.m_Height = 10; - break; - } - - case 8: { - params.m_Width = 10; - params.m_Height = 6; - break; - } - - case 9: { - u32 A = (modeBits >> 5) & 0x3; - u32 B = (modeBits >> 9) & 0x3; - params.m_Width = A + 6; - params.m_Height = B + 6; - break; - } - - default: - assert(false && "Don't know this layout..."); - params.m_bError = true; - break; - } - - // Determine whether or not we're using dual planes - // and/or high precision layouts. - bool D = (layout != 9) && (modeBits & 0x400); - bool H = (layout != 9) && (modeBits & 0x200); - - if (H) { - const u32 maxWeights[6] = {9, 11, 15, 19, 23, 31}; - params.m_MaxWeight = maxWeights[R - 2]; - } else { - const u32 maxWeights[6] = {1, 2, 3, 4, 5, 7}; - params.m_MaxWeight = maxWeights[R - 2]; - } - - params.m_bDualPlane = D; - - return params; -} - -static void FillVoidExtentLDR(InputBitStream& strm, std::span outBuf, u32 blockWidth, - u32 blockHeight) { - // Don't actually care about the void extent, just read the bits... - for (s32 i = 0; i < 4; ++i) { - strm.ReadBits<13>(); - } - - // Decode the RGBA components and renormalize them to the range [0, 255] - u16 r = static_cast(strm.ReadBits<16>()); - u16 g = static_cast(strm.ReadBits<16>()); - u16 b = static_cast(strm.ReadBits<16>()); - u16 a = static_cast(strm.ReadBits<16>()); - - u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast(b) & 0xFF00) << 8 | - (static_cast(a) & 0xFF00) << 16; - - for (u32 j = 0; j < blockHeight; j++) { - for (u32 i = 0; i < blockWidth; i++) { - outBuf[j * blockWidth + i] = rgba; - } - } -} - -static void FillError(std::span outBuf, u32 blockWidth, u32 blockHeight) { - for (u32 j = 0; j < blockHeight; j++) { - for (u32 i = 0; i < blockWidth; i++) { - outBuf[j * blockWidth + i] = 0xFFFF00FF; - } - } -} - -// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] -// is the same as [(numBits - 1):0] and repeats all the way down. -template -static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) { - if (numBits == 0) { - return 0; - } - if (toBit == 0) { - return 0; - } - const IntType v = val & static_cast((1 << numBits) - 1); - IntType res = v; - u32 reslen = numBits; - while (reslen < toBit) { - u32 comp = 0; - if (numBits > toBit - reslen) { - u32 newshift = toBit - reslen; - comp = numBits - newshift; - numBits = newshift; - } - res = static_cast(res << numBits); - res = static_cast(res | (v >> comp)); - reslen += numBits; - } - return res; -} - -static constexpr std::size_t NumReplicateEntries(u32 num_bits) { - return std::size_t(1) << num_bits; -} - -template -static constexpr auto MakeReplicateTable() { - std::array table{}; - for (IntType value = 0; value < static_cast(std::size(table)); ++value) { - table[value] = Replicate(value, num_bits, to_bit); - } - return table; -} - -static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable(); -static constexpr u32 ReplicateByteTo16(std::size_t value) { - return REPLICATE_BYTE_TO_16_TABLE[value]; -} - -static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable(); -static constexpr u32 ReplicateBitTo7(std::size_t value) { - return REPLICATE_BIT_TO_7_TABLE[value]; -} - -static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable(); -static constexpr u32 ReplicateBitTo9(std::size_t value) { - return REPLICATE_BIT_TO_9_TABLE[value]; -} - -static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable(); -/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback -/// to the runtime implementation -static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { - switch (num_bits) { - case 1: - return REPLICATE_1_BIT_TO_8_TABLE[value]; - case 2: - return REPLICATE_2_BIT_TO_8_TABLE[value]; - case 3: - return REPLICATE_3_BIT_TO_8_TABLE[value]; - case 4: - return REPLICATE_4_BIT_TO_8_TABLE[value]; - case 5: - return REPLICATE_5_BIT_TO_8_TABLE[value]; - case 6: - return REPLICATE_6_BIT_TO_8_TABLE[value]; - case 7: - return REPLICATE_7_BIT_TO_8_TABLE[value]; - case 8: - return REPLICATE_8_BIT_TO_8_TABLE[value]; - default: - return Replicate(value, num_bits, 8); - } -} - -static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable(); -static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) { - switch (num_bits) { - case 1: - return REPLICATE_1_BIT_TO_6_TABLE[value]; - case 2: - return REPLICATE_2_BIT_TO_6_TABLE[value]; - case 3: - return REPLICATE_3_BIT_TO_6_TABLE[value]; - case 4: - return REPLICATE_4_BIT_TO_6_TABLE[value]; - case 5: - return REPLICATE_5_BIT_TO_6_TABLE[value]; - default: - return Replicate(value, num_bits, 6); - } -} - -class Pixel { -protected: - using ChannelType = s16; - u8 m_BitDepth[4] = {8, 8, 8, 8}; - s16 color[4] = {}; - -public: - Pixel() = default; - Pixel(u32 a, u32 r, u32 g, u32 b, u32 bitDepth = 8) - : m_BitDepth{u8(bitDepth), u8(bitDepth), u8(bitDepth), u8(bitDepth)}, - color{static_cast(a), static_cast(r), - static_cast(g), static_cast(b)} {} - - // Changes the depth of each pixel. This scales the values to - // the appropriate bit depth by either truncating the least - // significant bits when going from larger to smaller bit depth - // or by repeating the most significant bits when going from - // smaller to larger bit depths. - void ChangeBitDepth() { - for (u32 i = 0; i < 4; i++) { - Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]); - m_BitDepth[i] = 8; - } - } - - template - static float ConvertChannelToFloat(IntType channel, u8 bitDepth) { - float denominator = static_cast((1 << bitDepth) - 1); - return static_cast(channel) / denominator; - } - - // Changes the bit depth of a single component. See the comment - // above for how we do this. - static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) { - assert(oldDepth <= 8); - - if (oldDepth == 8) { - // Do nothing - return val; - } else if (oldDepth == 0) { - return static_cast((1 << 8) - 1); - } else if (8 > oldDepth) { - return static_cast(FastReplicateTo8(static_cast(val), oldDepth)); - } else { - // oldDepth > newDepth - const u8 bitsWasted = static_cast(oldDepth - 8); - u16 v = static_cast(val); - v = static_cast((v + (1 << (bitsWasted - 1))) >> bitsWasted); - v = ::std::min(::std::max(0, v), static_cast((1 << 8) - 1)); - return static_cast(v); - } - - assert(false && "We shouldn't get here."); - return 0; - } - - const ChannelType& A() const { - return color[0]; - } - ChannelType& A() { - return color[0]; - } - const ChannelType& R() const { - return color[1]; - } - ChannelType& R() { - return color[1]; - } - const ChannelType& G() const { - return color[2]; - } - ChannelType& G() { - return color[2]; - } - const ChannelType& B() const { - return color[3]; - } - ChannelType& B() { - return color[3]; - } - const ChannelType& Component(u32 idx) const { - return color[idx]; - } - ChannelType& Component(u32 idx) { - return color[idx]; - } - - void GetBitDepth(u8 (&outDepth)[4]) const { - for (s32 i = 0; i < 4; i++) { - outDepth[i] = m_BitDepth[i]; - } - } - - // Take all of the components, transform them to their 8-bit variants, - // and then pack each channel into an R8G8B8A8 32-bit integer. We assume - // that the architecture is little-endian, so the alpha channel will end - // up in the most-significant byte. - u32 Pack() const { - Pixel eightBit(*this); - eightBit.ChangeBitDepth(); - - u32 r = 0; - r |= eightBit.A(); - r <<= 8; - r |= eightBit.B(); - r <<= 8; - r |= eightBit.G(); - r <<= 8; - r |= eightBit.R(); - return r; - } - - // Clamps the pixel to the range [0,255] - void ClampByte() { - for (u32 i = 0; i < 4; i++) { - color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); - } - } - - void MakeOpaque() { - A() = 255; - } -}; - -static void DecodeColorValues(u32* out, std::span data, const u32* modes, const u32 nPartitions, - const u32 nBitsForColorData) { - // First figure out how many color values we have - u32 nValues = 0; - for (u32 i = 0; i < nPartitions; i++) { - nValues += ((modes[i] >> 2) + 1) << 1; - } - - // Then based on the number of values and the remaining number of bits, - // figure out the max value for each of them... - u32 range = 256; - while (--range > 0) { - IntegerEncodedValue val = EncodingsValues[range]; - u32 bitLength = val.GetBitLength(nValues); - if (bitLength <= nBitsForColorData) { - // Find the smallest possible range that matches the given encoding - while (--range > 0) { - IntegerEncodedValue newval = EncodingsValues[range]; - if (!newval.MatchesEncoding(val)) { - break; - } - } - - // Return to last matching range. - range++; - break; - } - } - - // We now have enough to decode our integer sequence. - IntegerEncodedVector decodedColorValues; - - InputBitStream colorStream(data, 0); - DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); - - // Once we have the decoded values, we need to dequantize them to the 0-255 range - // This procedure is outlined in ASTC spec C.2.13 - u32 outIdx = 0; - for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) { - // Have we already decoded all that we need? - if (outIdx >= nValues) { - break; - } - - const IntegerEncodedValue& val = *itr; - u32 bitlen = val.num_bits; - u32 bitval = val.bit_value; - - assert(bitlen >= 1); - - u32 A = 0, B = 0, C = 0, D = 0; - // A is just the lsb replicated 9 times. - A = ReplicateBitTo9(bitval & 1); - - switch (val.encoding) { - // Replicate bits - case IntegerEncoding::JustBits: - out[outIdx++] = FastReplicateTo8(bitval, bitlen); - break; - - // Use algorithm in C.2.13 - case IntegerEncoding::Trit: { - - D = val.trit_value; - - switch (bitlen) { - case 1: { - C = 204; - } break; - - case 2: { - C = 93; - // B = b000b0bb0 - u32 b = (bitval >> 1) & 1; - B = (b << 8) | (b << 4) | (b << 2) | (b << 1); - } break; - - case 3: { - C = 44; - // B = cb000cbcb - u32 cb = (bitval >> 1) & 3; - B = (cb << 7) | (cb << 2) | cb; - } break; - - case 4: { - C = 22; - // B = dcb000dcb - u32 dcb = (bitval >> 1) & 7; - B = (dcb << 6) | dcb; - } break; - - case 5: { - C = 11; - // B = edcb000ed - u32 edcb = (bitval >> 1) & 0xF; - B = (edcb << 5) | (edcb >> 2); - } break; - - case 6: { - C = 5; - // B = fedcb000f - u32 fedcb = (bitval >> 1) & 0x1F; - B = (fedcb << 4) | (fedcb >> 4); - } break; - - default: - assert(false && "Unsupported trit encoding for color values!"); - break; - } // switch(bitlen) - } // case IntegerEncoding::Trit - break; - - case IntegerEncoding::Qus32: { - - D = val.qus32_value; - - switch (bitlen) { - case 1: { - C = 113; - } break; - - case 2: { - C = 54; - // B = b0000bb00 - u32 b = (bitval >> 1) & 1; - B = (b << 8) | (b << 3) | (b << 2); - } break; - - case 3: { - C = 26; - // B = cb0000cbc - u32 cb = (bitval >> 1) & 3; - B = (cb << 7) | (cb << 1) | (cb >> 1); - } break; - - case 4: { - C = 13; - // B = dcb0000dc - u32 dcb = (bitval >> 1) & 7; - B = (dcb << 6) | (dcb >> 1); - } break; - - case 5: { - C = 6; - // B = edcb0000e - u32 edcb = (bitval >> 1) & 0xF; - B = (edcb << 5) | (edcb >> 3); - } break; - - default: - assert(false && "Unsupported quint encoding for color values!"); - break; - } // switch(bitlen) - } // case IntegerEncoding::Qus32 - break; - } // switch(val.encoding) - - if (val.encoding != IntegerEncoding::JustBits) { - u32 T = D * C + B; - T ^= A; - T = (A & 0x80) | (T >> 2); - out[outIdx++] = T; - } - } - - // Make sure that each of our values is in the proper range... - for (u32 i = 0; i < nValues; i++) { - assert(out[i] <= 255); - } -} - -static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { - u32 bitval = val.bit_value; - u32 bitlen = val.num_bits; - - u32 A = ReplicateBitTo7(bitval & 1); - u32 B = 0, C = 0, D = 0; - - u32 result = 0; - switch (val.encoding) { - case IntegerEncoding::JustBits: - result = FastReplicateTo6(bitval, bitlen); - break; - - case IntegerEncoding::Trit: { - D = val.trit_value; - assert(D < 3); - - switch (bitlen) { - case 0: { - u32 results[3] = {0, 32, 63}; - result = results[D]; - } break; - - case 1: { - C = 50; - } break; - - case 2: { - C = 23; - u32 b = (bitval >> 1) & 1; - B = (b << 6) | (b << 2) | b; - } break; - - case 3: { - C = 11; - u32 cb = (bitval >> 1) & 3; - B = (cb << 5) | cb; - } break; - - default: - assert(false && "Invalid trit encoding for texel weight"); - break; - } - } break; - - case IntegerEncoding::Qus32: { - D = val.qus32_value; - assert(D < 5); - - switch (bitlen) { - case 0: { - u32 results[5] = {0, 16, 32, 47, 63}; - result = results[D]; - } break; - - case 1: { - C = 28; - } break; - - case 2: { - C = 13; - u32 b = (bitval >> 1) & 1; - B = (b << 6) | (b << 1); - } break; - - default: - assert(false && "Invalid quint encoding for texel weight"); - break; - } - } break; - } - - if (val.encoding != IntegerEncoding::JustBits && bitlen > 0) { - // Decode the value... - result = D * C + B; - result ^= A; - result = (A & 0x20) | (result >> 2); - } - - assert(result < 64); - - // Change from [0,63] to [0,64] - if (result > 32) { - result += 1; - } - - return result; -} - -static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights, - const TexelWeightParams& params, const u32 blockWidth, - const u32 blockHeight) { - u32 weightIdx = 0; - u32 unquantized[2][144]; - - for (auto itr = weights.begin(); itr != weights.end(); ++itr) { - unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr); - - if (params.m_bDualPlane) { - ++itr; - unquantized[1][weightIdx] = UnquantizeTexelWeight(*itr); - if (itr == weights.end()) { - break; - } - } - - if (++weightIdx >= (params.m_Width * params.m_Height)) - break; - } - - // Do infill if necessary (Section C.2.18) ... - u32 Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1); - u32 Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1); - - const u32 kPlaneScale = params.m_bDualPlane ? 2U : 1U; - for (u32 plane = 0; plane < kPlaneScale; plane++) - for (u32 t = 0; t < blockHeight; t++) - for (u32 s = 0; s < blockWidth; s++) { - u32 cs = Ds * s; - u32 ct = Dt * t; - - u32 gs = (cs * (params.m_Width - 1) + 32) >> 6; - u32 gt = (ct * (params.m_Height - 1) + 32) >> 6; - - u32 js = gs >> 4; - u32 fs = gs & 0xF; - - u32 jt = gt >> 4; - u32 ft = gt & 0x0F; - - u32 w11 = (fs * ft + 8) >> 4; - u32 w10 = ft - w11; - u32 w01 = fs - w11; - u32 w00 = 16 - fs - ft + w11; - - u32 v0 = js + jt * params.m_Width; - -#define FIND_TEXEL(tidx, bidx) \ - u32 p##bidx = 0; \ - do { \ - if ((tidx) < (params.m_Width * params.m_Height)) { \ - p##bidx = unquantized[plane][(tidx)]; \ - } \ - } while (0) - - FIND_TEXEL(v0, 00); - FIND_TEXEL(v0 + 1, 01); - FIND_TEXEL(v0 + params.m_Width, 10); - FIND_TEXEL(v0 + params.m_Width + 1, 11); - -#undef FIND_TEXEL - - out[plane][t * blockWidth + s] = - (p00 * w00 + p01 * w01 + p10 * w10 + p11 * w11 + 8) >> 4; - } -} - -// Transfers a bit as described in C.2.14 -static inline void BitTransferSigned(s32& a, s32& b) { - b >>= 1; - b |= a & 0x80; - a >>= 1; - a &= 0x3F; - if (a & 0x20) - a -= 0x40; -} - -// Adds more precision to the blue channel as described -// in C.2.14 -static inline Pixel BlueContract(s32 a, s32 r, s32 g, s32 b) { - return Pixel(static_cast(a), static_cast((r + b) >> 1), - static_cast((g + b) >> 1), static_cast(b)); -} - -// Partition selection functions as specified in -// C.2.21 -static inline u32 hash52(u32 p) { - p ^= p >> 15; - p -= p << 17; - p += p << 7; - p += p << 4; - p ^= p >> 5; - p += p << 16; - p ^= p >> 7; - p ^= p >> 3; - p ^= p << 6; - p ^= p >> 17; - return p; -} - -static u32 SelectPartition(s32 seed, s32 x, s32 y, s32 z, s32 partitionCount, s32 smallBlock) { - if (1 == partitionCount) - return 0; - - if (smallBlock) { - x <<= 1; - y <<= 1; - z <<= 1; - } - - seed += (partitionCount - 1) * 1024; - - u32 rnum = hash52(static_cast(seed)); - u8 seed1 = static_cast(rnum & 0xF); - u8 seed2 = static_cast((rnum >> 4) & 0xF); - u8 seed3 = static_cast((rnum >> 8) & 0xF); - u8 seed4 = static_cast((rnum >> 12) & 0xF); - u8 seed5 = static_cast((rnum >> 16) & 0xF); - u8 seed6 = static_cast((rnum >> 20) & 0xF); - u8 seed7 = static_cast((rnum >> 24) & 0xF); - u8 seed8 = static_cast((rnum >> 28) & 0xF); - u8 seed9 = static_cast((rnum >> 18) & 0xF); - u8 seed10 = static_cast((rnum >> 22) & 0xF); - u8 seed11 = static_cast((rnum >> 26) & 0xF); - u8 seed12 = static_cast(((rnum >> 30) | (rnum << 2)) & 0xF); - - seed1 = static_cast(seed1 * seed1); - seed2 = static_cast(seed2 * seed2); - seed3 = static_cast(seed3 * seed3); - seed4 = static_cast(seed4 * seed4); - seed5 = static_cast(seed5 * seed5); - seed6 = static_cast(seed6 * seed6); - seed7 = static_cast(seed7 * seed7); - seed8 = static_cast(seed8 * seed8); - seed9 = static_cast(seed9 * seed9); - seed10 = static_cast(seed10 * seed10); - seed11 = static_cast(seed11 * seed11); - seed12 = static_cast(seed12 * seed12); - - s32 sh1, sh2, sh3; - if (seed & 1) { - sh1 = (seed & 2) ? 4 : 5; - sh2 = (partitionCount == 3) ? 6 : 5; - } else { - sh1 = (partitionCount == 3) ? 6 : 5; - sh2 = (seed & 2) ? 4 : 5; - } - sh3 = (seed & 0x10) ? sh1 : sh2; - - seed1 = static_cast(seed1 >> sh1); - seed2 = static_cast(seed2 >> sh2); - seed3 = static_cast(seed3 >> sh1); - seed4 = static_cast(seed4 >> sh2); - seed5 = static_cast(seed5 >> sh1); - seed6 = static_cast(seed6 >> sh2); - seed7 = static_cast(seed7 >> sh1); - seed8 = static_cast(seed8 >> sh2); - seed9 = static_cast(seed9 >> sh3); - seed10 = static_cast(seed10 >> sh3); - seed11 = static_cast(seed11 >> sh3); - seed12 = static_cast(seed12 >> sh3); - - s32 a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); - s32 b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); - s32 c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); - s32 d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); - - a &= 0x3F; - b &= 0x3F; - c &= 0x3F; - d &= 0x3F; - - if (partitionCount < 4) - d = 0; - if (partitionCount < 3) - c = 0; - - if (a >= b && a >= c && a >= d) - return 0; - else if (b >= c && b >= d) - return 1; - else if (c >= d) - return 2; - return 3; -} - -static inline u32 Select2DPartition(s32 seed, s32 x, s32 y, s32 partitionCount, s32 smallBlock) { - return SelectPartition(seed, x, y, 0, partitionCount, smallBlock); -} - -// Section C.2.14 -static void ComputeEndpos32s(Pixel& ep1, Pixel& ep2, const u32*& colorValues, - u32 colorEndpos32Mode) { -#define READ_UINT_VALUES(N) \ - u32 v[N]; \ - for (u32 i = 0; i < N; i++) { \ - v[i] = *(colorValues++); \ - } - -#define READ_INT_VALUES(N) \ - s32 v[N]; \ - for (u32 i = 0; i < N; i++) { \ - v[i] = static_cast(*(colorValues++)); \ - } - - switch (colorEndpos32Mode) { - case 0: { - READ_UINT_VALUES(2) - ep1 = Pixel(0xFF, v[0], v[0], v[0]); - ep2 = Pixel(0xFF, v[1], v[1], v[1]); - } break; - - case 1: { - READ_UINT_VALUES(2) - u32 L0 = (v[0] >> 2) | (v[1] & 0xC0); - u32 L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU); - ep1 = Pixel(0xFF, L0, L0, L0); - ep2 = Pixel(0xFF, L1, L1, L1); - } break; - - case 4: { - READ_UINT_VALUES(4) - ep1 = Pixel(v[2], v[0], v[0], v[0]); - ep2 = Pixel(v[3], v[1], v[1], v[1]); - } break; - - case 5: { - READ_INT_VALUES(4) - BitTransferSigned(v[1], v[0]); - BitTransferSigned(v[3], v[2]); - ep1 = Pixel(v[2], v[0], v[0], v[0]); - ep2 = Pixel(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]); - ep1.ClampByte(); - ep2.ClampByte(); - } break; - - case 6: { - READ_UINT_VALUES(4) - ep1 = Pixel(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); - ep2 = Pixel(0xFF, v[0], v[1], v[2]); - } break; - - case 8: { - READ_UINT_VALUES(6) - if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { - ep1 = Pixel(0xFF, v[0], v[2], v[4]); - ep2 = Pixel(0xFF, v[1], v[3], v[5]); - } else { - ep1 = BlueContract(0xFF, v[1], v[3], v[5]); - ep2 = BlueContract(0xFF, v[0], v[2], v[4]); - } - } break; - - case 9: { - READ_INT_VALUES(6) - BitTransferSigned(v[1], v[0]); - BitTransferSigned(v[3], v[2]); - BitTransferSigned(v[5], v[4]); - if (v[1] + v[3] + v[5] >= 0) { - ep1 = Pixel(0xFF, v[0], v[2], v[4]); - ep2 = Pixel(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]); - } else { - ep1 = BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]); - ep2 = BlueContract(0xFF, v[0], v[2], v[4]); - } - ep1.ClampByte(); - ep2.ClampByte(); - } break; - - case 10: { - READ_UINT_VALUES(6) - ep1 = Pixel(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); - ep2 = Pixel(v[5], v[0], v[1], v[2]); - } break; - - case 12: { - READ_UINT_VALUES(8) - if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { - ep1 = Pixel(v[6], v[0], v[2], v[4]); - ep2 = Pixel(v[7], v[1], v[3], v[5]); - } else { - ep1 = BlueContract(v[7], v[1], v[3], v[5]); - ep2 = BlueContract(v[6], v[0], v[2], v[4]); - } - } break; - - case 13: { - READ_INT_VALUES(8) - BitTransferSigned(v[1], v[0]); - BitTransferSigned(v[3], v[2]); - BitTransferSigned(v[5], v[4]); - BitTransferSigned(v[7], v[6]); - if (v[1] + v[3] + v[5] >= 0) { - ep1 = Pixel(v[6], v[0], v[2], v[4]); - ep2 = Pixel(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]); - } else { - ep1 = BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]); - ep2 = BlueContract(v[6], v[0], v[2], v[4]); - } - ep1.ClampByte(); - ep2.ClampByte(); - } break; - - default: - assert(false && "Unsupported color endpoint mode (is it HDR?)"); - break; - } - -#undef READ_UINT_VALUES -#undef READ_INT_VALUES -} - -static void DecompressBlock(std::span inBuf, const u32 blockWidth, - const u32 blockHeight, std::span outBuf) { - InputBitStream strm(inBuf); - TexelWeightParams weightParams = DecodeBlockInfo(strm); - - // Was there an error? - if (weightParams.m_bError) { - assert(false && "Invalid block mode"); - FillError(outBuf, blockWidth, blockHeight); - return; - } - - if (weightParams.m_bVoidExtentLDR) { - FillVoidExtentLDR(strm, outBuf, blockWidth, blockHeight); - return; - } - - if (weightParams.m_bVoidExtentHDR) { - assert(false && "HDR void extent blocks are unsupported!"); - FillError(outBuf, blockWidth, blockHeight); - return; - } - - if (weightParams.m_Width > blockWidth) { - assert(false && "Texel weight grid width should be smaller than block width"); - FillError(outBuf, blockWidth, blockHeight); - return; - } - - if (weightParams.m_Height > blockHeight) { - assert(false && "Texel weight grid height should be smaller than block height"); - FillError(outBuf, blockWidth, blockHeight); - return; - } - - // Read num partitions - u32 nPartitions = strm.ReadBits<2>() + 1; - assert(nPartitions <= 4); - - if (nPartitions == 4 && weightParams.m_bDualPlane) { - assert(false && "Dual plane mode is incompatible with four partition blocks"); - FillError(outBuf, blockWidth, blockHeight); - return; - } - - // Based on the number of partitions, read the color endpos32 mode for - // each partition. - - // Determine partitions, partition index, and color endpos32 modes - s32 planeIdx = -1; - u32 partitionIndex; - u32 colorEndpos32Mode[4] = {0, 0, 0, 0}; - - // Define color data. - u8 colorEndpos32Data[16]; - memset(colorEndpos32Data, 0, sizeof(colorEndpos32Data)); - OutputBitStream colorEndpos32Stream(colorEndpos32Data, 16 * 8, 0); - - // Read extra config data... - u32 baseCEM = 0; - if (nPartitions == 1) { - colorEndpos32Mode[0] = strm.ReadBits<4>(); - partitionIndex = 0; - } else { - partitionIndex = strm.ReadBits<10>(); - baseCEM = strm.ReadBits<6>(); - } - u32 baseMode = (baseCEM & 3); - - // Remaining bits are color endpos32 data... - u32 nWeightBits = weightParams.GetPackedBitSize(); - s32 remainingBits = 128 - nWeightBits - static_cast(strm.GetBitsRead()); - - // Consider extra bits prior to texel data... - u32 extraCEMbits = 0; - if (baseMode) { - switch (nPartitions) { - case 2: - extraCEMbits += 2; - break; - case 3: - extraCEMbits += 5; - break; - case 4: - extraCEMbits += 8; - break; - default: - assert(false); - break; - } - } - remainingBits -= extraCEMbits; - - // Do we have a dual plane situation? - u32 planeSelectorBits = 0; - if (weightParams.m_bDualPlane) { - planeSelectorBits = 2; - } - remainingBits -= planeSelectorBits; - - // Read color data... - u32 colorDataBits = remainingBits; - while (remainingBits > 0) { - u32 nb = std::min(remainingBits, 8); - u32 b = strm.ReadBits(nb); - colorEndpos32Stream.WriteBits(b, nb); - remainingBits -= 8; - } - - // Read the plane selection bits - planeIdx = strm.ReadBits(planeSelectorBits); - - // Read the rest of the CEM - if (baseMode) { - u32 extraCEM = strm.ReadBits(extraCEMbits); - u32 CEM = (extraCEM << 6) | baseCEM; - CEM >>= 2; - - bool C[4] = {0}; - for (u32 i = 0; i < nPartitions; i++) { - C[i] = CEM & 1; - CEM >>= 1; - } - - u8 M[4] = {0}; - for (u32 i = 0; i < nPartitions; i++) { - M[i] = CEM & 3; - CEM >>= 2; - assert(M[i] <= 3); - } - - for (u32 i = 0; i < nPartitions; i++) { - colorEndpos32Mode[i] = baseMode; - if (!(C[i])) - colorEndpos32Mode[i] -= 1; - colorEndpos32Mode[i] <<= 2; - colorEndpos32Mode[i] |= M[i]; - } - } else if (nPartitions > 1) { - u32 CEM = baseCEM >> 2; - for (u32 i = 0; i < nPartitions; i++) { - colorEndpos32Mode[i] = CEM; - } - } - - // Make sure everything up till here is sane. - for (u32 i = 0; i < nPartitions; i++) { - assert(colorEndpos32Mode[i] < 16); - } - assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128); - - // Decode both color data and texel weight data - u32 colorValues[32]; // Four values, two endpos32s, four maximum paritions - DecodeColorValues(colorValues, colorEndpos32Data, colorEndpos32Mode, nPartitions, - colorDataBits); - - Pixel endpos32s[4][2]; - const u32* colorValuesPtr = colorValues; - for (u32 i = 0; i < nPartitions; i++) { - ComputeEndpos32s(endpos32s[i][0], endpos32s[i][1], colorValuesPtr, colorEndpos32Mode[i]); - } - - // Read the texel weight data.. - std::array texelWeightData; - std::ranges::copy(inBuf, texelWeightData.begin()); - - // Reverse everything - for (u32 i = 0; i < 8; i++) { -// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits -#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32 - u8 a = static_cast(REVERSE_BYTE(texelWeightData[i])); - u8 b = static_cast(REVERSE_BYTE(texelWeightData[15 - i])); -#undef REVERSE_BYTE - - texelWeightData[i] = b; - texelWeightData[15 - i] = a; - } - - // Make sure that higher non-texel bits are set to zero - const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1; - if (clearByteStart > 0 && clearByteStart <= texelWeightData.size()) { - texelWeightData[clearByteStart - 1] &= - static_cast((1 << (weightParams.GetPackedBitSize() % 8)) - 1); - std::memset(texelWeightData.data() + clearByteStart, 0, - std::min(16U - clearByteStart, 16U)); - } - - IntegerEncodedVector texelWeightValues; - - InputBitStream weightStream(texelWeightData); - - DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight, - weightParams.GetNumWeightValues()); - - // Blocks can be at most 12x12, so we can have as many as 144 weights - u32 weights[2][144]; - UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight); - - // Now that we have endpos32s and weights, we can s32erpolate and generate - // the proper decoding... - for (u32 j = 0; j < blockHeight; j++) - for (u32 i = 0; i < blockWidth; i++) { - u32 partition = Select2DPartition(partitionIndex, i, j, nPartitions, - (blockHeight * blockWidth) < 32); - assert(partition < nPartitions); - - Pixel p; - for (u32 c = 0; c < 4; c++) { - u32 C0 = endpos32s[partition][0].Component(c); - C0 = ReplicateByteTo16(C0); - u32 C1 = endpos32s[partition][1].Component(c); - C1 = ReplicateByteTo16(C1); - - u32 plane = 0; - if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { - plane = 1; - } - - u32 weight = weights[plane][j * blockWidth + i]; - u32 C = (C0 * (64 - weight) + C1 * weight + 32) / 64; - if (C == 65535) { - p.Component(c) = 255; - } else { - double Cf = static_cast(C); - p.Component(c) = static_cast(255.0 * (Cf / 65536.0) + 0.5); - } - } - - outBuf[j * blockWidth + i] = p.Pack(); - } -} - -} // namespace ASTCC - -namespace Tegra::Texture::ASTC { - -void Decompress(std::span data, uint32_t width, uint32_t height, uint32_t depth, - uint32_t block_width, uint32_t block_height, std::span output) { - u32 block_index = 0; - std::size_t depth_offset = 0; - for (u32 z = 0; z < depth; z++) { - for (u32 y = 0; y < height; y += block_height) { - for (u32 x = 0; x < width; x += block_width) { - const std::span blockPtr{data.subspan(block_index * 16, 16)}; - - // Blocks can be at most 12x12 - std::array uncompData; - ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData); - - u32 decompWidth = std::min(block_width, width - x); - u32 decompHeight = std::min(block_height, height - y); - - const std::span outRow = output.subspan(depth_offset + (y * width + x) * 4); - for (u32 jj = 0; jj < decompHeight; jj++) { - std::memcpy(outRow.data() + jj * width * 4, - uncompData.data() + jj * block_width, decompWidth * 4); - } - ++block_index; - } - } - depth_offset += height * width * 4; - } -} - -} // namespace Tegra::Texture::ASTC diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index bc8bddaec2..c1c73fda57 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h @@ -4,20 +4,12 @@ #pragma once -#include +#include +#include "common/common_types.h" namespace Tegra::Texture::ASTC { -/// Count the number of bits set in a number. -constexpr u32 Popcnt(u32 n) { - u32 c = 0; - for (; n; c++) { - n &= n - 1; - } - return c; -} - -enum class IntegerEncoding { JustBits, Qus32, Trit }; +enum class IntegerEncoding { JustBits, Quint, Trit }; struct IntegerEncodedValue { constexpr IntegerEncodedValue() = default; @@ -29,55 +21,55 @@ struct IntegerEncodedValue { return encoding == other.encoding && num_bits == other.num_bits; } - // Returns the number of bits required to encode nVals values. - u32 GetBitLength(u32 nVals) const { - u32 totalBits = num_bits * nVals; + // Returns the number of bits required to encode num_vals values. + u32 GetBitLength(u32 num_vals) const { + u32 total_bits = num_bits * num_vals; if (encoding == IntegerEncoding::Trit) { - totalBits += (nVals * 8 + 4) / 5; - } else if (encoding == IntegerEncoding::Qus32) { - totalBits += (nVals * 7 + 2) / 3; + total_bits += (num_vals * 8 + 4) / 5; + } else if (encoding == IntegerEncoding::Quint) { + total_bits += (num_vals * 7 + 2) / 3; } - return totalBits; + return total_bits; } IntegerEncoding encoding{}; u32 num_bits = 0; u32 bit_value = 0; union { - u32 qus32_value = 0; + u32 quint_value = 0; u32 trit_value; }; }; // Returns a new instance of this struct that corresponds to the -// can take no more than maxval values -static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) { - while (maxVal > 0) { - u32 check = maxVal + 1; +// can take no more than mav_value values +constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) { + while (mav_value > 0) { + u32 check = mav_value + 1; - // Is maxVal a power of two? + // Is mav_value a power of two? if (!(check & (check - 1))) { - return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal)); + return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value)); } - // Is maxVal of the type 3*2^n - 1? + // Is mav_value of the type 3*2^n - 1? if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { - return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1)); + return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1)); } - // Is maxVal of the type 5*2^n - 1? + // Is mav_value of the type 5*2^n - 1? if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { - return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1)); + return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1)); } // Apparently it can't be represented with a bounded integer sequence... // just iterate. - maxVal--; + mav_value--; } return IntegerEncodedValue(IntegerEncoding::JustBits, 0); } -static constexpr std::array MakeEncodedValues() { +constexpr std::array MakeEncodedValues() { std::array encodings{}; for (std::size_t i = 0; i < encodings.size(); ++i) { encodings[i] = CreateEncoding(static_cast(i)); @@ -85,41 +77,38 @@ static constexpr std::array MakeEncodedValues() { return encodings; } -static constexpr std::array EncodingsValues = MakeEncodedValues(); +constexpr std::array EncodingsValues = MakeEncodedValues(); -// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] -// is the same as [(numBits - 1):0] and repeats all the way down. +// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] +// is the same as [(num_bits - 1):0] and repeats all the way down. template -static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) { - if (numBits == 0) { +constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) { + if (num_bits == 0 || to_bit == 0) { return 0; } - if (toBit == 0) { - return 0; - } - const IntType v = val & static_cast((1 << numBits) - 1); + const IntType v = val & static_cast((1 << num_bits) - 1); IntType res = v; - u32 reslen = numBits; - while (reslen < toBit) { + u32 reslen = num_bits; + while (reslen < to_bit) { u32 comp = 0; - if (numBits > toBit - reslen) { - u32 newshift = toBit - reslen; - comp = numBits - newshift; - numBits = newshift; + if (num_bits > to_bit - reslen) { + u32 newshift = to_bit - reslen; + comp = num_bits - newshift; + num_bits = newshift; } - res = static_cast(res << numBits); + res = static_cast(res << num_bits); res = static_cast(res | (v >> comp)); - reslen += numBits; + reslen += num_bits; } return res; } -static constexpr std::size_t NumReplicateEntries(u32 num_bits) { +constexpr std::size_t NumReplicateEntries(u32 num_bits) { return std::size_t(1) << num_bits; } template -static constexpr auto MakeReplicateTable() { +constexpr auto MakeReplicateTable() { std::array table{}; for (IntType value = 0; value < static_cast(std::size(table)); ++value) { table[value] = Replicate(value, num_bits, to_bit); @@ -127,78 +116,17 @@ static constexpr auto MakeReplicateTable() { return table; } -static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable(); -static constexpr u32 ReplicateByteTo16(std::size_t value) { - return REPLICATE_BYTE_TO_16_TABLE[value]; -} +constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable(); +constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable(); +constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable(); +constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable(); -static constexpr u32 ReplicateBitTo7(std::size_t value) { - return REPLICATE_BIT_TO_7_TABLE[value]; -} - -static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable(); -static constexpr u32 ReplicateBitTo9(std::size_t value) { - return REPLICATE_BIT_TO_9_TABLE[value]; -} - -static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable(); -/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback -/// to the runtime implementation -static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { - switch (num_bits) { - case 1: - return REPLICATE_1_BIT_TO_8_TABLE[value]; - case 2: - return REPLICATE_2_BIT_TO_8_TABLE[value]; - case 3: - return REPLICATE_3_BIT_TO_8_TABLE[value]; - case 4: - return REPLICATE_4_BIT_TO_8_TABLE[value]; - case 5: - return REPLICATE_5_BIT_TO_8_TABLE[value]; - case 6: - return REPLICATE_6_BIT_TO_8_TABLE[value]; - case 7: - return REPLICATE_7_BIT_TO_8_TABLE[value]; - case 8: - return REPLICATE_8_BIT_TO_8_TABLE[value]; - default: - return Replicate(value, num_bits, 8); - } -} - -static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable(); -static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable(); - -static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) { - switch (num_bits) { - case 1: - return REPLICATE_1_BIT_TO_6_TABLE[value]; - case 2: - return REPLICATE_2_BIT_TO_6_TABLE[value]; - case 3: - return REPLICATE_3_BIT_TO_6_TABLE[value]; - case 4: - return REPLICATE_4_BIT_TO_6_TABLE[value]; - case 5: - return REPLICATE_5_BIT_TO_6_TABLE[value]; - default: - return Replicate(value, num_bits, 6); - } -} - -void Decompress(std::span data, uint32_t width, uint32_t height, uint32_t depth, - uint32_t block_width, uint32_t block_height, std::span output); +struct AstcBufferData { + decltype(EncodingsValues) encoding_values = EncodingsValues; + decltype(REPLICATE_6_BIT_TO_8_TABLE) replicate_6_to_8 = REPLICATE_6_BIT_TO_8_TABLE; + decltype(REPLICATE_7_BIT_TO_8_TABLE) replicate_7_to_8 = REPLICATE_7_BIT_TO_8_TABLE; + decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE; + decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE; +} constexpr ASTC_BUFFER_DATA; } // namespace Tegra::Texture::ASTC