yuzu-android/src/video_core/host_shaders/astc_decoder.comp

1188 lines
36 KiB
Plaintext

// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#version 450
#ifdef VULKAN
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 1
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout(location = n) uniform
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 0
#endif
layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
BEGIN_PUSH_CONSTANTS
UNIFORM(1) uvec2 block_dims;
UNIFORM(2) uint layer_stride;
UNIFORM(3) uint block_size;
UNIFORM(4) uint x_shift;
UNIFORM(5) uint block_height;
UNIFORM(6) uint block_height_mask;
END_PUSH_CONSTANTS
struct EncodingData {
uint data;
};
layout(binding = BINDING_INPUT_BUFFER, std430) readonly restrict buffer InputBufferU32 {
uvec4 astc_data[];
};
layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly restrict image2DArray dest_image;
const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT;
const uint BYTES_PER_BLOCK_LOG2 = 4;
const uint JUST_BITS = 0u;
const uint QUINT = 1u;
const uint TRIT = 2u;
// ASTC Encodings data, sorted in ascending order based on their BitLength value
// (see GetBitLength() function)
const uint encoding_values[22] = uint[](
(JUST_BITS), (JUST_BITS | (1u << 8u)), (TRIT), (JUST_BITS | (2u << 8u)),
(QUINT), (TRIT | (1u << 8u)), (JUST_BITS | (3u << 8u)), (QUINT | (1u << 8u)),
(TRIT | (2u << 8u)), (JUST_BITS | (4u << 8u)), (QUINT | (2u << 8u)), (TRIT | (3u << 8u)),
(JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u)),
(QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)),
(TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)));
// Input ASTC texture globals
int total_bitsread = 0;
uvec4 local_buff;
// Color data globals
uvec4 color_endpoint_data;
int color_bitsread = 0;
// Global "vector" to be pushed into when decoding
// At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode
// At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode
// So the maximum would be 144 (12 x 12) elements, x 2 for two planes
#define DIVCEIL(number, divisor) (number + divisor - 1) / divisor
#define ARRAY_NUM_ELEMENTS 144
#define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4)
uint result_vector[ARRAY_NUM_ELEMENTS * 2];
int result_index = 0;
uint result_vector_max_index;
bool result_limit_reached = false;
// EncodingData helpers
uint Encoding(EncodingData val) {
return bitfieldExtract(val.data, 0, 8);
}
uint NumBits(EncodingData val) {
return bitfieldExtract(val.data, 8, 8);
}
uint BitValue(EncodingData val) {
return bitfieldExtract(val.data, 16, 8);
}
uint QuintTritValue(EncodingData val) {
return bitfieldExtract(val.data, 24, 8);
}
void Encoding(inout EncodingData val, uint v) {
val.data = bitfieldInsert(val.data, v, 0, 8);
}
void NumBits(inout EncodingData val, uint v) {
val.data = bitfieldInsert(val.data, v, 8, 8);
}
void BitValue(inout EncodingData val, uint v) {
val.data = bitfieldInsert(val.data, v, 16, 8);
}
void QuintTritValue(inout EncodingData val, uint v) {
val.data = bitfieldInsert(val.data, v, 24, 8);
}
EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint quint_trit_val) {
return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) |
((bit_val) << 16u) | ((quint_trit_val) << 24u));
}
void ResultEmplaceBack(EncodingData val) {
if (result_index >= result_vector_max_index) {
// Alert callers to avoid decoding more than needed by this phase
result_limit_reached = true;
return;
}
result_vector[result_index] = val.data;
++result_index;
}
uvec4 ReplicateByteTo16(uvec4 value) {
return value * 0x101;
}
uint ReplicateBitTo7(uint value) {
return value * 127;
}
uint ReplicateBitTo9(uint value) {
return value * 511;
}
uint ReplicateBits(uint value, uint num_bits, uint to_bit) {
if (value == 0 || num_bits == 0) {
return 0;
}
if (num_bits >= to_bit) {
return value;
}
const uint v = value & uint((1 << num_bits) - 1);
uint res = v;
uint reslen = num_bits;
while (reslen < to_bit) {
const uint num_dst_bits_to_shift_up = min(num_bits, to_bit - reslen);
const uint num_src_bits_to_shift_down = num_bits - num_dst_bits_to_shift_up;
res <<= num_dst_bits_to_shift_up;
res |= (v >> num_src_bits_to_shift_down);
reslen += num_bits;
}
return res;
}
uint FastReplicateTo8(uint value, uint num_bits) {
return ReplicateBits(value, num_bits, 8);
}
uint FastReplicateTo6(uint value, uint num_bits) {
return ReplicateBits(value, num_bits, 6);
}
uint Div3Floor(uint v) {
return (v * 0x5556) >> 16;
}
uint Div3Ceil(uint v) {
return Div3Floor(v + 2);
}
uint Div5Floor(uint v) {
return (v * 0x3334) >> 16;
}
uint Div5Ceil(uint v) {
return Div5Floor(v + 4);
}
uint Hash52(uint p) {
p ^= p >> 15;
p -= p << 17;
p += p << 7;
p += p << 4;
p ^= p >> 5;
p += p << 16;
p ^= p >> 7;
p ^= p >> 3;
p ^= p << 6;
p ^= p >> 17;
return p;
}
uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) {
if ((block_dims.y * block_dims.x) < 32) {
x <<= 1;
y <<= 1;
}
seed += (partition_count - 1) * 1024;
const uint rnum = Hash52(uint(seed));
uint seed1 = uint(rnum & 0xF);
uint seed2 = uint((rnum >> 4) & 0xF);
uint seed3 = uint((rnum >> 8) & 0xF);
uint seed4 = uint((rnum >> 12) & 0xF);
uint seed5 = uint((rnum >> 16) & 0xF);
uint seed6 = uint((rnum >> 20) & 0xF);
uint seed7 = uint((rnum >> 24) & 0xF);
uint seed8 = uint((rnum >> 28) & 0xF);
seed1 = (seed1 * seed1);
seed2 = (seed2 * seed2);
seed3 = (seed3 * seed3);
seed4 = (seed4 * seed4);
seed5 = (seed5 * seed5);
seed6 = (seed6 * seed6);
seed7 = (seed7 * seed7);
seed8 = (seed8 * seed8);
uint sh1, sh2;
if ((seed & 1) > 0) {
sh1 = (seed & 2) > 0 ? 4 : 5;
sh2 = (partition_count == 3) ? 6 : 5;
} else {
sh1 = (partition_count == 3) ? 6 : 5;
sh2 = (seed & 2) > 0 ? 4 : 5;
}
seed1 >>= sh1;
seed2 >>= sh2;
seed3 >>= sh1;
seed4 >>= sh2;
seed5 >>= sh1;
seed6 >>= sh2;
seed7 >>= sh1;
seed8 >>= sh2;
uint a = seed1 * x + seed2 * y + (rnum >> 14);
uint b = seed3 * x + seed4 * y + (rnum >> 10);
uint c = seed5 * x + seed6 * y + (rnum >> 6);
uint d = seed7 * x + seed8 * y + (rnum >> 2);
a &= 0x3F;
b &= 0x3F;
c &= 0x3F;
d &= 0x3F;
if (partition_count < 4) {
d = 0;
}
if (partition_count < 3) {
c = 0;
}
if (a >= b && a >= c && a >= d) {
return 0;
} else if (b >= c && b >= d) {
return 1;
} else if (c >= d) {
return 2;
} else {
return 3;
}
}
uint ExtractBits(uvec4 payload, int offset, int bits) {
if (bits <= 0) {
return 0;
}
if (bits > 32) {
return 0;
}
const int last_offset = offset + bits - 1;
const int shifted_offset = offset >> 5;
if ((last_offset >> 5) == shifted_offset) {
return bitfieldExtract(payload[shifted_offset], offset & 31, bits);
}
const int first_bits = 32 - (offset & 31);
const int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits));
const int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits));
return result_first | (result_second << first_bits);
}
uint StreamBits(uint num_bits) {
const int int_bits = int(num_bits);
const uint ret = ExtractBits(local_buff, total_bitsread, int_bits);
total_bitsread += int_bits;
return ret;
}
void SkipBits(uint num_bits) {
const int int_bits = int(num_bits);
total_bitsread += int_bits;
}
uint StreamColorBits(uint num_bits) {
const int int_bits = int(num_bits);
const uint ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits);
color_bitsread += int_bits;
return ret;
}
EncodingData GetEncodingFromVector(uint index) {
const uint data = result_vector[index];
return EncodingData(data);
}
// Returns the number of bits required to encode n_vals values.
uint GetBitLength(uint n_vals, uint encoding_index) {
const EncodingData encoding_value = EncodingData(encoding_values[encoding_index]);
const uint encoding = Encoding(encoding_value);
uint total_bits = NumBits(encoding_value) * n_vals;
if (encoding == TRIT) {
total_bits += Div5Ceil(n_vals * 8);
} else if (encoding == QUINT) {
total_bits += Div3Ceil(n_vals * 7);
}
return total_bits;
}
uint GetNumWeightValues(uvec2 size, bool dual_plane) {
uint n_vals = size.x * size.y;
if (dual_plane) {
n_vals *= 2;
}
return n_vals;
}
uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) {
const uint n_vals = GetNumWeightValues(size, dual_plane);
return GetBitLength(n_vals, max_weight);
}
uint BitsBracket(uint bits, uint pos) {
return ((bits >> pos) & 1);
}
uint BitsOp(uint bits, uint start, uint end) {
const uint mask = (1 << (end - start + 1)) - 1;
return ((bits >> start) & mask);
}
void DecodeQuintBlock(uint num_bits) {
uvec3 m;
uvec4 qQ;
m[0] = StreamColorBits(num_bits);
qQ.w = StreamColorBits(3);
m[1] = StreamColorBits(num_bits);
qQ.w |= StreamColorBits(2) << 3;
m[2] = StreamColorBits(num_bits);
qQ.w |= StreamColorBits(2) << 5;
if (BitsOp(qQ.w, 1, 2) == 3 && BitsOp(qQ.w, 5, 6) == 0) {
qQ.x = 4;
qQ.y = 4;
qQ.z = (BitsBracket(qQ.w, 0) << 2) | ((BitsBracket(qQ.w, 4) & ~BitsBracket(qQ.w, 0)) << 1) |
(BitsBracket(qQ.w, 3) & ~BitsBracket(qQ.w, 0));
} else {
uint C = 0;
if (BitsOp(qQ.w, 1, 2) == 3) {
qQ.z = 4;
C = (BitsOp(qQ.w, 3, 4) << 3) | ((~BitsOp(qQ.w, 5, 6) & 3) << 1) | BitsBracket(qQ.w, 0);
} else {
qQ.z = BitsOp(qQ.w, 5, 6);
C = BitsOp(qQ.w, 0, 4);
}
if (BitsOp(C, 0, 2) == 5) {
qQ.y = 4;
qQ.x = BitsOp(C, 3, 4);
} else {
qQ.y = BitsOp(C, 3, 4);
qQ.x = BitsOp(C, 0, 2);
}
}
for (uint i = 0; i < 3; i++) {
const EncodingData val = CreateEncodingData(QUINT, num_bits, m[i], qQ[i]);
ResultEmplaceBack(val);
}
}
void DecodeTritBlock(uint num_bits) {
uvec4 m;
uvec4 t;
uvec3 Tm5t5;
m[0] = StreamColorBits(num_bits);
Tm5t5.x = StreamColorBits(2);
m[1] = StreamColorBits(num_bits);
Tm5t5.x |= StreamColorBits(2) << 2;
m[2] = StreamColorBits(num_bits);
Tm5t5.x |= StreamColorBits(1) << 4;
m[3] = StreamColorBits(num_bits);
Tm5t5.x |= StreamColorBits(2) << 5;
Tm5t5.y = StreamColorBits(num_bits);
Tm5t5.x |= StreamColorBits(1) << 7;
uint C = 0;
if (BitsOp(Tm5t5.x, 2, 4) == 7) {
C = (BitsOp(Tm5t5.x, 5, 7) << 2) | BitsOp(Tm5t5.x, 0, 1);
Tm5t5.z = 2;
t[3] = 2;
} else {
C = BitsOp(Tm5t5.x, 0, 4);
if (BitsOp(Tm5t5.x, 5, 6) == 3) {
Tm5t5.z = 2;
t[3] = BitsBracket(Tm5t5.x, 7);
} else {
Tm5t5.z = BitsBracket(Tm5t5.x, 7);
t[3] = BitsOp(Tm5t5.x, 5, 6);
}
}
if (BitsOp(C, 0, 1) == 3) {
t[2] = 2;
t[1] = BitsBracket(C, 4);
t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3));
} else if (BitsOp(C, 2, 3) == 3) {
t[2] = 2;
t[1] = 2;
t[0] = BitsOp(C, 0, 1);
} else {
t[2] = BitsBracket(C, 4);
t[1] = BitsOp(C, 2, 3);
t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1));
}
for (uint i = 0; i < 4; i++) {
const EncodingData val = CreateEncodingData(TRIT, num_bits, m[i], t[i]);
ResultEmplaceBack(val);
}
const EncodingData val = CreateEncodingData(TRIT, num_bits, Tm5t5.y, Tm5t5.z);
ResultEmplaceBack(val);
}
void DecodeIntegerSequence(uint max_range, uint num_values) {
EncodingData val = EncodingData(encoding_values[max_range]);
const uint encoding = Encoding(val);
const uint num_bits = NumBits(val);
uint vals_decoded = 0;
while (vals_decoded < num_values && !result_limit_reached) {
switch (encoding) {
case QUINT:
DecodeQuintBlock(num_bits);
vals_decoded += 3;
break;
case TRIT:
DecodeTritBlock(num_bits);
vals_decoded += 5;
break;
case JUST_BITS:
BitValue(val, StreamColorBits(num_bits));
ResultEmplaceBack(val);
vals_decoded++;
break;
}
}
}
void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, out uint color_values[32]) {
uint num_values = 0;
for (uint i = 0; i < num_partitions; i++) {
num_values += ((modes[i] >> 2) + 1) << 1;
}
// Find the largest encoding that's within color_data_bits
// TODO(ameerj): profile with binary search
int range = 0;
while (++range < encoding_values.length()) {
const uint bit_length = GetBitLength(num_values, range);
if (bit_length > color_data_bits) {
break;
}
}
DecodeIntegerSequence(range - 1, num_values);
uint out_index = 0;
for (int itr = 0; itr < result_index; ++itr) {
if (out_index >= num_values) {
break;
}
const EncodingData val = GetEncodingFromVector(itr);
const uint encoding = Encoding(val);
const uint bitlen = NumBits(val);
const uint bitval = BitValue(val);
uint A = 0, B = 0, C = 0, D = 0;
A = ReplicateBitTo9((bitval & 1));
switch (encoding) {
case JUST_BITS:
color_values[++out_index] = FastReplicateTo8(bitval, bitlen);
break;
case TRIT: {
D = QuintTritValue(val);
switch (bitlen) {
case 1:
C = 204;
break;
case 2: {
C = 93;
const uint b = (bitval >> 1) & 1;
B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
break;
}
case 3: {
C = 44;
const uint cb = (bitval >> 1) & 3;
B = (cb << 7) | (cb << 2) | cb;
break;
}
case 4: {
C = 22;
const uint dcb = (bitval >> 1) & 7;
B = (dcb << 6) | dcb;
break;
}
case 5: {
C = 11;
const uint edcb = (bitval >> 1) & 0xF;
B = (edcb << 5) | (edcb >> 2);
break;
}
case 6: {
C = 5;
const uint fedcb = (bitval >> 1) & 0x1F;
B = (fedcb << 4) | (fedcb >> 4);
break;
}
}
break;
}
case QUINT: {
D = QuintTritValue(val);
switch (bitlen) {
case 1:
C = 113;
break;
case 2: {
C = 54;
const uint b = (bitval >> 1) & 1;
B = (b << 8) | (b << 3) | (b << 2);
break;
}
case 3: {
C = 26;
const uint cb = (bitval >> 1) & 3;
B = (cb << 7) | (cb << 1) | (cb >> 1);
break;
}
case 4: {
C = 13;
const uint dcb = (bitval >> 1) & 7;
B = (dcb << 6) | (dcb >> 1);
break;
}
case 5: {
C = 6;
const uint edcb = (bitval >> 1) & 0xF;
B = (edcb << 5) | (edcb >> 3);
break;
}
}
break;
}
}
if (encoding != JUST_BITS) {
uint T = (D * C) + B;
T ^= A;
T = (A & 0x80) | (T >> 2);
color_values[++out_index] = T;
}
}
}
ivec2 BitTransferSigned(int a, int b) {
ivec2 transferred;
transferred.y = b >> 1;
transferred.y |= a & 0x80;
transferred.x = a >> 1;
transferred.x &= 0x3F;
if ((transferred.x & 0x20) > 0) {
transferred.x -= 0x40;
}
return transferred;
}
uvec4 ClampByte(ivec4 color) {
return uvec4(clamp(color, 0, 255));
}
ivec4 BlueContract(int a, int r, int g, int b) {
return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
}
void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, uint color_values[32],
inout uint colvals_index) {
#define READ_UINT_VALUES(N) \
uvec4 V[2]; \
for (uint i = 0; i < N; i++) { \
V[i / 4][i % 4] = color_values[++colvals_index]; \
}
#define READ_INT_VALUES(N) \
ivec4 V[2]; \
for (uint i = 0; i < N; i++) { \
V[i / 4][i % 4] = int(color_values[++colvals_index]); \
}
switch (color_endpoint_mode) {
case 0: {
READ_UINT_VALUES(2)
ep1 = uvec4(0xFF, V[0].x, V[0].x, V[0].x);
ep2 = uvec4(0xFF, V[0].y, V[0].y, V[0].y);
break;
}
case 1: {
READ_UINT_VALUES(2)
const uint L0 = (V[0].x >> 2) | (V[0].y & 0xC0);
const uint L1 = min(L0 + (V[0].y & 0x3F), 0xFFU);
ep1 = uvec4(0xFF, L0, L0, L0);
ep2 = uvec4(0xFF, L1, L1, L1);
break;
}
case 4: {
READ_UINT_VALUES(4)
ep1 = uvec4(V[0].z, V[0].x, V[0].x, V[0].x);
ep2 = uvec4(V[0].w, V[0].y, V[0].y, V[0].y);
break;
}
case 5: {
READ_INT_VALUES(4)
ivec2 transferred = BitTransferSigned(V[0].y, V[0].x);
V[0].y = transferred.x;
V[0].x = transferred.y;
transferred = BitTransferSigned(V[0].w, V[0].z);
V[0].w = transferred.x;
V[0].z = transferred.y;
ep1 = ClampByte(ivec4(V[0].z, V[0].x, V[0].x, V[0].x));
ep2 = ClampByte(ivec4(V[0].z + V[0].w, V[0].x + V[0].y, V[0].x + V[0].y, V[0].x + V[0].y));
break;
}
case 6: {
READ_UINT_VALUES(4)
ep1 = uvec4(0xFF, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8);
ep2 = uvec4(0xFF, V[0].x, V[0].y, V[0].z);
break;
}
case 8: {
READ_UINT_VALUES(6)
if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) {
ep1 = uvec4(0xFF, V[0].x, V[0].z, V[1].x);
ep2 = uvec4(0xFF, V[0].y, V[0].w, V[1].y);
} else {
ep1 = uvec4(BlueContract(0xFF, int(V[0].y), int(V[0].w), int(V[1].y)));
ep2 = uvec4(BlueContract(0xFF, int(V[0].x), int(V[0].z), int(V[1].x)));
}
break;
}
case 9: {
READ_INT_VALUES(6)
ivec2 transferred = BitTransferSigned(V[0].y, V[0].x);
V[0].y = transferred.x;
V[0].x = transferred.y;
transferred = BitTransferSigned(V[0].w, V[0].z);
V[0].w = transferred.x;
V[0].z = transferred.y;
transferred = BitTransferSigned(V[1].y, V[1].x);
V[1].y = transferred.x;
V[1].x = transferred.y;
if ((V[0].y + V[0].w + V[1].y) >= 0) {
ep1 = ClampByte(ivec4(0xFF, V[0].x, V[0].z, V[1].x));
ep2 = ClampByte(ivec4(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
} else {
ep1 = ClampByte(BlueContract(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
ep2 = ClampByte(BlueContract(0xFF, V[0].x, V[0].z, V[1].x));
}
break;
}
case 10: {
READ_UINT_VALUES(6)
ep1 = uvec4(V[1].x, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8);
ep2 = uvec4(V[1].y, V[0].x, V[0].y, V[0].z);
break;
}
case 12: {
READ_UINT_VALUES(8)
if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) {
ep1 = uvec4(V[1].z, V[0].x, V[0].z, V[1].x);
ep2 = uvec4(V[1].w, V[0].y, V[0].w, V[1].y);
} else {
ep1 = uvec4(BlueContract(int(V[1].w), int(V[0].y), int(V[0].w), int(V[1].y)));
ep2 = uvec4(BlueContract(int(V[1].z), int(V[0].x), int(V[0].z), int(V[1].x)));
}
break;
}
case 13: {
READ_INT_VALUES(8)
ivec2 transferred = BitTransferSigned(V[0].y, V[0].x);
V[0].y = transferred.x;
V[0].x = transferred.y;
transferred = BitTransferSigned(V[0].w, V[0].z);
V[0].w = transferred.x;
V[0].z = transferred.y;
transferred = BitTransferSigned(V[1].y, V[1].x);
V[1].y = transferred.x;
V[1].x = transferred.y;
transferred = BitTransferSigned(V[1].w, V[1].z);
V[1].w = transferred.x;
V[1].z = transferred.y;
if ((V[0].y + V[0].w + V[1].y) >= 0) {
ep1 = ClampByte(ivec4(V[1].z, V[0].x, V[0].z, V[1].x));
ep2 = ClampByte(ivec4(V[1].w + V[1].z, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
} else {
ep1 = ClampByte(BlueContract(V[1].z + V[1].w, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
ep2 = ClampByte(BlueContract(V[1].z, V[0].x, V[0].z, V[1].x));
}
break;
}
default: {
// HDR mode, or more likely a bug computing the color_endpoint_mode
ep1 = uvec4(0xFF, 0xFF, 0, 0);
ep2 = uvec4(0xFF, 0xFF, 0, 0);
break;
}
}
#undef READ_UINT_VALUES
#undef READ_INT_VALUES
}
uint UnquantizeTexelWeight(EncodingData val) {
const uint encoding = Encoding(val);
const uint bitlen = NumBits(val);
const uint bitval = BitValue(val);
const uint A = ReplicateBitTo7((bitval & 1));
uint B = 0, C = 0, D = 0;
uint result = 0;
const uint bitlen_0_results[5] = {0, 16, 32, 48, 64};
switch (encoding) {
case JUST_BITS:
return FastReplicateTo6(bitval, bitlen);
case TRIT: {
D = QuintTritValue(val);
switch (bitlen) {
case 0:
return bitlen_0_results[D * 2];
case 1: {
C = 50;
break;
}
case 2: {
C = 23;
const uint b = (bitval >> 1) & 1;
B = (b << 6) | (b << 2) | b;
break;
}
case 3: {
C = 11;
const uint cb = (bitval >> 1) & 3;
B = (cb << 5) | cb;
break;
}
default:
break;
}
break;
}
case QUINT: {
D = QuintTritValue(val);
switch (bitlen) {
case 0:
return bitlen_0_results[D];
case 1: {
C = 28;
break;
}
case 2: {
C = 13;
const uint b = (bitval >> 1) & 1;
B = (b << 6) | (b << 1);
break;
}
}
break;
}
}
if (encoding != JUST_BITS && bitlen > 0) {
result = D * C + B;
result ^= A;
result = (A & 0x20) | (result >> 2);
}
if (result > 32) {
result += 1;
}
return result;
}
void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) {
const uint num_planes = is_dual_plane ? 2 : 1;
const uint area = size.x * size.y;
const uint loop_count = min(result_index, area * num_planes);
for (uint itr = 0; itr < loop_count; ++itr) {
result_vector[itr] =
UnquantizeTexelWeight(GetEncodingFromVector(itr));
}
}
uint GetUnquantizedTexelWeight(uint offset_base, uint plane, bool is_dual_plane) {
const uint offset = is_dual_plane ? 2 * offset_base + plane : offset_base;
return result_vector[offset];
}
uvec4 GetUnquantizedWeightVector(uint t, uint s, uvec2 size, uint plane_index, bool is_dual_plane) {
const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
const uint area = size.x * size.y;
const uint cs = Ds * s;
const uint ct = Dt * t;
const uint gs = (cs * (size.x - 1) + 32) >> 6;
const uint gt = (ct * (size.y - 1) + 32) >> 6;
const uint js = gs >> 4;
const uint fs = gs & 0xF;
const uint jt = gt >> 4;
const uint ft = gt & 0x0F;
const uint w11 = (fs * ft + 8) >> 4;
const uint w10 = ft - w11;
const uint w01 = fs - w11;
const uint w00 = 16 - fs - ft + w11;
const uvec4 w = uvec4(w00, w01, w10, w11);
const uint v0 = jt * size.x + js;
uvec4 p0 = uvec4(0);
uvec4 p1 = uvec4(0);
if (v0 < area) {
const uint offset_base = v0;
p0.x = GetUnquantizedTexelWeight(offset_base, 0, is_dual_plane);
p1.x = GetUnquantizedTexelWeight(offset_base, 1, is_dual_plane);
}
if ((v0 + 1) < (area)) {
const uint offset_base = v0 + 1;
p0.y = GetUnquantizedTexelWeight(offset_base, 0, is_dual_plane);
p1.y = GetUnquantizedTexelWeight(offset_base, 1, is_dual_plane);
}
if ((v0 + size.x) < (area)) {
const uint offset_base = v0 + size.x;
p0.z = GetUnquantizedTexelWeight(offset_base, 0, is_dual_plane);
p1.z = GetUnquantizedTexelWeight(offset_base, 1, is_dual_plane);
}
if ((v0 + size.x + 1) < (area)) {
const uint offset_base = v0 + size.x + 1;
p0.w = GetUnquantizedTexelWeight(offset_base, 0, is_dual_plane);
p1.w = GetUnquantizedTexelWeight(offset_base, 1, is_dual_plane);
}
const uint primary_weight = (uint(dot(p0, w)) + 8) >> 4;
uvec4 weight_vec = uvec4(primary_weight);
if (is_dual_plane) {
const uint secondary_weight = (uint(dot(p1, w)) + 8) >> 4;
for (uint c = 0; c < 4; c++) {
const bool is_secondary = ((plane_index + 1u) & 3u) == c;
weight_vec[c] = is_secondary ? secondary_weight : primary_weight;
}
}
return weight_vec;
}
int FindLayout(uint mode) {
if ((mode & 3) != 0) {
if ((mode & 8) != 0) {
if ((mode & 4) != 0) {
if ((mode & 0x100) != 0) {
return 4;
}
return 3;
}
return 2;
}
if ((mode & 4) != 0) {
return 1;
}
return 0;
}
if ((mode & 0x100) != 0) {
if ((mode & 0x80) != 0) {
if ((mode & 0x20) != 0) {
return 8;
}
return 7;
}
return 9;
}
if ((mode & 0x80) != 0) {
return 6;
}
return 5;
}
void FillError(ivec3 coord) {
for (uint j = 0; j < block_dims.y; j++) {
for (uint i = 0; i < block_dims.x; i++) {
imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0));
}
}
}
void FillVoidExtentLDR(ivec3 coord) {
SkipBits(52);
const uint r_u = StreamBits(16);
const uint g_u = StreamBits(16);
const uint b_u = StreamBits(16);
const uint a_u = StreamBits(16);
const float a = float(a_u) / 65535.0f;
const float r = float(r_u) / 65535.0f;
const float g = float(g_u) / 65535.0f;
const float b = float(b_u) / 65535.0f;
for (uint j = 0; j < block_dims.y; j++) {
for (uint i = 0; i < block_dims.x; i++) {
imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a));
}
}
}
bool IsError(uint mode) {
if ((mode & 0x1ff) == 0x1fc) {
if ((mode & 0x200) != 0) {
// params.void_extent_hdr = true;
return true;
}
if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
return true;
}
return false;
}
if ((mode & 0xf) == 0) {
return true;
}
if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
return true;
}
return false;
}
uvec2 DecodeBlockSize(uint mode) {
uint A, B;
switch (FindLayout(mode)) {
case 0:
A = (mode >> 5) & 0x3;
B = (mode >> 7) & 0x3;
return uvec2(B + 4, A + 2);
case 1:
A = (mode >> 5) & 0x3;
B = (mode >> 7) & 0x3;
return uvec2(B + 8, A + 2);
case 2:
A = (mode >> 5) & 0x3;
B = (mode >> 7) & 0x3;
return uvec2(A + 2, B + 8);
case 3:
A = (mode >> 5) & 0x3;
B = (mode >> 7) & 0x1;
return uvec2(A + 2, B + 6);
case 4:
A = (mode >> 5) & 0x3;
B = (mode >> 7) & 0x1;
return uvec2(B + 2, A + 2);
case 5:
A = (mode >> 5) & 0x3;
return uvec2(12, A + 2);
case 6:
A = (mode >> 5) & 0x3;
return uvec2(A + 2, 12);
case 7:
return uvec2(6, 10);
case 8:
return uvec2(10, 6);
case 9:
A = (mode >> 5) & 0x3;
B = (mode >> 9) & 0x3;
return uvec2(A + 6, B + 6);
default:
return uvec2(0);
}
}
uint DecodeMaxWeight(uint mode) {
const uint mode_layout = FindLayout(mode);
uint weight_index = (mode & 0x10) != 0 ? 1 : 0;
if (mode_layout < 5) {
weight_index |= (mode & 0x3) << 1;
} else {
weight_index |= (mode & 0xc) >> 1;
}
weight_index -= 2;
if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
weight_index += 6;
}
return weight_index + 1;
}
void DecompressBlock(ivec3 coord) {
uint mode = StreamBits(11);
if (IsError(mode)) {
FillError(coord);
return;
}
if ((mode & 0x1ff) == 0x1fc) {
// params.void_extent_ldr = true;
FillVoidExtentLDR(coord);
return;
}
const uvec2 size_params = DecodeBlockSize(mode);
if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) {
FillError(coord);
return;
}
const uint num_partitions = StreamBits(2) + 1;
const uint mode_layout = FindLayout(mode);
const bool dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) {
FillError(coord);
return;
}
uint partition_index = 1;
uvec4 color_endpoint_mode = uvec4(0);
uint ced_pointer = 0;
uint base_cem = 0;
if (num_partitions == 1) {
color_endpoint_mode.x = StreamBits(4);
partition_index = 0;
} else {
partition_index = StreamBits(10);
base_cem = StreamBits(6);
}
const uint base_mode = base_cem & 3;
const uint max_weight = DecodeMaxWeight(mode);
const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight);
uint remaining_bits = 128 - weight_bits - total_bitsread;
uint extra_cem_bits = 0;
if (base_mode > 0) {
switch (num_partitions) {
case 2:
extra_cem_bits += 2;
break;
case 3:
extra_cem_bits += 5;
break;
case 4:
extra_cem_bits += 8;
break;
default:
return;
}
}
remaining_bits -= extra_cem_bits;
const uint plane_selector_bits = dual_plane ? 2 : 0;
remaining_bits -= plane_selector_bits;
if (remaining_bits > 128) {
// Bad data, more remaining bits than 4 bytes
// return early
return;
}
// Read color data...
const uint color_data_bits = remaining_bits;
while (remaining_bits > 0) {
const int nb = int(min(remaining_bits, 32U));
const uint b = StreamBits(nb);
color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
++ced_pointer;
remaining_bits -= nb;
}
const uint plane_index = uint(StreamBits(plane_selector_bits));
if (base_mode > 0) {
const uint extra_cem = StreamBits(extra_cem_bits);
uint cem = (extra_cem << 6) | base_cem;
cem >>= 2;
uvec4 C = uvec4(0);
for (uint i = 0; i < num_partitions; i++) {
C[i] = (cem & 1);
cem >>= 1;
}
uvec4 M = uvec4(0);
for (uint i = 0; i < num_partitions; i++) {
M[i] = cem & 3;
cem >>= 2;
}
for (uint i = 0; i < num_partitions; i++) {
color_endpoint_mode[i] = base_mode;
if (C[i] == 0) {
--color_endpoint_mode[i];
}
color_endpoint_mode[i] <<= 2;
color_endpoint_mode[i] |= M[i];
}
} else if (num_partitions > 1) {
const uint cem = base_cem >> 2;
for (uint i = 0; i < num_partitions; i++) {
color_endpoint_mode[i] = cem;
}
}
uvec4 endpoints0[4];
uvec4 endpoints1[4];
{
// This decode phase should at most push 32 elements into the vector
result_vector_max_index = 32;
uint color_values[32];
uint colvals_index = 0;
DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits, color_values);
for (uint i = 0; i < num_partitions; i++) {
ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values,
colvals_index);
}
}
color_endpoint_data = local_buff;
color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx;
const uint clear_byte_start = (weight_bits >> 3) + 1;
const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) &
uint(((1 << (weight_bits % 8)) - 1));
const uint vec_index = (clear_byte_start - 1) >> 2;
color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert,
int((clear_byte_start - 1) % 4) * 8, 8);
for (uint i = clear_byte_start; i < 16; ++i) {
const uint idx = i >> 2;
color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8);
}
// Re-init vector variables for next decode phase
result_index = 0;
color_bitsread = 0;
result_limit_reached = false;
// The limit for the Unquantize phase, avoids decoding more data than needed.
result_vector_max_index = size_params.x * size_params.y;
if (dual_plane) {
result_vector_max_index *= 2;
}
DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane));
UnquantizeTexelWeights(size_params, dual_plane);
for (uint j = 0; j < block_dims.y; j++) {
for (uint i = 0; i < block_dims.x; i++) {
uint local_partition = 0;
if (num_partitions > 1) {
local_partition = Select2DPartition(partition_index, i, j, num_partitions);
}
const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]);
const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]);
const uvec4 weight_vec = GetUnquantizedWeightVector(j, i, size_params, plane_index, dual_plane);
const vec4 Cf =
vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64);
const vec4 p = (Cf / 65535.0f);
imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar);
}
}
}
uint SwizzleOffset(uvec2 pos) {
const uint x = pos.x;
const uint y = pos.y;
return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
}
void main() {
uvec3 pos = gl_GlobalInvocationID;
pos.x <<= BYTES_PER_BLOCK_LOG2;
const uint swizzle = SwizzleOffset(pos.xy);
const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
uint offset = 0;
offset += pos.z * layer_stride;
offset += (block_y >> block_height) * block_size;
offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
offset += swizzle;
const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1));
if (any(greaterThanEqual(coord, imageSize(dest_image)))) {
return;
}
local_buff = astc_data[offset / 16];
DecompressBlock(coord);
}