yuzu-mainline/src/video_core/host_shaders/queries_prefix_scan_sum.comp

// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later

#version 460 core

#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_shuffle : require
#extension GL_KHR_shader_subgroup_shuffle_relative : require
#extension GL_KHR_shader_subgroup_arithmetic : require

#ifdef VULKAN

#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 1

#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv

#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout(location = n) uniform
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 0

#endif

BEGIN_PUSH_CONSTANTS
UNIFORM(0) uint min_accumulation_base;
UNIFORM(1) uint max_accumulation_base;
UNIFORM(2) uint accumulation_limit;
UNIFORM(3) uint buffer_offset;
END_PUSH_CONSTANTS

#define LOCAL_RESULTS 8
#define QUERIES_PER_INVOC 2048

layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in;

layout(std430, binding = 0) readonly buffer block1 {
    uvec2 input_data[];
};

layout(std430, binding = 1) coherent buffer block2 {
    uvec2 output_data[];
};

layout(std430, binding = 2) coherent buffer block3 {
    uvec2 accumulated_data;
};

shared uvec2 shared_data[128];

// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64
uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
    uint carry = 0;
    uvec2 result;
    result.x = uaddCarry(value_1.x, value_2.x, carry);
    result.y = value_1.y + value_2.y + carry;
    return result;
}

// do subgroup Prefix Sum using Hillis and Steele's algorithm
uvec2 subgroupInclusiveAddUint64(uvec2 value) {
    uvec2 result = value;
    for (uint i = 1; i < gl_SubgroupSize; i *= 2) {
        uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i;
        if (i <= gl_SubgroupInvocationID) {
            result = AddUint64(result, other);
        }
    }
    return result;
}

// Writes down the results to the output buffer and to the accumulation buffer
void WriteResults(uvec2 results[LOCAL_RESULTS]) {
    const uint current_id = gl_LocalInvocationID.x;
    const uvec2 accum = accumulated_data;
    for (uint i = 0; i < LOCAL_RESULTS; i++) {
        uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0);
        AddUint64(results[i], base_data);
    }
    for (uint i = 0; i < LOCAL_RESULTS; i++) {
        output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i];
    }
    uint index = accumulation_limit % LOCAL_RESULTS;
    uint base_id = accumulation_limit / LOCAL_RESULTS;
    if (min_accumulation_base >= accumulation_limit + 1) {
        if (current_id == base_id) {
            accumulated_data = results[index];
        }
        return;
    }
    // We have that ugly case in which the accumulation data is reset in the middle somewhere.
    barrier();
    groupMemoryBarrier();

    if (current_id == base_id) {
        uvec2 reset_value = output_data[max_accumulation_base - 1];
        // Calculate two complement / negate manually
        reset_value = AddUint64(uvec2(1,0), ~reset_value);
        accumulated_data = AddUint64(results[index], reset_value);
    }
}

void main() {
    const uint subgroup_inv_id = gl_SubgroupInvocationID;
    const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups;
    const uint last_subgroup_id = subgroupMax(subgroup_inv_id);
    const uint current_id = gl_LocalInvocationID.x;
    const uint total_work = accumulation_limit;
    const uint last_result_id = LOCAL_RESULTS - 1;
    uvec2 data[LOCAL_RESULTS];
    for (uint i = 0; i < LOCAL_RESULTS; i++) {
        data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i];
    }
    uvec2 results[LOCAL_RESULTS];
    results[0] = data[0];
    for (uint i = 1; i < LOCAL_RESULTS; i++) {
        results[i] = AddUint64(data[i], results[i - 1]);
    }
    // make sure all input data has been loaded
    subgroupBarrier();
    subgroupMemoryBarrier();

    // on the last local result, do a subgroup inclusive scan sum
    results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]);
    // get the last local result from the subgroup behind the current
    uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1);
    if (subgroup_inv_id != 0) {
        for (uint i = 1; i < LOCAL_RESULTS; i++) {
            results[i - 1] = AddUint64(results[i - 1], result_behind);
        }
    }

    // if we had less queries than our subgroup, just write down the results.
    if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch.
        WriteResults(results);
        return;
    }

    // We now have more, so lets write the last result into shared memory.
    // Only pick the last subgroup.
    if (subgroup_inv_id == last_subgroup_id) {
        shared_data[subgroup_id] = results[last_result_id];
    }
    // wait until everyone loaded their stuffs
    barrier();
    memoryBarrierShared();

    // only if it's not the first subgroup
    if (subgroup_id != 0) {
        // get the results from some previous invocation
        uvec2 tmp = shared_data[subgroup_inv_id];
        subgroupBarrier();
        subgroupMemoryBarrierShared();
        tmp = subgroupInclusiveAddUint64(tmp);
        // obtain the result that would be equivalent to the previous result
        uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1);
        for (uint i = 0; i < LOCAL_RESULTS; i++) {
            results[i] = AddUint64(results[i], shuffled_result);
        }
    }
    WriteResults(results);
}
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project`
			`// SPDX-License-Identifier: GPL-3.0-or-later`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00
			`#version 460 core`

Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`#extension GL_KHR_shader_subgroup_basic : require`
			`#extension GL_KHR_shader_subgroup_shuffle : require`
			`#extension GL_KHR_shader_subgroup_shuffle_relative : require`
			`#extension GL_KHR_shader_subgroup_arithmetic : require`

Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`#ifdef VULKAN`

			`#define HAS_EXTENDED_TYPES 1`
			`#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`#define END_PUSH_CONSTANTS };`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`#define UNIFORM(n)`
			`#define BINDING_INPUT_BUFFER 0`
			`#define BINDING_OUTPUT_IMAGE 1`

			`#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv`

			`#extension GL_NV_gpu_shader5 : enable`
			`#ifdef GL_NV_gpu_shader5`
			`#define HAS_EXTENDED_TYPES 1`
			`#else`
			`#define HAS_EXTENDED_TYPES 0`
			`#endif`
			`#define BEGIN_PUSH_CONSTANTS`
			`#define END_PUSH_CONSTANTS`
			`#define UNIFORM(n) layout(location = n) uniform`
			`#define BINDING_INPUT_BUFFER 0`
			`#define BINDING_OUTPUT_IMAGE 0`

			`#endif`

			`BEGIN_PUSH_CONSTANTS`
Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`UNIFORM(0) uint min_accumulation_base;`
			`UNIFORM(1) uint max_accumulation_base;`
			`UNIFORM(2) uint accumulation_limit;`
			`UNIFORM(3) uint buffer_offset;`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`END_PUSH_CONSTANTS`

Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`#define LOCAL_RESULTS 8`
			`#define QUERIES_PER_INVOC 2048`

			`layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in;`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00
			`layout(std430, binding = 0) readonly buffer block1 {`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`uvec2 input_data[];`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`};`

Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`layout(std430, binding = 1) coherent buffer block2 {`
			`uvec2 output_data[];`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`};`

			`layout(std430, binding = 2) coherent buffer block3 {`
			`uvec2 accumulated_data;`
			`};`

Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`shared uvec2 shared_data[128];`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {`
			`uint carry = 0;`
			`uvec2 result;`
			`result.x = uaddCarry(value_1.x, value_2.x, carry);`
			`result.y = value_1.y + value_2.y + carry;`
			`return result;`
			`}`

Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`// do subgroup Prefix Sum using Hillis and Steele's algorithm`
			`uvec2 subgroupInclusiveAddUint64(uvec2 value) {`
			`uvec2 result = value;`
			`for (uint i = 1; i < gl_SubgroupSize; i *= 2) {`
Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i;`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`if (i <= gl_SubgroupInvocationID) {`
			`result = AddUint64(result, other);`
			`}`
			`}`
			`return result;`
			`}`

			`// Writes down the results to the output buffer and to the accumulation buffer`
Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`void WriteResults(uvec2 results[LOCAL_RESULTS]) {`
			`const uint current_id = gl_LocalInvocationID.x;`
			`const uvec2 accum = accumulated_data;`
			`for (uint i = 0; i < LOCAL_RESULTS; i++) {`
			`uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0);`
			`AddUint64(results[i], base_data);`
			`}`
			`for (uint i = 0; i < LOCAL_RESULTS; i++) {`
			`output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i];`
			`}`
			`uint index = accumulation_limit % LOCAL_RESULTS;`
			`uint base_id = accumulation_limit / LOCAL_RESULTS;`
			`if (min_accumulation_base >= accumulation_limit + 1) {`
			`if (current_id == base_id) {`
			`accumulated_data = results[index];`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`}`
			`return;`
			`}`
			`// We have that ugly case in which the accumulation data is reset in the middle somewhere.`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`barrier();`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`groupMemoryBarrier();`
Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00
			`if (current_id == base_id) {`
			`uvec2 reset_value = output_data[max_accumulation_base - 1];`
			`// Calculate two complement / negate manually`
			`reset_value = AddUint64(uvec2(1,0), ~reset_value);`
			`accumulated_data = AddUint64(results[index], reset_value);`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`}`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`}`

			`void main() {`
Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`const uint subgroup_inv_id = gl_SubgroupInvocationID;`
			`const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups;`
			`const uint last_subgroup_id = subgroupMax(subgroup_inv_id);`
			`const uint current_id = gl_LocalInvocationID.x;`
			`const uint total_work = accumulation_limit;`
			`const uint last_result_id = LOCAL_RESULTS - 1;`
			`uvec2 data[LOCAL_RESULTS];`
			`for (uint i = 0; i < LOCAL_RESULTS; i++) {`
			`data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i];`
			`}`
			`uvec2 results[LOCAL_RESULTS];`
			`results[0] = data[0];`
			`for (uint i = 1; i < LOCAL_RESULTS; i++) {`
			`results[i] = AddUint64(data[i], results[i - 1]);`
			`}`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`// make sure all input data has been loaded`
			`subgroupBarrier();`
			`subgroupMemoryBarrier();`

Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`// on the last local result, do a subgroup inclusive scan sum`
			`results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]);`
			`// get the last local result from the subgroup behind the current`
			`uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1);`
			`if (subgroup_inv_id != 0) {`
			`for (uint i = 1; i < LOCAL_RESULTS; i++) {`
			`results[i - 1] = AddUint64(results[i - 1], result_behind);`
			`}`
			`}`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00
			`// if we had less queries than our subgroup, just write down the results.`
Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch.`
			`WriteResults(results);`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`return;`
			`}`

			`// We now have more, so lets write the last result into shared memory.`
			`// Only pick the last subgroup.`
			`if (subgroup_inv_id == last_subgroup_id) {`
Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`shared_data[subgroup_id] = results[last_result_id];`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`}`
			`// wait until everyone loaded their stuffs`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`barrier();`
			`memoryBarrierShared();`

Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`// only if it's not the first subgroup`
Query Cache: Simplify Prefix Sum compute shader 2023-08-22 12:28:25 +02:00			`if (subgroup_id != 0) {`
Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`// get the results from some previous invocation`
			`uvec2 tmp = shared_data[subgroup_inv_id];`
			`subgroupBarrier();`
			`subgroupMemoryBarrierShared();`
			`tmp = subgroupInclusiveAddUint64(tmp);`
			`// obtain the result that would be equivalent to the previous result`
			`uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1);`
			`for (uint i = 0; i < LOCAL_RESULTS; i++) {`
			`results[i] = AddUint64(results[i], shuffled_result);`
			`}`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`}`
Query Cache: Fix Prefix Sums 2023-08-24 03:58:59 +02:00			`WriteResults(results);`
Query Cache: Implement host side sample counting. 2023-08-20 17:53:08 +02:00			`}`