Query Cache: Simplify Prefix Sum compute shader
This commit is contained in:
		| @@ -42,6 +42,7 @@ set(SHADER_FILES | ||||
|     present_bicubic.frag | ||||
|     present_gaussian.frag | ||||
|     queries_prefix_scan_sum.comp | ||||
|     queries_prefix_scan_sum_nosubgroups.comp | ||||
|     resolve_conditional_render.comp | ||||
|     smaa_edge_detection.vert | ||||
|     smaa_edge_detection.frag | ||||
| @@ -72,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND") | ||||
| endif() | ||||
|  | ||||
| set(GLSL_FLAGS "") | ||||
| set(SPIR_V_VERSION "spirv1.3") | ||||
| set(QUIET_FLAG "--quiet") | ||||
|  | ||||
| set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) | ||||
| @@ -125,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) | ||||
|             OUTPUT | ||||
|                 ${SPIRV_HEADER_FILE} | ||||
|             COMMAND | ||||
|                 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} | ||||
|                 ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION} | ||||
|             MAIN_DEPENDENCY | ||||
|                 ${SOURCE_FILE} | ||||
|         ) | ||||
|   | ||||
| @@ -1,26 +1,24 @@ | ||||
| // SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel | ||||
| // SPDX-License-Identifier: MIT | ||||
|  | ||||
| // Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and | ||||
| // Nicholas Haemel. Modified to suit needs and optimize for subgroup | ||||
| // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||||
| // SPDX-License-Identifier: GPL-3.0-or-later | ||||
|  | ||||
| #version 460 core | ||||
|  | ||||
| #extension GL_KHR_shader_subgroup_basic : require | ||||
| #extension GL_KHR_shader_subgroup_shuffle : require | ||||
| #extension GL_KHR_shader_subgroup_shuffle_relative : require | ||||
| #extension GL_KHR_shader_subgroup_arithmetic : require | ||||
|  | ||||
| #ifdef VULKAN | ||||
|  | ||||
| #extension GL_KHR_shader_subgroup_arithmetic : enable | ||||
| #define HAS_EXTENDED_TYPES 1 | ||||
| #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||||
| #define END_PUSH_CONSTANTS                                                                         \ | ||||
|     }                                                                                              \ | ||||
|     ; | ||||
| #define END_PUSH_CONSTANTS }; | ||||
| #define UNIFORM(n) | ||||
| #define BINDING_INPUT_BUFFER 0 | ||||
| #define BINDING_OUTPUT_IMAGE 1 | ||||
|  | ||||
| #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||||
|  | ||||
| #extension GL_KHR_shader_subgroup_arithmetic : enable | ||||
| #extension GL_NV_gpu_shader5 : enable | ||||
| #ifdef GL_NV_gpu_shader5 | ||||
| #define HAS_EXTENDED_TYPES 1 | ||||
| @@ -43,19 +41,20 @@ END_PUSH_CONSTANTS | ||||
| layout(local_size_x = 32) in; | ||||
|  | ||||
| layout(std430, binding = 0) readonly buffer block1 { | ||||
|     uvec2 input_data[gl_WorkGroupSize.x]; | ||||
|     uvec2 input_data[]; | ||||
| }; | ||||
|  | ||||
| layout(std430, binding = 1) writeonly coherent buffer block2 { | ||||
|     uvec2 output_data[gl_WorkGroupSize.x]; | ||||
| layout(std430, binding = 1) coherent buffer block2 { | ||||
|     uvec2 output_data[]; | ||||
| }; | ||||
|  | ||||
| layout(std430, binding = 2) coherent buffer block3 { | ||||
|     uvec2 accumulated_data; | ||||
| }; | ||||
|  | ||||
| shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; | ||||
| shared uvec2 shared_data[2]; | ||||
|  | ||||
| // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 | ||||
| uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||||
|     uint carry = 0; | ||||
|     uvec2 result; | ||||
| @@ -64,61 +63,102 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| void main(void) { | ||||
|     uint id = gl_LocalInvocationID.x; | ||||
|     uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); | ||||
|     uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); | ||||
|     uint work_size = gl_WorkGroupSize.x; | ||||
|     uint rd_id; | ||||
|     uint wr_id; | ||||
|     uint mask; | ||||
|     uvec2 input_1 = input_data[id * 2]; | ||||
|     uvec2 input_2 = input_data[id * 2 + 1]; | ||||
|     // The number of steps is the log base 2 of the | ||||
|     // work group size, which should be a power of 2 | ||||
|     const uint steps = uint(log2(work_size)) + 1; | ||||
|     uint step = 0; | ||||
| // do subgroup Prefix Sum using Hillis and Steele's algorithm | ||||
| uvec2 subgroupInclusiveAddUint64(uvec2 value) { | ||||
|     uvec2 result = value; | ||||
|     for (uint i = 1; i < gl_SubgroupSize; i *= 2) { | ||||
|         if (i <= gl_SubgroupInvocationID) { | ||||
|             uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; | ||||
|             result = AddUint64(result, other); | ||||
|         } | ||||
|     } | ||||
|     return result; | ||||
| } | ||||
|  | ||||
|     // Each invocation is responsible for the content of | ||||
|     // two elements of the output array | ||||
|     shared_data[id * 2] = input_1; | ||||
|     shared_data[id * 2 + 1] = input_2; | ||||
|     // Synchronize to make sure that everyone has initialized | ||||
|     // their elements of shared_data[] with data loaded from | ||||
|     // the input arrays | ||||
| // Writes down the results to the output buffer and to the accumulation buffer | ||||
| void WriteResults(uvec2 result) { | ||||
|     uint current_global_id = gl_GlobalInvocationID.x; | ||||
|     uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0); | ||||
|     output_data[current_global_id] = result + base_data; | ||||
|     if (max_accumulation_base >= accumulation_limit + 1) { | ||||
|         if (current_global_id == accumulation_limit) { | ||||
|             accumulated_data = result; | ||||
|         } | ||||
|         return; | ||||
|     } | ||||
|     // We have that ugly case in which the accumulation data is reset in the middle somewhere. | ||||
|     barrier(); | ||||
|     groupMemoryBarrier(); | ||||
|     if (current_global_id == accumulation_limit) { | ||||
|         uvec2 value_1 = output_data[max_accumulation_base]; | ||||
|         accumulated_data = AddUint64(result, -value_1); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void main() { | ||||
|     uint subgroup_inv_id = gl_SubgroupInvocationID; | ||||
|     uint subgroup_id = gl_SubgroupID; | ||||
|     uint last_subgroup_id = subgroupMax(subgroup_inv_id); | ||||
|     uint current_global_id = gl_GlobalInvocationID.x; | ||||
|     uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x; | ||||
|     uvec2 data = input_data[current_global_id]; | ||||
|     // make sure all input data has been loaded | ||||
|     subgroupBarrier(); | ||||
|     subgroupMemoryBarrier(); | ||||
|  | ||||
|     uvec2 result = subgroupInclusiveAddUint64(data); | ||||
|  | ||||
|     // if we had less queries than our subgroup, just write down the results. | ||||
|     if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch. | ||||
|         WriteResults(result); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     // We now have more, so lets write the last result into shared memory. | ||||
|     // Only pick the last subgroup. | ||||
|     if (subgroup_inv_id == last_subgroup_id) { | ||||
|         shared_data[subgroup_id] = result; | ||||
|     } | ||||
|     // wait until everyone loaded their stuffs | ||||
|     barrier(); | ||||
|     memoryBarrierShared(); | ||||
|     // For each step... | ||||
|     for (step = 0; step < steps; step++) { | ||||
|         // Calculate the read and write index in the | ||||
|         // shared array | ||||
|         mask = (1 << step) - 1; | ||||
|         rd_id = ((id >> step) << (step + 1)) + mask; | ||||
|         wr_id = rd_id + 1 + (id & mask); | ||||
|         // Accumulate the read data into our element | ||||
|  | ||||
|         shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); | ||||
|         // Synchronize again to make sure that everyone | ||||
|         // has caught up with us | ||||
|     // Case 1: the total work for the grouped results can be calculated in a single subgroup | ||||
|     // operation (about 1024 queries). | ||||
|     uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x; | ||||
|     if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch. | ||||
|         if (subgroup_id != 0) { | ||||
|             uvec2 tmp = shared_data[subgroup_inv_id]; | ||||
|             subgroupBarrier(); | ||||
|             subgroupMemoryBarrierShared(); | ||||
|             tmp = subgroupInclusiveAddUint64(tmp); | ||||
|             result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1)); | ||||
|         } | ||||
|  | ||||
|         WriteResults(result); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     // Case 2: our work amount is huge, so lets do it in O(log n) steps. | ||||
|     const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0; | ||||
|     const uint steps = 1 << (findMSB(total_extra_work) + extra); | ||||
|     uint step; | ||||
|     // Hillis and Steele's algorithm | ||||
|     for (step = 1; step < steps; step *= 2) { | ||||
|         if (current_global_id < steps && current_global_id >= step) { | ||||
|             uvec2 current = shared_data[current_global_id]; | ||||
|             uvec2 other = shared_data[current_global_id - step]; | ||||
|             shared_data[current_global_id] = AddUint64(current, other); | ||||
|         } | ||||
|         // steps is constant, so this will always execute in ever workgroup's thread. | ||||
|         barrier(); | ||||
|         memoryBarrierShared(); | ||||
|     } | ||||
|     // Add the accumulation | ||||
|     shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); | ||||
|     shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); | ||||
|     barrier(); | ||||
|     memoryBarrierShared(); | ||||
|  | ||||
|     // Finally write our data back to the output buffer | ||||
|     output_data[id * 2] = shared_data[id * 2]; | ||||
|     output_data[id * 2 + 1] = shared_data[id * 2 + 1]; | ||||
|     if (id == 0) { | ||||
|         if (max_accumulation_base >= accumulation_limit + 1) { | ||||
|             accumulated_data = shared_data[accumulation_limit]; | ||||
|             return; | ||||
|         } | ||||
|         uvec2 value_1 = shared_data[max_accumulation_base]; | ||||
|         uvec2 value_2 = shared_data[accumulation_limit]; | ||||
|         accumulated_data = AddUint64(value_1, -value_2); | ||||
|     // Only add results for groups higher than 0 | ||||
|     if (subgroup_id != 0) { | ||||
|         result = AddUint64(result, shared_data[subgroup_id - 1]); | ||||
|     } | ||||
|  | ||||
|     // Just write the final results. We are done | ||||
|     WriteResults(result); | ||||
| } | ||||
| @@ -0,0 +1,120 @@ | ||||
| // SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel | ||||
| // SPDX-License-Identifier: MIT | ||||
|  | ||||
| // Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and | ||||
| // Nicholas Haemel. Modified to suit needs. | ||||
|  | ||||
| #version 460 core | ||||
|  | ||||
| #ifdef VULKAN | ||||
|  | ||||
| #define HAS_EXTENDED_TYPES 1 | ||||
| #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||||
| #define END_PUSH_CONSTANTS }; | ||||
| #define UNIFORM(n) | ||||
| #define BINDING_INPUT_BUFFER 0 | ||||
| #define BINDING_OUTPUT_IMAGE 1 | ||||
|  | ||||
| #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||||
|  | ||||
| #extension GL_NV_gpu_shader5 : enable | ||||
| #ifdef GL_NV_gpu_shader5 | ||||
| #define HAS_EXTENDED_TYPES 1 | ||||
| #else | ||||
| #define HAS_EXTENDED_TYPES 0 | ||||
| #endif | ||||
| #define BEGIN_PUSH_CONSTANTS | ||||
| #define END_PUSH_CONSTANTS | ||||
| #define UNIFORM(n) layout(location = n) uniform | ||||
| #define BINDING_INPUT_BUFFER 0 | ||||
| #define BINDING_OUTPUT_IMAGE 0 | ||||
|  | ||||
| #endif | ||||
|  | ||||
| BEGIN_PUSH_CONSTANTS | ||||
| UNIFORM(0) uint max_accumulation_base; | ||||
| UNIFORM(1) uint accumulation_limit; | ||||
| END_PUSH_CONSTANTS | ||||
|  | ||||
| layout(local_size_x = 32) in; | ||||
|  | ||||
| layout(std430, binding = 0) readonly buffer block1 { | ||||
|     uvec2 input_data[gl_WorkGroupSize.x]; | ||||
| }; | ||||
|  | ||||
| layout(std430, binding = 1) writeonly coherent buffer block2 { | ||||
|     uvec2 output_data[gl_WorkGroupSize.x]; | ||||
| }; | ||||
|  | ||||
| layout(std430, binding = 2) coherent buffer block3 { | ||||
|     uvec2 accumulated_data; | ||||
| }; | ||||
|  | ||||
| shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; | ||||
|  | ||||
| uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||||
|     uint carry = 0; | ||||
|     uvec2 result; | ||||
|     result.x = uaddCarry(value_1.x, value_2.x, carry); | ||||
|     result.y = value_1.y + value_2.y + carry; | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| void main(void) { | ||||
|     uint id = gl_LocalInvocationID.x; | ||||
|     uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); | ||||
|     uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); | ||||
|     uint work_size = gl_WorkGroupSize.x; | ||||
|     uint rd_id; | ||||
|     uint wr_id; | ||||
|     uint mask; | ||||
|     uvec2 input_1 = input_data[id * 2]; | ||||
|     uvec2 input_2 = input_data[id * 2 + 1]; | ||||
|     // The number of steps is the log base 2 of the | ||||
|     // work group size, which should be a power of 2 | ||||
|     const uint steps = uint(log2(work_size)) + 1; | ||||
|     uint step = 0; | ||||
|  | ||||
|     // Each invocation is responsible for the content of | ||||
|     // two elements of the output array | ||||
|     shared_data[id * 2] = input_1; | ||||
|     shared_data[id * 2 + 1] = input_2; | ||||
|     // Synchronize to make sure that everyone has initialized | ||||
|     // their elements of shared_data[] with data loaded from | ||||
|     // the input arrays | ||||
|     barrier(); | ||||
|     memoryBarrierShared(); | ||||
|     // For each step... | ||||
|     for (step = 0; step < steps; step++) { | ||||
|         // Calculate the read and write index in the | ||||
|         // shared array | ||||
|         mask = (1 << step) - 1; | ||||
|         rd_id = ((id >> step) << (step + 1)) + mask; | ||||
|         wr_id = rd_id + 1 + (id & mask); | ||||
|         // Accumulate the read data into our element | ||||
|  | ||||
|         shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); | ||||
|         // Synchronize again to make sure that everyone | ||||
|         // has caught up with us | ||||
|         barrier(); | ||||
|         memoryBarrierShared(); | ||||
|     } | ||||
|     // Add the accumulation | ||||
|     shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); | ||||
|     shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); | ||||
|     barrier(); | ||||
|     memoryBarrierShared(); | ||||
|  | ||||
|     // Finally write our data back to the output buffer | ||||
|     output_data[id * 2] = shared_data[id * 2]; | ||||
|     output_data[id * 2 + 1] = shared_data[id * 2 + 1]; | ||||
|     if (id == 0) { | ||||
|         if (max_accumulation_base >= accumulation_limit + 1) { | ||||
|             accumulated_data = shared_data[accumulation_limit]; | ||||
|             return; | ||||
|         } | ||||
|         uvec2 value_1 = shared_data[max_accumulation_base]; | ||||
|         uvec2 value_2 = shared_data[accumulation_limit]; | ||||
|         accumulated_data = AddUint64(value_1, -value_2); | ||||
|     } | ||||
| } | ||||
| @@ -13,6 +13,7 @@ | ||||
| #include "common/div_ceil.h" | ||||
| #include "video_core/host_shaders/astc_decoder_comp_spv.h" | ||||
| #include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" | ||||
| #include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h" | ||||
| #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" | ||||
| #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" | ||||
| #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" | ||||
| @@ -187,7 +188,8 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, | ||||
|                          vk::Span<VkDescriptorSetLayoutBinding> bindings, | ||||
|                          vk::Span<VkDescriptorUpdateTemplateEntry> templates, | ||||
|                          const DescriptorBankInfo& bank_info, | ||||
|                          vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code) | ||||
|                          vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, | ||||
|                          std::optional<u32> optional_subgroup_size) | ||||
|     : device{device_} { | ||||
|     descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ | ||||
|         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, | ||||
| @@ -228,13 +230,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, | ||||
|         .pCode = code.data(), | ||||
|     }); | ||||
|     device.SaveShader(code); | ||||
|     const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{ | ||||
|         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, | ||||
|         .pNext = nullptr, | ||||
|         .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U, | ||||
|     }; | ||||
|     bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size; | ||||
|     pipeline = device.GetLogical().CreateComputePipeline({ | ||||
|         .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, | ||||
|         .pNext = nullptr, | ||||
|         .flags = 0, | ||||
|         .stage{ | ||||
|             .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, | ||||
|             .pNext = nullptr, | ||||
|             .pNext = use_setup_size ? &subgroup_size_ci : nullptr, | ||||
|             .flags = 0, | ||||
|             .stage = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|             .module = *module, | ||||
| @@ -399,10 +407,17 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ | ||||
| QueriesPrefixScanPass::QueriesPrefixScanPass( | ||||
|     const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, | ||||
|     ComputePassDescriptorQueue& compute_pass_descriptor_queue_) | ||||
|     : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, | ||||
|                   QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, | ||||
|                   COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>, | ||||
|                   QUERIES_PREFIX_SCAN_SUM_COMP_SPV), | ||||
|     : ComputePass( | ||||
|           device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, | ||||
|           QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, | ||||
|           COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>, | ||||
|           device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) && | ||||
|                   device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) && | ||||
|                   device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && | ||||
|                   device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) | ||||
|               ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) | ||||
|               : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV), | ||||
|           {32}), | ||||
|       scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} | ||||
|  | ||||
| void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, | ||||
|   | ||||
| @@ -3,6 +3,7 @@ | ||||
|  | ||||
| #pragma once | ||||
|  | ||||
| #include <optional> | ||||
| #include <span> | ||||
| #include <utility> | ||||
|  | ||||
| @@ -31,7 +32,8 @@ public: | ||||
|                          vk::Span<VkDescriptorSetLayoutBinding> bindings, | ||||
|                          vk::Span<VkDescriptorUpdateTemplateEntry> templates, | ||||
|                          const DescriptorBankInfo& bank_info, | ||||
|                          vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); | ||||
|                          vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, | ||||
|                          std::optional<u32> optional_subgroup_size = std::nullopt); | ||||
|     ~ComputePass(); | ||||
|  | ||||
| protected: | ||||
|   | ||||
| @@ -1376,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku | ||||
|             return true; | ||||
|         } | ||||
|     } | ||||
|     /*if (!is_in_bc[0] && !is_in_bc[1]) { | ||||
|     if (!is_in_bc[0] && !is_in_bc[1]) { | ||||
|         // Both queries are in query cache, it's best to just flush. | ||||
|         return true; | ||||
|     }*/ | ||||
|     } | ||||
|     HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); | ||||
|     return true; | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user