// Copyright (c) 2020-2023 Huawei Technologies Co. Ltd. // // SPDX-License-Identifier: CC-BY-4.0 include::{generated}/meta/{refprefix}VK_HUAWEI_cluster_culling_shader.adoc[] === Other Extension Metadata *Last Modified Date*:: 2023-08-16 *Interactions and External Dependencies*:: - This extension provides API support for {GLSLregistry}/huawei/GLSL_HUAWEI_cluster_culling_shader.txt[`GL_HUAWEI_cluster_culling_shader`]. *Contributors*:: - Yuchang Wang, Huawei - Juntao Li, Huawei - Pan Gao, Huawei - Jie Cao, Huawei - Yunjin Zhang, Huawei - Shujie Zhou, Huawei - Chaojun Wang, Huawei - Jiajun Hu, Huawei - Cong Zhang, Huawei === Description Cluster Culling Shaders (CCS) are similar to the existing compute shaders. Their main purpose is to provide an execution environment in order to perform coarse-level geometry culling and LOD selection more efficiently on the GPU. The traditional 2-pass GPU culling solution using a compute shader sometimes needs a pipeline barrier between compute and graphics pipeline to optimize performance. An additional compaction process may also be required. This extension addresses these shortcomings, allowing compute shaders to directly emit visible clusters to the following graphics pipeline. A set of new built-in output variables are used to express a visible cluster, including per-cluster shading rate. In addition, a new built-in function is used to emit these variables from CCS to the IA stage. The IA stage can use these variables to fetches vertices of a visible cluster and drive vertex shaders to shading these vertices. Note that CCS do not work with geometry or tessellation shaders, but both IA and vertex shaders are preserved. Vertex shaders are still used for vertex position shading, instead of directly outputting transformed vertices from the compute shader. This makes CCS more suitable for mobile GPUs. include::{generated}/interfaces/VK_HUAWEI_cluster_culling_shader.adoc[] === New Built-In Variables * <> * <> * <> * <> * <> * <> * <> * <> * <> === New SPIR-V Capability * <> === Sample Code Example of cluster culling in a GLSL shader [source,c] ---- #extension GL_HUAWEI_cluster_culling_shader: enable #define GPU_WARP_SIZE 32 #define GPU_GROUP_SIZE GPU_WARP_SIZE #define GPU_CLUSTER_PER_INVOCATION 1 #define GPU_CLUSTER_PER_WORKGROUP (GPU_GROUP_SIZE * GPU_CLUSTER_PER_INVOCATION) // Number of threads per workgroup // - 1D only // - warpsize = 32 layout(local_size_x=GPU_GROUP_SIZE, local_size_y=1, local_size_z=1) in; #define GPU_DRAW_BUFFER_BINDING 0 #define GPU_INSTANCE_DESCRIPTOR_BINDING 1 struct BoundingSphere { vec3 center; float radius; }; struct InstanceData { mat4 mvp_matrix; // mvp matrix. vec4 frustum_planes[6]; // six frustum planes mat4 model_matrix_transpose_inverse; // inverse transpose of model matrix. vec3 view_origin; // view original }; struct InstanceDescriptor { uint begin; uint end; uint cluster_count; uint debug; BoundingSphere sphere; InstanceData instance_data; }; struct DrawElementsCommand{ uint indexcount; uint instanceCount; uint firstIndex; int vertexoffset; uint firstInstance; uint cluster_id; }; // indexed mode out gl_PerClusterHUAWEI{ uint gl_IndexCountHUAWEI; uint gl_InstanceCountHUAWEI; uint gl_FirstIndexHUAWEI; int gl_VertexOffsetHUAWEI; uint gl_FirstInstanceHUAWEI; uint gl_ClusterIDHUAWEI; uint gl_ClusterShadingRateHUAWEI; }; layout(binding = GPU_DRAW_BUFFER_BINDING, std430) buffer draw_indirect_ssbo { DrawElementsCommand draw_commands[]; }; layout(binding = GPU_INSTANCE_DESCRIPTOR_BINDING, std430) buffer instance_descriptor_ssbo { InstanceDescriptor instance_descriptors[]; }; float Distance(uint instance_id) { vec3 v = normalize(instance_descriptor[instance_id].sphere.center - instance_descriptor[instance_id].instance_data.view_origin); float dist = sqrt(dot(v,v)); return dist; } bool isSphereOutsideFrustum( vec3 sphere_center, float sphere_radius ) { bool isInside = false; for(int i = 0; i < 6; i++) { isInside = isInside || (dot(instance_descriptors[instance_id].instance_data.frustum_planes[i].xyz, sphere_center) + instance_descriptors[instance_id].instance_data.frustum_planes[i].w < sphere_radius); } return isInside; } void main() { // get instance description instance_id = gl_GlobalInvocationID.x; InstanceDescriptor inst_desc = instance_descriptors[instance_id]; //instance based culling bool render = !isSphereOutsideFrustum(inst_desc.sphere.center, inst_desc.sphere.radius); if (render) { // calculate distance float distance = Distance(instance_id); // update shading rate built-in variable if(distance > 0.7) gl_ClusterShadingRateHUAWEI = gl_ShadingRateFlag4VerticalPixelsEXT | gl_ShadingRateFlag4HorizontalPixelsEXT; else if(distance > 0.3) gl_ClusterShadingRateHUAWEI = gl_ShadingRateFlag2VerticalPixelsEXT | gl_ShadingRateFlag2HorizontalPixelsEXT; else gl_ClusterShadingRateHUAWEI = 0; // this is a visible cluster, update built-in output variable. // in case of indexed mode: gl_IndexCountHUAWEI = draw_commands[cluster_id].indexcount; gl_InstanceCountHUAWEI = draw_commands[cluster_id].instanceCount; gl_FirstIndexHUAWEI = draw_commands[cluster_id].firstIndex; gl_VertexOffsetHUAWEI = draw_commands[cluster_id].vertexoffset; gl_FirstInstanceHUAWEI = draw_commands[cluster_id].firstInstance; gl_ClusterIDHUAWEI = draw_commands[cluster_id].cluster_id; // emit built-in output variables as a drawing command to subsequent // rendering pipeline. dispatchClusterHUAWEI(); } } ---- Example of graphics pipeline creation with cluster culling shader [source,c] ---- // create a cluster culling shader stage info structure. VkPipelineShaderStageCreateInfo ccsStageInfo{}; ccsStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; ccsStageInfo.stage = VK_SHADER_STAGE_CLUSTER_CULLING_BIT_HUAWEI; ccsStageInfo.module = clustercullingshaderModule; ccsStageInfo.pName = "main"; // pipeline shader stage creation VkPipelineShaderStageCreateInfo shaderStages[] = { ccsStageInfo, vertexShaderStageInfo, fragmentShaderStageInfo }; // create graphics pipeline VkGraphicsPipelineCreateInfo pipelineInfo{}; pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; pipelineInfo.stageCount = 3; pipelineInfo.pStage = shaderStages; pipelineInfo.pVertexInputState = &vertexInputInfo; // ... VkPipeline graphicsPipeline; VkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &graphicsPipeline); ---- Example of launching the execution of cluster culling shader [source,c] ---- vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphicsPipeline); vkCmdDrawClusterHUAWEI(commandBuffer, groupCountX, 1, 1); vkCmdEndRenderPass(commandBuffer); ---- === Version History * Revision 1, 2022-11-18 (YuChang Wang) ** Internal revisions * Revision 2, 2023-04-02 (Jon Leech) ** Grammar edits. * Revision 3, 2023-08-21 (YuChang Wang) ** Add per-cluster shading rate.