1/* Copyright © 2022 Friedrich Vock 2 * Copyright © 2024 Intel Corporation 3 * SPDX-License-Identifier: MIT 4 */ 5 6#version 460 7 8#extension GL_GOOGLE_include_directive : require 9 10#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require 11#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require 12#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require 13#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require 14#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require 15#extension GL_EXT_scalar_block_layout : require 16#extension GL_EXT_buffer_reference : require 17#extension GL_EXT_buffer_reference2 : require 18#extension GL_KHR_memory_scope_semantics : require 19#extension GL_EXT_shader_atomic_int64: require 20 21layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; 22 23#include "anv_build_helpers.h" 24#include "anv_build_interface.h" 25 26#define ULP 1.1920928955078125e-7f 27 28layout(push_constant) uniform CONSTS { 29 encode_args args; 30}; 31 32uint64_t 33get_instance_flag(uint32_t src) 34{ 35 uint32_t flags = src & 0xff; 36 return flags & 0xf; 37} 38 39void 40encode_leaf_node(uint32_t type, uint64_t src_node, uint64_t dst_node, REF(anv_accel_struct_header) dst_header) 41{ 42 switch (type) { 43 case vk_ir_node_triangle: { 44 REF(anv_quad_leaf_node) quad_leaf = REF(anv_quad_leaf_node)(dst_node); 45 46 vk_ir_triangle_node src = DEREF(REF(vk_ir_triangle_node)(src_node)); 47 uint32_t geometry_id_and_flags = src.geometry_id_and_flags & 0xffffff; 48 49 /* sub-type (4-bit) encoded on 24-bit index */ 50 geometry_id_and_flags |= (ANV_SUB_TYPE_QUAD & 0xF) << 24; 51 /* Set disable opacity culling by default */ 52 geometry_id_and_flags |= (1 << 29); 53 54 /* Disable the second triangle */ 55 uint32_t prim_index1_delta = 0; 56 /* For now, blockIncr are all 1, so every quad leaf has its "last" bit set. */ 57 prim_index1_delta |= (1 << 22); 58 59 DEREF(quad_leaf).prim_index1_delta = prim_index1_delta; 60 61 if ((src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0) { 62 /* Geometry opqaue (1-bit) is encoded on 30-bit index */ 63 geometry_id_and_flags |= (ANV_GEOMETRY_FLAG_OPAQUE << 30); 64 atomicAnd(DEREF(dst_header).instance_flags, 65 ~ANV_INSTANCE_FLAG_FORCE_NON_OPAQUE); 66 } else { 67 atomicAnd(DEREF(dst_header).instance_flags, 68 ~ANV_INSTANCE_FLAG_FORCE_OPAQUE); 69 } 70 71 DEREF(quad_leaf).prim_index0 = src.triangle_id; 72 DEREF(quad_leaf).leaf_desc.geometry_id_and_flags = geometry_id_and_flags; 73 74 /* shaderIndex is typically set to match geomIndex 75 * Geom mask is default to 0xFF 76 */ 77 DEREF(quad_leaf).leaf_desc.shader_index_and_geom_mask = 0xFF000000 | (geometry_id_and_flags & 0xffffff); 78 79 /* Setup single triangle */ 80 for (uint32_t i = 0; i < 3; i++) { 81 for (uint32_t j = 0; j < 3; j++) { 82 DEREF(quad_leaf).v[i][j] = src.coords[i][j]; 83 } 84 } 85 break; 86 } 87 case vk_ir_node_aabb: { 88 REF(anv_procedural_leaf_node) aabb_leaf = REF(anv_procedural_leaf_node)(dst_node); 89 90 vk_ir_aabb_node src = DEREF(REF(vk_ir_aabb_node)(src_node)); 91 uint32_t geometry_id_and_flags = src.geometry_id_and_flags & 0xffffff; 92 93 /* sub-type (4-bit) encoded on 24-bit index */ 94 geometry_id_and_flags |= (ANV_SUB_TYPE_PROCEDURAL & 0xF) << 24; 95 /* Set disable opacity culling by default */ 96 geometry_id_and_flags |= (1 << 29); 97 98 if ((src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0) { 99 geometry_id_and_flags |= (ANV_GEOMETRY_FLAG_OPAQUE << 30); 100 atomicAnd(DEREF(dst_header).instance_flags, 101 ~ANV_INSTANCE_FLAG_FORCE_NON_OPAQUE); 102 } else { 103 atomicAnd(DEREF(dst_header).instance_flags, 104 ~ANV_INSTANCE_FLAG_FORCE_OPAQUE); 105 } 106 107 DEREF(aabb_leaf).leaf_desc.geometry_id_and_flags = geometry_id_and_flags; 108 109 /* shaderIndex is typically set to match geomIndex 110 * Geom mask is default to 0xFF 111 */ 112 DEREF(aabb_leaf).leaf_desc.shader_index_and_geom_mask = 0xFF000000 | (geometry_id_and_flags & 0xffffff); 113 114 /* num primitives = 1 */ 115 uint32_t dw1 = 1; 116 /* "last" has only 1 bit, and it is set. */ 117 dw1 |= (1 << 31); 118 119 DEREF(aabb_leaf).DW1 = dw1; 120 DEREF(aabb_leaf).primIndex[0] = src.primitive_id; 121 break; 122 } 123 case vk_ir_node_instance: { 124 vk_ir_instance_node src = DEREF(REF(vk_ir_instance_node)(src_node)); 125 126 REF(anv_instance_leaf) dst_instance = REF(anv_instance_leaf)(dst_node); 127 128 REF(anv_accel_struct_header) blas_header = REF(anv_accel_struct_header)(src.base_ptr); 129 uint64_t start_node_ptr = uint64_t(src.base_ptr) + DEREF(blas_header).rootNodeOffset; 130 131 uint32_t sbt_offset_and_flags = src.sbt_offset_and_flags; 132 133 uint32_t shader_index_and_geom_mask = 0; 134 shader_index_and_geom_mask |= (src.custom_instance_and_mask & 0xff000000); 135 DEREF(dst_instance).part0.shader_index_and_geom_mask = shader_index_and_geom_mask; 136 137 uint32_t instance_contribution_and_geom_flags = 0; 138 instance_contribution_and_geom_flags |= src.sbt_offset_and_flags & 0xffffff; 139 instance_contribution_and_geom_flags |= (1 << 29); 140 instance_contribution_and_geom_flags |= 141 (get_instance_flag(src.sbt_offset_and_flags >> 24) == ANV_INSTANCE_FLAG_FORCE_OPAQUE ? 142 ANV_GEOMETRY_FLAG_OPAQUE : 0) << 30; 143 DEREF(dst_instance).part0.instance_contribution_and_geom_flags = 144 instance_contribution_and_geom_flags; 145 146 uint32_t instance_flags = DEREF(blas_header).instance_flags; 147 if (((sbt_offset_and_flags >> 24) & (VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR | 148 VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR)) != 0) { 149 instance_flags &= ~(VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR | 150 VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR); 151 instance_flags |= (sbt_offset_and_flags >> 24) & (VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR | 152 VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR); 153 } 154 155 DEREF(dst_instance).part0.start_node_ptr_and_inst_flags = 156 start_node_ptr | 157 (get_instance_flag(instance_flags | (src.sbt_offset_and_flags >> 24)) << 48); 158 159 mat4 transform = mat4(src.otw_matrix); 160 161 mat4 inv_transform = transpose(inverse(transpose(transform))); 162 mat3x4 wto_matrix = mat3x4(inv_transform); 163 mat3x4 otw_matrix = mat3x4(transform); 164 165 /* Arrange WTO transformation matrix in column-major order */ 166 DEREF(dst_instance).part0.world2obj_vx_x = wto_matrix[0][0]; 167 DEREF(dst_instance).part0.world2obj_vx_y = wto_matrix[1][0]; 168 DEREF(dst_instance).part0.world2obj_vx_z = wto_matrix[2][0]; 169 DEREF(dst_instance).part0.obj2world_p_x = otw_matrix[0][3]; 170 171 DEREF(dst_instance).part0.world2obj_vy_x = wto_matrix[0][1]; 172 DEREF(dst_instance).part0.world2obj_vy_y = wto_matrix[1][1]; 173 DEREF(dst_instance).part0.world2obj_vy_z = wto_matrix[2][1]; 174 DEREF(dst_instance).part0.obj2world_p_y = otw_matrix[1][3]; 175 176 DEREF(dst_instance).part0.world2obj_vz_x = wto_matrix[0][2]; 177 DEREF(dst_instance).part0.world2obj_vz_y = wto_matrix[1][2]; 178 DEREF(dst_instance).part0.world2obj_vz_z = wto_matrix[2][2]; 179 DEREF(dst_instance).part0.obj2world_p_z = otw_matrix[2][3]; 180 181 /* Arrange OTW transformation matrix in column-major order */ 182 DEREF(dst_instance).part1.obj2world_vx_x = otw_matrix[0][0]; 183 DEREF(dst_instance).part1.obj2world_vx_y = otw_matrix[1][0]; 184 DEREF(dst_instance).part1.obj2world_vx_z = otw_matrix[2][0]; 185 DEREF(dst_instance).part1.world2obj_p_x = wto_matrix[0][3]; 186 187 DEREF(dst_instance).part1.obj2world_vy_x = otw_matrix[0][1]; 188 DEREF(dst_instance).part1.obj2world_vy_y = otw_matrix[1][1]; 189 DEREF(dst_instance).part1.obj2world_vy_z = otw_matrix[2][1]; 190 DEREF(dst_instance).part1.world2obj_p_y = wto_matrix[1][3]; 191 192 DEREF(dst_instance).part1.obj2world_vz_x = otw_matrix[0][2]; 193 DEREF(dst_instance).part1.obj2world_vz_y = otw_matrix[1][2]; 194 DEREF(dst_instance).part1.obj2world_vz_z = otw_matrix[2][2]; 195 DEREF(dst_instance).part1.world2obj_p_z = wto_matrix[2][3]; 196 197 DEREF(dst_instance).part1.bvh_ptr = src.base_ptr; 198 DEREF(dst_instance).part1.instance_index = src.instance_id; 199 DEREF(dst_instance).part1.instance_id = src.custom_instance_and_mask & 0xffffff; 200 201 uint64_t instance_leaves_addr_base = args.output_bvh - args.output_bvh_offset + ANV_RT_BVH_HEADER_SIZE; 202 uint64_t cnt = atomicAdd(DEREF(dst_header).instance_count, 1); 203 DEREF(INDEX(uint64_t, instance_leaves_addr_base, cnt)) = dst_node; 204 break; 205 } 206 } 207} 208 209vk_aabb 210conservative_aabb(vk_aabb input_aabb) 211{ 212 vk_aabb out_aabb; 213 214 vec3 reduce_value = max(abs(input_aabb.min), abs(input_aabb.max)); 215 float err = ULP * max(reduce_value.x, max(reduce_value.y, reduce_value.z)); 216 217 out_aabb.min = input_aabb.min - vec3(err); 218 out_aabb.max = input_aabb.max + vec3(err); 219 220 return out_aabb; 221} 222 223void 224aabb_extend(inout vk_aabb v1, vk_aabb v2) 225{ 226 v1.min = min(v1.min, v2.min); 227 v1.max = max(v1.max, v2.max); 228} 229 230vec3 231aabb_size(vk_aabb input_aabb) 232{ 233 return input_aabb.max - input_aabb.min; 234} 235 236/* Determine the node_type based on type of its children. 237 * If children are all the same leaves, this internal node is a fat leaf; 238 * Otherwise, it's a mixed node. 239 */ 240uint8_t 241determine_internal_node_type(uint32_t children[6], uint child_count) 242{ 243 if (child_count == 0) 244 return uint8_t(ANV_NODE_TYPE_INVALID); 245 246 uint32_t type_of_first_child = ir_id_to_type(children[0]); 247 for (uint32_t i = 1; i < child_count; ++i) { 248 uint32_t type = ir_id_to_type(children[i]); 249 if(type != type_of_first_child){ 250 return uint8_t(ANV_NODE_TYPE_MIXED); 251 } 252 } 253 254 /* All children have same type. Now check what type they are. */ 255 switch (type_of_first_child){ 256 case vk_ir_node_triangle: 257 return uint8_t(ANV_NODE_TYPE_QUAD); 258 case vk_ir_node_aabb: 259 return uint8_t(ANV_NODE_TYPE_PROCEDURAL); 260 case vk_ir_node_instance: 261 return uint8_t(ANV_NODE_TYPE_INSTANCE); 262 case vk_ir_node_internal: 263 return uint8_t(ANV_NODE_TYPE_MIXED); 264 default: 265 return uint8_t(ANV_NODE_TYPE_INVALID); 266 } 267} 268 269vk_aabb 270quantize_bounds(vk_aabb aabb, vec3 base, i8vec3 exp) 271{ 272 vk_aabb quant_aabb; 273 vec3 lower = aabb.min - base; 274 vec3 upper = aabb.max - base; 275 276 vec3 qlower = ldexp(lower, -exp + 8); 277 vec3 qupper = ldexp(upper, -exp + 8); 278 279 qlower = min(max(floor(qlower), vec3(0.0)), vec3(255.0)); 280 qupper = min(max(ceil(qupper), vec3(0.0)), vec3(255.0)); 281 282 quant_aabb.min = qlower; 283 quant_aabb.max = qupper; 284 285 return quant_aabb; 286} 287 288void 289encode_internal_node(uint32_t children[6], uint32_t child_block_offset_from_internal_node, uint child_count, 290 vec3 min_offset, vec3 max_offset, uint32_t bvh_block_offset) 291{ 292 REF(anv_internal_node) dst_node = 293 REF(anv_internal_node)(OFFSET(args.output_bvh, ANV_RT_BLOCK_SIZE * bvh_block_offset)); 294 295 DEREF(dst_node).child_block_offset = child_block_offset_from_internal_node; 296 297 vk_aabb box; 298 box.min = min_offset; 299 box.max = max_offset; 300 301 vk_aabb conservative_child_aabb = conservative_aabb(box); 302 DEREF(dst_node).lower[0] = conservative_child_aabb.min.x; 303 DEREF(dst_node).lower[1] = conservative_child_aabb.min.y; 304 DEREF(dst_node).lower[2] = conservative_child_aabb.min.z; 305 306 float up = 1.0 + ULP; 307 ivec3 exp; 308 309 vec3 len = aabb_size(conservative_child_aabb) * up; 310 vec3 mant = frexp(len, exp); 311 312 exp.x += int((mant.x > (255.0f / 256.0f))); 313 exp.y += int((mant.y > (255.0f / 256.0f))); 314 exp.z += int((mant.z > (255.0f / 256.0f))); 315 316 i8vec3 exponent_i8 = i8vec3(exp); 317 DEREF(dst_node).exp_x = max(int8_t(-128), exponent_i8.x); 318 DEREF(dst_node).exp_y = max(int8_t(-128), exponent_i8.y); 319 DEREF(dst_node).exp_z = max(int8_t(-128), exponent_i8.z); 320 321 i8vec3 exp_i8 = i8vec3(DEREF(dst_node).exp_x, DEREF(dst_node).exp_y, DEREF(dst_node).exp_z); 322 323 DEREF(dst_node).node_mask = uint8_t(0xff); 324 DEREF(dst_node).node_type = determine_internal_node_type(children, child_count); 325 326 for (uint32_t i = 0; i < 6; i++) { 327 if (i < child_count) { 328 uint32_t type = ir_id_to_type(children[i]); 329 /* blockIncr and child_block_offset are how HW used to find children during traversal. 330 * If not set properly, gpu could hang. 331 */ 332 DEREF(dst_node).data[i].block_incr_and_start_prim = 333 type == vk_ir_node_instance ? uint8_t(2) : uint8_t(1); 334 335 uint32_t offset = ir_id_to_offset(children[i]); 336 337 vk_aabb child_aabb = 338 DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb; 339 340 child_aabb = conservative_aabb(child_aabb); 341 342 vk_aabb quantize_aabb = quantize_bounds(child_aabb, conservative_child_aabb.min, exp_i8); 343 344 DEREF(dst_node).lower_x[i] = uint8_t(quantize_aabb.min.x); 345 DEREF(dst_node).lower_y[i] = uint8_t(quantize_aabb.min.y); 346 DEREF(dst_node).lower_z[i] = uint8_t(quantize_aabb.min.z); 347 DEREF(dst_node).upper_x[i] = uint8_t(quantize_aabb.max.x); 348 DEREF(dst_node).upper_y[i] = uint8_t(quantize_aabb.max.y); 349 DEREF(dst_node).upper_z[i] = uint8_t(quantize_aabb.max.z); 350 351 /* for a mixed node, encode type of each children in startPrim in childdata */ 352 if (DEREF(dst_node).node_type == uint8_t(ANV_NODE_TYPE_MIXED)){ 353 uint32_t type = ir_id_to_type(children[i]); 354 switch (type){ 355 case vk_ir_node_triangle: 356 DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_QUAD) << 2); 357 break; 358 case vk_ir_node_aabb: 359 DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_PROCEDURAL) << 2); 360 break; 361 case vk_ir_node_instance: 362 DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_INSTANCE) << 2); 363 break; 364 case vk_ir_node_internal: 365 DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_MIXED) << 2); 366 break; 367 } 368 } 369 } else { 370 /* Invalid Child Nodes: For invalid child nodes, the MSBs of lower and upper 371 * x planes are flipped. In other words: 372 * bool valid(int i) const { 373 * return !(lower_x[i] & 0x80) || (upper_x[i] & 0x80); 374 * } 375 */ 376 DEREF(dst_node).lower_x[i] = uint8_t(0x80); 377 DEREF(dst_node).lower_y[i] = uint8_t(0); 378 DEREF(dst_node).lower_z[i] = uint8_t(0); 379 DEREF(dst_node).upper_x[i] = uint8_t(0); 380 DEREF(dst_node).upper_y[i] = uint8_t(0); 381 DEREF(dst_node).upper_z[i] = uint8_t(0); 382 383 /* in case HW also references blockIncr to do something, we zero out the data. */ 384 DEREF(dst_node).data[i].block_incr_and_start_prim = uint8_t(0); 385 DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_INVALID) << 2); 386 } 387 } 388} 389 390void 391main() 392{ 393 /* Encode.comp is dispatched through indirect dispatch with calculated groupCountX, 394 * but we can still overdispatch invocations, so we need a guard here. 395 * 396 * Also, we can't support more than 0xFFFFFFFF internal nodes due to SW 397 * limit we enforce on indirect workgroup count for signaling. 398 */ 399 if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count || 400 DEREF(args.header).ir_internal_node_count > 0xFFFFFFFF) 401 return; 402 403 /* Each lane will process one vk_ir_node_internal. The root node is sitting at the end 404 * of the IR BVH, and we let the lane with gl_GlobalInvocationID.x == 0 to take care of it. 405 */ 406 uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x; 407 408 uint32_t intermediate_leaf_node_size; 409 switch (args.geometry_type) { 410 case VK_GEOMETRY_TYPE_TRIANGLES_KHR: 411 intermediate_leaf_node_size = SIZEOF(vk_ir_triangle_node); 412 break; 413 case VK_GEOMETRY_TYPE_AABBS_KHR: 414 intermediate_leaf_node_size = SIZEOF(vk_ir_aabb_node); 415 break; 416 default: /* instances */ 417 intermediate_leaf_node_size = SIZEOF(vk_ir_instance_node); 418 break; 419 } 420 421 uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * intermediate_leaf_node_size; 422 423 REF(vk_ir_box_node) intermediate_internal_nodes = 424 REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size); 425 REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id); 426 vk_ir_box_node src = DEREF(src_node); 427 428 bool is_root_node = gl_GlobalInvocationID.x == 0; 429 430 REF(anv_accel_struct_header) header = REF(anv_accel_struct_header)(args.output_bvh - args.output_bvh_offset); 431 432 if (is_root_node) { 433 DEREF(header).instance_flags = 434 (args.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR ? ANV_INSTANCE_ALL_AABB : 0) | 435 /* These will be removed when processing leaf nodes */ 436 ANV_INSTANCE_FLAG_FORCE_OPAQUE | ANV_INSTANCE_FLAG_FORCE_NON_OPAQUE; 437 438 /* Indicate where the next children should be encoded. Offset measured in number of 64B blocks and started from output_bvh */ 439 DEREF(args.header).dst_node_offset = 1; 440 441 DEREF(header).instance_count = 0; 442 } 443 444 for (;;) { 445 /* Make changes to the current node's BVH offset value visible. */ 446 memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, 447 gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); 448 449 /* Indicate where this internal node should be encoded. Offset measured in number of 64B blocks and started from output_bvh.*/ 450 uint32_t bvh_block_offset = is_root_node ? 0 : DEREF(src_node).bvh_offset; 451 452 /* The invocation that processes this node is spinning, since its parent hasn't told it bvh_offset */ 453 if (bvh_block_offset == VK_UNKNOWN_BVH_OFFSET) 454 continue; 455 456 if (bvh_block_offset == VK_NULL_BVH_OFFSET) 457 break; 458 459 uint32_t found_child_count = 0; 460 uint32_t children[6] = {VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE, 461 VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE, 462 VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE}; 463 464 /* Initially, this node can have at most two children (can be internal nodes or leaves). */ 465 for (uint32_t i = 0; i < 2; ++i) 466 if (src.children[i] != VK_BVH_INVALID_NODE) 467 children[found_child_count++] = src.children[i]; 468 469 /* For this node, try to collapse binary to 6-ary children */ 470 while (found_child_count < 6) { 471 /* For each iteration, find a vk_ir_node_internal child that has largest surface area */ 472 int32_t collapsed_child_index = -1; 473 float largest_surface_area = -INFINITY; 474 475 for (int32_t i = 0; i < found_child_count; ++i) { 476 /* If a child is a leaf (not vk_ir_node_internal), there's no need to collapse it. */ 477 if (ir_id_to_type(children[i]) != vk_ir_node_internal) 478 continue; 479 480 vk_aabb bounds = 481 DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, 482 ir_id_to_offset(children[i]))).aabb; 483 484 float surface_area = aabb_surface_area(bounds); 485 if (surface_area > largest_surface_area) { 486 largest_surface_area = surface_area; 487 collapsed_child_index = i; 488 } 489 } 490 491 if (collapsed_child_index != -1) { 492 /* Once I found a good vk_ir_node_internal child, try to connect myself 493 * to this child's children, i.e. my grandchildren. Grandchildren can be 494 * internal nodes or leaves. 495 */ 496 REF(vk_ir_box_node) child_node = 497 REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, 498 ir_id_to_offset(children[collapsed_child_index])); 499 uint32_t grandchildren[2] = DEREF(child_node).children; 500 uint32_t valid_grandchild_count = 0; 501 502 if (grandchildren[1] != VK_BVH_INVALID_NODE) 503 ++valid_grandchild_count; 504 505 if (grandchildren[0] != VK_BVH_INVALID_NODE) 506 ++valid_grandchild_count; 507 else 508 grandchildren[0] = grandchildren[1]; 509 510 /* Grandchild now becomes my direct child, and can possibly be collapsed 511 * in the next iteration if found_child_count has not reached 6. 512 */ 513 if (valid_grandchild_count > 1) 514 children[found_child_count++] = grandchildren[1]; 515 516 if (valid_grandchild_count > 0) 517 children[collapsed_child_index] = grandchildren[0]; 518 else { 519 /* This child doesn't have valid children, then I don't consider this 520 * child as my child anymore. This is possible depending on how and 521 * when lbvh/ploc algorithm marks a node as VK_BVH_INVALID_NODE. 522 */ 523 found_child_count--; 524 children[collapsed_child_index] = children[found_child_count]; 525 } 526 527 /* Finish collapsing, now I can mark this collapsed internal node as NULL, 528 * so whichever lane that would have processed it will return. 529 */ 530 DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET; 531 } else 532 break; 533 } 534 535 /* Count the number of instance children found. For each one found, it contributes to 2 blocks to dst_node_offset */ 536 uint32_t num_blocks_to_add = 0; 537 for (uint32_t i = 0; i < found_child_count; ++i) { 538 uint32_t type = ir_id_to_type(children[i]); 539 num_blocks_to_add += (type == vk_ir_node_instance) ? 2 : 1; 540 } 541 542 /* Used for finding where to encode children. Also, update dst_node_offset so other invocations know where to start encoding */ 543 uint32_t child_block_offset_from_output_bvh = atomicAdd(DEREF(args.header).dst_node_offset, num_blocks_to_add); 544 545 /* This is one of the needed information in anv_internal_node */ 546 uint32_t child_block_offset_from_internal_node = child_block_offset_from_output_bvh - bvh_block_offset; 547 548 vec3 min_offset = vec3(INFINITY); 549 vec3 max_offset = vec3(-INFINITY); 550 for (uint32_t i = 0; i < found_child_count; ++i) { 551 /* Retrieve type and location of the child from IR BVH */ 552 uint32_t type = ir_id_to_type(children[i]); 553 uint32_t offset = ir_id_to_offset(children[i]); 554 555 if (type == vk_ir_node_internal) { 556 REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset); 557 DEREF(child_node).bvh_offset = child_block_offset_from_output_bvh; 558 } else { 559 encode_leaf_node(type, args.intermediate_bvh + offset, 560 args.output_bvh + ANV_RT_BLOCK_SIZE * child_block_offset_from_output_bvh, 561 header); 562 } 563 564 vk_aabb child_aabb = 565 DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb; 566 567 min_offset = min(min_offset, child_aabb.min); 568 max_offset = max(max_offset, child_aabb.max); 569 570 child_block_offset_from_output_bvh += (type == vk_ir_node_instance) ? 2 : 1; 571 } 572 573 /* Make changes to the children's BVH offset value available to the other invocations. */ 574 memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, 575 gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); 576 577 encode_internal_node(children, child_block_offset_from_internal_node, 578 found_child_count, min_offset, max_offset, bvh_block_offset); 579 580 break; 581 } 582 583 if (is_root_node) { 584 DEREF(header).aabb = src.base.aabb; 585 DEREF(header).rootNodeOffset = args.output_bvh_offset; 586 } 587} 588