• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/* Copyright © 2022 Friedrich Vock
2 * Copyright © 2024 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6#version 460
7
8#extension GL_GOOGLE_include_directive : require
9
10#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
11#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
12#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
13#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
14#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
15#extension GL_EXT_scalar_block_layout : require
16#extension GL_EXT_buffer_reference : require
17#extension GL_EXT_buffer_reference2 : require
18#extension GL_KHR_memory_scope_semantics : require
19#extension GL_EXT_shader_atomic_int64: require
20
21layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
22
23#include "anv_build_helpers.h"
24#include "anv_build_interface.h"
25
26#define ULP 1.1920928955078125e-7f
27
28layout(push_constant) uniform CONSTS {
29   encode_args args;
30};
31
32uint64_t
33get_instance_flag(uint32_t src)
34{
35   uint32_t flags = src & 0xff;
36   return flags & 0xf;
37}
38
39void
40encode_leaf_node(uint32_t type, uint64_t src_node, uint64_t dst_node, REF(anv_accel_struct_header) dst_header)
41{
42   switch (type) {
43   case vk_ir_node_triangle: {
44      REF(anv_quad_leaf_node) quad_leaf = REF(anv_quad_leaf_node)(dst_node);
45
46      vk_ir_triangle_node src = DEREF(REF(vk_ir_triangle_node)(src_node));
47      uint32_t geometry_id_and_flags = src.geometry_id_and_flags & 0xffffff;
48
49      /* sub-type (4-bit) encoded on 24-bit index */
50      geometry_id_and_flags |= (ANV_SUB_TYPE_QUAD & 0xF) << 24;
51      /* Set disable opacity culling by default */
52      geometry_id_and_flags |= (1 << 29);
53
54      /* Disable the second triangle */
55      uint32_t prim_index1_delta = 0;
56      /* For now, blockIncr are all 1, so every quad leaf has its "last" bit set. */
57      prim_index1_delta |= (1 << 22);
58
59      DEREF(quad_leaf).prim_index1_delta = prim_index1_delta;
60
61      if ((src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0) {
62         /* Geometry opqaue (1-bit) is encoded on 30-bit index */
63         geometry_id_and_flags |= (ANV_GEOMETRY_FLAG_OPAQUE << 30);
64         atomicAnd(DEREF(dst_header).instance_flags,
65                   ~ANV_INSTANCE_FLAG_FORCE_NON_OPAQUE);
66      } else {
67         atomicAnd(DEREF(dst_header).instance_flags,
68                   ~ANV_INSTANCE_FLAG_FORCE_OPAQUE);
69      }
70
71      DEREF(quad_leaf).prim_index0 = src.triangle_id;
72      DEREF(quad_leaf).leaf_desc.geometry_id_and_flags = geometry_id_and_flags;
73
74      /* shaderIndex is typically set to match geomIndex
75       * Geom mask is default to 0xFF
76       */
77      DEREF(quad_leaf).leaf_desc.shader_index_and_geom_mask = 0xFF000000 | (geometry_id_and_flags & 0xffffff);
78
79      /* Setup single triangle */
80      for (uint32_t i = 0; i < 3; i++) {
81         for (uint32_t j = 0; j < 3; j++) {
82            DEREF(quad_leaf).v[i][j] = src.coords[i][j];
83         }
84      }
85      break;
86   }
87   case vk_ir_node_aabb: {
88      REF(anv_procedural_leaf_node) aabb_leaf = REF(anv_procedural_leaf_node)(dst_node);
89
90      vk_ir_aabb_node src = DEREF(REF(vk_ir_aabb_node)(src_node));
91      uint32_t geometry_id_and_flags = src.geometry_id_and_flags & 0xffffff;
92
93      /* sub-type (4-bit) encoded on 24-bit index */
94      geometry_id_and_flags |= (ANV_SUB_TYPE_PROCEDURAL & 0xF) << 24;
95      /* Set disable opacity culling by default */
96      geometry_id_and_flags |= (1 << 29);
97
98      if ((src.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0) {
99         geometry_id_and_flags |= (ANV_GEOMETRY_FLAG_OPAQUE << 30);
100         atomicAnd(DEREF(dst_header).instance_flags,
101                   ~ANV_INSTANCE_FLAG_FORCE_NON_OPAQUE);
102      } else {
103         atomicAnd(DEREF(dst_header).instance_flags,
104                   ~ANV_INSTANCE_FLAG_FORCE_OPAQUE);
105      }
106
107      DEREF(aabb_leaf).leaf_desc.geometry_id_and_flags = geometry_id_and_flags;
108
109      /* shaderIndex is typically set to match geomIndex
110       * Geom mask is default to 0xFF
111       */
112      DEREF(aabb_leaf).leaf_desc.shader_index_and_geom_mask = 0xFF000000 | (geometry_id_and_flags & 0xffffff);
113
114      /* num primitives = 1 */
115      uint32_t dw1 = 1;
116      /* "last" has only 1 bit, and it is set. */
117      dw1 |= (1 << 31);
118
119      DEREF(aabb_leaf).DW1 = dw1;
120      DEREF(aabb_leaf).primIndex[0] = src.primitive_id;
121      break;
122   }
123   case vk_ir_node_instance: {
124      vk_ir_instance_node src = DEREF(REF(vk_ir_instance_node)(src_node));
125
126      REF(anv_instance_leaf) dst_instance = REF(anv_instance_leaf)(dst_node);
127
128      REF(anv_accel_struct_header) blas_header = REF(anv_accel_struct_header)(src.base_ptr);
129      uint64_t start_node_ptr = uint64_t(src.base_ptr) + DEREF(blas_header).rootNodeOffset;
130
131      uint32_t sbt_offset_and_flags = src.sbt_offset_and_flags;
132
133      uint32_t shader_index_and_geom_mask = 0;
134      shader_index_and_geom_mask |= (src.custom_instance_and_mask & 0xff000000);
135      DEREF(dst_instance).part0.shader_index_and_geom_mask = shader_index_and_geom_mask;
136
137      uint32_t instance_contribution_and_geom_flags = 0;
138      instance_contribution_and_geom_flags |= src.sbt_offset_and_flags & 0xffffff;
139      instance_contribution_and_geom_flags |= (1 << 29);
140      instance_contribution_and_geom_flags |=
141         (get_instance_flag(src.sbt_offset_and_flags >> 24) == ANV_INSTANCE_FLAG_FORCE_OPAQUE ?
142          ANV_GEOMETRY_FLAG_OPAQUE : 0) << 30;
143      DEREF(dst_instance).part0.instance_contribution_and_geom_flags =
144         instance_contribution_and_geom_flags;
145
146      uint32_t instance_flags = DEREF(blas_header).instance_flags;
147      if (((sbt_offset_and_flags >> 24) & (VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR |
148                                           VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR)) != 0) {
149         instance_flags &= ~(VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR |
150                             VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR);
151         instance_flags |= (sbt_offset_and_flags >> 24) & (VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR |
152                                                           VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR);
153      }
154
155      DEREF(dst_instance).part0.start_node_ptr_and_inst_flags =
156         start_node_ptr |
157         (get_instance_flag(instance_flags | (src.sbt_offset_and_flags >> 24)) << 48);
158
159      mat4 transform = mat4(src.otw_matrix);
160
161      mat4 inv_transform = transpose(inverse(transpose(transform)));
162      mat3x4 wto_matrix = mat3x4(inv_transform);
163      mat3x4 otw_matrix = mat3x4(transform);
164
165      /* Arrange WTO transformation matrix in column-major order */
166      DEREF(dst_instance).part0.world2obj_vx_x = wto_matrix[0][0];
167      DEREF(dst_instance).part0.world2obj_vx_y = wto_matrix[1][0];
168      DEREF(dst_instance).part0.world2obj_vx_z = wto_matrix[2][0];
169      DEREF(dst_instance).part0.obj2world_p_x =  otw_matrix[0][3];
170
171      DEREF(dst_instance).part0.world2obj_vy_x = wto_matrix[0][1];
172      DEREF(dst_instance).part0.world2obj_vy_y = wto_matrix[1][1];
173      DEREF(dst_instance).part0.world2obj_vy_z = wto_matrix[2][1];
174      DEREF(dst_instance).part0.obj2world_p_y =  otw_matrix[1][3];
175
176      DEREF(dst_instance).part0.world2obj_vz_x = wto_matrix[0][2];
177      DEREF(dst_instance).part0.world2obj_vz_y = wto_matrix[1][2];
178      DEREF(dst_instance).part0.world2obj_vz_z = wto_matrix[2][2];
179      DEREF(dst_instance).part0.obj2world_p_z =  otw_matrix[2][3];
180
181      /* Arrange OTW transformation matrix in column-major order */
182      DEREF(dst_instance).part1.obj2world_vx_x = otw_matrix[0][0];
183      DEREF(dst_instance).part1.obj2world_vx_y = otw_matrix[1][0];
184      DEREF(dst_instance).part1.obj2world_vx_z = otw_matrix[2][0];
185      DEREF(dst_instance).part1.world2obj_p_x =  wto_matrix[0][3];
186
187      DEREF(dst_instance).part1.obj2world_vy_x = otw_matrix[0][1];
188      DEREF(dst_instance).part1.obj2world_vy_y = otw_matrix[1][1];
189      DEREF(dst_instance).part1.obj2world_vy_z = otw_matrix[2][1];
190      DEREF(dst_instance).part1.world2obj_p_y =  wto_matrix[1][3];
191
192      DEREF(dst_instance).part1.obj2world_vz_x = otw_matrix[0][2];
193      DEREF(dst_instance).part1.obj2world_vz_y = otw_matrix[1][2];
194      DEREF(dst_instance).part1.obj2world_vz_z = otw_matrix[2][2];
195      DEREF(dst_instance).part1.world2obj_p_z =  wto_matrix[2][3];
196
197      DEREF(dst_instance).part1.bvh_ptr = src.base_ptr;
198      DEREF(dst_instance).part1.instance_index = src.instance_id;
199      DEREF(dst_instance).part1.instance_id = src.custom_instance_and_mask & 0xffffff;
200
201      uint64_t instance_leaves_addr_base = args.output_bvh - args.output_bvh_offset + ANV_RT_BVH_HEADER_SIZE;
202      uint64_t cnt = atomicAdd(DEREF(dst_header).instance_count, 1);
203      DEREF(INDEX(uint64_t, instance_leaves_addr_base, cnt)) = dst_node;
204      break;
205   }
206   }
207}
208
209vk_aabb
210conservative_aabb(vk_aabb input_aabb)
211{
212   vk_aabb out_aabb;
213
214   vec3 reduce_value = max(abs(input_aabb.min), abs(input_aabb.max));
215   float err = ULP * max(reduce_value.x, max(reduce_value.y, reduce_value.z));
216
217   out_aabb.min = input_aabb.min - vec3(err);
218   out_aabb.max = input_aabb.max + vec3(err);
219
220   return out_aabb;
221}
222
223void
224aabb_extend(inout vk_aabb v1, vk_aabb v2)
225{
226   v1.min = min(v1.min, v2.min);
227   v1.max = max(v1.max, v2.max);
228}
229
230vec3
231aabb_size(vk_aabb input_aabb)
232{
233   return input_aabb.max - input_aabb.min;
234}
235
236/* Determine the node_type based on type of its children.
237 * If children are all the same leaves, this internal node is a fat leaf;
238 * Otherwise, it's a mixed node.
239 */
240uint8_t
241determine_internal_node_type(uint32_t children[6], uint child_count)
242{
243   if (child_count == 0)
244      return uint8_t(ANV_NODE_TYPE_INVALID);
245
246   uint32_t type_of_first_child = ir_id_to_type(children[0]);
247   for (uint32_t i = 1; i < child_count; ++i) {
248      uint32_t type = ir_id_to_type(children[i]);
249      if(type != type_of_first_child){
250         return uint8_t(ANV_NODE_TYPE_MIXED);
251      }
252   }
253
254   /* All children have same type. Now check what type they are. */
255   switch (type_of_first_child){
256   case vk_ir_node_triangle:
257      return uint8_t(ANV_NODE_TYPE_QUAD);
258   case vk_ir_node_aabb:
259      return uint8_t(ANV_NODE_TYPE_PROCEDURAL);
260   case vk_ir_node_instance:
261      return uint8_t(ANV_NODE_TYPE_INSTANCE);
262   case vk_ir_node_internal:
263      return uint8_t(ANV_NODE_TYPE_MIXED);
264   default:
265      return uint8_t(ANV_NODE_TYPE_INVALID);
266   }
267}
268
269vk_aabb
270quantize_bounds(vk_aabb aabb, vec3 base, i8vec3 exp)
271{
272   vk_aabb quant_aabb;
273   vec3 lower = aabb.min - base;
274   vec3 upper = aabb.max - base;
275
276   vec3 qlower = ldexp(lower, -exp + 8);
277   vec3 qupper = ldexp(upper, -exp + 8);
278
279   qlower = min(max(floor(qlower), vec3(0.0)), vec3(255.0));
280   qupper = min(max(ceil(qupper), vec3(0.0)), vec3(255.0));
281
282   quant_aabb.min = qlower;
283   quant_aabb.max = qupper;
284
285   return quant_aabb;
286}
287
288void
289encode_internal_node(uint32_t children[6], uint32_t child_block_offset_from_internal_node, uint child_count,
290                     vec3 min_offset, vec3 max_offset, uint32_t bvh_block_offset)
291{
292   REF(anv_internal_node) dst_node =
293      REF(anv_internal_node)(OFFSET(args.output_bvh, ANV_RT_BLOCK_SIZE * bvh_block_offset));
294
295   DEREF(dst_node).child_block_offset = child_block_offset_from_internal_node;
296
297   vk_aabb box;
298   box.min = min_offset;
299   box.max = max_offset;
300
301   vk_aabb conservative_child_aabb = conservative_aabb(box);
302   DEREF(dst_node).lower[0] = conservative_child_aabb.min.x;
303   DEREF(dst_node).lower[1] = conservative_child_aabb.min.y;
304   DEREF(dst_node).lower[2] = conservative_child_aabb.min.z;
305
306   float up = 1.0 + ULP;
307   ivec3 exp;
308
309   vec3 len = aabb_size(conservative_child_aabb) * up;
310   vec3 mant = frexp(len, exp);
311
312   exp.x += int((mant.x > (255.0f / 256.0f)));
313   exp.y += int((mant.y > (255.0f / 256.0f)));
314   exp.z += int((mant.z > (255.0f / 256.0f)));
315
316   i8vec3 exponent_i8 = i8vec3(exp);
317   DEREF(dst_node).exp_x = max(int8_t(-128), exponent_i8.x);
318   DEREF(dst_node).exp_y = max(int8_t(-128), exponent_i8.y);
319   DEREF(dst_node).exp_z = max(int8_t(-128), exponent_i8.z);
320
321   i8vec3 exp_i8 = i8vec3(DEREF(dst_node).exp_x, DEREF(dst_node).exp_y, DEREF(dst_node).exp_z);
322
323   DEREF(dst_node).node_mask = uint8_t(0xff);
324   DEREF(dst_node).node_type = determine_internal_node_type(children, child_count);
325
326   for (uint32_t i = 0; i < 6; i++) {
327      if (i < child_count) {
328         uint32_t type = ir_id_to_type(children[i]);
329         /* blockIncr and child_block_offset are how HW used to find children during traversal.
330          * If not set properly, gpu could hang.
331          */
332         DEREF(dst_node).data[i].block_incr_and_start_prim =
333            type == vk_ir_node_instance ? uint8_t(2) : uint8_t(1);
334
335         uint32_t offset = ir_id_to_offset(children[i]);
336
337         vk_aabb child_aabb =
338            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
339
340         child_aabb = conservative_aabb(child_aabb);
341
342         vk_aabb quantize_aabb = quantize_bounds(child_aabb, conservative_child_aabb.min, exp_i8);
343
344         DEREF(dst_node).lower_x[i] = uint8_t(quantize_aabb.min.x);
345         DEREF(dst_node).lower_y[i] = uint8_t(quantize_aabb.min.y);
346         DEREF(dst_node).lower_z[i] = uint8_t(quantize_aabb.min.z);
347         DEREF(dst_node).upper_x[i] = uint8_t(quantize_aabb.max.x);
348         DEREF(dst_node).upper_y[i] = uint8_t(quantize_aabb.max.y);
349         DEREF(dst_node).upper_z[i] = uint8_t(quantize_aabb.max.z);
350
351         /* for a mixed node, encode type of each children in startPrim in childdata */
352         if (DEREF(dst_node).node_type == uint8_t(ANV_NODE_TYPE_MIXED)){
353            uint32_t type = ir_id_to_type(children[i]);
354            switch (type){
355            case vk_ir_node_triangle:
356               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_QUAD) << 2);
357               break;
358            case vk_ir_node_aabb:
359               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_PROCEDURAL) << 2);
360               break;
361            case vk_ir_node_instance:
362               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_INSTANCE) << 2);
363               break;
364            case vk_ir_node_internal:
365               DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_MIXED) << 2);
366               break;
367            }
368         }
369      } else {
370         /* Invalid Child Nodes: For invalid child nodes, the MSBs of lower and upper
371          * x planes are flipped. In other words:
372          * bool valid(int i) const {
373          *   return !(lower_x[i] & 0x80) || (upper_x[i] & 0x80);
374          * }
375          */
376         DEREF(dst_node).lower_x[i] = uint8_t(0x80);
377         DEREF(dst_node).lower_y[i] = uint8_t(0);
378         DEREF(dst_node).lower_z[i] = uint8_t(0);
379         DEREF(dst_node).upper_x[i] = uint8_t(0);
380         DEREF(dst_node).upper_y[i] = uint8_t(0);
381         DEREF(dst_node).upper_z[i] = uint8_t(0);
382
383         /* in case HW also references blockIncr to do something, we zero out the data. */
384         DEREF(dst_node).data[i].block_incr_and_start_prim = uint8_t(0);
385         DEREF(dst_node).data[i].block_incr_and_start_prim |= (uint8_t(ANV_NODE_TYPE_INVALID) << 2);
386      }
387   }
388}
389
390void
391main()
392{
393   /* Encode.comp is dispatched through indirect dispatch with calculated groupCountX,
394    * but we can still overdispatch invocations, so we need a guard here.
395    *
396    * Also, we can't support more than 0xFFFFFFFF internal nodes due to SW
397    * limit we enforce on indirect workgroup count for signaling.
398    */
399   if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count ||
400       DEREF(args.header).ir_internal_node_count > 0xFFFFFFFF)
401      return;
402
403   /* Each lane will process one vk_ir_node_internal. The root node is sitting at the end
404    * of the IR BVH, and we let the lane with gl_GlobalInvocationID.x == 0 to take care of it.
405    */
406   uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x;
407
408   uint32_t intermediate_leaf_node_size;
409   switch (args.geometry_type) {
410   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
411      intermediate_leaf_node_size = SIZEOF(vk_ir_triangle_node);
412      break;
413   case VK_GEOMETRY_TYPE_AABBS_KHR:
414      intermediate_leaf_node_size = SIZEOF(vk_ir_aabb_node);
415      break;
416   default: /* instances */
417      intermediate_leaf_node_size = SIZEOF(vk_ir_instance_node);
418      break;
419   }
420
421   uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * intermediate_leaf_node_size;
422
423   REF(vk_ir_box_node) intermediate_internal_nodes =
424      REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size);
425   REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id);
426   vk_ir_box_node src = DEREF(src_node);
427
428   bool is_root_node = gl_GlobalInvocationID.x == 0;
429
430   REF(anv_accel_struct_header) header = REF(anv_accel_struct_header)(args.output_bvh - args.output_bvh_offset);
431
432   if (is_root_node) {
433      DEREF(header).instance_flags =
434         (args.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR ? ANV_INSTANCE_ALL_AABB : 0) |
435         /* These will be removed when processing leaf nodes */
436         ANV_INSTANCE_FLAG_FORCE_OPAQUE | ANV_INSTANCE_FLAG_FORCE_NON_OPAQUE;
437
438      /* Indicate where the next children should be encoded. Offset measured in number of 64B blocks and started from output_bvh */
439      DEREF(args.header).dst_node_offset = 1;
440
441      DEREF(header).instance_count = 0;
442   }
443
444   for (;;) {
445      /* Make changes to the current node's BVH offset value visible. */
446      memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
447                    gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
448
449      /* Indicate where this internal node should be encoded. Offset measured in number of 64B blocks and started from output_bvh.*/
450      uint32_t bvh_block_offset = is_root_node ? 0 : DEREF(src_node).bvh_offset;
451
452      /* The invocation that processes this node is spinning, since its parent hasn't told it bvh_offset */
453      if (bvh_block_offset == VK_UNKNOWN_BVH_OFFSET)
454         continue;
455
456      if (bvh_block_offset == VK_NULL_BVH_OFFSET)
457         break;
458
459      uint32_t found_child_count = 0;
460      uint32_t children[6] = {VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
461                              VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE,
462                              VK_BVH_INVALID_NODE, VK_BVH_INVALID_NODE};
463
464      /* Initially, this node can have at most two children (can be internal nodes or leaves). */
465      for (uint32_t i = 0; i < 2; ++i)
466         if (src.children[i] != VK_BVH_INVALID_NODE)
467            children[found_child_count++] = src.children[i];
468
469      /* For this node, try to collapse binary to 6-ary children */
470      while (found_child_count < 6) {
471         /* For each iteration, find a vk_ir_node_internal child that has largest surface area */
472         int32_t collapsed_child_index = -1;
473         float largest_surface_area = -INFINITY;
474
475         for (int32_t i = 0; i < found_child_count; ++i) {
476            /* If a child is a leaf (not vk_ir_node_internal), there's no need to collapse it. */
477            if (ir_id_to_type(children[i]) != vk_ir_node_internal)
478               continue;
479
480            vk_aabb bounds =
481               DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh,
482                                           ir_id_to_offset(children[i]))).aabb;
483
484            float surface_area = aabb_surface_area(bounds);
485            if (surface_area > largest_surface_area) {
486               largest_surface_area = surface_area;
487               collapsed_child_index = i;
488            }
489         }
490
491         if (collapsed_child_index != -1) {
492            /* Once I found a good vk_ir_node_internal child, try to connect myself
493             * to this child's children, i.e. my grandchildren. Grandchildren can be
494             * internal nodes or leaves.
495             */
496            REF(vk_ir_box_node) child_node =
497               REF(vk_ir_box_node)OFFSET(args.intermediate_bvh,
498                                        ir_id_to_offset(children[collapsed_child_index]));
499            uint32_t grandchildren[2] = DEREF(child_node).children;
500            uint32_t valid_grandchild_count = 0;
501
502            if (grandchildren[1] != VK_BVH_INVALID_NODE)
503               ++valid_grandchild_count;
504
505            if (grandchildren[0] != VK_BVH_INVALID_NODE)
506               ++valid_grandchild_count;
507            else
508               grandchildren[0] = grandchildren[1];
509
510            /* Grandchild now becomes my direct child, and can possibly be collapsed
511             * in the next iteration if found_child_count has not reached 6.
512             */
513            if (valid_grandchild_count > 1)
514               children[found_child_count++] = grandchildren[1];
515
516            if (valid_grandchild_count > 0)
517               children[collapsed_child_index] = grandchildren[0];
518            else {
519               /* This child doesn't have valid children, then I don't consider this
520                * child as my child anymore. This is possible depending on how and
521                * when lbvh/ploc algorithm marks a node as VK_BVH_INVALID_NODE.
522                */
523               found_child_count--;
524               children[collapsed_child_index] = children[found_child_count];
525            }
526
527            /* Finish collapsing, now I can mark this collapsed internal node as NULL,
528             * so whichever lane that would have processed it will return.
529             */
530            DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET;
531         } else
532            break;
533      }
534
535      /* Count the number of instance children found. For each one found, it contributes to 2 blocks to dst_node_offset */
536      uint32_t num_blocks_to_add = 0;
537      for (uint32_t i = 0; i < found_child_count; ++i) {
538         uint32_t type = ir_id_to_type(children[i]);
539         num_blocks_to_add += (type == vk_ir_node_instance) ? 2 : 1;
540      }
541
542      /* Used for finding where to encode children. Also, update dst_node_offset so other invocations know where to start encoding */
543      uint32_t child_block_offset_from_output_bvh = atomicAdd(DEREF(args.header).dst_node_offset, num_blocks_to_add);
544
545      /* This is one of the needed information in anv_internal_node */
546      uint32_t child_block_offset_from_internal_node = child_block_offset_from_output_bvh - bvh_block_offset;
547
548      vec3 min_offset = vec3(INFINITY);
549      vec3 max_offset = vec3(-INFINITY);
550      for (uint32_t i = 0; i < found_child_count; ++i) {
551         /* Retrieve type and location of the child from IR BVH */
552         uint32_t type = ir_id_to_type(children[i]);
553         uint32_t offset = ir_id_to_offset(children[i]);
554
555         if (type == vk_ir_node_internal) {
556            REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
557            DEREF(child_node).bvh_offset = child_block_offset_from_output_bvh;
558         } else {
559            encode_leaf_node(type, args.intermediate_bvh + offset,
560                             args.output_bvh + ANV_RT_BLOCK_SIZE * child_block_offset_from_output_bvh,
561                             header);
562         }
563
564         vk_aabb child_aabb =
565            DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
566
567         min_offset = min(min_offset, child_aabb.min);
568         max_offset = max(max_offset, child_aabb.max);
569
570         child_block_offset_from_output_bvh += (type == vk_ir_node_instance) ? 2 : 1;
571      }
572
573      /* Make changes to the children's BVH offset value available to the other invocations. */
574      memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
575                    gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
576
577      encode_internal_node(children, child_block_offset_from_internal_node,
578                           found_child_count, min_offset, max_offset, bvh_block_offset);
579
580      break;
581   }
582
583   if (is_root_node) {
584      DEREF(header).aabb = src.base.aabb;
585      DEREF(header).rootNodeOffset = args.output_bvh_offset;
586   }
587}
588