1// 2// Copyright (C) 2009-2022 Intel Corporation 3// 4// SPDX-License-Identifier: MIT 5// 6// 7 8#include "libs/lsc_intrinsics.h" 9#include "morton/morton_common.h" 10 11// caution rec.local_parent_index__numItems needs to have high 16bits filled afterwards; 12BuildRecordLocalMortonFlattener TranslateToLocalRecord(struct BinaryMortonCodeHierarchy srcRec) 13{ 14 BuildRecordLocalMortonFlattener rec; 15 rec.leftChild = srcRec.leftChild; 16 rec.rightChild = srcRec.rightChild; 17 rec.rangeStart = srcRec.range.start; 18 rec.local_parent_index__numItems = (srcRec.range.end - srcRec.range.start) + 1; 19 return rec; 20} 21 22GRL_INLINE BuildRecordLocalMortonFlattener MortonFlattenedBoxlessNode_reinterpret_as_BR(MortonFlattenedBoxlessNode boxless) 23{ 24 BuildRecordLocalMortonFlattener rec; 25 rec.leftChild = boxless.binary_hierarchy_index; 26 rec.rightChild = boxless.childOffset_type; 27 rec.rangeStart = boxless.backPointer; 28 rec.local_parent_index__numItems = 0; 29 return rec; 30} 31 32GRL_INLINE void SUBGROUP_create_boxless_node_phase1( 33 uniform global struct Globals* globals, 34 uniform global struct BinaryMortonCodeHierarchy* bnodes, 35 uniform global char* bvh_mem, 36 uniform BuildRecordLocalMortonFlattener currentRecord, 37 uniform uint currQnodeLocalId, //local index for flattened qnoode, don't mix this with nodeIndex that is in morton build record 38 uniform local uint* local_numRecords, 39 uniform uint tictoc, 40 uniform uint* sg_bu_startpoint_arr, 41 uniform uint* sg_bu_startpoint_cnt, 42 uniform uint parentOfRoot, 43 uniform bool processRoot, 44 uniform UPerNodeData* nodeData) 45{ 46 varying ushort lane = get_sub_group_local_id(); 47 48 /* initialize child array */ 49 uniform uint numChildren = 2; 50 varying struct BuildRecordLocalMortonFlattener sg_children; 51 sg_children.local_parent_index__numItems = 0; 52 53 uint binary_hierarchy_child_idx = (lane == 0) ? currentRecord.leftChild : currentRecord.rightChild; 54 if (lane >= numChildren) binary_hierarchy_child_idx = 1 << 31; 55 56 sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, binary_hierarchy_child_idx)); 57 58 /* fill QBVH6 node with up to 6 children */ 59 while (numChildren < BVH_NODE_N6) 60 { 61 // we dont have to do "local_parent_index__numItems & 0xFFFF" because local_parent_index part is 0 here at this point 62 uint childNumItems = sg_children.local_parent_index__numItems; 63 varying bool sg_is_leaf = childNumItems <= cfg_minLeafSize; 64 if (sub_group_all(sg_is_leaf)) { break; } 65 66 uniform uint bestItems = sub_group_reduce_max_N6(childNumItems); 67 uniform ushort bestChild = ctz(intel_sub_group_ballot(childNumItems == bestItems)); 68 varying uint leftOfBest = sg_children.leftChild; // val important only for (lane == bestChild), not valid for other lanes 69 uniform uint rightOfBest = sub_group_broadcast(sg_children.rightChild, bestChild); 70 71 varying uint nodeID = (lane == bestChild) ? leftOfBest : rightOfBest; 72 73 if (lane == numChildren || lane == bestChild) 74 { 75 sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, nodeID)); 76 } 77 78 numChildren++; 79 } 80 81 uniform uint global_offset; 82 uniform uint child_node_index; 83 84 bool isFatleafChild = (sg_children.local_parent_index__numItems <= cfg_minLeafSize) && (lane < numChildren); 85 uint numFatleafChildren = popcount(intel_sub_group_ballot(isFatleafChild)); 86 87 if (lane <= numChildren) { 88 uint writeIDX = 0; 89 90 if (lane == numChildren) 91 { 92 /* create nodes in local structure, to be used later in the bottom up to create nodes in actual bvh */ 93 MortonFlattenedBoxlessNode flattened_node; 94 uint parentIDX; 95 96 if (processRoot) 97 { 98 *local_numRecords = numChildren + 1; 99 child_node_index = 1; 100 writeIDX = 0; 101 flattened_node.binary_hierarchy_index = 0xFFFFFFFF; 102 flattened_node.childOffset_type = (1 << 6) | BVH_INTERNAL_NODE; 103 parentIDX = parentOfRoot; 104 } 105 else 106 { 107 uint shift = (16 * tictoc); 108 uint mask = 0xFFFF; 109 uint atomicAddVal = numChildren << shift; 110 child_node_index = atomic_add_local(local_numRecords, atomicAddVal); 111 sub_group_barrier(0); 112 writeIDX = currQnodeLocalId; 113 parentIDX = currentRecord.local_parent_index__numItems >> 16; 114 flattened_node.binary_hierarchy_index = 0xFFFFFFFF; 115 sub_group_barrier(0); 116 child_node_index = (child_node_index >> 16) + (child_node_index & mask); 117 flattened_node.childOffset_type = ((child_node_index - currQnodeLocalId) << 6) | BVH_INTERNAL_NODE; 118 } 119 120#if MORTON_VERBOSE_LOG 121 printf("wg %d: SUBGROUP_create_boxless_node_phase1: writeIDX %d, child_node_index %d - %d\n", get_group_id(0), writeIDX, child_node_index, child_node_index + numChildren); 122#endif 123 flattened_node.backPointer = (parentIDX << 6) | (numChildren << 3) | numFatleafChildren; 124 sg_children = MortonFlattenedBoxlessNode_reinterpret_as_BR(flattened_node); 125 } 126 127 child_node_index = sub_group_broadcast(child_node_index, numChildren); 128 129 if (lane != numChildren) 130 { 131 writeIDX = child_node_index + lane; 132 sg_children.local_parent_index__numItems |= currQnodeLocalId << 16; 133 } 134 135 nodeData[writeIDX].buildRecord = sg_children; 136 } 137 138 if (numFatleafChildren == numChildren) { 139 uint arridx = *sg_bu_startpoint_cnt; 140 // GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane) 141 set_2xSG_arr_first_write(arridx, sg_bu_startpoint_arr, (ushort)currQnodeLocalId, lane); 142 *sg_bu_startpoint_cnt = arridx + 1; 143 } 144} 145 146// TODO_OPT: Consider having phase 0 bucket the build records by number of primitives, and dispatch different variants 147// of this kernel with different WG sizes. There are many records produced that generate only 1 or 2 subtrees, so 8 SGs is 148// probably often wasted 149GRL_INLINE void phase1_process_fatleaf( 150 uint globalBaseForInternalNodes, // for root node this is indexOfRoot 151 uint globalParent , // for root this should be parentOfRoot 152 bool isInstancePrimLeafType, // 153 uint leafPrimType, // 154 uint leafStride, // 155 global struct QBVHNodeN* nodeData, // per group 156 uint nodeDataStart, // 157 struct AABB* primref, // 158 BackPointers* backPointers, // 159 global struct MortonCodePrimitive* mc,// 160 uint nodesToLeafsGap, // 161 local union UPerNodeData* perNodeData,// 162 bool processRoot, // 163 short localNodeId, // 164 BuildRecordLocalMortonFlattener fatleafRecord, // per node 165 uint primID ) // 166{ 167 uint lane = get_sub_group_local_id(); 168 uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF); 169 uniform uint mcID = fatleafRecord.rangeStart; 170 uint pseudolane = lane < numChildren ? lane : 0; 171 varying struct AABB sg_bounds = primref[primID]; 172 173 uint local_parent_idx = (fatleafRecord.local_parent_index__numItems >> 16); 174 uint globalNodeId = globalBaseForInternalNodes + localNodeId; 175 uniform global struct QBVHNodeN* qnode = nodeData + globalNodeId; 176 177 uint children_offset = (mcID * leafStride + nodesToLeafsGap) - globalNodeId; 178 179 { 180 /* For all primitives in a fat leaf we store a back 181 * pointer. This way we can modify the fat leaf node at leaf construction time. */ 182 uint back_pointer = globalNodeId + nodeDataStart; 183 /* Store back pointer and primID inside morton code array to 184 * be later used by leaf creation. */ 185 mc[mcID + pseudolane].index_code = ((ulong)back_pointer) << 32 | (ulong)primID; 186 } 187 188 struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&sg_bounds); 189 reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 ); 190 191 uint8_t instMask; 192 if (isInstancePrimLeafType) 193 { 194 instMask = lane < numChildren ? PRIMREF_instanceMask(&sg_bounds) : 0; 195 subgroup_setInstanceQBVHNodeN(children_offset, &sg_bounds, numChildren, qnode, instMask); 196 instMask = sub_group_reduce_or_N6(instMask); 197 } 198 else 199 { 200 instMask = 0xFF; 201 subgroup_setQBVHNodeN_setFields_reduced_bounds(children_offset, leafPrimType, &sg_bounds, numChildren, instMask, qnode, false, reduce_bounds); 202 } 203 204 reduce_bounds.lower.w = as_float((uint)instMask); 205 uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduce_bounds, 0); 206 local uint* boxUint = (local uint*)(perNodeData + localNodeId); 207 if (get_sub_group_size() == 8 || lane < 8) 208 { 209 boxUint[lane] = reduce_bounds_lane; 210 uint globalParentIdx; 211 if (processRoot) { 212 // for root, treeletRootGlobalIndex is index of rootsParent in global space 213 globalParentIdx = globalParent; 214 } 215 else { 216 // for non root, raw_parent_idx is in local space 217 globalParentIdx = (local_parent_idx > 0) ? (globalBaseForInternalNodes + local_parent_idx) : globalParent; 218 } 219 if (lane == 0) { 220 *InnerNode_GetBackPointer(backPointers, globalNodeId) = (globalParentIdx << 6) | (numChildren << 3); 221 } 222 } 223} 224 225GRL_INLINE void perform_phase1(global struct Globals* globals, 226 global struct MortonCodePrimitive* mc, 227 global struct AABB* primref, 228 global struct BinaryMortonCodeHierarchy* bnodes, 229 global char* bvh_mem, 230 local union UPerNodeData* perNodeData, 231 local uint* local_records_head, 232 local uint* local_globalOffsetForNodes, 233 BuildRecordLocalMortonFlattener rootRecord, 234 uint treeletRootGlobalIndex, 235 uint parentOfRootIndex, 236 const uint leafPrimType, 237 bool isInstancePrimLeafType) 238{ 239 global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 240 varying ushort lane = get_sub_group_local_id(); 241 242 // array that will keep 2x8 shorts indices 243 varying uint sg_fatleaf_array = 0x0; 244 uniform uint8_t sg_fatleaf_cnt = 0; 245 /* terminate when all subtrees are leaves */ 246 247 uint subgroupId = get_sub_group_id(); 248 uint ID = subgroupId; 249 250 uint sg_bu_startpoints = 0; 251 uniform uint sg_bu_startpoints_cnt = 0; 252 const uint shift_mask = globals->shift_mask; 253 254 const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64; 255 BackPointers* backPointers = BVHBase_GetBackPointers(bvh); 256 global struct QBVHNodeN* nodeData = BVHBase_nodeData(bvh); 257 258 uint* pLeafStart = (!isInstancePrimLeafType) ? &bvh->quadLeafStart : &bvh->instanceLeafStart; 259 uint leafStart = *pLeafStart; 260 uint leafStride = (!isInstancePrimLeafType) ? 1 : (sizeof(struct HwInstanceLeaf) / sizeof(struct InternalNode)); 261 uint nodesToLeafsGap = leafStart - nodeDataStart; 262 263 if (ID == 0) 264 { 265 BuildRecordLocalMortonFlattener current = rootRecord; 266 267 if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6) 268 { 269 *local_records_head = 1; 270#if MORTON_DEBUG_CHECKS 271 if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n"); 272#endif 273 BuildRecordLocalMortonFlattener fatleafRecord = current; 274 uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF); 275 uint pseudolane = lane < numChildren ? lane : 0; 276 uniform const uint mcID = fatleafRecord.rangeStart; 277 varying uint primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask); 278 279 phase1_process_fatleaf( 280 treeletRootGlobalIndex, parentOfRootIndex, isInstancePrimLeafType, leafPrimType, leafStride, 281 nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData, 282 true, 0, fatleafRecord, primID); 283 } 284 else 285 { 286#if MORTON_VERBOSE_LOG 287 if (get_local_id(0) == 0) { printf("wg %d perform_phase1: starting collapsing subtree with root at node %d \n", get_group_id(0), rootIndex); } 288#endif 289 //printf("local_records_head = %d\n", *local_records_head); 290 SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, 0, &sg_bu_startpoints, &sg_bu_startpoints_cnt, parentOfRootIndex, true, perNodeData); 291 *local_globalOffsetForNodes = treeletRootGlobalIndex; 292 } 293 294 ID += get_num_sub_groups(); 295 } 296 297 uniform uint priv_records_tail = 1; 298 299 /* wait for all work items to have updated local_records array */ 300 work_group_barrier(CLK_LOCAL_MEM_FENCE); 301 302 uniform uint priv_records_head = *local_records_head & 0xFFFF; 303 treeletRootGlobalIndex = *local_globalOffsetForNodes; // propagated from subgroup 1 304 uniform uint priv_records_tail_prev = priv_records_tail; 305 uniform uint other_records_head = priv_records_head; 306 307 uint ticToc = 1; 308 309 if (priv_records_head == priv_records_tail) 310 { 311 return; 312 } 313 else 314 { 315 do 316 { 317 for (; ID < priv_records_head; ID += get_num_sub_groups()) 318 { 319 BuildRecordLocalMortonFlattener current = (perNodeData[ID].buildRecord); 320 321 if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6) 322 { 323 set_2xSG_arr_first_write(sg_fatleaf_cnt++, &sg_fatleaf_array, ID, lane); 324#if MORTON_VERBOSE_LOG 325 if (lane == 0)printf("wg %d, sg %d, perform_phase1: node ID %d is fatleaf \n", get_group_id(0), get_sub_group_id(), ID); 326#endif 327#if MORTON_DEBUG_CHECKS 328 if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n"); 329#endif 330 } 331 else 332 { 333 SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, ticToc, &sg_bu_startpoints, &sg_bu_startpoints_cnt, 0, 0, perNodeData); 334 } 335 } 336 337 priv_records_tail = priv_records_head; 338 /* wait for all work items to have updated local_records array */ 339 work_group_barrier(CLK_LOCAL_MEM_FENCE); 340 { 341 uint records_as_in_mem = *local_records_head; 342 priv_records_head = (records_as_in_mem >> (16 * ticToc)) & 0xFFFF; 343 uint other_records_head_temp = priv_records_head; 344 priv_records_head += other_records_head; 345 other_records_head = other_records_head_temp; 346 ticToc = ticToc ^ 1; 347#if MORTON_VERBOSE_LOG 348 if(get_local_id(0) == 0)printf("wg %d, perform_phase1: priv_records_tail %d, priv_records_head %d, records_as_in_mem %x\n", get_group_id(0), get_sub_group_id(), priv_records_tail, priv_records_head, records_as_in_mem); 349#endif 350 } 351 } while (priv_records_tail != priv_records_head); // get out of the loop if the tail reached the head 352 } 353 354 bool atomicNodeAllocation = treeletRootGlobalIndex > 0; 355 bool atomicNodeAllocationProduce = (get_sub_group_id() + lane == 0) && atomicNodeAllocation; 356 uint singleTreeletBumpBVHnodeCnt = (!atomicNodeAllocation && (get_sub_group_id() + lane == 0)) ? nodeDataStart + priv_records_tail : 0; 357 358 uniform uint globalBaseForInternalNodes = 0; 359 360 // we distinguish multi treelet from single treelets here by looking on our treeletRootGlobalIndex 361 // if treelets root is whole tree root (treeletRootGlobalIndex==0) then we are the only treelet so 362 // there's no need to synchronize multiple treelets nodes allocations with atomics. 363 if (atomicNodeAllocationProduce) 364 { 365 *local_globalOffsetForNodes = allocate_inner_nodes(bvh, priv_records_tail - 1); 366 } 367 368 // because, root is allocated elsewhere, and first node placed in global mem is node with local index 1 369 // mapping local to global: 370 // local space global space 371 // [0] - treelet root [treeletRootGlobalIndex] 372 // ... possibly very long distance ... 373 // [1] - first non root [globalBaseForInternalNodes + 1] - this index is returned by atomic allocator above 374 // [2] - first [globalBaseForInternalNodes + 2] 375 // ... 376 // [numToAllocate] - last node [globalBaseForInternalNodes + 3] 377 if (atomicNodeAllocation) 378 { 379 work_group_barrier(CLK_LOCAL_MEM_FENCE); 380 globalBaseForInternalNodes = *local_globalOffsetForNodes -(nodeDataStart+1); 381 } 382 383#if MORTON_VERBOSE_LOG 384 if (get_local_id(0) == 0) { printf("wg %d perform_phase1: globalBaseForInternalNodes %d, num local nodes %d\n", get_group_id(0), globalBaseForInternalNodes, priv_records_tail - 1); } 385#endif 386 387 if (sg_fatleaf_cnt) 388 { 389 short localNodeId = get_from_2xSG_arr(sg_fatleaf_cnt - 1, sg_fatleaf_array, lane); 390 //if (localNodeId >= MORTON_BUILDER_SUBTREE_THRESHOLD * 2) continue; 391 //if(local_startpoints_cnt > 1) return; 392 BuildRecordLocalMortonFlattener fatleafRecord = perNodeData[localNodeId].buildRecord; 393 394 varying uint primID; 395 { 396 uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF); 397 uint pseudolane = lane < numChildren ? lane : 0; 398 uniform const uint mcID = fatleafRecord.rangeStart; 399 primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask); 400 } 401 402 // process fatleafs, and store their boxes to SLM 403 // also put startpoints for bottom up 404 //uint fatleaf_cnt = *local_startpoints_cnt; 405 while (sg_fatleaf_cnt-- > 1) 406 { 407 short nextLocalNodeId = get_from_2xSG_arr(sg_fatleaf_cnt-1, sg_fatleaf_array, lane); 408 BuildRecordLocalMortonFlattener nextfatleafRecord = perNodeData[nextLocalNodeId].buildRecord; 409 varying uint nextPrimId; 410 411 { 412 uint numChildren = (nextfatleafRecord.local_parent_index__numItems & 0xFFFF); 413 uint pseudolane = lane < numChildren ? lane : 0; 414 uniform const uint mcID = nextfatleafRecord.rangeStart; 415 nextPrimId = (uint)(mc[mcID + pseudolane].index_code & shift_mask); 416 } 417 418 phase1_process_fatleaf( 419 globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride, 420 nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData, 421 false, localNodeId, fatleafRecord, primID); 422 423 fatleafRecord = nextfatleafRecord; 424 localNodeId = nextLocalNodeId; 425 primID = nextPrimId; 426 } 427 428 phase1_process_fatleaf( 429 globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride, 430 nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData, 431 false, localNodeId, fatleafRecord, primID); 432 } 433 434#if 0 435 // put collected bottom-up startpoints to wg shared array to later distribute the work evenly accross the groups. 436 { 437 ushort myStartpointWriteSite = 0; 438 439 if (lane == 0) 440 { 441 myStartpointWriteSite = atomic_add_local((local uint*)local_startpoints_cnt, (ushort)sg_bu_startpoints_cnt); 442 } 443 myStartpointWriteSite = sub_group_broadcast(myStartpointWriteSite, 0); 444 445 unpack_from_2xSG_arr(sg_bu_startpoints_cnt, sg_bu_startpoints, lane, local_startpoints_arr + myStartpointWriteSite); 446 } 447#endif 448 449 work_group_barrier(CLK_LOCAL_MEM_FENCE); 450 451 // distribute bottom-up startpoints 452#if 0 453 { 454 short sp_count_to_divide = (*local_startpoints_cnt); 455 456 //calculate the chunk for each sg. 457 sg_bu_startpoints_cnt = sp_count_to_divide / get_num_sub_groups(); 458 uint sg_bu_startpoints_cnt_reminder = sp_count_to_divide % get_num_sub_groups(); 459 460 uint myReadSite = get_sub_group_id() * sg_bu_startpoints_cnt; 461 if (get_sub_group_id() < sg_bu_startpoints_cnt_reminder) { 462 //from the reminder elements if sg idx is < sg_bu_startpoints_cnt_reminder then sg gets one extra idx 463 // and all sgs before it also have one extra 464 myReadSite += get_sub_group_id(); 465 sg_bu_startpoints_cnt++; 466 } 467 else 468 { 469 // all reminder elements are consummed by previous sgs 470 myReadSite += sg_bu_startpoints_cnt_reminder; 471 } 472 473 pack_from_2xSG_arr(local_startpoints_arr + myReadSite, sg_bu_startpoints_cnt, &sg_bu_startpoints, lane); 474 } 475#endif 476 477 SUBGROUP_refit_bottom_up_local(nodeData, backPointers, treeletRootGlobalIndex, globalBaseForInternalNodes, lane, perNodeData, sg_bu_startpoints, sg_bu_startpoints_cnt); 478 479 if (singleTreeletBumpBVHnodeCnt) 480 { 481 bvh->nodeDataCur = singleTreeletBumpBVHnodeCnt; 482 } 483} 484 485GRL_INLINE void update_empty_blas(global struct BVHBase* bvh, uint leafPrimType) 486{ 487 if (get_sub_group_id() == 0 ) 488 { 489 global struct QBVHNodeN* qnode = BVHBase_nodeData(bvh); 490 BackPointers* backPointers = BVHBase_GetBackPointers(bvh); 491 492 //set required fields to mark that blas is empty 493 uint k = (get_sub_group_local_id() < BVH_NODE_N6) ? get_sub_group_local_id() : 0; 494 qnode->type = leafPrimType; 495 qnode->instMask = 0; 496 qnode->qbounds.lower_x[k] = 0x80; 497 qnode->qbounds.upper_x[k] = 0; 498 499 *InnerNode_GetBackPointer(backPointers, 0) = (((uint)-1) << 6); 500 } 501} 502 503/* 504 505 POSTSORT PHASE1: 506 Two kernels here, selected by MORTON_BUILDER_SUBTREE_THRESHOLD. 507 1. parallel_build_phase1_Indirect_SG - record[0] is set to the subtree tip 508 2. parallel_build_phase1_Indirect_global_root - record[0] is set to the bvh root (no phase2 needed afterwards) 509 510*/ 511 512__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) 513__attribute__((intel_reqd_sub_group_size(16))) void kernel 514parallel_build_phase1_Indirect_SG( global struct Globals* globals, 515 global struct MortonCodePrimitive* mc, 516 global struct AABB* primref, 517 global struct BinaryMortonCodeHierarchy* bnodes, 518 global char* bvh_mem) 519{ 520 global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 521 const uint leafPrimType = globals->leafPrimType; 522 523 //special case for empty blas 524 if(globals->numPrimitives == 0) 525 { 526 bvh->nodeDataCur = BVH_ROOT_NODE_OFFSET / 64 + 1; 527 update_empty_blas(bvh, leafPrimType); 528 return; 529 } 530 531 local union UPerNodeData perNodeData[(MORTON_BUILDER_SUBTREE_THRESHOLD * 2) -1]; 532 local uint local_records_head; 533 // Two separate SLM variables for local_globalOffsetForNodes to remove one of the barriers 534 local uint local_globalOffsetForNodes, local_globalOffsetForNodes2; 535 536 uint rootIndex = 0; 537 uint parentOfRoot = 0; 538 BuildRecordLocalMortonFlattener rootBuildRecord; 539 540 /* add start build record to local stack */ 541 if (get_sub_group_id() == 0 ) 542 { 543 global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64 * bvh->quadLeafStart); 544 uint recordID = get_group_id(0); 545 struct BuildRecordMorton mortonGlobalRecord = records[recordID]; 546 547 rootBuildRecord = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, mortonGlobalRecord.nodeID)); 548 549 parentOfRoot = mortonGlobalRecord.parent_index; 550 rootIndex = mortonGlobalRecord.current_index; 551 552#if MORTON_VERBOSE_LOG 553 printf("P1_STARTPOINTS: current_index: %d, buildRecord.numItems: %d, buildRecord.binary_hierarchy_index: %d, buildRecord.local_parent_index: %d\n", 554 local_globalOffsetForNodes, buildRecord.numItems, buildRecord.binary_hierarchy_index, buildRecord.local_parent_index); 555#endif 556 } 557 558 if (leafPrimType == NODE_TYPE_INSTANCE) 559 { 560 perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, 561 &local_records_head, &local_globalOffsetForNodes, 562 rootBuildRecord, rootIndex, parentOfRoot, NODE_TYPE_INSTANCE, true); 563 } 564 else 565 { 566 perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, 567 &local_records_head, &local_globalOffsetForNodes, 568 rootBuildRecord, rootIndex, parentOfRoot, leafPrimType, false); 569 } 570 571} 572 573__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) 574__attribute__((intel_reqd_sub_group_size(16))) void kernel 575parallel_build_phase1_Indirect_global_root( global struct Globals* globals, 576 global struct MortonCodePrimitive* mc, 577 global struct AABB* primref, 578 global struct BinaryMortonCodeHierarchy* bnodes, 579 global char* bvh_mem) 580{ 581 global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 582 const uint leafPrimType = globals->leafPrimType; 583 const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64; 584 585 bvh->nodeDataCur = nodeDataStart + 1; 586 587 //special case for empty blas 588 if(globals->numPrimitives == 0) 589 { 590 update_empty_blas(bvh, leafPrimType); 591 return; 592 } 593 594 local union UPerNodeData perNodeData[MORTON_BUILDER_SUBTREE_THRESHOLD * 2 - 1]; 595 local uint local_records_head; 596 local uint local_globalOffsetForNodes; 597 598 BuildRecordLocalMortonFlattener rootBuildRecord; 599 600 if (get_sub_group_id() == 0 ) 601 { 602 struct BinaryMortonCodeHierarchy binaryNode = BinaryMortonCodeHierarchy_getEntry(bnodes, globals->binary_hierarchy_root); 603 604 rootBuildRecord = TranslateToLocalRecord(binaryNode); 605 606 local_globalOffsetForNodes = 0; 607 } 608 609 if (leafPrimType == NODE_TYPE_INSTANCE) 610 { 611 perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, 612 &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, NODE_TYPE_INSTANCE, true); 613 } 614 else 615 { 616 perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, 617 &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, leafPrimType, false); 618 619 } 620} 621 622#if 0 623GRL_INLINE void 624DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals, 625 global struct MortonCodePrimitive* mc, 626 global struct AABB* primref, 627 global struct BinaryMortonCodeHierarchy* bnodes, 628 global char* bvh_mem, 629 uint startID, uint endID, 630 local uint* local_numRecords, 631 local uint* local_numRecordsOld, 632 local struct BuildRecordMorton* local_records 633) 634{ 635 global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 636 global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64); 637 638 /* iterate over all subtrees this workgroup should build */ 639 for ( uint recordID = startID; recordID < endID; recordID++ ) 640 { 641 /* add start build record to local stack */ 642 if ( get_local_id( 0 ) == 0 ) 643 { 644 local_records[0] = records[recordID]; 645 *local_numRecords = 1; 646 *local_numRecordsOld = 0; 647 } 648 work_group_barrier( CLK_LOCAL_MEM_FENCE ); 649 650 /* terminate when all subtrees are leaves */ 651 while ( *local_numRecords != *local_numRecordsOld ) 652 { 653 /* remember the old number of build records to detect later 654 * whether we are done */ 655 if ( get_local_id( 0 ) == 0 ) 656 { 657 *local_numRecordsOld = *local_numRecords; 658 } 659 work_group_barrier( CLK_LOCAL_MEM_FENCE ); 660 661 /* all work items in the sub group pick a subtree to build */ 662 for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) ) 663 { 664 /* ignore small subtrees */ 665 if ( local_records[ID].items <= BVH_NODE_N6 ) 666 continue; 667 668 /* create QBVH node */ 669 create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] ); 670 } 671 672 /* wait for all work items to have updated local_records array */ 673 work_group_barrier( CLK_LOCAL_MEM_FENCE ); 674 } 675 676 const uint shift_mask = globals->shift_mask; 677 const uint leafPrimType = globals->leafPrimType; 678 const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET; 679 BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); 680 global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); 681 682 /* create all fat leaf nodes and initiate refit */ 683 for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) ) 684 { 685 struct BuildRecordMorton current = local_records[ID]; 686 const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID ); 687 688 global struct QBVHNodeN* qnode = nodeData + current.current_index; 689 690 /* get bounds of all children of the fat leaf node */ 691 struct AABB bounds[BVH_NODE_N6]; 692 for ( uint i = 0; i < current.items; i++ ) 693 { 694 /* get primID and bounds of primitive */ 695 const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask); 696 bounds[i] = primref[primID]; 697 698 /* For all primitives in a fat leaf we store a back 699 * pointer. This way we can modify the fat leaf node at leaf construction time. */ 700 const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem; 701 702 /* Store back pointer and primID inside morton code array to 703 * be later used by leaf creation. */ 704 mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID; 705 } 706 707 /* update fat leaf node */ 708 QBVHNodeN_setType( qnode, leafPrimType ); 709 global void* offset; 710 if ( leafPrimType != BVH_INSTANCE_NODE ) 711 { 712 offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad ); 713 QBVHNodeN_setChildIncr1( qnode ); 714 } 715 else 716 { 717 offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf ); 718 QBVHNodeN_setChildIncr2( qnode ); 719 } 720 QBVH6Node_set_offset( qnode, offset ); 721 QBVHNodeN_setBounds( qnode, bounds, current.items ); 722 723 /* set back pointers for fat leaf nodes */ 724 *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3); 725 726 /* bottom up refit */ 727 refit_bottom_up( qnode, bvh, bounds, current.items ); 728 } 729 } 730} 731 732/* 733 734 This phase takes the build records calculated in phase0 as input and 735 finished the BVH construction for all these subtrees. 736 737*/ 738__attribute__((reqd_work_group_size(8, 1, 1))) 739old_parallel_build_phase1(global struct Globals *globals, 740 global struct MortonCodePrimitive *mc, 741 global struct AABB *primref, 742 global struct BinaryMortonCodeHierarchy *bnodes, 743 global char *bvh_mem) 744{ 745 global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; 746 global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); 747 748 /* a queue of build records */ 749 local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD]; 750 local uint local_numRecords; 751 local uint local_numRecordsOld; 752 753 /* construct range of build records that each sub group will process */ 754 const uint numRecords = globals->numBuildRecords; 755 const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0); 756 const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0); 757 758 DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records ); 759 760} 761 762__attribute__( (reqd_work_group_size( 8, 1, 1 )) ) 763old_parallel_build_phase1_Indirect( global struct Globals* globals, 764 global struct MortonCodePrimitive* mc, 765 global struct AABB* primref, 766 global struct BinaryMortonCodeHierarchy* bnodes, 767 global char* bvh_mem ) 768{ 769 global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 770 global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart); 771 772 /* a queue of build records */ 773 local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD]; 774 local uint local_numRecords; 775 local uint local_numRecordsOld; 776 777 /* construct range of build records that each sub group will process */ 778 const uint numRecords = globals->numBuildRecords; 779 uint startID = get_group_id( 0 ); 780 uint endID = startID + 1; 781 782 DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records ); 783 784} 785#endif 786