• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2009-2022 Intel Corporation
3//
4// SPDX-License-Identifier: MIT
5//
6//
7
8#include "libs/lsc_intrinsics.h"
9#include "morton/morton_common.h"
10
11// caution rec.local_parent_index__numItems needs to have high 16bits filled afterwards;
12BuildRecordLocalMortonFlattener TranslateToLocalRecord(struct BinaryMortonCodeHierarchy srcRec)
13{
14    BuildRecordLocalMortonFlattener rec;
15    rec.leftChild  = srcRec.leftChild;
16    rec.rightChild = srcRec.rightChild;
17    rec.rangeStart = srcRec.range.start;
18    rec.local_parent_index__numItems = (srcRec.range.end - srcRec.range.start) + 1;
19    return rec;
20}
21
22GRL_INLINE BuildRecordLocalMortonFlattener MortonFlattenedBoxlessNode_reinterpret_as_BR(MortonFlattenedBoxlessNode boxless)
23{
24    BuildRecordLocalMortonFlattener rec;
25    rec.leftChild = boxless.binary_hierarchy_index;
26    rec.rightChild = boxless.childOffset_type;
27    rec.rangeStart = boxless.backPointer;
28    rec.local_parent_index__numItems = 0;
29    return rec;
30}
31
32GRL_INLINE void SUBGROUP_create_boxless_node_phase1(
33    uniform global struct Globals* globals,
34    uniform global struct BinaryMortonCodeHierarchy* bnodes,
35    uniform global char* bvh_mem,
36    uniform BuildRecordLocalMortonFlattener currentRecord,
37    uniform uint  currQnodeLocalId, //local index for flattened qnoode, don't mix this with nodeIndex that is in morton build record
38    uniform local uint* local_numRecords,
39    uniform uint tictoc,
40    uniform uint* sg_bu_startpoint_arr,
41    uniform uint* sg_bu_startpoint_cnt,
42    uniform uint parentOfRoot,
43    uniform bool processRoot,
44    uniform UPerNodeData* nodeData)
45{
46    varying ushort lane = get_sub_group_local_id();
47
48    /* initialize child array */
49    uniform uint numChildren = 2;
50    varying struct BuildRecordLocalMortonFlattener sg_children;
51    sg_children.local_parent_index__numItems = 0;
52
53    uint binary_hierarchy_child_idx = (lane == 0) ? currentRecord.leftChild : currentRecord.rightChild;
54    if (lane >= numChildren) binary_hierarchy_child_idx = 1 << 31;
55
56    sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, binary_hierarchy_child_idx));
57
58    /* fill QBVH6 node with up to 6 children */
59    while (numChildren < BVH_NODE_N6)
60    {
61        // we dont have to do "local_parent_index__numItems & 0xFFFF" because local_parent_index part is 0 here at this point
62        uint childNumItems = sg_children.local_parent_index__numItems;
63        varying bool sg_is_leaf = childNumItems <= cfg_minLeafSize;
64        if (sub_group_all(sg_is_leaf)) { break; }
65
66        uniform uint   bestItems = sub_group_reduce_max_N6(childNumItems);
67        uniform ushort bestChild = ctz(intel_sub_group_ballot(childNumItems == bestItems));
68        varying uint   leftOfBest = sg_children.leftChild; // val important only for (lane == bestChild), not valid for other lanes
69        uniform uint   rightOfBest = sub_group_broadcast(sg_children.rightChild, bestChild);
70
71        varying uint nodeID = (lane == bestChild) ? leftOfBest : rightOfBest;
72
73        if (lane == numChildren || lane == bestChild)
74        {
75            sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, nodeID));
76        }
77
78        numChildren++;
79    }
80
81    uniform uint global_offset;
82    uniform uint child_node_index;
83
84    bool isFatleafChild = (sg_children.local_parent_index__numItems <= cfg_minLeafSize) && (lane < numChildren);
85    uint numFatleafChildren = popcount(intel_sub_group_ballot(isFatleafChild));
86
87    if (lane <= numChildren) {
88        uint           writeIDX = 0;
89
90        if (lane == numChildren)
91    {
92        /* create nodes in local structure, to be used later in the bottom up to create nodes in actual bvh */
93        MortonFlattenedBoxlessNode flattened_node;
94            uint parentIDX;
95
96            if (processRoot)
97            {
98                *local_numRecords = numChildren + 1;
99                child_node_index = 1;
100                writeIDX = 0;
101        flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
102                flattened_node.childOffset_type = (1 << 6) | BVH_INTERNAL_NODE;
103                parentIDX = parentOfRoot;
104            }
105            else
106            {
107                uint shift = (16 * tictoc);
108                uint mask = 0xFFFF;
109                uint atomicAddVal = numChildren << shift;
110                child_node_index = atomic_add_local(local_numRecords, atomicAddVal);
111                sub_group_barrier(0);
112                writeIDX = currQnodeLocalId;
113                parentIDX = currentRecord.local_parent_index__numItems >> 16;
114                flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
115                sub_group_barrier(0);
116                child_node_index = (child_node_index >> 16) + (child_node_index & mask);
117        flattened_node.childOffset_type = ((child_node_index - currQnodeLocalId) << 6) | BVH_INTERNAL_NODE;
118            }
119
120#if MORTON_VERBOSE_LOG
121            printf("wg %d: SUBGROUP_create_boxless_node_phase1: writeIDX %d, child_node_index %d - %d\n", get_group_id(0), writeIDX, child_node_index, child_node_index + numChildren);
122#endif
123            flattened_node.backPointer = (parentIDX << 6) | (numChildren << 3) | numFatleafChildren;
124            sg_children = MortonFlattenedBoxlessNode_reinterpret_as_BR(flattened_node);
125    }
126
127        child_node_index = sub_group_broadcast(child_node_index, numChildren);
128
129        if (lane != numChildren)
130    {
131            writeIDX = child_node_index + lane;
132            sg_children.local_parent_index__numItems |= currQnodeLocalId << 16;
133    }
134
135        nodeData[writeIDX].buildRecord = sg_children;
136    }
137
138    if (numFatleafChildren == numChildren) {
139        uint arridx = *sg_bu_startpoint_cnt;
140        // GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
141        set_2xSG_arr_first_write(arridx, sg_bu_startpoint_arr, (ushort)currQnodeLocalId, lane);
142        *sg_bu_startpoint_cnt = arridx + 1;
143    }
144}
145
146// TODO_OPT:  Consider having phase 0 bucket the build records by number of primitives, and dispatch different variants
147//    of this kernel with different WG sizes.   There are many records produced that generate only 1 or 2 subtrees, so 8 SGs is
148//     probably often wasted
149GRL_INLINE void phase1_process_fatleaf(
150    uint   globalBaseForInternalNodes,    // for root node this is indexOfRoot
151    uint   globalParent          ,        // for root this should be parentOfRoot
152    bool   isInstancePrimLeafType,        //
153    uint   leafPrimType,                  //
154    uint   leafStride,                    //
155    global struct QBVHNodeN* nodeData,    // per group
156    uint nodeDataStart,                   //
157    struct AABB* primref,                 //
158    BackPointers* backPointers,           //
159    global struct MortonCodePrimitive* mc,//
160    uint nodesToLeafsGap,                 //
161    local union UPerNodeData* perNodeData,//
162    bool processRoot,                               //
163    short localNodeId,                              //
164    BuildRecordLocalMortonFlattener fatleafRecord,  // per node
165    uint primID )                                   //
166{
167    uint lane = get_sub_group_local_id();
168    uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
169    uniform uint mcID = fatleafRecord.rangeStart;
170    uint pseudolane = lane < numChildren ? lane : 0;
171    varying struct AABB sg_bounds = primref[primID];
172
173    uint local_parent_idx = (fatleafRecord.local_parent_index__numItems >> 16);
174    uint globalNodeId = globalBaseForInternalNodes + localNodeId;
175    uniform global struct QBVHNodeN* qnode = nodeData + globalNodeId;
176
177    uint children_offset = (mcID * leafStride + nodesToLeafsGap) - globalNodeId;
178
179    {
180        /* For all primitives in a fat leaf we store a back
181         * pointer. This way we can modify the fat leaf node at leaf construction time. */
182        uint back_pointer = globalNodeId + nodeDataStart;
183        /* Store back pointer and primID inside morton code array to
184         * be later used by leaf creation. */
185        mc[mcID + pseudolane].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
186    }
187
188    struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&sg_bounds);
189    reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
190
191    uint8_t instMask;
192    if (isInstancePrimLeafType)
193    {
194        instMask = lane < numChildren ? PRIMREF_instanceMask(&sg_bounds) : 0;
195        subgroup_setInstanceQBVHNodeN(children_offset, &sg_bounds, numChildren, qnode, instMask);
196        instMask = sub_group_reduce_or_N6(instMask);
197    }
198    else
199    {
200        instMask = 0xFF;
201        subgroup_setQBVHNodeN_setFields_reduced_bounds(children_offset, leafPrimType, &sg_bounds, numChildren, instMask, qnode, false, reduce_bounds);
202    }
203
204    reduce_bounds.lower.w = as_float((uint)instMask);
205    uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduce_bounds, 0);
206    local uint* boxUint = (local uint*)(perNodeData + localNodeId);
207    if (get_sub_group_size() == 8 || lane < 8)
208    {
209        boxUint[lane] = reduce_bounds_lane;
210        uint globalParentIdx;
211        if (processRoot) {
212            // for root, treeletRootGlobalIndex is index of rootsParent in global space
213            globalParentIdx = globalParent;
214        }
215        else {
216            // for non root, raw_parent_idx is in local space
217            globalParentIdx = (local_parent_idx > 0) ? (globalBaseForInternalNodes + local_parent_idx) : globalParent;
218        }
219        if (lane == 0) {
220            *InnerNode_GetBackPointer(backPointers, globalNodeId) = (globalParentIdx << 6) | (numChildren << 3);
221        }
222    }
223}
224
225GRL_INLINE void perform_phase1(global struct Globals* globals,
226    global struct MortonCodePrimitive* mc,
227    global struct AABB* primref,
228    global struct BinaryMortonCodeHierarchy* bnodes,
229    global char* bvh_mem,
230    local union UPerNodeData* perNodeData,
231    local uint* local_records_head,
232    local uint* local_globalOffsetForNodes,
233    BuildRecordLocalMortonFlattener rootRecord,
234    uint treeletRootGlobalIndex,
235    uint parentOfRootIndex,
236    const uint leafPrimType,
237    bool isInstancePrimLeafType)
238{
239    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
240    varying ushort lane = get_sub_group_local_id();
241
242    // array that will keep 2x8 shorts indices
243    varying uint    sg_fatleaf_array = 0x0;
244    uniform uint8_t sg_fatleaf_cnt = 0;
245    /* terminate when all subtrees are leaves */
246
247    uint subgroupId = get_sub_group_id();
248    uint ID = subgroupId;
249
250    uint sg_bu_startpoints = 0;
251    uniform uint sg_bu_startpoints_cnt = 0;
252    const uint shift_mask = globals->shift_mask;
253
254    const uint nodeDataStart  = BVH_ROOT_NODE_OFFSET / 64;
255    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
256    global struct QBVHNodeN* nodeData = BVHBase_nodeData(bvh);
257
258    uint* pLeafStart = (!isInstancePrimLeafType) ? &bvh->quadLeafStart : &bvh->instanceLeafStart;
259    uint  leafStart = *pLeafStart;
260    uint  leafStride = (!isInstancePrimLeafType) ? 1 : (sizeof(struct HwInstanceLeaf) / sizeof(struct InternalNode));
261    uint  nodesToLeafsGap = leafStart - nodeDataStart;
262
263    if (ID == 0)
264    {
265        BuildRecordLocalMortonFlattener current = rootRecord;
266
267        if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
268        {
269             *local_records_head = 1;
270#if MORTON_DEBUG_CHECKS
271                if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
272#endif
273            BuildRecordLocalMortonFlattener fatleafRecord = current;
274            uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
275            uint pseudolane = lane < numChildren ? lane : 0;
276            uniform const uint mcID = fatleafRecord.rangeStart;
277            varying uint primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
278
279            phase1_process_fatleaf(
280                treeletRootGlobalIndex, parentOfRootIndex, isInstancePrimLeafType, leafPrimType, leafStride,
281                nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
282                true, 0, fatleafRecord, primID);
283        }
284        else
285        {
286#if MORTON_VERBOSE_LOG
287            if (get_local_id(0) == 0) { printf("wg %d perform_phase1: starting collapsing subtree with root at node %d \n", get_group_id(0), rootIndex); }
288#endif
289            //printf("local_records_head = %d\n", *local_records_head);
290            SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, 0, &sg_bu_startpoints, &sg_bu_startpoints_cnt, parentOfRootIndex, true, perNodeData);
291            *local_globalOffsetForNodes = treeletRootGlobalIndex;
292        }
293
294        ID += get_num_sub_groups();
295    }
296
297    uniform uint priv_records_tail = 1;
298
299    /* wait for all work items to have updated local_records array */
300    work_group_barrier(CLK_LOCAL_MEM_FENCE);
301
302    uniform uint priv_records_head = *local_records_head & 0xFFFF;
303    treeletRootGlobalIndex = *local_globalOffsetForNodes; // propagated from subgroup 1
304    uniform uint priv_records_tail_prev = priv_records_tail;
305    uniform uint other_records_head = priv_records_head;
306
307    uint ticToc = 1;
308
309    if (priv_records_head == priv_records_tail)
310    {
311        return;
312    }
313    else
314    {
315        do
316        {
317            for (; ID < priv_records_head; ID += get_num_sub_groups())
318            {
319                BuildRecordLocalMortonFlattener current = (perNodeData[ID].buildRecord);
320
321                if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
322                {
323                    set_2xSG_arr_first_write(sg_fatleaf_cnt++, &sg_fatleaf_array, ID, lane);
324#if MORTON_VERBOSE_LOG
325                    if (lane == 0)printf("wg %d, sg %d, perform_phase1: node ID %d is fatleaf \n", get_group_id(0), get_sub_group_id(), ID);
326#endif
327#if MORTON_DEBUG_CHECKS
328                    if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
329#endif
330                }
331                else
332                {
333                    SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, ticToc, &sg_bu_startpoints, &sg_bu_startpoints_cnt, 0, 0, perNodeData);
334                }
335            }
336
337            priv_records_tail = priv_records_head;
338            /* wait for all work items to have updated local_records array */
339            work_group_barrier(CLK_LOCAL_MEM_FENCE);
340            {
341                uint records_as_in_mem = *local_records_head;
342                priv_records_head = (records_as_in_mem >> (16 * ticToc)) & 0xFFFF;
343                uint other_records_head_temp = priv_records_head;
344                priv_records_head += other_records_head;
345                other_records_head = other_records_head_temp;
346                ticToc = ticToc ^ 1;
347#if MORTON_VERBOSE_LOG
348                if(get_local_id(0) == 0)printf("wg %d, perform_phase1: priv_records_tail %d, priv_records_head %d, records_as_in_mem %x\n", get_group_id(0), get_sub_group_id(), priv_records_tail, priv_records_head, records_as_in_mem);
349#endif
350            }
351        } while (priv_records_tail != priv_records_head); // get out of the loop if the tail reached the head
352    }
353
354    bool atomicNodeAllocation = treeletRootGlobalIndex > 0;
355    bool atomicNodeAllocationProduce = (get_sub_group_id() + lane == 0) && atomicNodeAllocation;
356    uint singleTreeletBumpBVHnodeCnt = (!atomicNodeAllocation && (get_sub_group_id() + lane == 0)) ? nodeDataStart + priv_records_tail : 0;
357
358    uniform uint globalBaseForInternalNodes = 0;
359
360    // we distinguish multi treelet from single treelets here by looking on our treeletRootGlobalIndex
361    // if treelets root is whole tree root (treeletRootGlobalIndex==0) then we are the only treelet so
362    // there's no need to synchronize multiple treelets nodes allocations with atomics.
363    if (atomicNodeAllocationProduce)
364    {
365        *local_globalOffsetForNodes = allocate_inner_nodes(bvh, priv_records_tail - 1);
366    }
367
368    // because, root is allocated elsewhere, and first node placed in global mem is node with local index 1
369            // mapping local to global:
370            // local space                           global space
371            // [0]             - treelet root        [treeletRootGlobalIndex]
372            //                                       ... possibly very long distance ...
373            // [1]             - first non root      [globalBaseForInternalNodes + 1] - this index is returned by atomic allocator above
374            // [2]             - first               [globalBaseForInternalNodes + 2]
375            // ...
376            // [numToAllocate] - last node           [globalBaseForInternalNodes + 3]
377    if (atomicNodeAllocation)
378    {
379        work_group_barrier(CLK_LOCAL_MEM_FENCE);
380        globalBaseForInternalNodes = *local_globalOffsetForNodes -(nodeDataStart+1);
381    }
382
383#if MORTON_VERBOSE_LOG
384    if (get_local_id(0) == 0) { printf("wg %d perform_phase1: globalBaseForInternalNodes %d, num local nodes %d\n", get_group_id(0), globalBaseForInternalNodes, priv_records_tail - 1); }
385#endif
386
387    if (sg_fatleaf_cnt)
388    {
389        short localNodeId = get_from_2xSG_arr(sg_fatleaf_cnt - 1, sg_fatleaf_array, lane);
390        //if (localNodeId >= MORTON_BUILDER_SUBTREE_THRESHOLD * 2) continue;
391        //if(local_startpoints_cnt > 1) return;
392        BuildRecordLocalMortonFlattener fatleafRecord = perNodeData[localNodeId].buildRecord;
393
394        varying uint primID;
395        {
396            uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
397            uint pseudolane = lane < numChildren ? lane : 0;
398                uniform const uint mcID = fatleafRecord.rangeStart;
399                primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
400        }
401
402        // process fatleafs, and store their boxes to SLM
403        // also put startpoints for bottom up
404        //uint fatleaf_cnt = *local_startpoints_cnt;
405        while (sg_fatleaf_cnt-- > 1)
406        {
407            short                           nextLocalNodeId   = get_from_2xSG_arr(sg_fatleaf_cnt-1, sg_fatleaf_array, lane);
408            BuildRecordLocalMortonFlattener nextfatleafRecord = perNodeData[nextLocalNodeId].buildRecord;
409            varying uint                    nextPrimId;
410
411            {
412                uint numChildren = (nextfatleafRecord.local_parent_index__numItems & 0xFFFF);
413                uint pseudolane = lane < numChildren ? lane : 0;
414                uniform const uint mcID = nextfatleafRecord.rangeStart;
415                nextPrimId = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
416            }
417
418            phase1_process_fatleaf(
419                globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
420                nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
421                false, localNodeId, fatleafRecord, primID);
422
423            fatleafRecord = nextfatleafRecord;
424            localNodeId   = nextLocalNodeId;
425            primID        = nextPrimId;
426        }
427
428        phase1_process_fatleaf(
429            globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
430            nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
431            false, localNodeId, fatleafRecord, primID);
432        }
433
434#if 0
435    // put collected bottom-up startpoints to wg shared array to later distribute the work evenly accross the groups.
436        {
437            ushort myStartpointWriteSite = 0;
438
439            if (lane == 0)
440            {
441                myStartpointWriteSite = atomic_add_local((local uint*)local_startpoints_cnt, (ushort)sg_bu_startpoints_cnt);
442            }
443            myStartpointWriteSite = sub_group_broadcast(myStartpointWriteSite, 0);
444
445            unpack_from_2xSG_arr(sg_bu_startpoints_cnt, sg_bu_startpoints, lane, local_startpoints_arr + myStartpointWriteSite);
446        }
447#endif
448
449        work_group_barrier(CLK_LOCAL_MEM_FENCE);
450
451        // distribute bottom-up startpoints
452#if 0
453        {
454            short sp_count_to_divide = (*local_startpoints_cnt);
455
456            //calculate the chunk for each sg.
457            sg_bu_startpoints_cnt = sp_count_to_divide / get_num_sub_groups();
458            uint sg_bu_startpoints_cnt_reminder = sp_count_to_divide % get_num_sub_groups();
459
460            uint myReadSite = get_sub_group_id() * sg_bu_startpoints_cnt;
461            if (get_sub_group_id() < sg_bu_startpoints_cnt_reminder) {
462                //from the reminder elements if sg idx is < sg_bu_startpoints_cnt_reminder then sg gets one extra idx
463                // and all sgs before it also have one extra
464                myReadSite += get_sub_group_id();
465                sg_bu_startpoints_cnt++;
466        }
467        else
468        {
469            // all reminder elements are consummed by previous sgs
470            myReadSite += sg_bu_startpoints_cnt_reminder;
471        }
472
473        pack_from_2xSG_arr(local_startpoints_arr + myReadSite, sg_bu_startpoints_cnt, &sg_bu_startpoints, lane);
474    }
475#endif
476
477    SUBGROUP_refit_bottom_up_local(nodeData, backPointers, treeletRootGlobalIndex, globalBaseForInternalNodes, lane, perNodeData, sg_bu_startpoints, sg_bu_startpoints_cnt);
478
479    if (singleTreeletBumpBVHnodeCnt)
480    {
481        bvh->nodeDataCur = singleTreeletBumpBVHnodeCnt;
482    }
483}
484
485GRL_INLINE void update_empty_blas(global struct BVHBase* bvh, uint leafPrimType)
486{
487    if (get_sub_group_id() == 0 )
488    {
489        global struct QBVHNodeN* qnode = BVHBase_nodeData(bvh);
490        BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
491
492        //set required fields to mark that blas is empty
493        uint k = (get_sub_group_local_id() < BVH_NODE_N6) ? get_sub_group_local_id() : 0;
494        qnode->type = leafPrimType;
495        qnode->instMask = 0;
496        qnode->qbounds.lower_x[k] = 0x80;
497        qnode->qbounds.upper_x[k] = 0;
498
499        *InnerNode_GetBackPointer(backPointers, 0) = (((uint)-1) << 6);
500    }
501}
502
503/*
504
505  POSTSORT PHASE1:
506  Two kernels here, selected by MORTON_BUILDER_SUBTREE_THRESHOLD.
507  1. parallel_build_phase1_Indirect_SG - record[0] is set to the subtree tip
508  2. parallel_build_phase1_Indirect_global_root - record[0] is set to the bvh root (no phase2 needed afterwards)
509
510*/
511
512__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
513__attribute__((intel_reqd_sub_group_size(16))) void kernel
514parallel_build_phase1_Indirect_SG( global struct Globals* globals,
515    global struct MortonCodePrimitive* mc,
516    global struct AABB* primref,
517    global struct BinaryMortonCodeHierarchy* bnodes,
518    global char* bvh_mem)
519{
520    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
521    const uint leafPrimType = globals->leafPrimType;
522
523    //special case for empty blas
524    if(globals->numPrimitives == 0)
525    {
526        bvh->nodeDataCur = BVH_ROOT_NODE_OFFSET / 64 + 1;
527        update_empty_blas(bvh, leafPrimType);
528        return;
529    }
530
531    local union UPerNodeData perNodeData[(MORTON_BUILDER_SUBTREE_THRESHOLD * 2) -1];
532    local uint local_records_head;
533    // Two separate SLM variables for local_globalOffsetForNodes to remove one of the barriers
534    local uint local_globalOffsetForNodes, local_globalOffsetForNodes2;
535
536    uint rootIndex = 0;
537    uint parentOfRoot = 0;
538    BuildRecordLocalMortonFlattener  rootBuildRecord;
539
540    /* add start build record to local stack */
541    if (get_sub_group_id() == 0 )
542    {
543        global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64 * bvh->quadLeafStart);
544        uint recordID = get_group_id(0);
545        struct BuildRecordMorton mortonGlobalRecord = records[recordID];
546
547        rootBuildRecord = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, mortonGlobalRecord.nodeID));
548
549        parentOfRoot = mortonGlobalRecord.parent_index;
550        rootIndex = mortonGlobalRecord.current_index;
551
552#if MORTON_VERBOSE_LOG
553        printf("P1_STARTPOINTS: current_index: %d, buildRecord.numItems: %d, buildRecord.binary_hierarchy_index: %d, buildRecord.local_parent_index: %d\n",
554               local_globalOffsetForNodes, buildRecord.numItems, buildRecord.binary_hierarchy_index, buildRecord.local_parent_index);
555#endif
556    }
557
558    if (leafPrimType == NODE_TYPE_INSTANCE)
559    {
560        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
561            &local_records_head, &local_globalOffsetForNodes,
562            rootBuildRecord, rootIndex, parentOfRoot, NODE_TYPE_INSTANCE, true);
563    }
564    else
565    {
566        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
567            &local_records_head, &local_globalOffsetForNodes,
568            rootBuildRecord, rootIndex, parentOfRoot, leafPrimType, false);
569    }
570
571}
572
573__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
574__attribute__((intel_reqd_sub_group_size(16))) void kernel
575parallel_build_phase1_Indirect_global_root( global struct Globals* globals,
576    global struct MortonCodePrimitive* mc,
577    global struct AABB* primref,
578    global struct BinaryMortonCodeHierarchy* bnodes,
579    global char* bvh_mem)
580{
581    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
582    const uint leafPrimType = globals->leafPrimType;
583    const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
584
585    bvh->nodeDataCur = nodeDataStart + 1;
586
587    //special case for empty blas
588    if(globals->numPrimitives == 0)
589    {
590        update_empty_blas(bvh, leafPrimType);
591        return;
592    }
593
594    local union UPerNodeData perNodeData[MORTON_BUILDER_SUBTREE_THRESHOLD * 2 - 1];
595    local uint local_records_head;
596    local uint local_globalOffsetForNodes;
597
598    BuildRecordLocalMortonFlattener rootBuildRecord;
599
600    if (get_sub_group_id() == 0 )
601    {
602        struct BinaryMortonCodeHierarchy binaryNode = BinaryMortonCodeHierarchy_getEntry(bnodes, globals->binary_hierarchy_root);
603
604        rootBuildRecord = TranslateToLocalRecord(binaryNode);
605
606        local_globalOffsetForNodes = 0;
607    }
608
609    if (leafPrimType == NODE_TYPE_INSTANCE)
610    {
611        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
612            &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, NODE_TYPE_INSTANCE, true);
613    }
614    else
615    {
616        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
617            &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, leafPrimType, false);
618
619    }
620}
621
622#if 0
623GRL_INLINE void
624DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
625    global struct MortonCodePrimitive* mc,
626    global struct AABB* primref,
627    global struct BinaryMortonCodeHierarchy* bnodes,
628    global char* bvh_mem,
629    uint startID, uint endID,
630    local uint* local_numRecords,
631    local uint* local_numRecordsOld,
632    local struct BuildRecordMorton* local_records
633)
634{
635    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
636    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
637
638    /* iterate over all subtrees this workgroup should build */
639    for ( uint recordID = startID; recordID < endID; recordID++ )
640    {
641        /* add start build record to local stack */
642        if ( get_local_id( 0 ) == 0 )
643        {
644            local_records[0] = records[recordID];
645            *local_numRecords = 1;
646            *local_numRecordsOld = 0;
647        }
648        work_group_barrier( CLK_LOCAL_MEM_FENCE );
649
650        /* terminate when all subtrees are leaves */
651        while ( *local_numRecords != *local_numRecordsOld )
652        {
653            /* remember the old number of build records to detect later
654       * whether we are done */
655            if ( get_local_id( 0 ) == 0 )
656            {
657                *local_numRecordsOld = *local_numRecords;
658            }
659            work_group_barrier( CLK_LOCAL_MEM_FENCE );
660
661            /* all work items in the sub group pick a subtree to build */
662            for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
663            {
664                /* ignore small subtrees */
665                if ( local_records[ID].items <= BVH_NODE_N6 )
666                    continue;
667
668                /* create QBVH node */
669                create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
670            }
671
672            /* wait for all work items to have updated local_records array */
673            work_group_barrier( CLK_LOCAL_MEM_FENCE );
674        }
675
676        const uint shift_mask = globals->shift_mask;
677        const uint leafPrimType = globals->leafPrimType;
678        const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
679        BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
680        global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
681
682        /* create all fat leaf nodes and initiate refit */
683        for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
684        {
685            struct BuildRecordMorton current = local_records[ID];
686            const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
687
688            global struct QBVHNodeN* qnode = nodeData + current.current_index;
689
690            /* get bounds of all children of the fat leaf node */
691            struct AABB bounds[BVH_NODE_N6];
692            for ( uint i = 0; i < current.items; i++ )
693            {
694                /* get primID and bounds of primitive */
695                const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
696                bounds[i] = primref[primID];
697
698                /* For all primitives in a fat leaf we store a back
699                 * pointer. This way we can modify the fat leaf node at leaf construction time. */
700                const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
701
702                /* Store back pointer and primID inside morton code array to
703                 * be later used by leaf creation. */
704                mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
705            }
706
707            /* update fat leaf node */
708            QBVHNodeN_setType( qnode, leafPrimType );
709            global void* offset;
710            if ( leafPrimType != BVH_INSTANCE_NODE )
711            {
712                offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
713                QBVHNodeN_setChildIncr1( qnode );
714            }
715            else
716            {
717                offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
718                QBVHNodeN_setChildIncr2( qnode );
719            }
720            QBVH6Node_set_offset( qnode, offset );
721            QBVHNodeN_setBounds( qnode, bounds, current.items );
722
723            /* set back pointers for fat leaf nodes */
724            *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
725
726            /* bottom up refit */
727            refit_bottom_up( qnode, bvh, bounds, current.items );
728        }
729    }
730}
731
732/*
733
734  This phase takes the build records calculated in phase0 as input and
735  finished the BVH construction for all these subtrees.
736
737*/
738__attribute__((reqd_work_group_size(8, 1, 1)))
739old_parallel_build_phase1(global struct Globals *globals,
740                      global struct MortonCodePrimitive *mc,
741                      global struct AABB *primref,
742                      global struct BinaryMortonCodeHierarchy *bnodes,
743                      global char *bvh_mem)
744{
745    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
746    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
747
748    /* a queue of build records */
749    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
750    local uint local_numRecords;
751    local uint local_numRecordsOld;
752
753    /* construct range of build records that each sub group will process */
754    const uint numRecords = globals->numBuildRecords;
755    const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
756    const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
757
758    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
759
760}
761
762__attribute__( (reqd_work_group_size( 8, 1, 1 )) )
763old_parallel_build_phase1_Indirect( global struct Globals* globals,
764    global struct MortonCodePrimitive* mc,
765    global struct AABB* primref,
766    global struct BinaryMortonCodeHierarchy* bnodes,
767    global char* bvh_mem )
768{
769    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
770    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
771
772    /* a queue of build records */
773    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
774    local uint local_numRecords;
775    local uint local_numRecordsOld;
776
777    /* construct range of build records that each sub group will process */
778    const uint numRecords = globals->numBuildRecords;
779    uint startID = get_group_id( 0 );
780    uint endID   = startID + 1;
781
782    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
783
784}
785#endif
786