• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2009-2021 Intel Corporation
3//
4// SPDX-License-Identifier: MIT
5//
6//
7
8#include "common.h"
9#include "instance.h"
10
11#define DBG(x)
12
13#define ENABLE_CHECKS 0
14
15#define ENABLE_32BINS_IN_BREADTH_FIRST_PHASE 1
16
17/* todo:                                                     */
18/* - new cross WG code path for first splits                 */
19/* - optimize find best child loop sequence                  */
20/* - subgroup_setQBVHNodeN needs work on 6 slots in parallel */
21
22#define DIVIDE_BY_6 1
23
24inline uint getNumPrims(struct BuildRecord *buildRecord)
25{
26    return buildRecord->end - buildRecord->start;
27}
28
29inline void printBuildRecord(struct BuildRecord *record)
30{
31    printf("centroidBounds\n");
32    AABB_print(&record->centroidBounds);
33    printf("start %d end %d size %d depth %d \n", record->start, record->end, record->end - record->start, getBuildRecursionDepth(record));
34}
35
36inline void printBinInfo2(struct BinInfo2 *record)
37{
38    printf("boundsX[%d]\n", BINS * 2);
39    for (uint b = 0; b < BINS * 2; b++)
40    {
41        AABB3f_print(&record->boundsX[b]);
42        printf("counts.x = %d\n", record->counts[b].x);
43    }
44    printf("boundsY[%d]\n", BINS * 2);
45    for (uint b = 0; b < BINS * 2; b++)
46    {
47        AABB3f_print(&record->boundsY[b]);
48        printf("counts.y = %d\n", record->counts[b].y);
49    }
50    printf("boundsZ[%d]\n", BINS * 2);
51    for (uint b = 0; b < BINS * 2; b++)
52    {
53        AABB3f_print(&record->boundsZ[b]);
54        printf("counts.z = %d\n", record->counts[b].z);
55    }
56}
57
58inline void initBinMapping(struct BinMapping *binMapping, struct AABB *centBounds, const uint bins)
59{
60    const float4 eps = 1E-34f;
61    const float4 diag = max(eps, centBounds->upper - centBounds->lower);
62    const float4 scale = (float4)(0.99f * (float)bins) / diag;
63    binMapping->scale = select((float4)(0.0f), scale, diag > eps);
64    binMapping->ofs = centBounds->lower;
65}
66
67inline void atomicExtendLocalBuildRecord(local struct BuildRecord *buildRecord, global struct AABB *primref)
68{
69    const float4 centroid2 = primref->lower + primref->upper;
70    AABB_local_atomic_merge(&buildRecord->centroidBounds, centroid2, centroid2);
71}
72
73// ----------------------------------------------------------------------------------------
74// ----------------------------------------------------------------------------------------
75// ----------------------------------------------------------------------------------------
76
77inline void initBinInfo(struct BinInfo *binInfo)
78{
79    for (uint i = 0; i < BINS; i++)
80    {
81        AABB3f_init(&binInfo->boundsX[i]);
82        AABB3f_init(&binInfo->boundsY[i]);
83        AABB3f_init(&binInfo->boundsZ[i]);
84        binInfo->counts[i] = (uint3)(0);
85    }
86}
87
88inline void subgroup_initBinInfo(struct BinInfo *binInfo)
89{
90    const uint subgroupLocalID = get_sub_group_local_id();
91    const uint subgroup_size = get_sub_group_size();
92
93    for (uint i = subgroupLocalID; i < BINS; i += subgroup_size)
94    {
95        AABB3f_init(&binInfo->boundsX[i]);
96        AABB3f_init(&binInfo->boundsY[i]);
97        AABB3f_init(&binInfo->boundsZ[i]);
98        binInfo->counts[i] = (uint3)(0);
99    }
100}
101
102inline void parallel_initBinInfo(struct BinInfo *binInfo)
103{
104    const uint localID = get_local_id(0);
105    if (localID < BINS)
106    {
107        AABB3f_init(&binInfo->boundsX[localID]);
108        AABB3f_init(&binInfo->boundsY[localID]);
109        AABB3f_init(&binInfo->boundsZ[localID]);
110        binInfo->counts[localID] = (uint3)(0);
111    }
112}
113
114inline void atomicUpdateLocalBinInfo(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref)
115{
116    const float4 lower = primref->lower;
117    const float4 upper = primref->upper;
118    const float4 p = lower + upper;
119    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
120    AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper);
121    AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper);
122    AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper);
123    atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
124    atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
125    atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
126}
127
128inline void atomicUpdateLocalBinInfo_nocheck(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref)
129{
130    const float4 lower = primref->lower;
131    const float4 upper = primref->upper;
132    const float4 p = lower + upper;
133    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
134    AABB3f_atomic_merge_local_nocheck(&binInfo->boundsX[i.x], lower, upper);
135    AABB3f_atomic_merge_local_nocheck(&binInfo->boundsY[i.y], lower, upper);
136    AABB3f_atomic_merge_local_nocheck(&binInfo->boundsZ[i.z], lower, upper);
137    atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
138    atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
139    atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
140}
141
142inline void updateBins(struct BinMapping *binMapping, struct BinInfo *binInfo, global struct AABB *primref)
143{
144    const float4 lower = primref->lower;
145    const float4 upper = primref->upper;
146    const float4 p = lower + upper;
147    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
148    AABB3f_extendlu(&binInfo->boundsX[i.x], lower.xyz, upper.xyz);
149    AABB3f_extendlu(&binInfo->boundsY[i.y], lower.xyz, upper.xyz);
150    AABB3f_extendlu(&binInfo->boundsZ[i.z], lower.xyz, upper.xyz);
151    binInfo->counts[i.x].x++;
152    binInfo->counts[i.y].y++;
153    binInfo->counts[i.z].z++;
154}
155
156// =====================================================================================================================
157// =====================================================================================================================
158// =====================================================================================================================
159
160inline void parallel_initBinInfo2(struct BinInfo2 *binInfo, const uint bins)
161{
162    const uint localID = get_local_id(0);
163    if (localID < bins)
164    {
165        AABB3f_init(&binInfo->boundsX[localID]);
166        AABB3f_init(&binInfo->boundsY[localID]);
167        AABB3f_init(&binInfo->boundsZ[localID]);
168        binInfo->counts[localID] = (uint3)(0);
169    }
170}
171
172inline void atomicUpdateLocalBinInfo2(struct BinMapping *binMapping, local struct BinInfo2 *binInfo, global struct AABB *primref)
173{
174    const float4 lower = primref->lower;
175    const float4 upper = primref->upper;
176    const float4 p = lower + upper;
177    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
178    AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper);
179    AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper);
180    AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper);
181    atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
182    atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
183    atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
184}
185
186inline void atomicUpdateGlobalFromLocalBinInfo2(global struct BinInfo2 *dest, local struct BinInfo2 *source, const uint bins)
187{
188    const uint localID = get_local_id(0);
189    if (localID < bins)
190    {
191        AABB3f_atomic_merge_global_local(&dest->boundsX[localID], &source->boundsX[localID]);
192        AABB3f_atomic_merge_global_local(&dest->boundsY[localID], &source->boundsY[localID]);
193        AABB3f_atomic_merge_global_local(&dest->boundsZ[localID], &source->boundsZ[localID]);
194        atomic_add((global uint *)&dest->counts[localID] + 0, source->counts[localID].x);
195        atomic_add((global uint *)&dest->counts[localID] + 1, source->counts[localID].y);
196        atomic_add((global uint *)&dest->counts[localID] + 2, source->counts[localID].z);
197    }
198}
199
200inline uint subgroup_getMaxAreaChild(struct AABB *childrenAABB, const uint numChildren)
201{
202    const uint subgroupLocalID = get_sub_group_local_id();
203#if 0
204  /*! find best child to split */
205  const float area = (subgroupLocalID < numChildren) & (as_uint(childrenAABB[subgroupLocalID].upper.w) > cfg_minLeafSize) ? childrenAABB[subgroupLocalID].lower.w : -(float)INFINITY;
206  const float maxArea = sub_group_reduce_max(area);
207  const uint mask = intel_sub_group_ballot(area == maxArea);
208  const uint bestChild = maxArea != -(float)INFINITY ? ctz(mask) : -1;
209#else
210    float bestArea = -(float)INFINITY;
211    int bestChild = -1;
212    for (int i = 0; i < numChildren; i++)
213    {
214        /* ignore leaves as they cannot get split */
215        if (as_uint(childrenAABB[i].upper.w) <= cfg_minLeafSize)
216            continue;
217
218        /* find child with largest surface area */
219        if (childrenAABB[i].lower.w > bestArea)
220        {
221            bestChild = i;
222            bestArea = childrenAABB[i].lower.w;
223        }
224    }
225#endif
226    return bestChild;
227}
228
229inline bool AABB_verifyBounds(struct BuildRecord *buildRecord, struct AABB *geometryBounds, struct AABB *primref)
230{
231    const float4 centroid2 = primref->lower + primref->upper;
232
233    if (centroid2.x < buildRecord->centroidBounds.lower.x)
234        return false;
235    if (centroid2.y < buildRecord->centroidBounds.lower.y)
236        return false;
237    if (centroid2.z < buildRecord->centroidBounds.lower.z)
238        return false;
239
240    if (centroid2.x > buildRecord->centroidBounds.upper.x)
241        return false;
242    if (centroid2.y > buildRecord->centroidBounds.upper.y)
243        return false;
244    if (centroid2.z > buildRecord->centroidBounds.upper.z)
245        return false;
246
247    if (primref->lower.x < geometryBounds->lower.x)
248        return false;
249    if (primref->lower.y < geometryBounds->lower.y)
250        return false;
251    if (primref->lower.z < geometryBounds->lower.z)
252        return false;
253
254    if (primref->upper.x > geometryBounds->upper.x)
255        return false;
256    if (primref->upper.y > geometryBounds->upper.y)
257        return false;
258    if (primref->upper.z > geometryBounds->upper.z)
259        return false;
260
261    return true;
262}
263
264/* initialize primref index array */
265__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
266__attribute__((intel_reqd_sub_group_size(16))) void kernel
267create_primref_index(global struct Globals *globals,
268                     global struct AABB *primref,
269                     global unsigned int *primref_index)
270{
271    const uint local_size = get_local_size(0);
272    const uint taskID = get_group_id(0);
273    const uint numTasks = get_num_groups(0);
274    const uint localID = get_local_id(0);
275
276    const uint startID = (taskID + 0) * globals->numPrimitives / numTasks;
277    const uint endID = (taskID + 1) * globals->numPrimitives / numTasks;
278    for (uint primID = startID + localID; primID < endID; primID += local_size)
279        primref_index[primID] = primID;
280}
281
282// ==========================================================================================================
283// ==========================================================================================================
284// ==========================================================================================================
285
286inline float left_to_right_area16(struct AABB3f *low)
287{
288    struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low);
289    return halfArea_AABB3f(&low_prefix);
290}
291
292inline uint left_to_right_counts16(uint low)
293{
294    return sub_group_scan_exclusive_add(low);
295}
296
297inline float right_to_left_area16(struct AABB3f *low)
298{
299    const uint subgroupLocalID = get_sub_group_local_id();
300    const uint subgroup_size = get_sub_group_size();
301    const uint ID = subgroup_size - 1 - subgroupLocalID;
302    struct AABB3f low_reverse = AABB3f_sub_group_shuffle(low, ID);
303    struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse);
304    const float low_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID);
305    return low_area;
306}
307
308inline uint right_to_left_counts16(uint low)
309{
310    const uint subgroupLocalID = get_sub_group_local_id();
311    const uint subgroup_size = get_sub_group_size();
312    const uint ID = subgroup_size - 1 - subgroupLocalID;
313    const uint low_reverse = sub_group_broadcast(low, ID);
314    const uint low_prefix = sub_group_scan_inclusive_add(low_reverse);
315    return sub_group_broadcast(low_prefix, ID);
316}
317
318inline float2 left_to_right_area32(struct AABB3f *low, struct AABB3f *high)
319{
320    struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low);
321    struct AABB3f low_reduce = AABB3f_sub_group_reduce(low);
322    struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max(high);
323    AABB3f_extend(&high_prefix, &low_reduce);
324    const float low_area = halfArea_AABB3f(&low_prefix);
325    const float high_area = halfArea_AABB3f(&high_prefix);
326    return (float2)(low_area, high_area);
327}
328
329inline uint2 left_to_right_counts32(uint low, uint high)
330{
331    const uint low_prefix = sub_group_scan_exclusive_add(low);
332    const uint low_reduce = sub_group_reduce_add(low);
333    const uint high_prefix = sub_group_scan_exclusive_add(high);
334    return (uint2)(low_prefix, low_reduce + high_prefix);
335}
336
337inline float2 right_to_left_area32(struct AABB3f *low, struct AABB3f *high)
338{
339    const uint subgroupLocalID = get_sub_group_local_id();
340    const uint subgroup_size = get_sub_group_size();
341    const uint ID = subgroup_size - 1 - subgroupLocalID;
342    struct AABB3f low_reverse = AABB3f_sub_group_shuffle(high, ID);
343    struct AABB3f high_reverse = AABB3f_sub_group_shuffle(low, ID);
344    struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse);
345    struct AABB3f low_reduce = AABB3f_sub_group_reduce(&low_reverse);
346    struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max(&high_reverse);
347    AABB3f_extend(&high_prefix, &low_reduce);
348    const float low_area = sub_group_broadcast(halfArea_AABB3f(&high_prefix), ID);
349    const float high_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID);
350    return (float2)(low_area, high_area);
351}
352
353inline uint2 right_to_left_counts32(uint low, uint high)
354{
355    const uint subgroupLocalID = get_sub_group_local_id();
356    const uint subgroup_size = get_sub_group_size();
357    const uint ID = subgroup_size - 1 - subgroupLocalID;
358    const uint low_reverse = sub_group_broadcast(high, ID);
359    const uint high_reverse = sub_group_broadcast(low, ID);
360    const uint low_prefix = sub_group_scan_inclusive_add(low_reverse);
361    const uint low_reduce = sub_group_reduce_add(low_reverse);
362    const uint high_prefix = sub_group_scan_inclusive_add(high_reverse) + low_reduce;
363    return (uint2)(sub_group_broadcast(high_prefix, ID), sub_group_broadcast(low_prefix, ID));
364}
365
366inline ulong getBestSplit(float3 sah, uint ID, const float4 scale, const ulong defaultSplit)
367{
368    ulong splitX = (((ulong)as_uint(sah.x)) << 32) | ((uint)ID << 2) | 0;
369    ulong splitY = (((ulong)as_uint(sah.y)) << 32) | ((uint)ID << 2) | 1;
370    ulong splitZ = (((ulong)as_uint(sah.z)) << 32) | ((uint)ID << 2) | 2;
371    /* ignore zero sized dimensions */
372    splitX = select(splitX, defaultSplit, (ulong)(scale.x == 0));
373    splitY = select(splitY, defaultSplit, (ulong)(scale.y == 0));
374    splitZ = select(splitZ, defaultSplit, (ulong)(scale.z == 0));
375    ulong bestSplit = min(min(splitX, splitY), splitZ);
376    bestSplit = sub_group_reduce_min(bestSplit);
377    return bestSplit;
378}
379
380inline uint fastDivideBy6_uint(uint v)
381{
382#if 1
383    const ulong u = (ulong)v >> 1;
384    return (uint)((u * 0x55555556ul) >> 32);
385#else
386    return v / 6;
387#endif
388}
389
390inline uint3 fastDivideBy6_uint3(uint3 v)
391{
392    return (uint3)(fastDivideBy6_uint(v.x), fastDivideBy6_uint(v.y), fastDivideBy6_uint(v.z));
393}
394
395inline struct Split reduceBinsAndComputeBestSplit16(struct BinInfo *binInfo, const float4 scale, uint startID, uint endID)
396{
397    const uint subgroupLocalID = get_sub_group_local_id();
398    const uint subgroup_size = get_sub_group_size();
399
400    struct AABB3f boundsX = binInfo->boundsX[subgroupLocalID];
401
402    const float lr_areaX = left_to_right_area16(&boundsX);
403    const float rl_areaX = right_to_left_area16(&boundsX);
404
405    struct AABB3f boundsY = binInfo->boundsY[subgroupLocalID];
406
407    const float lr_areaY = left_to_right_area16(&boundsY);
408    const float rl_areaY = right_to_left_area16(&boundsY);
409
410    struct AABB3f boundsZ = binInfo->boundsZ[subgroupLocalID];
411
412    const float lr_areaZ = left_to_right_area16(&boundsZ);
413    const float rl_areaZ = right_to_left_area16(&boundsZ);
414
415    const uint3 counts = binInfo->counts[subgroupLocalID];
416
417    const uint lr_countsX = left_to_right_counts16(counts.x);
418    const uint rl_countsX = right_to_left_counts16(counts.x);
419    const uint lr_countsY = left_to_right_counts16(counts.y);
420    const uint rl_countsY = right_to_left_counts16(counts.y);
421    const uint lr_countsZ = left_to_right_counts16(counts.z);
422    const uint rl_countsZ = right_to_left_counts16(counts.z);
423
424    const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ);
425    const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ);
426
427#if DIVIDE_BY_6 == 0
428    const uint blocks_shift = SAH_LOG_BLOCK_SHIFT;
429    uint3 blocks_add = (uint3)((1 << blocks_shift) - 1);
430    const uint3 lr_count = ((uint3)(lr_countsX, lr_countsY, lr_countsZ) + blocks_add) >> blocks_shift;
431    const uint3 rl_count = ((uint3)(rl_countsX, rl_countsY, rl_countsZ) + blocks_add) >> blocks_shift;
432#else
433    const uint3 lr_count = fastDivideBy6_uint3((uint3)(lr_countsX, lr_countsY, lr_countsZ) + BVH_NODE_N6 - 1);
434    const uint3 rl_count = fastDivideBy6_uint3((uint3)(rl_countsX, rl_countsY, rl_countsZ) + BVH_NODE_N6 - 1);
435#endif
436    float3 sah = fma(lr_area, convert_float3(lr_count), rl_area * convert_float3(rl_count));
437
438    /* first bin is invalid */
439
440    sah.x = select((float)(INFINITY), sah.x, subgroupLocalID != 0);
441    sah.y = select((float)(INFINITY), sah.y, subgroupLocalID != 0);
442    sah.z = select((float)(INFINITY), sah.z, subgroupLocalID != 0);
443
444    const uint mid = (startID + endID) / 2;
445    const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0;
446
447    const ulong bestSplit = getBestSplit(sah, subgroupLocalID, scale, defaultSplit);
448
449    struct Split split;
450    split.sah = as_float((uint)(bestSplit >> 32));
451    split.dim = (uint)bestSplit & 3;
452    split.pos = (uint)bestSplit >> 2;
453
454    return split;
455}
456
457inline struct Split reduceBinsAndComputeBestSplit32(struct BinInfo2 *binInfo, const float4 scale, uint startID, uint endID)
458{
459    const uint subgroupLocalID = get_sub_group_local_id();
460    const uint subgroup_size = get_sub_group_size();
461
462    struct AABB3f boundsX_low = binInfo->boundsX[subgroupLocalID];
463    struct AABB3f boundsX_high = binInfo->boundsX[subgroupLocalID + subgroup_size];
464
465    const float2 lr_areaX = left_to_right_area32(&boundsX_low, &boundsX_high);
466    const float2 rl_areaX = right_to_left_area32(&boundsX_low, &boundsX_high);
467
468    struct AABB3f boundsY_low = binInfo->boundsY[subgroupLocalID];
469    struct AABB3f boundsY_high = binInfo->boundsY[subgroupLocalID + subgroup_size];
470
471    const float2 lr_areaY = left_to_right_area32(&boundsY_low, &boundsY_high);
472    const float2 rl_areaY = right_to_left_area32(&boundsY_low, &boundsY_high);
473
474    struct AABB3f boundsZ_low = binInfo->boundsZ[subgroupLocalID];
475    struct AABB3f boundsZ_high = binInfo->boundsZ[subgroupLocalID + subgroup_size];
476
477    const float2 lr_areaZ = left_to_right_area32(&boundsZ_low, &boundsZ_high);
478    const float2 rl_areaZ = right_to_left_area32(&boundsZ_low, &boundsZ_high);
479
480    const uint3 counts_low = binInfo->counts[subgroupLocalID];
481    const uint3 counts_high = binInfo->counts[subgroupLocalID + subgroup_size];
482
483    const uint2 lr_countsX = left_to_right_counts32(counts_low.x, counts_high.x);
484    const uint2 rl_countsX = right_to_left_counts32(counts_low.x, counts_high.x);
485    const uint2 lr_countsY = left_to_right_counts32(counts_low.y, counts_high.y);
486    const uint2 rl_countsY = right_to_left_counts32(counts_low.y, counts_high.y);
487    const uint2 lr_countsZ = left_to_right_counts32(counts_low.z, counts_high.z);
488    const uint2 rl_countsZ = right_to_left_counts32(counts_low.z, counts_high.z);
489
490    const uint blocks_shift = SAH_LOG_BLOCK_SHIFT;
491    uint3 blocks_add = (uint3)((1 << blocks_shift) - 1);
492
493    /* low part: bins 0..15 */
494    const float3 lr_area_low = (float3)(lr_areaX.x, lr_areaY.x, lr_areaZ.x);
495    const float3 rl_area_low = (float3)(rl_areaX.x, rl_areaY.x, rl_areaZ.x);
496
497#if DIVIDE_BY_6 == 0
498    const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x) + blocks_add) >> blocks_shift;
499    const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x) + blocks_add) >> blocks_shift;
500
501#else
502    //const uint3 lr_count_low = ((uint3)(lr_countsX.x,lr_countsY.x,lr_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6;
503    //const uint3 rl_count_low = ((uint3)(rl_countsX.x,rl_countsY.x,rl_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6;
504
505    /* skip blocks for breadth-first phase */
506    const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x));
507    const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x));
508
509#endif
510
511    float3 sah_low = fma(lr_area_low, convert_float3(lr_count_low), rl_area_low * convert_float3(rl_count_low));
512
513    /* first bin is invalid */
514    // sah_low.x = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.x;
515    // sah_low.y = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.y;
516    // sah_low.z = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.z;
517
518    sah_low.x = select((float)(INFINITY), sah_low.x, subgroupLocalID != 0);
519    sah_low.y = select((float)(INFINITY), sah_low.y, subgroupLocalID != 0);
520    sah_low.z = select((float)(INFINITY), sah_low.z, subgroupLocalID != 0);
521
522    /* high part: bins 16..31 */
523
524    const float3 lr_area_high = (float3)(lr_areaX.y, lr_areaY.y, lr_areaZ.y);
525    const float3 rl_area_high = (float3)(rl_areaX.y, rl_areaY.y, rl_areaZ.y);
526#if DIVIDE_BY_6 == 0
527    const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y) + blocks_add) >> blocks_shift;
528    const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y) + blocks_add) >> blocks_shift;
529#else
530    //const uint3 lr_count_high = ((uint3)(lr_countsX.y,lr_countsY.y,lr_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6;
531    //const uint3 rl_count_high = ((uint3)(rl_countsX.y,rl_countsY.y,rl_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6;
532
533    /* skip blocks for breadth-first phase */
534    const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y));
535    const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y));
536
537#endif
538    const float3 sah_high = fma(lr_area_high, convert_float3(lr_count_high), rl_area_high * convert_float3(rl_count_high));
539
540    const uint mid = (startID + endID) / 2;
541    const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0;
542
543    const ulong bestSplit_low = getBestSplit(sah_low, subgroupLocalID, scale, defaultSplit);
544    const ulong bestSplit_high = getBestSplit(sah_high, subgroupLocalID + subgroup_size, scale, defaultSplit);
545    const ulong bestSplit = min(bestSplit_low, bestSplit_high);
546
547    struct Split split;
548    split.sah = as_float((uint)(bestSplit >> 32));
549    split.dim = (uint)bestSplit & 3;
550    split.pos = (uint)bestSplit >> 2;
551
552    return split;
553}
554
555// =====================================================================
556
557inline float leafSAH(float geometryArea, uint prims, uint block_shift)
558{
559    return geometryArea * convert_float((prims + (1 << block_shift) - 1) >> block_shift);
560}
561
562inline bool is_left(struct BinMapping *binMapping, struct Split *split, struct AABB *primref)
563{
564    const uint dim = split->dim;
565    const float lower = primref->lower[dim];
566    const float upper = primref->upper[dim];
567    const float c = lower + upper;
568    const uint pos = convert_uint_rtz((c - binMapping->ofs[dim]) * binMapping->scale[dim]);
569    return pos < split->pos;
570}
571
572inline void serial_find_split(global struct AABB *primref,
573                              struct BinMapping *binMapping,
574                              struct BuildRecord *buildRecord,
575                              local struct Split *split,
576                              local struct BinInfo *binInfo,
577                              global uint *primref_index0,
578                              global uint *primref_index1)
579{
580    const uint subgroupLocalID = get_sub_group_local_id();
581    const uint subgroup_size = get_sub_group_size();
582
583    const uint startID = buildRecord->start;
584    const uint endID = buildRecord->end;
585
586    subgroup_initBinInfo(binInfo);
587
588    for (uint t = startID + subgroupLocalID; t < endID; t += subgroup_size)
589    {
590        const uint index = primref_index0[t];
591        primref_index1[t] = index;
592        atomicUpdateLocalBinInfo_nocheck(binMapping, binInfo, &primref[index]);
593    }
594}
595
596inline void serial_partition_index(global struct AABB *primref,
597                                   struct BinMapping *binMapping,
598                                   struct BuildRecord *buildRecord,
599                                   struct Split *inSplit,
600                                   struct BuildRecord *outLeft,
601                                   struct BuildRecord *outRight,
602                                   struct AABB *outGeometryBoundsLeft,
603                                   struct AABB *outGeometryBoundsRight,
604                                   global uint *primref_index0,
605                                   global uint *primref_index1)
606{
607    const uint localID = get_local_id(0);
608    const uint subgroupLocalID = get_sub_group_local_id();
609    const uint subgroupID = get_sub_group_id();
610    const uint subgroup_size = get_sub_group_size();
611
612    const uint begin = buildRecord->start;
613    const uint end = buildRecord->end;
614    struct Split split = *inSplit;
615
616    struct BuildRecord left;
617    struct BuildRecord right;
618    initBuildRecord(&left, begin, end);
619    initBuildRecord(&right, begin, end);
620
621    struct AABB leftAABB;
622    struct AABB rightAABB;
623    AABB_init(&leftAABB);
624    AABB_init(&rightAABB);
625
626    global uint *l = primref_index0 + begin;
627    global uint *r = primref_index0 + end;
628
629    /* no valid split, just split in the middle */
630    if (split.sah == (float)(INFINITY))
631    {
632        for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size)
633        {
634            const uint index = primref_index1[i];
635            const uint count = sub_group_reduce_add(1);
636            extendBuildRecord(&left, &primref[index]);
637            AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
638            l[subgroupLocalID] = index;
639            l += count;
640        }
641
642        for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size)
643        {
644            const uint index = primref_index1[i];
645            const uint count = sub_group_reduce_add(1);
646            extendBuildRecord(&right, &primref[index]);
647            AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
648            r -= count;
649            r[subgroupLocalID] = index;
650        }
651    }
652    else
653    {
654        for (uint i = begin + subgroupLocalID; i < end; i += subgroup_size)
655        {
656            const uint index = primref_index1[i];
657            const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
658            const uint isRight = 1 - isLeft;
659            const uint countLeft = sub_group_reduce_add(isLeft);
660            const uint countRight = sub_group_reduce_add(isRight);
661            const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
662            const uint prefixRight = sub_group_scan_exclusive_add(isRight);
663
664            r -= countRight;
665
666            if (isLeft)
667            {
668                extendBuildRecord(&left, &primref[index]);
669                AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
670                l[prefixLeft] = index;
671            }
672            else
673            {
674                extendBuildRecord(&right, &primref[index]);
675                AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
676                r[prefixRight] = index;
677            }
678            l += countLeft;
679        }
680    }
681
682    left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
683    right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
684    leftAABB = AABB_sub_group_reduce(&leftAABB);
685    rightAABB = AABB_sub_group_reduce(&rightAABB);
686
687    if (subgroupLocalID == 0)
688    {
689        uint pos = l - primref_index0; // single first thread needs to compute "pos"
690        left.end = pos;
691        right.start = pos;
692
693        leftAABB.lower.w = AABB_halfArea(&leftAABB);
694        rightAABB.lower.w = AABB_halfArea(&rightAABB);
695
696        leftAABB.upper.w = as_float(getNumPrimsBuildRecord(&left));
697        rightAABB.upper.w = as_float(getNumPrimsBuildRecord(&right));
698
699        *outLeft = left;
700        *outRight = right;
701        *outGeometryBoundsLeft = leftAABB;
702        *outGeometryBoundsRight = rightAABB;
703    }
704
705    work_group_barrier(CLK_LOCAL_MEM_FENCE);
706
707#if ENABLE_CHECKS == 1
708    if (subgroupLocalID == 0)
709    {
710        if (AABB_verify(outLeft))
711        {
712            printf("outLeft:\n");
713            printBuildRecord(outLeft);
714        }
715        if (AABB_verify(outRight))
716        {
717            printf("outRight:\n");
718            printBuildRecord(outRight);
719        }
720        if (AABB_verify(outGeometryBoundsLeft))
721        {
722            printf("outGeometryBoundsLeft:\n");
723            AABB_print(outGeometryBoundsLeft);
724        }
725        if (AABB_verify(outGeometryBoundsRight))
726        {
727            printf("outGeometryBoundsRight:\n");
728            AABB_print(outGeometryBoundsRight);
729        }
730
731        for (uint i = outLeft->start; i < outLeft->end; i++)
732        {
733            const uint index = primref_index0[i];
734            if (split.sah != (float)(INFINITY) && !is_left(binMapping, inSplit, &primref[index]))
735                printf("check left %d \n", i);
736            if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index]))
737                printf("check prim ref bounds left %d \n", i);
738        }
739        for (uint i = outRight->start; i < outRight->end; i++)
740        {
741            const uint index = primref_index0[i];
742            if (split.sah != (float)(INFINITY) && is_left(binMapping, inSplit, &primref[index]))
743                printf("check right %d \n", i);
744            if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index]))
745                printf("check prim ref bounds right %d \n", i);
746        }
747    }
748#endif
749}
750
751inline uint subgroup_createLeaf_index(global struct BlockAllocator *allocator,
752                                      const uint start,
753                                      const uint end,
754                                      global struct AABB *primref,
755                                      uint primID,
756                                      global char *bvh_mem,
757                                      unsigned leafSize)
758{
759    const uint subgroupLocalID = get_sub_group_local_id();
760    const uint subgroup_size = get_sub_group_size();
761    const uint items = end - start;
762
763#if ENABLE_CHECKS == 1
764    if (items > BVH_LEAF_N_MAX)
765        printf("error items %d \n", items);
766#endif
767
768    // JDB TODO:  Why was this code commented out??
769    //uint offset = (subgroupLocalID == 0) ? alloc_leaf_mem(globals,sizeof(struct Quad)*items) : 0;
770    //offset = sub_group_broadcast(offset,0);
771
772    //uint offset = globals->leaf_mem_allocator_start + start * leafSize;
773    uint offset = allocator->start + start * leafSize;
774    return offset;
775}
776
777inline uint get_qnode_index_for_backptr(void *qnode_base, void *qnode)
778{
779    size_t offset = ((size_t)qnode - (size_t)qnode_base) / sizeof(struct QBVHNodeN);
780    uint offset_u = (uint)offset;
781#if ENABLE_CHECKS
782    if ((size_t)((offset_u << 6) >> 6) != offset)
783    {
784        printf("get_qnode_index_for_backptr - index out of reach");
785    }
786#endif
787    return offset_u;
788}
789
790struct SerialBuildRecurseTemplateConst
791{
792    unsigned leafSize;
793    unsigned leafType;
794    bool allocateBackpointers;
795};
796
797// ====================================================================================
798// ====================================================================================
799// ====================================================================================
800// ====================================================================================
801// ====================================================================================
802
803inline void parallel_find_split(global struct AABB *primref,
804                                local struct BuildRecord *buildRecord,
805                                local struct Split *bestSplit,
806                                local struct BinInfo *binInfo,
807                                global uint *primref_index0,
808                                global uint *primref_index1)
809{
810    const uint localID = get_local_id(0);
811    const uint local_size = get_local_size(0);
812    const uint subgroupID = get_sub_group_id();
813
814    const uint startID = buildRecord->start;
815    const uint endID = buildRecord->end;
816
817    struct BinMapping binMapping;
818    initBinMapping(&binMapping, &buildRecord->centroidBounds, BINS);
819
820    /* init bininfo */
821    parallel_initBinInfo(binInfo);
822
823    work_group_barrier(CLK_LOCAL_MEM_FENCE);
824
825    for (uint t = startID + localID; t < endID; t += local_size)
826    {
827        const uint index = primref_index0[t];
828        primref_index1[t] = index;
829        atomicUpdateLocalBinInfo(&binMapping, binInfo, &primref[index]);
830    }
831
832    work_group_barrier(CLK_LOCAL_MEM_FENCE);
833
834    /* find best dimension */
835
836    if (subgroupID == 0)
837    {
838        *bestSplit = reduceBinsAndComputeBestSplit16(binInfo, binMapping.scale, startID, endID);
839    }
840
841    work_group_barrier(CLK_LOCAL_MEM_FENCE);
842}
843
844inline void parallel_find_split32(local uint *local_sync,
845                                  global struct AABB *primref,
846                                  local struct BuildRecord *buildRecord,
847                                  local struct Split *bestSplit,
848                                  local struct BinInfo2 *binInfo2,
849                                  global uint *primref_index0,
850                                  global uint *primref_index1)
851{
852
853    const uint localID = get_local_id(0);
854    const uint local_size = get_local_size(0);
855    const uint subgroupID = get_sub_group_id();
856    const uint numSubGroups = get_num_sub_groups();
857    const uint subgroupLocalID = get_sub_group_local_id();
858
859    const uint startID = buildRecord->start;
860    const uint endID = buildRecord->end;
861
862    struct BinMapping binMapping;
863    initBinMapping(&binMapping, &buildRecord->centroidBounds, 2 * BINS);
864
865    /* init bininfo */
866    parallel_initBinInfo2(binInfo2, 2 * BINS);
867
868    if (localID == 0)
869        *local_sync = 0;
870
871    work_group_barrier(CLK_LOCAL_MEM_FENCE);
872
873    for (uint t = startID + localID; t < endID; t += local_size)
874    {
875        const uint index = primref_index0[t];
876        primref_index1[t] = index;
877        atomicUpdateLocalBinInfo2(&binMapping, binInfo2, &primref[index]);
878    }
879
880    /* find best split position using the last subgroup */
881    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
882    uint syncID = subgroupLocalID == 0 ? generic_atomic_add(local_sync, 1) : 0;
883    syncID = sub_group_broadcast(syncID, 0);
884
885    if (syncID + 1 == numSubGroups)
886    {
887        *bestSplit = reduceBinsAndComputeBestSplit32(binInfo2, binMapping.scale, startID, endID);
888        DBG(if (localID == 0) printSplit(bestSplit));
889    }
890
891    work_group_barrier(CLK_LOCAL_MEM_FENCE);
892}
893
894inline void parallel_partition_index(local uint *local_sync,
895                                     global struct AABB *primref,
896                                     struct BinMapping *binMapping,
897                                     const uint begin,
898                                     const uint end,
899                                     struct Split *inSplit,
900                                     local struct BuildRecord *outLeft,
901                                     local struct BuildRecord *outRight,
902                                     local struct AABB *outGeometryBoundsLeft,
903                                     local struct AABB *outGeometryBoundsRight,
904                                     global uint *primref_index0,
905                                     global uint *primref_index1,
906                                     uint *atomicCountLeft,
907                                     uint *atomicCountRight)
908{
909    const uint localID = get_local_id(0);
910    const uint local_size = get_local_size(0);
911    const uint subgroupID = get_sub_group_id();
912    const uint numSubGroups = get_num_sub_groups();
913    const uint subgroup_size = get_sub_group_size();
914    const uint subgroupLocalID = get_sub_group_local_id();
915
916    const uint size = end - begin;
917    struct Split split = *inSplit;
918
919    /* init bin bounds */
920    if (localID == 0)
921    {
922        initBuildRecord(outLeft, begin, end);
923        initBuildRecord(outRight, begin, end);
924        AABB_init(outGeometryBoundsLeft);
925        AABB_init(outGeometryBoundsRight);
926        *atomicCountLeft = 0;
927        *atomicCountRight = 0;
928        *local_sync = 0;
929    }
930
931    work_group_barrier(CLK_LOCAL_MEM_FENCE); // remove ?
932
933    struct BuildRecord left;
934    struct BuildRecord right;
935    initBuildRecord(&left, begin, end);
936    initBuildRecord(&right, begin, end);
937
938    struct AABB leftAABB;
939    struct AABB rightAABB;
940    AABB_init(&leftAABB);
941    AABB_init(&rightAABB);
942
943    if (split.sah == (float)(INFINITY))
944    {
945        if (subgroupID == 0)
946        {
947            for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size)
948            {
949                const uint index = primref_index1[i];
950                extendBuildRecord(&left, &primref[index]);
951                AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
952                primref_index0[i] = index;
953            }
954
955            for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size)
956            {
957                const uint index = primref_index1[i];
958                extendBuildRecord(&right, &primref[index]);
959                AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
960                primref_index0[i] = index;
961            }
962
963            left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
964            right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
965            leftAABB = AABB_sub_group_reduce(&leftAABB);
966            rightAABB = AABB_sub_group_reduce(&rightAABB);
967
968            if (localID == 0)
969            {
970                outLeft->centroidBounds = left.centroidBounds;
971                outRight->centroidBounds = right.centroidBounds;
972
973                *outGeometryBoundsLeft = leftAABB;
974                *outGeometryBoundsRight = rightAABB;
975
976                outLeft->end = split.pos;
977                outRight->start = split.pos;
978
979                outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft);
980                outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight);
981                outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft));
982                outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight));
983            }
984        }
985    }
986    else
987    {
988
989        const int startID = begin + ((subgroupID + 0) * size / numSubGroups);
990        const int endID = begin + ((subgroupID + 1) * size / numSubGroups);
991
992        for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size)
993        {
994            const uint index = primref_index1[i];
995            const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
996            const uint isRight = 1 - isLeft;
997            const uint countLeft = sub_group_reduce_add(isLeft);
998            const uint countRight = sub_group_reduce_add(isRight);
999            const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
1000            const uint prefixRight = sub_group_scan_exclusive_add(isRight);
1001
1002            uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0;
1003            offsetLeft = sub_group_broadcast(offsetLeft, 0);
1004            uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0;
1005            offsetRight = sub_group_broadcast(offsetRight, 0);
1006
1007            if (isLeft)
1008            {
1009                extendBuildRecord(&left, &primref[index]);
1010                AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
1011                primref_index0[begin + offsetLeft + prefixLeft] = index;
1012            }
1013            else
1014            {
1015                extendBuildRecord(&right, &primref[index]);
1016                AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
1017                primref_index0[end - (offsetRight + countRight) + prefixRight] = index;
1018            }
1019        }
1020        left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
1021        right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
1022        leftAABB = AABB_sub_group_reduce(&leftAABB);
1023        rightAABB = AABB_sub_group_reduce(&rightAABB);
1024
1025        AABB_local_atomic_merge(&outLeft->centroidBounds, left.centroidBounds.lower, left.centroidBounds.upper);
1026        AABB_local_atomic_merge(&outRight->centroidBounds, right.centroidBounds.lower, right.centroidBounds.upper);
1027
1028        AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper);
1029        AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper);
1030
1031        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
1032
1033        if (subgroupLocalID == 0)
1034        {
1035            const uint sync = atomic_add(local_sync, 1);
1036            if (sync + 1 == numSubGroups)
1037            {
1038                uint pos = begin + *atomicCountLeft; // single thread of last subgroup needs to compute "pos"
1039                outLeft->end = pos;
1040                outRight->start = pos;
1041
1042                outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft);
1043                outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight);
1044                outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft));
1045                outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight));
1046            }
1047        }
1048    }
1049
1050    work_group_barrier(CLK_LOCAL_MEM_FENCE);
1051
1052#if ENABLE_CHECKS == 1
1053    if (localID == 0)
1054    {
1055        if (outLeft->end <= begin)
1056            printf("pos begin error\n");
1057        if (outLeft->end > end)
1058            printf("pos end error\n");
1059
1060        for (uint i = outLeft->start; i < outLeft->end; i++)
1061        {
1062            const uint index = primref_index0[i];
1063            //printf("left %d -> %d \n",i,index);
1064            if (!is_left(binMapping, inSplit, &primref[index]))
1065                printf("check left %d \n", i);
1066            if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index]))
1067                printf("check prim ref bounds left %d \n", i);
1068        }
1069        for (uint i = outRight->start; i < outRight->end; i++)
1070        {
1071            const uint index = primref_index0[i];
1072            //printf("right %d -> %d \n",i,index);
1073            if (is_left(binMapping, inSplit, &primref[index]))
1074                printf("check right %d \n", i);
1075            if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index]))
1076                printf("check prim ref bounds right %d \n", i);
1077        }
1078    }
1079#endif
1080}
1081
1082
1083#define ENABLE_LOOP_BREADTH_FIRST 0
1084#if ENABLE_LOOP_BREADTH_FIRST
1085// TBD It might be that layout of this impact perf.
1086struct BreadthFirstLoopLocals
1087{
1088    struct BuildRecord local_current;
1089#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
1090    struct BinInfo binInfo;
1091#else
1092    struct BinInfo2 binInfo;
1093#endif
1094    struct Split split;
1095    struct BuildRecord children[BVH_NODE_N + 1];
1096    struct AABB childrenAABB[BVH_NODE_N + 1];
1097    uint atomicCountLeft;
1098    uint atomicCountRight;
1099    uint local_sync;
1100    uint recordID;
1101    uint buildRecordIDs[BUILDRECORD_STACK_SIZE];
1102    uint numBuildRecordIDs;
1103    bool exit;
1104};
1105
1106
1107inline void parallel_build_breadth_first_loopT(global struct Globals *globals,
1108                                               global struct AABB *primref,
1109                                               global uint *primref_index,
1110                                               global char *bvh_mem,
1111                                               uint subtreeThreshold,
1112                                               local struct BreadthFirstLoopLocals *L,
1113                                               struct BreadthFirstTemplateConst T)
1114{
1115    const uint global_size = get_global_size(0);
1116    const uint local_size = get_local_size(0);
1117    const uint localID = get_local_id(0);
1118    const uint taskID = get_group_id(0);
1119    const uint numTasks = get_num_groups(0);
1120
1121    const uint subgroupID = get_sub_group_id();
1122    const uint subgroupLocalID = get_sub_group_local_id();
1123
1124    /* double buffered primref index array */
1125    global uint *primref_index0 = primref_index;
1126    global uint *primref_index1 = primref_index + globals->numPrimitives;
1127
1128    global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
1129
1130#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
1131    const uint bins = BINS;
1132#else
1133    const uint bins = 2 * BINS;
1134#endif
1135
1136    if (localID == 0)
1137    {
1138        L->numBuildRecordIDs = 0;
1139        L->exit = false;
1140    }
1141
1142    work_group_barrier(CLK_LOCAL_MEM_FENCE);
1143
1144    while (1)
1145    {
1146        if (localID == 0)
1147        {
1148            if (L->numBuildRecordIDs == 0)
1149            {
1150                L->recordID = generic_atomic_add(&globals->counter, 1);
1151                if (L->recordID >= globals->numBuildRecords)
1152                    L->exit = true;
1153            }
1154            else
1155            {
1156                L->numBuildRecordIDs--;
1157                L->recordID = L->buildRecordIDs[L->numBuildRecordIDs];
1158            }
1159            L->local_current = records[L->recordID];
1160        }
1161
1162        work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
1163
1164        /* no more buildrecords available ? */
1165
1166        if (L->exit)
1167            break;
1168
1169        local struct BuildRecord *current = &L->local_current;
1170        const uint items = getNumPrims(current);
1171        const uint depth = getBuildRecursionDepth(current);
1172
1173        global unsigned int *num_records_output = &globals->numBuildRecords_extended;
1174
1175        struct QBVHNodeN *qnode = (struct QBVHNodeN *)current->current;
1176
1177        /* ignore small buildrecords */
1178        if (items < max(subtreeThreshold, cfg_minLeafSize))
1179        {
1180            // do nothing
1181        }
1182        else
1183        {
1184            /*! find best split */
1185#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
1186            parallel_find_split(primref, current, &L->split, &L->binInfo, primref_index0, primref_index1);
1187#else
1188            parallel_find_split32(&L->local_sync, primref, current, &L->split, &L->binInfo, primref_index0, primref_index1);
1189#endif
1190            uint numChildren = 2;
1191
1192            /*! find best split */
1193            struct BinMapping binMapping;
1194            initBinMapping(&binMapping, &current->centroidBounds, bins);
1195
1196            parallel_partition_index(&L->local_sync, primref, &binMapping, current->start, current->end, &L->split, &L->children[0], &L->children[1], &L->childrenAABB[0], &L->childrenAABB[1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight);
1197
1198            while (numChildren < BVH_NODE_N6)
1199            {
1200                /*! find best child to split */
1201                const uint bestChild = subgroup_getMaxAreaChild(L->childrenAABB, numChildren);
1202                if (bestChild == -1)
1203                    break;
1204
1205                /* perform best found split */
1206                local struct BuildRecord *brecord = &L->children[bestChild];
1207                local struct BuildRecord *lrecord = &L->children[numChildren + 0];
1208                local struct BuildRecord *rrecord = &L->children[numChildren + 1];
1209
1210#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
1211                parallel_find_split(primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1);
1212#else
1213                parallel_find_split32(&L->local_sync, primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1);
1214#endif
1215
1216                initBinMapping(&binMapping, &brecord->centroidBounds, bins);
1217
1218                parallel_partition_index(&L->local_sync, primref, &binMapping, brecord->start, brecord->end, &L->split, lrecord, rrecord, &L->childrenAABB[numChildren + 0], &L->childrenAABB[numChildren + 1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight);
1219
1220                *brecord = *rrecord;
1221                L->childrenAABB[bestChild] = L->childrenAABB[numChildren + 1];
1222
1223                work_group_barrier(CLK_LOCAL_MEM_FENCE);
1224
1225                numChildren++;
1226            }
1227
1228            //sub_group_barrier(CLK_LOCAL_MEM_FENCE);
1229
1230            if (localID <= 16 && subgroupID == 0)
1231            {
1232                global struct BVHBase *bvh_base = (global struct BVHBase *)bvh_mem;
1233                global struct QBVHNodeN *nodes_start = BVHBase_nodeData(bvh_base);
1234                global uint *back_pointers = BVHBase_backPointers(bvh_base);
1235                uint qnode_index = 0;
1236                if (T.allocateBackpointers)
1237                {
1238                    /* index of internal node, the domain of backpointers map*/
1239                    qnode_index = get_qnode_index_for_backptr(nodes_start, qnode);
1240                    // the backpointer is already set, but we need to add/encode the num of children
1241                    // todo don't like the need of data read (we should just add), maybe should pass grandpa pointer in record..., or use atomic...
1242                    back_pointers[qnode_index] += (numChildren << 3);
1243                }
1244
1245                /* sort children based on rnage size */
1246                const uint numPrimsIDs = select((uint)0, (as_uint(L->childrenAABB[subgroupLocalID].upper.w) << 3) | subgroupLocalID, subgroupLocalID < numChildren);
1247                //const uint IDs = sortBVHChildrenIDs(numPrimsIDs) & (BVH_NODE_N-1);
1248                const uint IDs = numPrimsIDs & 7;
1249                const uint pushIDs = convertToPushIndices8(IDs);
1250
1251                /* alloc #numChildren nodes at once */
1252                const uint node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren);
1253
1254                /* update single relative node pointer and type */
1255                const int offset = encodeOffset(bvh_mem, (global void *)qnode, node_offset) >> 6;
1256                const uint type = BVH_INTERNAL_NODE;
1257
1258                /* set parent pointer in child build records */
1259                if (subgroupLocalID < numChildren)
1260                {
1261                    setBuildRecursionDepth(&L->children[subgroupLocalID], depth + 1);
1262                    global uchar *child_data_ptr = (global uchar *)bvh_mem + node_offset + pushIDs * sizeof(struct QBVHNodeN);
1263                    L->children[subgroupLocalID].current = child_data_ptr;
1264                    if (T.allocateBackpointers)
1265                    {
1266                        uint child_index = get_qnode_index_for_backptr(nodes_start, child_data_ptr);
1267                        back_pointers[child_index] = qnode_index << 6;
1268                    }
1269                }
1270
1271                /* write out qbvh node */
1272                subgroup_setQBVHNodeN(offset, type, &L->childrenAABB[IDs], numChildren, qnode);
1273
1274                /* write out child buildrecords to memory */
1275
1276                uint global_records_offset = (subgroupLocalID == 0) ? atomic_add(num_records_output, numChildren - 1) : 0;
1277                global_records_offset = sub_group_broadcast(global_records_offset, 0);
1278
1279                if (localID == 0)
1280                {
1281                    records[L->recordID] = L->children[0];
1282                    L->buildRecordIDs[L->numBuildRecordIDs++] = L->recordID;
1283                    for (uint i = 1; i < numChildren; i++)
1284                    {
1285                        const uint ID = globals->numBuildRecords + global_records_offset + i - 1;
1286                        records[ID] = L->children[i];
1287                        L->buildRecordIDs[L->numBuildRecordIDs++] = ID;
1288                    }
1289                }
1290            }
1291        }
1292        work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
1293    }
1294
1295    /* last active HW thread ? */
1296    if (localID == 0)
1297    {
1298        const uint sync = atomic_add(&globals->sync, 1);
1299        if (sync + 1 == numTasks)
1300        {
1301            globals->sync = 0;
1302            /* set final number of buildrecords */
1303            globals->numBuildRecords += globals->numBuildRecords_extended;
1304            globals->numBuildRecords_extended = 0;
1305            globals->counter = 0;
1306        }
1307    }
1308}
1309
1310__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1)))
1311__attribute__((intel_reqd_sub_group_size(16))) void kernel
1312parallel_build_breadth_first_loop(global struct Globals *globals,
1313                                  global struct AABB *primref,
1314                                  global uint *primref_index,
1315                                  global char *bvh_mem,
1316                                  uint subtreeThreshold)
1317{
1318    local struct BreadthFirstLoopLocals L;
1319    static const struct BreadthFirstTemplateConst T = {
1320        false // bool allocateBackpointers;
1321    };
1322
1323    parallel_build_breadth_first_loopT(globals,
1324                                       primref,
1325                                       primref_index,
1326                                       bvh_mem,
1327                                       subtreeThreshold,
1328                                       &L,
1329                                       T);
1330}
1331
1332__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1)))
1333__attribute__((intel_reqd_sub_group_size(16))) void kernel
1334parallel_build_breadth_first_loop_backpointers(global struct Globals *globals,
1335                                               global struct AABB *primref,
1336                                               global uint *primref_index,
1337                                               global char *bvh_mem,
1338                                               uint subtreeThreshold)
1339{
1340    local struct BreadthFirstLoopLocals L;
1341    static const struct BreadthFirstTemplateConst T = {
1342        true // bool allocateBackpointers;
1343    };
1344
1345    parallel_build_breadth_first_loopT(globals,
1346                                       primref,
1347                                       primref_index,
1348                                       bvh_mem,
1349                                       subtreeThreshold,
1350                                       &L,
1351                                       T);
1352}
1353// ===================================================
1354// =============== experimental code =================
1355// ===================================================
1356#endif
1357
1358#define ENABLE_GLOBAL_SPLIT 0
1359#if ENABLE_GLOBAL_SPLIT
1360inline void parallel_partition_segment_index(local uint *local_sync,
1361                                             global struct AABB *primref,
1362                                             struct BinMapping *binMapping,
1363                                             const uint begin,
1364                                             const uint end,
1365                                             const uint global_begin,
1366                                             const uint global_end,
1367                                             struct Split *inSplit,
1368                                             local struct AABB *outLeft,
1369                                             local struct AABB *outRight,
1370                                             local struct AABB *outGeometryBoundsLeft,
1371                                             local struct AABB *outGeometryBoundsRight,
1372                                             global uint *primref_index0,
1373                                             global uint *primref_index1,
1374                                             uint *atomicCountLeft,
1375                                             uint *atomicCountRight)
1376{
1377    const uint localID = get_local_id(0);
1378    const uint local_size = get_local_size(0);
1379    const uint subgroupID = get_sub_group_id();
1380    const uint numSubGroups = get_num_sub_groups();
1381    const uint subgroup_size = get_sub_group_size();
1382    const uint subgroupLocalID = get_sub_group_local_id();
1383
1384    const uint size = end - begin;
1385    struct Split split = *inSplit;
1386
1387    /* init bin bounds */
1388    if (localID == 0)
1389    {
1390        AABB_init(outLeft);
1391        AABB_init(outRight);
1392        AABB_init(outGeometryBoundsLeft);
1393        AABB_init(outGeometryBoundsRight);
1394        *local_sync = 0;
1395    }
1396
1397    work_group_barrier(CLK_LOCAL_MEM_FENCE);
1398
1399    struct AABB left;
1400    struct AABB right;
1401    AABB_init(&left);
1402    AABB_init(&right);
1403
1404    struct AABB leftAABB;
1405    struct AABB rightAABB;
1406    AABB_init(&leftAABB);
1407    AABB_init(&rightAABB);
1408
1409    const int startID = begin + ((subgroupID + 0) * size / numSubGroups);
1410    const int endID = begin + ((subgroupID + 1) * size / numSubGroups);
1411
1412    for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size)
1413    {
1414        const uint index = primref_index1[i];
1415        const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
1416        const uint isRight = 1 - isLeft;
1417        const uint countLeft = sub_group_reduce_add(isLeft);
1418        const uint countRight = sub_group_reduce_add(isRight);
1419        const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
1420        const uint prefixRight = sub_group_scan_exclusive_add(isRight);
1421
1422        uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0;
1423        offsetLeft = sub_group_broadcast(offsetLeft, 0);
1424        uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0;
1425        offsetRight = sub_group_broadcast(offsetRight, 0);
1426
1427        if (isLeft)
1428        {
1429            AABB_extend_point(&left, AABB_centroid2(&primref[index]));
1430            AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
1431            primref_index0[global_begin + offsetLeft + prefixLeft] = index;
1432        }
1433        else
1434        {
1435            AABB_extend_point(&right, AABB_centroid2(&primref[index]));
1436            AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
1437            primref_index0[global_end - (offsetRight + countRight) + prefixRight] = index;
1438        }
1439    }
1440    left = AABB_sub_group_reduce(&left);
1441    right = AABB_sub_group_reduce(&right);
1442    leftAABB = AABB_sub_group_reduce(&leftAABB);
1443    rightAABB = AABB_sub_group_reduce(&rightAABB);
1444
1445    AABB_local_atomic_merge(outLeft, left.lower, left.upper);
1446    AABB_local_atomic_merge(outRight, right.lower, right.upper);
1447
1448    AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper);
1449    AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper);
1450
1451    work_group_barrier(CLK_LOCAL_MEM_FENCE);
1452}
1453
1454__attribute__((reqd_work_group_size(BINS * 2, 1, 1)))
1455__attribute__((intel_reqd_sub_group_size(16)))
1456void kernel global_init_split_iteration(global struct Globals *globals,
1457                            global struct GlobalBuildRecord *global_record,
1458                            global char *bvh_mem,
1459                            const uint subTreeThreshold)
1460{
1461    const uint localID = get_local_id(0);
1462    const uint taskID = get_group_id(0);
1463    const uint numTasks = get_num_groups(0);
1464
1465    global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
1466
1467    /* for each build record with size > subTreeThreshold initialize a global build record */
1468
1469    const uint startID = (taskID + 0) * globals->numBuildRecords / numTasks;
1470    const uint endID = (taskID + 1) * globals->numBuildRecords / numTasks;
1471
1472    for (uint i = startID; i < endID; i++)
1473    {
1474        global struct BuildRecord *buildRecord = &records[i];
1475        DBG(if (localID == 0) printf("i %d subTreeThreshold %d size %d \n", i, subTreeThreshold, buildRecord->end - buildRecord->start));
1476
1477        if ((buildRecord->end - buildRecord->start) > subTreeThreshold)
1478        {
1479            uint ID = localID == 0 ? generic_atomic_add(&globals->numGlobalBuildRecords, 1) : 0;
1480
1481            ID = work_group_broadcast(ID, 0);
1482            global struct BinInfo2 *binInfo = &global_record[ID].binInfo;
1483            global struct BinMapping *binMapping = &global_record[ID].binMapping;
1484            initBinMapping(binMapping, &buildRecord->centroidBounds, 2 * BINS);
1485            parallel_initBinInfo2(binInfo, 2 * BINS);
1486            if (localID == 0)
1487            {
1488                global_record[ID].range.start = buildRecord->start;
1489                global_record[ID].range.end = buildRecord->end;
1490                global_record[ID].atomicCountLeft = 0;
1491                global_record[ID].atomicCountRight = 0;
1492                global_record[ID].buildRecordID = i;
1493                AABB_init(&global_record[ID].leftCentroid);
1494                AABB_init(&global_record[ID].rightCentroid);
1495                AABB_init(&global_record[ID].leftGeometry);
1496                AABB_init(&global_record[ID].rightGeometry);
1497            }
1498        }
1499    }
1500    DBG(
1501        work_group_barrier(CLK_LOCAL_MEM_FENCE);
1502        if (localID == 0)
1503            printf("globals->numGlobalBuildRecords %d \n", globals->numGlobalBuildRecords););
1504}
1505
1506__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
1507__attribute__((intel_reqd_sub_group_size(16)))
1508void kernel global_bin_iteration(global struct Globals *globals,
1509                     global struct AABB *primref,
1510                     global uint *primref_index,
1511                     global char *bvh_mem,
1512                     global struct GlobalBuildRecord *global_record)
1513{
1514    const uint localID = get_local_id(0);
1515    const uint blockSize = get_local_size(0);
1516    const uint taskID = get_group_id(0);
1517    const uint numTasks = get_num_groups(0);
1518
1519    const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
1520
1521    /* early out */
1522    if (numGlobalBuildRecords == 0)
1523        return;
1524
1525    /* double buffered primref index array */
1526    global uint *primref_index0 = primref_index;
1527    global uint *primref_index1 = primref_index + globals->numPrimitives;
1528
1529    uint numBlocks = 0;
1530
1531    /* get total number of blocks, size of block == WG size */
1532    for (uint i = 0; i < numGlobalBuildRecords; i++)
1533        numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize;
1534
1535    const uint startBlockID = (taskID + 0) * numBlocks / numTasks;
1536    const uint endBlockID = (taskID + 1) * numBlocks / numTasks;
1537    uint numBlockIDs = endBlockID - startBlockID;
1538
1539    uint splitRecordID = 0;
1540    uint offset_start = 0;
1541    uint offset_end = 0;
1542    uint cur_blocks = 0;
1543
1544    for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++)
1545    {
1546        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
1547        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
1548        if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks)
1549        {
1550            const uint preBlocks = startBlockID - blockCounter;
1551            cur_blocks = min(numBlockIDs, blocks - preBlocks);
1552            offset_start = preBlocks * blockSize;
1553            offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord);
1554            break;
1555        }
1556        blockCounter += blocks;
1557    }
1558
1559    if (localID == 0)
1560        DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
1561
1562    local struct BinInfo2 local_binInfo;
1563    parallel_initBinInfo2(&local_binInfo, 2 * BINS);
1564    struct BinMapping binMapping = global_record[splitRecordID].binMapping;
1565
1566    while (1)
1567    {
1568        work_group_barrier(CLK_LOCAL_MEM_FENCE);
1569
1570        const uint startID = global_record[splitRecordID].range.start + offset_start;
1571        const uint endID = global_record[splitRecordID].range.start + offset_end;
1572
1573        if (localID == 0)
1574            DBG(printf("taskID %d startID %d endID %d \n", taskID, startID, endID));
1575
1576        for (uint i = startID + localID; i < endID; i += blockSize)
1577        {
1578            const uint index = primref_index0[i];
1579            primref_index1[i] = index;
1580            atomicUpdateLocalBinInfo2(&binMapping, &local_binInfo, &primref[index]);
1581        }
1582
1583        work_group_barrier(CLK_LOCAL_MEM_FENCE); //FIXME: remove, do local sync
1584        atomicUpdateGlobalFromLocalBinInfo2(&global_record[splitRecordID].binInfo, &local_binInfo, 2 * BINS);
1585        work_group_barrier(CLK_LOCAL_MEM_FENCE);
1586
1587        numBlockIDs -= cur_blocks;
1588        if (numBlockIDs == 0)
1589            break;
1590
1591        splitRecordID++;
1592        parallel_initBinInfo2(&local_binInfo, 2 * BINS);
1593        binMapping = global_record[splitRecordID].binMapping;
1594
1595        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
1596        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
1597        cur_blocks = min(numBlockIDs, blocks);
1598        offset_start = 0;
1599        offset_end = min(cur_blocks * blockSize, sizeRecord);
1600
1601        if (localID == 0)
1602            DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
1603    }
1604}
1605
1606__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
1607__attribute__((intel_reqd_sub_group_size(16))) void kernel
1608global_compute_best_split_iteration(global struct Globals *globals,
1609                                    global char *bvh_mem,
1610                                    global struct GlobalBuildRecord *global_record)
1611{
1612    const uint localID = get_local_id(0);
1613    const uint blockSize = get_local_size(0);
1614    const uint taskID = get_group_id(0);
1615    const uint numTasks = get_num_groups(0);
1616
1617    const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
1618
1619    /* early out */
1620    if (numGlobalBuildRecords == 0)
1621        return;
1622
1623    const uint startRecordID = (taskID + 0) * numGlobalBuildRecords / numTasks;
1624    const uint endRecordID = (taskID + 1) * numGlobalBuildRecords / numTasks;
1625    for (uint i = startRecordID; i < endRecordID; i++)
1626    {
1627        struct Split split = reduceBinsAndComputeBestSplit32(&global_record[i].binInfo,
1628                                                             global_record[i].binMapping.scale,
1629                                                             global_record[i].range.start,
1630                                                             global_record[i].range.end);
1631        if (localID == 0)
1632        {
1633            global_record[i].split = split;
1634            global_record[i].atomicCountLeft = 0;
1635            global_record[i].atomicCountRight = 0;
1636            DBG(printSplit(&global_record[i].split));
1637        }
1638    }
1639}
1640
1641__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
1642__attribute__((intel_reqd_sub_group_size(16))) void kernel
1643global_partition_iteration(global struct Globals *globals,
1644                           global struct AABB *primref,
1645                           global uint *primref_index,
1646                           global char *bvh_mem,
1647                           global struct GlobalBuildRecord *global_record)
1648{
1649
1650    const uint localID = get_local_id(0);
1651    const uint blockSize = get_local_size(0);
1652    const uint taskID = get_group_id(0);
1653    const uint numTasks = get_num_groups(0);
1654
1655    const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
1656
1657    /* early out */
1658    if (numGlobalBuildRecords == 0)
1659        return;
1660
1661    /* double buffered primref index array */
1662    global uint *primref_index0 = primref_index;
1663    global uint *primref_index1 = primref_index + globals->numPrimitives;
1664
1665    uint numBlocks = 0;
1666
1667    /* get total number of blocks, size of block == WG size */
1668    for (uint i = 0; i < numGlobalBuildRecords; i++)
1669        numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize;
1670
1671    const uint startBlockID = (taskID + 0) * numBlocks / numTasks;
1672    const uint endBlockID = (taskID + 1) * numBlocks / numTasks;
1673    uint numBlockIDs = endBlockID - startBlockID;
1674
1675    uint splitRecordID = 0;
1676    uint offset_start = 0;
1677    uint offset_end = 0;
1678    uint cur_blocks = 0;
1679
1680    for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++)
1681    {
1682        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
1683        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
1684        if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks)
1685        {
1686            const uint preBlocks = startBlockID - blockCounter;
1687            cur_blocks = min(numBlockIDs, blocks - preBlocks);
1688            offset_start = preBlocks * blockSize;
1689            offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord);
1690            break;
1691        }
1692        blockCounter += blocks;
1693    }
1694
1695    if (localID == 0)
1696        DBG(printf("partition taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
1697
1698    local struct AABB centroidAABB[2];
1699    local struct AABB geometryAABB[2];
1700    local uint local_sync;
1701
1702    while (1)
1703    {
1704
1705        const uint startID = global_record[splitRecordID].range.start + offset_start;
1706        const uint endID = global_record[splitRecordID].range.start + offset_end;
1707
1708        struct BinMapping binMapping = global_record[splitRecordID].binMapping;
1709        struct Split split = global_record[splitRecordID].split;
1710
1711        const uint global_start = global_record[splitRecordID].range.start;
1712        const uint global_end = global_record[splitRecordID].range.end;
1713
1714        if (localID == 0)
1715            DBG(printf("partition taskID %d startID %d endID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, startID, endID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
1716
1717        parallel_partition_segment_index(&local_sync, primref, &binMapping, startID, endID, global_start, global_end, &split, &centroidAABB[0], &centroidAABB[1], &geometryAABB[0], &geometryAABB[1], primref_index0, primref_index1, &global_record[splitRecordID].atomicCountLeft, &global_record[splitRecordID].atomicCountRight);
1718
1719        /* update global structures */
1720        if (localID == 0)
1721        {
1722            AABB_global_atomic_merge(&global_record[splitRecordID].leftCentroid, &centroidAABB[0]);
1723            AABB_global_atomic_merge(&global_record[splitRecordID].rightCentroid, &centroidAABB[1]);
1724            AABB_global_atomic_merge(&global_record[splitRecordID].leftGeometry, &geometryAABB[0]);
1725            AABB_global_atomic_merge(&global_record[splitRecordID].rightGeometry, &geometryAABB[1]);
1726        }
1727
1728        numBlockIDs -= cur_blocks;
1729        if (numBlockIDs == 0)
1730            break;
1731
1732        splitRecordID++;
1733
1734        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
1735        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
1736        cur_blocks = min(numBlockIDs, blocks);
1737        offset_start = 0;
1738        offset_end = min(cur_blocks * blockSize, sizeRecord);
1739    }
1740}
1741
1742inline void printBinaryNode(struct AABB *aabb)
1743{
1744    printf("lower %f upper %f lower.w %d upper.w %d \n", aabb->lower, aabb->upper, as_uint(aabb->lower.w), as_uint(aabb->upper.w));
1745}
1746
1747__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
1748__attribute__((intel_reqd_sub_group_size(16)))
1749void kernel global_finalize_iteration(global struct Globals *globals,
1750                          global struct GlobalBuildRecord *global_record,
1751                          global char *bvh_mem,
1752                          global struct AABB *binary_nodes)
1753{
1754    const uint localID = get_local_id(0);
1755    const uint localSize = get_local_size(0);
1756    const uint groupID = get_group_id(0);
1757    const uint numGroups = get_num_groups(0);
1758
1759    global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
1760
1761    for (uint i = localID; i < globals->numGlobalBuildRecords; i += localSize)
1762    {
1763        const uint buildRecordID = global_record[i].buildRecordID;
1764        const uint binaryNodeID = as_uint(records[buildRecordID].centroidBounds.lower.w);
1765        /* left child buildrecord */
1766        const uint leftID = buildRecordID;
1767        records[leftID].start = global_record[i].range.start;
1768        records[leftID].end = global_record[i].range.start + global_record[i].atomicCountLeft;
1769        records[leftID].centroidBounds = global_record[i].leftCentroid;
1770        /* right child buildrecord */
1771        const uint rightID = generic_atomic_add(&globals->numBuildRecords, 1);
1772        records[rightID].start = global_record[i].range.start + global_record[i].atomicCountLeft;
1773        records[rightID].end = global_record[i].range.end;
1774        records[rightID].centroidBounds = global_record[i].rightCentroid;
1775        /* two binary nodes */
1776        const uint binaryChildID = generic_atomic_add(&globals->numGlobalBinaryNodes, 2);
1777        binary_nodes[binaryNodeID].lower.w = as_float(binaryChildID + 0);
1778        binary_nodes[binaryNodeID].upper.w = as_float(binaryChildID + 1);
1779        binary_nodes[binaryChildID + 0] = global_record[i].leftGeometry;
1780        binary_nodes[binaryChildID + 1] = global_record[i].rightGeometry;
1781        binary_nodes[binaryChildID + 0].lower.w = as_float(leftID);
1782        binary_nodes[binaryChildID + 0].upper.w = as_float(-1);
1783        binary_nodes[binaryChildID + 1].lower.w = as_float(rightID);
1784        binary_nodes[binaryChildID + 1].upper.w = as_float(-1);
1785        records[leftID].centroidBounds.lower.w = as_float(binaryChildID + 0);
1786        records[rightID].centroidBounds.lower.w = as_float(binaryChildID + 1);
1787    }
1788
1789    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
1790
1791    if (localID == 0)
1792    {
1793        const uint sync = atomic_add(&globals->sync, 1);
1794        if (sync + 1 == numGroups)
1795        {
1796            globals->sync = 0;
1797            DBG(printf("globals->numBuildRecords %d \n", globals->numBuildRecords));
1798            DBG(
1799                for (uint i = 0; i < globals->numBuildRecords; i++) {
1800                    printf("i %d \n", i);
1801                    printBuildRecord(&records[i]);
1802                } printf("Binary Tree \n");
1803                for (uint i = 0; i < globals->numGlobalBinaryNodes; i++) {
1804                    printf("i %d \n", i);
1805                    printBinaryNode(&binary_nodes[i]);
1806                }
1807
1808            );
1809            globals->numGlobalBuildRecords = 0;
1810        }
1811    }
1812}
1813
1814__attribute__((reqd_work_group_size(1, 1, 1))) void kernel global_build_top_level(global struct Globals *globals,
1815                                                                                  global struct GlobalBuildRecord *global_record,
1816                                                                                  global char *bvh_mem,
1817                                                                                  global struct AABB *binary_nodes)
1818{
1819#define MAX_TOP_LEVEL_STACK_DEPTH 32
1820    struct AABB stack[MAX_TOP_LEVEL_STACK_DEPTH];
1821    global uchar *stackParentPtrs[MAX_TOP_LEVEL_STACK_DEPTH];
1822    struct AABB childrenAABB[BVH_NODE_N6];
1823    float childrenHalfArea[BVH_NODE_N6];
1824
1825    /* build records */
1826    global struct BuildRecord *record = getBuildRecords(bvh_mem, globals);
1827
1828    struct BVHBase *base = (struct BVHBase *)bvh_mem;
1829    struct QBVHNodeN *qnode_root = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset);
1830
1831    uint stack_index = 1;
1832    stack[0] = binary_nodes[0];
1833    stackParentPtrs[0] = (global uchar *)qnode_root;
1834
1835    while (stack_index != 0)
1836    {
1837        stack_index--;
1838
1839        childrenAABB[0] = stack[stack_index];
1840        struct QBVHNodeN *qnode = (struct QBVHNodeN *)stackParentPtrs[stack_index];
1841        childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]);
1842
1843        /* buildrecord leaf => set parent pointer and continue*/
1844        DBG(
1845            printf("stack_index %d \n", stack_index);
1846            printf("as_uint(childrenAABB[0].upper.w) %d \n", as_uint(childrenAABB[0].upper.w)););
1847
1848        if (as_uint(childrenAABB[0].upper.w) == -1)
1849        {
1850            const uint buildRecordID = as_uint(childrenAABB[0].lower.w);
1851            DBG(
1852                printf("leaf buildRecordID %d \n", buildRecordID);
1853                printBuildRecord(&record[buildRecordID]);)
1854
1855            record[buildRecordID].current = (global uchar *)qnode;
1856            continue;
1857        }
1858
1859        childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]);
1860
1861        uint numChildren = 1;
1862        while (numChildren < BVH_NODE_N6)
1863        {
1864            // FIXME
1865
1866            /*! find best child to split */
1867            float bestArea = -(float)INFINITY;
1868            int bestChild = -1;
1869            for (int i = 0; i < numChildren; i++)
1870            {
1871                /* ignore leaves as they cannot get split */
1872                if (as_uint(childrenAABB[i].upper.w) == -1)
1873                    continue;
1874
1875                /* find child with largest surface area */
1876                if (childrenHalfArea[i] > bestArea)
1877                {
1878                    bestChild = i;
1879                    bestArea = childrenAABB[i].lower.w;
1880                }
1881            }
1882            if (bestChild == -1)
1883                break;
1884            const uint leftID = as_uint(childrenAABB[bestChild].lower.w);
1885            const uint rightID = as_uint(childrenAABB[bestChild].upper.w);
1886            childrenAABB[bestChild] = binary_nodes[leftID];
1887            childrenAABB[numChildren] = binary_nodes[rightID];
1888            childrenHalfArea[bestChild] = AABB_halfArea(&childrenAABB[bestChild]);
1889            childrenHalfArea[numChildren] = AABB_halfArea(&childrenAABB[numChildren]);
1890            numChildren++;
1891        }
1892
1893        const uint child_node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren);
1894
1895        /* update single relative node pointer */
1896        const int offset = encodeOffset(bvh_mem, (global void *)qnode, child_node_offset) >> 6;
1897        const uint type = BVH_INTERNAL_NODE;
1898
1899        setQBVHNodeN(offset, type, childrenAABB, numChildren, qnode);
1900
1901        DBG(
1902            printQBVHNodeN(qnode);
1903            printf("numChildren %d \n", numChildren);
1904            for (uint i = 0; i < numChildren; i++)
1905                AABB_print(&childrenAABB[i]););
1906
1907        /* update parent pointer of build records of all children */
1908        for (uint ID = 0; ID < numChildren; ID++)
1909        {
1910            stack[stack_index] = childrenAABB[ID];
1911            stackParentPtrs[stack_index] = (global uchar *)bvh_mem + child_node_offset + ID * sizeof(struct QBVHNodeN);
1912            stack_index++;
1913        }
1914    }
1915}
1916
1917#endif
1918