1// 2// Copyright (C) 2009-2021 Intel Corporation 3// 4// SPDX-License-Identifier: MIT 5// 6// 7 8#include "common.h" 9#include "instance.h" 10 11#define DBG(x) 12 13#define ENABLE_CHECKS 0 14 15#define ENABLE_32BINS_IN_BREADTH_FIRST_PHASE 1 16 17/* todo: */ 18/* - new cross WG code path for first splits */ 19/* - optimize find best child loop sequence */ 20/* - subgroup_setQBVHNodeN needs work on 6 slots in parallel */ 21 22#define DIVIDE_BY_6 1 23 24inline uint getNumPrims(struct BuildRecord *buildRecord) 25{ 26 return buildRecord->end - buildRecord->start; 27} 28 29inline void printBuildRecord(struct BuildRecord *record) 30{ 31 printf("centroidBounds\n"); 32 AABB_print(&record->centroidBounds); 33 printf("start %d end %d size %d depth %d \n", record->start, record->end, record->end - record->start, getBuildRecursionDepth(record)); 34} 35 36inline void printBinInfo2(struct BinInfo2 *record) 37{ 38 printf("boundsX[%d]\n", BINS * 2); 39 for (uint b = 0; b < BINS * 2; b++) 40 { 41 AABB3f_print(&record->boundsX[b]); 42 printf("counts.x = %d\n", record->counts[b].x); 43 } 44 printf("boundsY[%d]\n", BINS * 2); 45 for (uint b = 0; b < BINS * 2; b++) 46 { 47 AABB3f_print(&record->boundsY[b]); 48 printf("counts.y = %d\n", record->counts[b].y); 49 } 50 printf("boundsZ[%d]\n", BINS * 2); 51 for (uint b = 0; b < BINS * 2; b++) 52 { 53 AABB3f_print(&record->boundsZ[b]); 54 printf("counts.z = %d\n", record->counts[b].z); 55 } 56} 57 58inline void initBinMapping(struct BinMapping *binMapping, struct AABB *centBounds, const uint bins) 59{ 60 const float4 eps = 1E-34f; 61 const float4 diag = max(eps, centBounds->upper - centBounds->lower); 62 const float4 scale = (float4)(0.99f * (float)bins) / diag; 63 binMapping->scale = select((float4)(0.0f), scale, diag > eps); 64 binMapping->ofs = centBounds->lower; 65} 66 67inline void atomicExtendLocalBuildRecord(local struct BuildRecord *buildRecord, global struct AABB *primref) 68{ 69 const float4 centroid2 = primref->lower + primref->upper; 70 AABB_local_atomic_merge(&buildRecord->centroidBounds, centroid2, centroid2); 71} 72 73// ---------------------------------------------------------------------------------------- 74// ---------------------------------------------------------------------------------------- 75// ---------------------------------------------------------------------------------------- 76 77inline void initBinInfo(struct BinInfo *binInfo) 78{ 79 for (uint i = 0; i < BINS; i++) 80 { 81 AABB3f_init(&binInfo->boundsX[i]); 82 AABB3f_init(&binInfo->boundsY[i]); 83 AABB3f_init(&binInfo->boundsZ[i]); 84 binInfo->counts[i] = (uint3)(0); 85 } 86} 87 88inline void subgroup_initBinInfo(struct BinInfo *binInfo) 89{ 90 const uint subgroupLocalID = get_sub_group_local_id(); 91 const uint subgroup_size = get_sub_group_size(); 92 93 for (uint i = subgroupLocalID; i < BINS; i += subgroup_size) 94 { 95 AABB3f_init(&binInfo->boundsX[i]); 96 AABB3f_init(&binInfo->boundsY[i]); 97 AABB3f_init(&binInfo->boundsZ[i]); 98 binInfo->counts[i] = (uint3)(0); 99 } 100} 101 102inline void parallel_initBinInfo(struct BinInfo *binInfo) 103{ 104 const uint localID = get_local_id(0); 105 if (localID < BINS) 106 { 107 AABB3f_init(&binInfo->boundsX[localID]); 108 AABB3f_init(&binInfo->boundsY[localID]); 109 AABB3f_init(&binInfo->boundsZ[localID]); 110 binInfo->counts[localID] = (uint3)(0); 111 } 112} 113 114inline void atomicUpdateLocalBinInfo(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref) 115{ 116 const float4 lower = primref->lower; 117 const float4 upper = primref->upper; 118 const float4 p = lower + upper; 119 const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); 120 AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper); 121 AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper); 122 AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper); 123 atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1); 124 atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1); 125 atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1); 126} 127 128inline void atomicUpdateLocalBinInfo_nocheck(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref) 129{ 130 const float4 lower = primref->lower; 131 const float4 upper = primref->upper; 132 const float4 p = lower + upper; 133 const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); 134 AABB3f_atomic_merge_local_nocheck(&binInfo->boundsX[i.x], lower, upper); 135 AABB3f_atomic_merge_local_nocheck(&binInfo->boundsY[i.y], lower, upper); 136 AABB3f_atomic_merge_local_nocheck(&binInfo->boundsZ[i.z], lower, upper); 137 atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1); 138 atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1); 139 atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1); 140} 141 142inline void updateBins(struct BinMapping *binMapping, struct BinInfo *binInfo, global struct AABB *primref) 143{ 144 const float4 lower = primref->lower; 145 const float4 upper = primref->upper; 146 const float4 p = lower + upper; 147 const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); 148 AABB3f_extendlu(&binInfo->boundsX[i.x], lower.xyz, upper.xyz); 149 AABB3f_extendlu(&binInfo->boundsY[i.y], lower.xyz, upper.xyz); 150 AABB3f_extendlu(&binInfo->boundsZ[i.z], lower.xyz, upper.xyz); 151 binInfo->counts[i.x].x++; 152 binInfo->counts[i.y].y++; 153 binInfo->counts[i.z].z++; 154} 155 156// ===================================================================================================================== 157// ===================================================================================================================== 158// ===================================================================================================================== 159 160inline void parallel_initBinInfo2(struct BinInfo2 *binInfo, const uint bins) 161{ 162 const uint localID = get_local_id(0); 163 if (localID < bins) 164 { 165 AABB3f_init(&binInfo->boundsX[localID]); 166 AABB3f_init(&binInfo->boundsY[localID]); 167 AABB3f_init(&binInfo->boundsZ[localID]); 168 binInfo->counts[localID] = (uint3)(0); 169 } 170} 171 172inline void atomicUpdateLocalBinInfo2(struct BinMapping *binMapping, local struct BinInfo2 *binInfo, global struct AABB *primref) 173{ 174 const float4 lower = primref->lower; 175 const float4 upper = primref->upper; 176 const float4 p = lower + upper; 177 const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); 178 AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper); 179 AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper); 180 AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper); 181 atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1); 182 atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1); 183 atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1); 184} 185 186inline void atomicUpdateGlobalFromLocalBinInfo2(global struct BinInfo2 *dest, local struct BinInfo2 *source, const uint bins) 187{ 188 const uint localID = get_local_id(0); 189 if (localID < bins) 190 { 191 AABB3f_atomic_merge_global_local(&dest->boundsX[localID], &source->boundsX[localID]); 192 AABB3f_atomic_merge_global_local(&dest->boundsY[localID], &source->boundsY[localID]); 193 AABB3f_atomic_merge_global_local(&dest->boundsZ[localID], &source->boundsZ[localID]); 194 atomic_add((global uint *)&dest->counts[localID] + 0, source->counts[localID].x); 195 atomic_add((global uint *)&dest->counts[localID] + 1, source->counts[localID].y); 196 atomic_add((global uint *)&dest->counts[localID] + 2, source->counts[localID].z); 197 } 198} 199 200inline uint subgroup_getMaxAreaChild(struct AABB *childrenAABB, const uint numChildren) 201{ 202 const uint subgroupLocalID = get_sub_group_local_id(); 203#if 0 204 /*! find best child to split */ 205 const float area = (subgroupLocalID < numChildren) & (as_uint(childrenAABB[subgroupLocalID].upper.w) > cfg_minLeafSize) ? childrenAABB[subgroupLocalID].lower.w : -(float)INFINITY; 206 const float maxArea = sub_group_reduce_max(area); 207 const uint mask = intel_sub_group_ballot(area == maxArea); 208 const uint bestChild = maxArea != -(float)INFINITY ? ctz(mask) : -1; 209#else 210 float bestArea = -(float)INFINITY; 211 int bestChild = -1; 212 for (int i = 0; i < numChildren; i++) 213 { 214 /* ignore leaves as they cannot get split */ 215 if (as_uint(childrenAABB[i].upper.w) <= cfg_minLeafSize) 216 continue; 217 218 /* find child with largest surface area */ 219 if (childrenAABB[i].lower.w > bestArea) 220 { 221 bestChild = i; 222 bestArea = childrenAABB[i].lower.w; 223 } 224 } 225#endif 226 return bestChild; 227} 228 229inline bool AABB_verifyBounds(struct BuildRecord *buildRecord, struct AABB *geometryBounds, struct AABB *primref) 230{ 231 const float4 centroid2 = primref->lower + primref->upper; 232 233 if (centroid2.x < buildRecord->centroidBounds.lower.x) 234 return false; 235 if (centroid2.y < buildRecord->centroidBounds.lower.y) 236 return false; 237 if (centroid2.z < buildRecord->centroidBounds.lower.z) 238 return false; 239 240 if (centroid2.x > buildRecord->centroidBounds.upper.x) 241 return false; 242 if (centroid2.y > buildRecord->centroidBounds.upper.y) 243 return false; 244 if (centroid2.z > buildRecord->centroidBounds.upper.z) 245 return false; 246 247 if (primref->lower.x < geometryBounds->lower.x) 248 return false; 249 if (primref->lower.y < geometryBounds->lower.y) 250 return false; 251 if (primref->lower.z < geometryBounds->lower.z) 252 return false; 253 254 if (primref->upper.x > geometryBounds->upper.x) 255 return false; 256 if (primref->upper.y > geometryBounds->upper.y) 257 return false; 258 if (primref->upper.z > geometryBounds->upper.z) 259 return false; 260 261 return true; 262} 263 264/* initialize primref index array */ 265__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 266__attribute__((intel_reqd_sub_group_size(16))) void kernel 267create_primref_index(global struct Globals *globals, 268 global struct AABB *primref, 269 global unsigned int *primref_index) 270{ 271 const uint local_size = get_local_size(0); 272 const uint taskID = get_group_id(0); 273 const uint numTasks = get_num_groups(0); 274 const uint localID = get_local_id(0); 275 276 const uint startID = (taskID + 0) * globals->numPrimitives / numTasks; 277 const uint endID = (taskID + 1) * globals->numPrimitives / numTasks; 278 for (uint primID = startID + localID; primID < endID; primID += local_size) 279 primref_index[primID] = primID; 280} 281 282// ========================================================================================================== 283// ========================================================================================================== 284// ========================================================================================================== 285 286inline float left_to_right_area16(struct AABB3f *low) 287{ 288 struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low); 289 return halfArea_AABB3f(&low_prefix); 290} 291 292inline uint left_to_right_counts16(uint low) 293{ 294 return sub_group_scan_exclusive_add(low); 295} 296 297inline float right_to_left_area16(struct AABB3f *low) 298{ 299 const uint subgroupLocalID = get_sub_group_local_id(); 300 const uint subgroup_size = get_sub_group_size(); 301 const uint ID = subgroup_size - 1 - subgroupLocalID; 302 struct AABB3f low_reverse = AABB3f_sub_group_shuffle(low, ID); 303 struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse); 304 const float low_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID); 305 return low_area; 306} 307 308inline uint right_to_left_counts16(uint low) 309{ 310 const uint subgroupLocalID = get_sub_group_local_id(); 311 const uint subgroup_size = get_sub_group_size(); 312 const uint ID = subgroup_size - 1 - subgroupLocalID; 313 const uint low_reverse = sub_group_broadcast(low, ID); 314 const uint low_prefix = sub_group_scan_inclusive_add(low_reverse); 315 return sub_group_broadcast(low_prefix, ID); 316} 317 318inline float2 left_to_right_area32(struct AABB3f *low, struct AABB3f *high) 319{ 320 struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low); 321 struct AABB3f low_reduce = AABB3f_sub_group_reduce(low); 322 struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max(high); 323 AABB3f_extend(&high_prefix, &low_reduce); 324 const float low_area = halfArea_AABB3f(&low_prefix); 325 const float high_area = halfArea_AABB3f(&high_prefix); 326 return (float2)(low_area, high_area); 327} 328 329inline uint2 left_to_right_counts32(uint low, uint high) 330{ 331 const uint low_prefix = sub_group_scan_exclusive_add(low); 332 const uint low_reduce = sub_group_reduce_add(low); 333 const uint high_prefix = sub_group_scan_exclusive_add(high); 334 return (uint2)(low_prefix, low_reduce + high_prefix); 335} 336 337inline float2 right_to_left_area32(struct AABB3f *low, struct AABB3f *high) 338{ 339 const uint subgroupLocalID = get_sub_group_local_id(); 340 const uint subgroup_size = get_sub_group_size(); 341 const uint ID = subgroup_size - 1 - subgroupLocalID; 342 struct AABB3f low_reverse = AABB3f_sub_group_shuffle(high, ID); 343 struct AABB3f high_reverse = AABB3f_sub_group_shuffle(low, ID); 344 struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse); 345 struct AABB3f low_reduce = AABB3f_sub_group_reduce(&low_reverse); 346 struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max(&high_reverse); 347 AABB3f_extend(&high_prefix, &low_reduce); 348 const float low_area = sub_group_broadcast(halfArea_AABB3f(&high_prefix), ID); 349 const float high_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID); 350 return (float2)(low_area, high_area); 351} 352 353inline uint2 right_to_left_counts32(uint low, uint high) 354{ 355 const uint subgroupLocalID = get_sub_group_local_id(); 356 const uint subgroup_size = get_sub_group_size(); 357 const uint ID = subgroup_size - 1 - subgroupLocalID; 358 const uint low_reverse = sub_group_broadcast(high, ID); 359 const uint high_reverse = sub_group_broadcast(low, ID); 360 const uint low_prefix = sub_group_scan_inclusive_add(low_reverse); 361 const uint low_reduce = sub_group_reduce_add(low_reverse); 362 const uint high_prefix = sub_group_scan_inclusive_add(high_reverse) + low_reduce; 363 return (uint2)(sub_group_broadcast(high_prefix, ID), sub_group_broadcast(low_prefix, ID)); 364} 365 366inline ulong getBestSplit(float3 sah, uint ID, const float4 scale, const ulong defaultSplit) 367{ 368 ulong splitX = (((ulong)as_uint(sah.x)) << 32) | ((uint)ID << 2) | 0; 369 ulong splitY = (((ulong)as_uint(sah.y)) << 32) | ((uint)ID << 2) | 1; 370 ulong splitZ = (((ulong)as_uint(sah.z)) << 32) | ((uint)ID << 2) | 2; 371 /* ignore zero sized dimensions */ 372 splitX = select(splitX, defaultSplit, (ulong)(scale.x == 0)); 373 splitY = select(splitY, defaultSplit, (ulong)(scale.y == 0)); 374 splitZ = select(splitZ, defaultSplit, (ulong)(scale.z == 0)); 375 ulong bestSplit = min(min(splitX, splitY), splitZ); 376 bestSplit = sub_group_reduce_min(bestSplit); 377 return bestSplit; 378} 379 380inline uint fastDivideBy6_uint(uint v) 381{ 382#if 1 383 const ulong u = (ulong)v >> 1; 384 return (uint)((u * 0x55555556ul) >> 32); 385#else 386 return v / 6; 387#endif 388} 389 390inline uint3 fastDivideBy6_uint3(uint3 v) 391{ 392 return (uint3)(fastDivideBy6_uint(v.x), fastDivideBy6_uint(v.y), fastDivideBy6_uint(v.z)); 393} 394 395inline struct Split reduceBinsAndComputeBestSplit16(struct BinInfo *binInfo, const float4 scale, uint startID, uint endID) 396{ 397 const uint subgroupLocalID = get_sub_group_local_id(); 398 const uint subgroup_size = get_sub_group_size(); 399 400 struct AABB3f boundsX = binInfo->boundsX[subgroupLocalID]; 401 402 const float lr_areaX = left_to_right_area16(&boundsX); 403 const float rl_areaX = right_to_left_area16(&boundsX); 404 405 struct AABB3f boundsY = binInfo->boundsY[subgroupLocalID]; 406 407 const float lr_areaY = left_to_right_area16(&boundsY); 408 const float rl_areaY = right_to_left_area16(&boundsY); 409 410 struct AABB3f boundsZ = binInfo->boundsZ[subgroupLocalID]; 411 412 const float lr_areaZ = left_to_right_area16(&boundsZ); 413 const float rl_areaZ = right_to_left_area16(&boundsZ); 414 415 const uint3 counts = binInfo->counts[subgroupLocalID]; 416 417 const uint lr_countsX = left_to_right_counts16(counts.x); 418 const uint rl_countsX = right_to_left_counts16(counts.x); 419 const uint lr_countsY = left_to_right_counts16(counts.y); 420 const uint rl_countsY = right_to_left_counts16(counts.y); 421 const uint lr_countsZ = left_to_right_counts16(counts.z); 422 const uint rl_countsZ = right_to_left_counts16(counts.z); 423 424 const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ); 425 const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ); 426 427#if DIVIDE_BY_6 == 0 428 const uint blocks_shift = SAH_LOG_BLOCK_SHIFT; 429 uint3 blocks_add = (uint3)((1 << blocks_shift) - 1); 430 const uint3 lr_count = ((uint3)(lr_countsX, lr_countsY, lr_countsZ) + blocks_add) >> blocks_shift; 431 const uint3 rl_count = ((uint3)(rl_countsX, rl_countsY, rl_countsZ) + blocks_add) >> blocks_shift; 432#else 433 const uint3 lr_count = fastDivideBy6_uint3((uint3)(lr_countsX, lr_countsY, lr_countsZ) + BVH_NODE_N6 - 1); 434 const uint3 rl_count = fastDivideBy6_uint3((uint3)(rl_countsX, rl_countsY, rl_countsZ) + BVH_NODE_N6 - 1); 435#endif 436 float3 sah = fma(lr_area, convert_float3(lr_count), rl_area * convert_float3(rl_count)); 437 438 /* first bin is invalid */ 439 440 sah.x = select((float)(INFINITY), sah.x, subgroupLocalID != 0); 441 sah.y = select((float)(INFINITY), sah.y, subgroupLocalID != 0); 442 sah.z = select((float)(INFINITY), sah.z, subgroupLocalID != 0); 443 444 const uint mid = (startID + endID) / 2; 445 const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0; 446 447 const ulong bestSplit = getBestSplit(sah, subgroupLocalID, scale, defaultSplit); 448 449 struct Split split; 450 split.sah = as_float((uint)(bestSplit >> 32)); 451 split.dim = (uint)bestSplit & 3; 452 split.pos = (uint)bestSplit >> 2; 453 454 return split; 455} 456 457inline struct Split reduceBinsAndComputeBestSplit32(struct BinInfo2 *binInfo, const float4 scale, uint startID, uint endID) 458{ 459 const uint subgroupLocalID = get_sub_group_local_id(); 460 const uint subgroup_size = get_sub_group_size(); 461 462 struct AABB3f boundsX_low = binInfo->boundsX[subgroupLocalID]; 463 struct AABB3f boundsX_high = binInfo->boundsX[subgroupLocalID + subgroup_size]; 464 465 const float2 lr_areaX = left_to_right_area32(&boundsX_low, &boundsX_high); 466 const float2 rl_areaX = right_to_left_area32(&boundsX_low, &boundsX_high); 467 468 struct AABB3f boundsY_low = binInfo->boundsY[subgroupLocalID]; 469 struct AABB3f boundsY_high = binInfo->boundsY[subgroupLocalID + subgroup_size]; 470 471 const float2 lr_areaY = left_to_right_area32(&boundsY_low, &boundsY_high); 472 const float2 rl_areaY = right_to_left_area32(&boundsY_low, &boundsY_high); 473 474 struct AABB3f boundsZ_low = binInfo->boundsZ[subgroupLocalID]; 475 struct AABB3f boundsZ_high = binInfo->boundsZ[subgroupLocalID + subgroup_size]; 476 477 const float2 lr_areaZ = left_to_right_area32(&boundsZ_low, &boundsZ_high); 478 const float2 rl_areaZ = right_to_left_area32(&boundsZ_low, &boundsZ_high); 479 480 const uint3 counts_low = binInfo->counts[subgroupLocalID]; 481 const uint3 counts_high = binInfo->counts[subgroupLocalID + subgroup_size]; 482 483 const uint2 lr_countsX = left_to_right_counts32(counts_low.x, counts_high.x); 484 const uint2 rl_countsX = right_to_left_counts32(counts_low.x, counts_high.x); 485 const uint2 lr_countsY = left_to_right_counts32(counts_low.y, counts_high.y); 486 const uint2 rl_countsY = right_to_left_counts32(counts_low.y, counts_high.y); 487 const uint2 lr_countsZ = left_to_right_counts32(counts_low.z, counts_high.z); 488 const uint2 rl_countsZ = right_to_left_counts32(counts_low.z, counts_high.z); 489 490 const uint blocks_shift = SAH_LOG_BLOCK_SHIFT; 491 uint3 blocks_add = (uint3)((1 << blocks_shift) - 1); 492 493 /* low part: bins 0..15 */ 494 const float3 lr_area_low = (float3)(lr_areaX.x, lr_areaY.x, lr_areaZ.x); 495 const float3 rl_area_low = (float3)(rl_areaX.x, rl_areaY.x, rl_areaZ.x); 496 497#if DIVIDE_BY_6 == 0 498 const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x) + blocks_add) >> blocks_shift; 499 const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x) + blocks_add) >> blocks_shift; 500 501#else 502 //const uint3 lr_count_low = ((uint3)(lr_countsX.x,lr_countsY.x,lr_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6; 503 //const uint3 rl_count_low = ((uint3)(rl_countsX.x,rl_countsY.x,rl_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6; 504 505 /* skip blocks for breadth-first phase */ 506 const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x)); 507 const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x)); 508 509#endif 510 511 float3 sah_low = fma(lr_area_low, convert_float3(lr_count_low), rl_area_low * convert_float3(rl_count_low)); 512 513 /* first bin is invalid */ 514 // sah_low.x = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.x; 515 // sah_low.y = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.y; 516 // sah_low.z = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.z; 517 518 sah_low.x = select((float)(INFINITY), sah_low.x, subgroupLocalID != 0); 519 sah_low.y = select((float)(INFINITY), sah_low.y, subgroupLocalID != 0); 520 sah_low.z = select((float)(INFINITY), sah_low.z, subgroupLocalID != 0); 521 522 /* high part: bins 16..31 */ 523 524 const float3 lr_area_high = (float3)(lr_areaX.y, lr_areaY.y, lr_areaZ.y); 525 const float3 rl_area_high = (float3)(rl_areaX.y, rl_areaY.y, rl_areaZ.y); 526#if DIVIDE_BY_6 == 0 527 const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y) + blocks_add) >> blocks_shift; 528 const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y) + blocks_add) >> blocks_shift; 529#else 530 //const uint3 lr_count_high = ((uint3)(lr_countsX.y,lr_countsY.y,lr_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6; 531 //const uint3 rl_count_high = ((uint3)(rl_countsX.y,rl_countsY.y,rl_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6; 532 533 /* skip blocks for breadth-first phase */ 534 const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y)); 535 const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y)); 536 537#endif 538 const float3 sah_high = fma(lr_area_high, convert_float3(lr_count_high), rl_area_high * convert_float3(rl_count_high)); 539 540 const uint mid = (startID + endID) / 2; 541 const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0; 542 543 const ulong bestSplit_low = getBestSplit(sah_low, subgroupLocalID, scale, defaultSplit); 544 const ulong bestSplit_high = getBestSplit(sah_high, subgroupLocalID + subgroup_size, scale, defaultSplit); 545 const ulong bestSplit = min(bestSplit_low, bestSplit_high); 546 547 struct Split split; 548 split.sah = as_float((uint)(bestSplit >> 32)); 549 split.dim = (uint)bestSplit & 3; 550 split.pos = (uint)bestSplit >> 2; 551 552 return split; 553} 554 555// ===================================================================== 556 557inline float leafSAH(float geometryArea, uint prims, uint block_shift) 558{ 559 return geometryArea * convert_float((prims + (1 << block_shift) - 1) >> block_shift); 560} 561 562inline bool is_left(struct BinMapping *binMapping, struct Split *split, struct AABB *primref) 563{ 564 const uint dim = split->dim; 565 const float lower = primref->lower[dim]; 566 const float upper = primref->upper[dim]; 567 const float c = lower + upper; 568 const uint pos = convert_uint_rtz((c - binMapping->ofs[dim]) * binMapping->scale[dim]); 569 return pos < split->pos; 570} 571 572inline void serial_find_split(global struct AABB *primref, 573 struct BinMapping *binMapping, 574 struct BuildRecord *buildRecord, 575 local struct Split *split, 576 local struct BinInfo *binInfo, 577 global uint *primref_index0, 578 global uint *primref_index1) 579{ 580 const uint subgroupLocalID = get_sub_group_local_id(); 581 const uint subgroup_size = get_sub_group_size(); 582 583 const uint startID = buildRecord->start; 584 const uint endID = buildRecord->end; 585 586 subgroup_initBinInfo(binInfo); 587 588 for (uint t = startID + subgroupLocalID; t < endID; t += subgroup_size) 589 { 590 const uint index = primref_index0[t]; 591 primref_index1[t] = index; 592 atomicUpdateLocalBinInfo_nocheck(binMapping, binInfo, &primref[index]); 593 } 594} 595 596inline void serial_partition_index(global struct AABB *primref, 597 struct BinMapping *binMapping, 598 struct BuildRecord *buildRecord, 599 struct Split *inSplit, 600 struct BuildRecord *outLeft, 601 struct BuildRecord *outRight, 602 struct AABB *outGeometryBoundsLeft, 603 struct AABB *outGeometryBoundsRight, 604 global uint *primref_index0, 605 global uint *primref_index1) 606{ 607 const uint localID = get_local_id(0); 608 const uint subgroupLocalID = get_sub_group_local_id(); 609 const uint subgroupID = get_sub_group_id(); 610 const uint subgroup_size = get_sub_group_size(); 611 612 const uint begin = buildRecord->start; 613 const uint end = buildRecord->end; 614 struct Split split = *inSplit; 615 616 struct BuildRecord left; 617 struct BuildRecord right; 618 initBuildRecord(&left, begin, end); 619 initBuildRecord(&right, begin, end); 620 621 struct AABB leftAABB; 622 struct AABB rightAABB; 623 AABB_init(&leftAABB); 624 AABB_init(&rightAABB); 625 626 global uint *l = primref_index0 + begin; 627 global uint *r = primref_index0 + end; 628 629 /* no valid split, just split in the middle */ 630 if (split.sah == (float)(INFINITY)) 631 { 632 for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size) 633 { 634 const uint index = primref_index1[i]; 635 const uint count = sub_group_reduce_add(1); 636 extendBuildRecord(&left, &primref[index]); 637 AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); 638 l[subgroupLocalID] = index; 639 l += count; 640 } 641 642 for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size) 643 { 644 const uint index = primref_index1[i]; 645 const uint count = sub_group_reduce_add(1); 646 extendBuildRecord(&right, &primref[index]); 647 AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); 648 r -= count; 649 r[subgroupLocalID] = index; 650 } 651 } 652 else 653 { 654 for (uint i = begin + subgroupLocalID; i < end; i += subgroup_size) 655 { 656 const uint index = primref_index1[i]; 657 const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0; 658 const uint isRight = 1 - isLeft; 659 const uint countLeft = sub_group_reduce_add(isLeft); 660 const uint countRight = sub_group_reduce_add(isRight); 661 const uint prefixLeft = sub_group_scan_exclusive_add(isLeft); 662 const uint prefixRight = sub_group_scan_exclusive_add(isRight); 663 664 r -= countRight; 665 666 if (isLeft) 667 { 668 extendBuildRecord(&left, &primref[index]); 669 AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); 670 l[prefixLeft] = index; 671 } 672 else 673 { 674 extendBuildRecord(&right, &primref[index]); 675 AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); 676 r[prefixRight] = index; 677 } 678 l += countLeft; 679 } 680 } 681 682 left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds); 683 right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds); 684 leftAABB = AABB_sub_group_reduce(&leftAABB); 685 rightAABB = AABB_sub_group_reduce(&rightAABB); 686 687 if (subgroupLocalID == 0) 688 { 689 uint pos = l - primref_index0; // single first thread needs to compute "pos" 690 left.end = pos; 691 right.start = pos; 692 693 leftAABB.lower.w = AABB_halfArea(&leftAABB); 694 rightAABB.lower.w = AABB_halfArea(&rightAABB); 695 696 leftAABB.upper.w = as_float(getNumPrimsBuildRecord(&left)); 697 rightAABB.upper.w = as_float(getNumPrimsBuildRecord(&right)); 698 699 *outLeft = left; 700 *outRight = right; 701 *outGeometryBoundsLeft = leftAABB; 702 *outGeometryBoundsRight = rightAABB; 703 } 704 705 work_group_barrier(CLK_LOCAL_MEM_FENCE); 706 707#if ENABLE_CHECKS == 1 708 if (subgroupLocalID == 0) 709 { 710 if (AABB_verify(outLeft)) 711 { 712 printf("outLeft:\n"); 713 printBuildRecord(outLeft); 714 } 715 if (AABB_verify(outRight)) 716 { 717 printf("outRight:\n"); 718 printBuildRecord(outRight); 719 } 720 if (AABB_verify(outGeometryBoundsLeft)) 721 { 722 printf("outGeometryBoundsLeft:\n"); 723 AABB_print(outGeometryBoundsLeft); 724 } 725 if (AABB_verify(outGeometryBoundsRight)) 726 { 727 printf("outGeometryBoundsRight:\n"); 728 AABB_print(outGeometryBoundsRight); 729 } 730 731 for (uint i = outLeft->start; i < outLeft->end; i++) 732 { 733 const uint index = primref_index0[i]; 734 if (split.sah != (float)(INFINITY) && !is_left(binMapping, inSplit, &primref[index])) 735 printf("check left %d \n", i); 736 if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index])) 737 printf("check prim ref bounds left %d \n", i); 738 } 739 for (uint i = outRight->start; i < outRight->end; i++) 740 { 741 const uint index = primref_index0[i]; 742 if (split.sah != (float)(INFINITY) && is_left(binMapping, inSplit, &primref[index])) 743 printf("check right %d \n", i); 744 if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index])) 745 printf("check prim ref bounds right %d \n", i); 746 } 747 } 748#endif 749} 750 751inline uint subgroup_createLeaf_index(global struct BlockAllocator *allocator, 752 const uint start, 753 const uint end, 754 global struct AABB *primref, 755 uint primID, 756 global char *bvh_mem, 757 unsigned leafSize) 758{ 759 const uint subgroupLocalID = get_sub_group_local_id(); 760 const uint subgroup_size = get_sub_group_size(); 761 const uint items = end - start; 762 763#if ENABLE_CHECKS == 1 764 if (items > BVH_LEAF_N_MAX) 765 printf("error items %d \n", items); 766#endif 767 768 // JDB TODO: Why was this code commented out?? 769 //uint offset = (subgroupLocalID == 0) ? alloc_leaf_mem(globals,sizeof(struct Quad)*items) : 0; 770 //offset = sub_group_broadcast(offset,0); 771 772 //uint offset = globals->leaf_mem_allocator_start + start * leafSize; 773 uint offset = allocator->start + start * leafSize; 774 return offset; 775} 776 777inline uint get_qnode_index_for_backptr(void *qnode_base, void *qnode) 778{ 779 size_t offset = ((size_t)qnode - (size_t)qnode_base) / sizeof(struct QBVHNodeN); 780 uint offset_u = (uint)offset; 781#if ENABLE_CHECKS 782 if ((size_t)((offset_u << 6) >> 6) != offset) 783 { 784 printf("get_qnode_index_for_backptr - index out of reach"); 785 } 786#endif 787 return offset_u; 788} 789 790struct SerialBuildRecurseTemplateConst 791{ 792 unsigned leafSize; 793 unsigned leafType; 794 bool allocateBackpointers; 795}; 796 797// ==================================================================================== 798// ==================================================================================== 799// ==================================================================================== 800// ==================================================================================== 801// ==================================================================================== 802 803inline void parallel_find_split(global struct AABB *primref, 804 local struct BuildRecord *buildRecord, 805 local struct Split *bestSplit, 806 local struct BinInfo *binInfo, 807 global uint *primref_index0, 808 global uint *primref_index1) 809{ 810 const uint localID = get_local_id(0); 811 const uint local_size = get_local_size(0); 812 const uint subgroupID = get_sub_group_id(); 813 814 const uint startID = buildRecord->start; 815 const uint endID = buildRecord->end; 816 817 struct BinMapping binMapping; 818 initBinMapping(&binMapping, &buildRecord->centroidBounds, BINS); 819 820 /* init bininfo */ 821 parallel_initBinInfo(binInfo); 822 823 work_group_barrier(CLK_LOCAL_MEM_FENCE); 824 825 for (uint t = startID + localID; t < endID; t += local_size) 826 { 827 const uint index = primref_index0[t]; 828 primref_index1[t] = index; 829 atomicUpdateLocalBinInfo(&binMapping, binInfo, &primref[index]); 830 } 831 832 work_group_barrier(CLK_LOCAL_MEM_FENCE); 833 834 /* find best dimension */ 835 836 if (subgroupID == 0) 837 { 838 *bestSplit = reduceBinsAndComputeBestSplit16(binInfo, binMapping.scale, startID, endID); 839 } 840 841 work_group_barrier(CLK_LOCAL_MEM_FENCE); 842} 843 844inline void parallel_find_split32(local uint *local_sync, 845 global struct AABB *primref, 846 local struct BuildRecord *buildRecord, 847 local struct Split *bestSplit, 848 local struct BinInfo2 *binInfo2, 849 global uint *primref_index0, 850 global uint *primref_index1) 851{ 852 853 const uint localID = get_local_id(0); 854 const uint local_size = get_local_size(0); 855 const uint subgroupID = get_sub_group_id(); 856 const uint numSubGroups = get_num_sub_groups(); 857 const uint subgroupLocalID = get_sub_group_local_id(); 858 859 const uint startID = buildRecord->start; 860 const uint endID = buildRecord->end; 861 862 struct BinMapping binMapping; 863 initBinMapping(&binMapping, &buildRecord->centroidBounds, 2 * BINS); 864 865 /* init bininfo */ 866 parallel_initBinInfo2(binInfo2, 2 * BINS); 867 868 if (localID == 0) 869 *local_sync = 0; 870 871 work_group_barrier(CLK_LOCAL_MEM_FENCE); 872 873 for (uint t = startID + localID; t < endID; t += local_size) 874 { 875 const uint index = primref_index0[t]; 876 primref_index1[t] = index; 877 atomicUpdateLocalBinInfo2(&binMapping, binInfo2, &primref[index]); 878 } 879 880 /* find best split position using the last subgroup */ 881 sub_group_barrier(CLK_LOCAL_MEM_FENCE); 882 uint syncID = subgroupLocalID == 0 ? generic_atomic_add(local_sync, 1) : 0; 883 syncID = sub_group_broadcast(syncID, 0); 884 885 if (syncID + 1 == numSubGroups) 886 { 887 *bestSplit = reduceBinsAndComputeBestSplit32(binInfo2, binMapping.scale, startID, endID); 888 DBG(if (localID == 0) printSplit(bestSplit)); 889 } 890 891 work_group_barrier(CLK_LOCAL_MEM_FENCE); 892} 893 894inline void parallel_partition_index(local uint *local_sync, 895 global struct AABB *primref, 896 struct BinMapping *binMapping, 897 const uint begin, 898 const uint end, 899 struct Split *inSplit, 900 local struct BuildRecord *outLeft, 901 local struct BuildRecord *outRight, 902 local struct AABB *outGeometryBoundsLeft, 903 local struct AABB *outGeometryBoundsRight, 904 global uint *primref_index0, 905 global uint *primref_index1, 906 uint *atomicCountLeft, 907 uint *atomicCountRight) 908{ 909 const uint localID = get_local_id(0); 910 const uint local_size = get_local_size(0); 911 const uint subgroupID = get_sub_group_id(); 912 const uint numSubGroups = get_num_sub_groups(); 913 const uint subgroup_size = get_sub_group_size(); 914 const uint subgroupLocalID = get_sub_group_local_id(); 915 916 const uint size = end - begin; 917 struct Split split = *inSplit; 918 919 /* init bin bounds */ 920 if (localID == 0) 921 { 922 initBuildRecord(outLeft, begin, end); 923 initBuildRecord(outRight, begin, end); 924 AABB_init(outGeometryBoundsLeft); 925 AABB_init(outGeometryBoundsRight); 926 *atomicCountLeft = 0; 927 *atomicCountRight = 0; 928 *local_sync = 0; 929 } 930 931 work_group_barrier(CLK_LOCAL_MEM_FENCE); // remove ? 932 933 struct BuildRecord left; 934 struct BuildRecord right; 935 initBuildRecord(&left, begin, end); 936 initBuildRecord(&right, begin, end); 937 938 struct AABB leftAABB; 939 struct AABB rightAABB; 940 AABB_init(&leftAABB); 941 AABB_init(&rightAABB); 942 943 if (split.sah == (float)(INFINITY)) 944 { 945 if (subgroupID == 0) 946 { 947 for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size) 948 { 949 const uint index = primref_index1[i]; 950 extendBuildRecord(&left, &primref[index]); 951 AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); 952 primref_index0[i] = index; 953 } 954 955 for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size) 956 { 957 const uint index = primref_index1[i]; 958 extendBuildRecord(&right, &primref[index]); 959 AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); 960 primref_index0[i] = index; 961 } 962 963 left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds); 964 right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds); 965 leftAABB = AABB_sub_group_reduce(&leftAABB); 966 rightAABB = AABB_sub_group_reduce(&rightAABB); 967 968 if (localID == 0) 969 { 970 outLeft->centroidBounds = left.centroidBounds; 971 outRight->centroidBounds = right.centroidBounds; 972 973 *outGeometryBoundsLeft = leftAABB; 974 *outGeometryBoundsRight = rightAABB; 975 976 outLeft->end = split.pos; 977 outRight->start = split.pos; 978 979 outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft); 980 outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight); 981 outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft)); 982 outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight)); 983 } 984 } 985 } 986 else 987 { 988 989 const int startID = begin + ((subgroupID + 0) * size / numSubGroups); 990 const int endID = begin + ((subgroupID + 1) * size / numSubGroups); 991 992 for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size) 993 { 994 const uint index = primref_index1[i]; 995 const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0; 996 const uint isRight = 1 - isLeft; 997 const uint countLeft = sub_group_reduce_add(isLeft); 998 const uint countRight = sub_group_reduce_add(isRight); 999 const uint prefixLeft = sub_group_scan_exclusive_add(isLeft); 1000 const uint prefixRight = sub_group_scan_exclusive_add(isRight); 1001 1002 uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0; 1003 offsetLeft = sub_group_broadcast(offsetLeft, 0); 1004 uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0; 1005 offsetRight = sub_group_broadcast(offsetRight, 0); 1006 1007 if (isLeft) 1008 { 1009 extendBuildRecord(&left, &primref[index]); 1010 AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); 1011 primref_index0[begin + offsetLeft + prefixLeft] = index; 1012 } 1013 else 1014 { 1015 extendBuildRecord(&right, &primref[index]); 1016 AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); 1017 primref_index0[end - (offsetRight + countRight) + prefixRight] = index; 1018 } 1019 } 1020 left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds); 1021 right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds); 1022 leftAABB = AABB_sub_group_reduce(&leftAABB); 1023 rightAABB = AABB_sub_group_reduce(&rightAABB); 1024 1025 AABB_local_atomic_merge(&outLeft->centroidBounds, left.centroidBounds.lower, left.centroidBounds.upper); 1026 AABB_local_atomic_merge(&outRight->centroidBounds, right.centroidBounds.lower, right.centroidBounds.upper); 1027 1028 AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper); 1029 AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper); 1030 1031 sub_group_barrier(CLK_LOCAL_MEM_FENCE); 1032 1033 if (subgroupLocalID == 0) 1034 { 1035 const uint sync = atomic_add(local_sync, 1); 1036 if (sync + 1 == numSubGroups) 1037 { 1038 uint pos = begin + *atomicCountLeft; // single thread of last subgroup needs to compute "pos" 1039 outLeft->end = pos; 1040 outRight->start = pos; 1041 1042 outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft); 1043 outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight); 1044 outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft)); 1045 outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight)); 1046 } 1047 } 1048 } 1049 1050 work_group_barrier(CLK_LOCAL_MEM_FENCE); 1051 1052#if ENABLE_CHECKS == 1 1053 if (localID == 0) 1054 { 1055 if (outLeft->end <= begin) 1056 printf("pos begin error\n"); 1057 if (outLeft->end > end) 1058 printf("pos end error\n"); 1059 1060 for (uint i = outLeft->start; i < outLeft->end; i++) 1061 { 1062 const uint index = primref_index0[i]; 1063 //printf("left %d -> %d \n",i,index); 1064 if (!is_left(binMapping, inSplit, &primref[index])) 1065 printf("check left %d \n", i); 1066 if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index])) 1067 printf("check prim ref bounds left %d \n", i); 1068 } 1069 for (uint i = outRight->start; i < outRight->end; i++) 1070 { 1071 const uint index = primref_index0[i]; 1072 //printf("right %d -> %d \n",i,index); 1073 if (is_left(binMapping, inSplit, &primref[index])) 1074 printf("check right %d \n", i); 1075 if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index])) 1076 printf("check prim ref bounds right %d \n", i); 1077 } 1078 } 1079#endif 1080} 1081 1082 1083#define ENABLE_LOOP_BREADTH_FIRST 0 1084#if ENABLE_LOOP_BREADTH_FIRST 1085// TBD It might be that layout of this impact perf. 1086struct BreadthFirstLoopLocals 1087{ 1088 struct BuildRecord local_current; 1089#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 1090 struct BinInfo binInfo; 1091#else 1092 struct BinInfo2 binInfo; 1093#endif 1094 struct Split split; 1095 struct BuildRecord children[BVH_NODE_N + 1]; 1096 struct AABB childrenAABB[BVH_NODE_N + 1]; 1097 uint atomicCountLeft; 1098 uint atomicCountRight; 1099 uint local_sync; 1100 uint recordID; 1101 uint buildRecordIDs[BUILDRECORD_STACK_SIZE]; 1102 uint numBuildRecordIDs; 1103 bool exit; 1104}; 1105 1106 1107inline void parallel_build_breadth_first_loopT(global struct Globals *globals, 1108 global struct AABB *primref, 1109 global uint *primref_index, 1110 global char *bvh_mem, 1111 uint subtreeThreshold, 1112 local struct BreadthFirstLoopLocals *L, 1113 struct BreadthFirstTemplateConst T) 1114{ 1115 const uint global_size = get_global_size(0); 1116 const uint local_size = get_local_size(0); 1117 const uint localID = get_local_id(0); 1118 const uint taskID = get_group_id(0); 1119 const uint numTasks = get_num_groups(0); 1120 1121 const uint subgroupID = get_sub_group_id(); 1122 const uint subgroupLocalID = get_sub_group_local_id(); 1123 1124 /* double buffered primref index array */ 1125 global uint *primref_index0 = primref_index; 1126 global uint *primref_index1 = primref_index + globals->numPrimitives; 1127 1128 global struct BuildRecord *records = getBuildRecords(bvh_mem, globals); 1129 1130#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 1131 const uint bins = BINS; 1132#else 1133 const uint bins = 2 * BINS; 1134#endif 1135 1136 if (localID == 0) 1137 { 1138 L->numBuildRecordIDs = 0; 1139 L->exit = false; 1140 } 1141 1142 work_group_barrier(CLK_LOCAL_MEM_FENCE); 1143 1144 while (1) 1145 { 1146 if (localID == 0) 1147 { 1148 if (L->numBuildRecordIDs == 0) 1149 { 1150 L->recordID = generic_atomic_add(&globals->counter, 1); 1151 if (L->recordID >= globals->numBuildRecords) 1152 L->exit = true; 1153 } 1154 else 1155 { 1156 L->numBuildRecordIDs--; 1157 L->recordID = L->buildRecordIDs[L->numBuildRecordIDs]; 1158 } 1159 L->local_current = records[L->recordID]; 1160 } 1161 1162 work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); 1163 1164 /* no more buildrecords available ? */ 1165 1166 if (L->exit) 1167 break; 1168 1169 local struct BuildRecord *current = &L->local_current; 1170 const uint items = getNumPrims(current); 1171 const uint depth = getBuildRecursionDepth(current); 1172 1173 global unsigned int *num_records_output = &globals->numBuildRecords_extended; 1174 1175 struct QBVHNodeN *qnode = (struct QBVHNodeN *)current->current; 1176 1177 /* ignore small buildrecords */ 1178 if (items < max(subtreeThreshold, cfg_minLeafSize)) 1179 { 1180 // do nothing 1181 } 1182 else 1183 { 1184 /*! find best split */ 1185#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 1186 parallel_find_split(primref, current, &L->split, &L->binInfo, primref_index0, primref_index1); 1187#else 1188 parallel_find_split32(&L->local_sync, primref, current, &L->split, &L->binInfo, primref_index0, primref_index1); 1189#endif 1190 uint numChildren = 2; 1191 1192 /*! find best split */ 1193 struct BinMapping binMapping; 1194 initBinMapping(&binMapping, ¤t->centroidBounds, bins); 1195 1196 parallel_partition_index(&L->local_sync, primref, &binMapping, current->start, current->end, &L->split, &L->children[0], &L->children[1], &L->childrenAABB[0], &L->childrenAABB[1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight); 1197 1198 while (numChildren < BVH_NODE_N6) 1199 { 1200 /*! find best child to split */ 1201 const uint bestChild = subgroup_getMaxAreaChild(L->childrenAABB, numChildren); 1202 if (bestChild == -1) 1203 break; 1204 1205 /* perform best found split */ 1206 local struct BuildRecord *brecord = &L->children[bestChild]; 1207 local struct BuildRecord *lrecord = &L->children[numChildren + 0]; 1208 local struct BuildRecord *rrecord = &L->children[numChildren + 1]; 1209 1210#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 1211 parallel_find_split(primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1); 1212#else 1213 parallel_find_split32(&L->local_sync, primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1); 1214#endif 1215 1216 initBinMapping(&binMapping, &brecord->centroidBounds, bins); 1217 1218 parallel_partition_index(&L->local_sync, primref, &binMapping, brecord->start, brecord->end, &L->split, lrecord, rrecord, &L->childrenAABB[numChildren + 0], &L->childrenAABB[numChildren + 1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight); 1219 1220 *brecord = *rrecord; 1221 L->childrenAABB[bestChild] = L->childrenAABB[numChildren + 1]; 1222 1223 work_group_barrier(CLK_LOCAL_MEM_FENCE); 1224 1225 numChildren++; 1226 } 1227 1228 //sub_group_barrier(CLK_LOCAL_MEM_FENCE); 1229 1230 if (localID <= 16 && subgroupID == 0) 1231 { 1232 global struct BVHBase *bvh_base = (global struct BVHBase *)bvh_mem; 1233 global struct QBVHNodeN *nodes_start = BVHBase_nodeData(bvh_base); 1234 global uint *back_pointers = BVHBase_backPointers(bvh_base); 1235 uint qnode_index = 0; 1236 if (T.allocateBackpointers) 1237 { 1238 /* index of internal node, the domain of backpointers map*/ 1239 qnode_index = get_qnode_index_for_backptr(nodes_start, qnode); 1240 // the backpointer is already set, but we need to add/encode the num of children 1241 // todo don't like the need of data read (we should just add), maybe should pass grandpa pointer in record..., or use atomic... 1242 back_pointers[qnode_index] += (numChildren << 3); 1243 } 1244 1245 /* sort children based on rnage size */ 1246 const uint numPrimsIDs = select((uint)0, (as_uint(L->childrenAABB[subgroupLocalID].upper.w) << 3) | subgroupLocalID, subgroupLocalID < numChildren); 1247 //const uint IDs = sortBVHChildrenIDs(numPrimsIDs) & (BVH_NODE_N-1); 1248 const uint IDs = numPrimsIDs & 7; 1249 const uint pushIDs = convertToPushIndices8(IDs); 1250 1251 /* alloc #numChildren nodes at once */ 1252 const uint node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren); 1253 1254 /* update single relative node pointer and type */ 1255 const int offset = encodeOffset(bvh_mem, (global void *)qnode, node_offset) >> 6; 1256 const uint type = BVH_INTERNAL_NODE; 1257 1258 /* set parent pointer in child build records */ 1259 if (subgroupLocalID < numChildren) 1260 { 1261 setBuildRecursionDepth(&L->children[subgroupLocalID], depth + 1); 1262 global uchar *child_data_ptr = (global uchar *)bvh_mem + node_offset + pushIDs * sizeof(struct QBVHNodeN); 1263 L->children[subgroupLocalID].current = child_data_ptr; 1264 if (T.allocateBackpointers) 1265 { 1266 uint child_index = get_qnode_index_for_backptr(nodes_start, child_data_ptr); 1267 back_pointers[child_index] = qnode_index << 6; 1268 } 1269 } 1270 1271 /* write out qbvh node */ 1272 subgroup_setQBVHNodeN(offset, type, &L->childrenAABB[IDs], numChildren, qnode); 1273 1274 /* write out child buildrecords to memory */ 1275 1276 uint global_records_offset = (subgroupLocalID == 0) ? atomic_add(num_records_output, numChildren - 1) : 0; 1277 global_records_offset = sub_group_broadcast(global_records_offset, 0); 1278 1279 if (localID == 0) 1280 { 1281 records[L->recordID] = L->children[0]; 1282 L->buildRecordIDs[L->numBuildRecordIDs++] = L->recordID; 1283 for (uint i = 1; i < numChildren; i++) 1284 { 1285 const uint ID = globals->numBuildRecords + global_records_offset + i - 1; 1286 records[ID] = L->children[i]; 1287 L->buildRecordIDs[L->numBuildRecordIDs++] = ID; 1288 } 1289 } 1290 } 1291 } 1292 work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); 1293 } 1294 1295 /* last active HW thread ? */ 1296 if (localID == 0) 1297 { 1298 const uint sync = atomic_add(&globals->sync, 1); 1299 if (sync + 1 == numTasks) 1300 { 1301 globals->sync = 0; 1302 /* set final number of buildrecords */ 1303 globals->numBuildRecords += globals->numBuildRecords_extended; 1304 globals->numBuildRecords_extended = 0; 1305 globals->counter = 0; 1306 } 1307 } 1308} 1309 1310__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1))) 1311__attribute__((intel_reqd_sub_group_size(16))) void kernel 1312parallel_build_breadth_first_loop(global struct Globals *globals, 1313 global struct AABB *primref, 1314 global uint *primref_index, 1315 global char *bvh_mem, 1316 uint subtreeThreshold) 1317{ 1318 local struct BreadthFirstLoopLocals L; 1319 static const struct BreadthFirstTemplateConst T = { 1320 false // bool allocateBackpointers; 1321 }; 1322 1323 parallel_build_breadth_first_loopT(globals, 1324 primref, 1325 primref_index, 1326 bvh_mem, 1327 subtreeThreshold, 1328 &L, 1329 T); 1330} 1331 1332__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1))) 1333__attribute__((intel_reqd_sub_group_size(16))) void kernel 1334parallel_build_breadth_first_loop_backpointers(global struct Globals *globals, 1335 global struct AABB *primref, 1336 global uint *primref_index, 1337 global char *bvh_mem, 1338 uint subtreeThreshold) 1339{ 1340 local struct BreadthFirstLoopLocals L; 1341 static const struct BreadthFirstTemplateConst T = { 1342 true // bool allocateBackpointers; 1343 }; 1344 1345 parallel_build_breadth_first_loopT(globals, 1346 primref, 1347 primref_index, 1348 bvh_mem, 1349 subtreeThreshold, 1350 &L, 1351 T); 1352} 1353// =================================================== 1354// =============== experimental code ================= 1355// =================================================== 1356#endif 1357 1358#define ENABLE_GLOBAL_SPLIT 0 1359#if ENABLE_GLOBAL_SPLIT 1360inline void parallel_partition_segment_index(local uint *local_sync, 1361 global struct AABB *primref, 1362 struct BinMapping *binMapping, 1363 const uint begin, 1364 const uint end, 1365 const uint global_begin, 1366 const uint global_end, 1367 struct Split *inSplit, 1368 local struct AABB *outLeft, 1369 local struct AABB *outRight, 1370 local struct AABB *outGeometryBoundsLeft, 1371 local struct AABB *outGeometryBoundsRight, 1372 global uint *primref_index0, 1373 global uint *primref_index1, 1374 uint *atomicCountLeft, 1375 uint *atomicCountRight) 1376{ 1377 const uint localID = get_local_id(0); 1378 const uint local_size = get_local_size(0); 1379 const uint subgroupID = get_sub_group_id(); 1380 const uint numSubGroups = get_num_sub_groups(); 1381 const uint subgroup_size = get_sub_group_size(); 1382 const uint subgroupLocalID = get_sub_group_local_id(); 1383 1384 const uint size = end - begin; 1385 struct Split split = *inSplit; 1386 1387 /* init bin bounds */ 1388 if (localID == 0) 1389 { 1390 AABB_init(outLeft); 1391 AABB_init(outRight); 1392 AABB_init(outGeometryBoundsLeft); 1393 AABB_init(outGeometryBoundsRight); 1394 *local_sync = 0; 1395 } 1396 1397 work_group_barrier(CLK_LOCAL_MEM_FENCE); 1398 1399 struct AABB left; 1400 struct AABB right; 1401 AABB_init(&left); 1402 AABB_init(&right); 1403 1404 struct AABB leftAABB; 1405 struct AABB rightAABB; 1406 AABB_init(&leftAABB); 1407 AABB_init(&rightAABB); 1408 1409 const int startID = begin + ((subgroupID + 0) * size / numSubGroups); 1410 const int endID = begin + ((subgroupID + 1) * size / numSubGroups); 1411 1412 for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size) 1413 { 1414 const uint index = primref_index1[i]; 1415 const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0; 1416 const uint isRight = 1 - isLeft; 1417 const uint countLeft = sub_group_reduce_add(isLeft); 1418 const uint countRight = sub_group_reduce_add(isRight); 1419 const uint prefixLeft = sub_group_scan_exclusive_add(isLeft); 1420 const uint prefixRight = sub_group_scan_exclusive_add(isRight); 1421 1422 uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0; 1423 offsetLeft = sub_group_broadcast(offsetLeft, 0); 1424 uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0; 1425 offsetRight = sub_group_broadcast(offsetRight, 0); 1426 1427 if (isLeft) 1428 { 1429 AABB_extend_point(&left, AABB_centroid2(&primref[index])); 1430 AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); 1431 primref_index0[global_begin + offsetLeft + prefixLeft] = index; 1432 } 1433 else 1434 { 1435 AABB_extend_point(&right, AABB_centroid2(&primref[index])); 1436 AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); 1437 primref_index0[global_end - (offsetRight + countRight) + prefixRight] = index; 1438 } 1439 } 1440 left = AABB_sub_group_reduce(&left); 1441 right = AABB_sub_group_reduce(&right); 1442 leftAABB = AABB_sub_group_reduce(&leftAABB); 1443 rightAABB = AABB_sub_group_reduce(&rightAABB); 1444 1445 AABB_local_atomic_merge(outLeft, left.lower, left.upper); 1446 AABB_local_atomic_merge(outRight, right.lower, right.upper); 1447 1448 AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper); 1449 AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper); 1450 1451 work_group_barrier(CLK_LOCAL_MEM_FENCE); 1452} 1453 1454__attribute__((reqd_work_group_size(BINS * 2, 1, 1))) 1455__attribute__((intel_reqd_sub_group_size(16))) 1456void kernel global_init_split_iteration(global struct Globals *globals, 1457 global struct GlobalBuildRecord *global_record, 1458 global char *bvh_mem, 1459 const uint subTreeThreshold) 1460{ 1461 const uint localID = get_local_id(0); 1462 const uint taskID = get_group_id(0); 1463 const uint numTasks = get_num_groups(0); 1464 1465 global struct BuildRecord *records = getBuildRecords(bvh_mem, globals); 1466 1467 /* for each build record with size > subTreeThreshold initialize a global build record */ 1468 1469 const uint startID = (taskID + 0) * globals->numBuildRecords / numTasks; 1470 const uint endID = (taskID + 1) * globals->numBuildRecords / numTasks; 1471 1472 for (uint i = startID; i < endID; i++) 1473 { 1474 global struct BuildRecord *buildRecord = &records[i]; 1475 DBG(if (localID == 0) printf("i %d subTreeThreshold %d size %d \n", i, subTreeThreshold, buildRecord->end - buildRecord->start)); 1476 1477 if ((buildRecord->end - buildRecord->start) > subTreeThreshold) 1478 { 1479 uint ID = localID == 0 ? generic_atomic_add(&globals->numGlobalBuildRecords, 1) : 0; 1480 1481 ID = work_group_broadcast(ID, 0); 1482 global struct BinInfo2 *binInfo = &global_record[ID].binInfo; 1483 global struct BinMapping *binMapping = &global_record[ID].binMapping; 1484 initBinMapping(binMapping, &buildRecord->centroidBounds, 2 * BINS); 1485 parallel_initBinInfo2(binInfo, 2 * BINS); 1486 if (localID == 0) 1487 { 1488 global_record[ID].range.start = buildRecord->start; 1489 global_record[ID].range.end = buildRecord->end; 1490 global_record[ID].atomicCountLeft = 0; 1491 global_record[ID].atomicCountRight = 0; 1492 global_record[ID].buildRecordID = i; 1493 AABB_init(&global_record[ID].leftCentroid); 1494 AABB_init(&global_record[ID].rightCentroid); 1495 AABB_init(&global_record[ID].leftGeometry); 1496 AABB_init(&global_record[ID].rightGeometry); 1497 } 1498 } 1499 } 1500 DBG( 1501 work_group_barrier(CLK_LOCAL_MEM_FENCE); 1502 if (localID == 0) 1503 printf("globals->numGlobalBuildRecords %d \n", globals->numGlobalBuildRecords);); 1504} 1505 1506__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1))) 1507__attribute__((intel_reqd_sub_group_size(16))) 1508void kernel global_bin_iteration(global struct Globals *globals, 1509 global struct AABB *primref, 1510 global uint *primref_index, 1511 global char *bvh_mem, 1512 global struct GlobalBuildRecord *global_record) 1513{ 1514 const uint localID = get_local_id(0); 1515 const uint blockSize = get_local_size(0); 1516 const uint taskID = get_group_id(0); 1517 const uint numTasks = get_num_groups(0); 1518 1519 const uint numGlobalBuildRecords = globals->numGlobalBuildRecords; 1520 1521 /* early out */ 1522 if (numGlobalBuildRecords == 0) 1523 return; 1524 1525 /* double buffered primref index array */ 1526 global uint *primref_index0 = primref_index; 1527 global uint *primref_index1 = primref_index + globals->numPrimitives; 1528 1529 uint numBlocks = 0; 1530 1531 /* get total number of blocks, size of block == WG size */ 1532 for (uint i = 0; i < numGlobalBuildRecords; i++) 1533 numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize; 1534 1535 const uint startBlockID = (taskID + 0) * numBlocks / numTasks; 1536 const uint endBlockID = (taskID + 1) * numBlocks / numTasks; 1537 uint numBlockIDs = endBlockID - startBlockID; 1538 1539 uint splitRecordID = 0; 1540 uint offset_start = 0; 1541 uint offset_end = 0; 1542 uint cur_blocks = 0; 1543 1544 for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++) 1545 { 1546 const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; 1547 const uint blocks = (sizeRecord + blockSize - 1) / blockSize; 1548 if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks) 1549 { 1550 const uint preBlocks = startBlockID - blockCounter; 1551 cur_blocks = min(numBlockIDs, blocks - preBlocks); 1552 offset_start = preBlocks * blockSize; 1553 offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord); 1554 break; 1555 } 1556 blockCounter += blocks; 1557 } 1558 1559 if (localID == 0) 1560 DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); 1561 1562 local struct BinInfo2 local_binInfo; 1563 parallel_initBinInfo2(&local_binInfo, 2 * BINS); 1564 struct BinMapping binMapping = global_record[splitRecordID].binMapping; 1565 1566 while (1) 1567 { 1568 work_group_barrier(CLK_LOCAL_MEM_FENCE); 1569 1570 const uint startID = global_record[splitRecordID].range.start + offset_start; 1571 const uint endID = global_record[splitRecordID].range.start + offset_end; 1572 1573 if (localID == 0) 1574 DBG(printf("taskID %d startID %d endID %d \n", taskID, startID, endID)); 1575 1576 for (uint i = startID + localID; i < endID; i += blockSize) 1577 { 1578 const uint index = primref_index0[i]; 1579 primref_index1[i] = index; 1580 atomicUpdateLocalBinInfo2(&binMapping, &local_binInfo, &primref[index]); 1581 } 1582 1583 work_group_barrier(CLK_LOCAL_MEM_FENCE); //FIXME: remove, do local sync 1584 atomicUpdateGlobalFromLocalBinInfo2(&global_record[splitRecordID].binInfo, &local_binInfo, 2 * BINS); 1585 work_group_barrier(CLK_LOCAL_MEM_FENCE); 1586 1587 numBlockIDs -= cur_blocks; 1588 if (numBlockIDs == 0) 1589 break; 1590 1591 splitRecordID++; 1592 parallel_initBinInfo2(&local_binInfo, 2 * BINS); 1593 binMapping = global_record[splitRecordID].binMapping; 1594 1595 const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; 1596 const uint blocks = (sizeRecord + blockSize - 1) / blockSize; 1597 cur_blocks = min(numBlockIDs, blocks); 1598 offset_start = 0; 1599 offset_end = min(cur_blocks * blockSize, sizeRecord); 1600 1601 if (localID == 0) 1602 DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); 1603 } 1604} 1605 1606__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 1607__attribute__((intel_reqd_sub_group_size(16))) void kernel 1608global_compute_best_split_iteration(global struct Globals *globals, 1609 global char *bvh_mem, 1610 global struct GlobalBuildRecord *global_record) 1611{ 1612 const uint localID = get_local_id(0); 1613 const uint blockSize = get_local_size(0); 1614 const uint taskID = get_group_id(0); 1615 const uint numTasks = get_num_groups(0); 1616 1617 const uint numGlobalBuildRecords = globals->numGlobalBuildRecords; 1618 1619 /* early out */ 1620 if (numGlobalBuildRecords == 0) 1621 return; 1622 1623 const uint startRecordID = (taskID + 0) * numGlobalBuildRecords / numTasks; 1624 const uint endRecordID = (taskID + 1) * numGlobalBuildRecords / numTasks; 1625 for (uint i = startRecordID; i < endRecordID; i++) 1626 { 1627 struct Split split = reduceBinsAndComputeBestSplit32(&global_record[i].binInfo, 1628 global_record[i].binMapping.scale, 1629 global_record[i].range.start, 1630 global_record[i].range.end); 1631 if (localID == 0) 1632 { 1633 global_record[i].split = split; 1634 global_record[i].atomicCountLeft = 0; 1635 global_record[i].atomicCountRight = 0; 1636 DBG(printSplit(&global_record[i].split)); 1637 } 1638 } 1639} 1640 1641__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1))) 1642__attribute__((intel_reqd_sub_group_size(16))) void kernel 1643global_partition_iteration(global struct Globals *globals, 1644 global struct AABB *primref, 1645 global uint *primref_index, 1646 global char *bvh_mem, 1647 global struct GlobalBuildRecord *global_record) 1648{ 1649 1650 const uint localID = get_local_id(0); 1651 const uint blockSize = get_local_size(0); 1652 const uint taskID = get_group_id(0); 1653 const uint numTasks = get_num_groups(0); 1654 1655 const uint numGlobalBuildRecords = globals->numGlobalBuildRecords; 1656 1657 /* early out */ 1658 if (numGlobalBuildRecords == 0) 1659 return; 1660 1661 /* double buffered primref index array */ 1662 global uint *primref_index0 = primref_index; 1663 global uint *primref_index1 = primref_index + globals->numPrimitives; 1664 1665 uint numBlocks = 0; 1666 1667 /* get total number of blocks, size of block == WG size */ 1668 for (uint i = 0; i < numGlobalBuildRecords; i++) 1669 numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize; 1670 1671 const uint startBlockID = (taskID + 0) * numBlocks / numTasks; 1672 const uint endBlockID = (taskID + 1) * numBlocks / numTasks; 1673 uint numBlockIDs = endBlockID - startBlockID; 1674 1675 uint splitRecordID = 0; 1676 uint offset_start = 0; 1677 uint offset_end = 0; 1678 uint cur_blocks = 0; 1679 1680 for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++) 1681 { 1682 const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; 1683 const uint blocks = (sizeRecord + blockSize - 1) / blockSize; 1684 if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks) 1685 { 1686 const uint preBlocks = startBlockID - blockCounter; 1687 cur_blocks = min(numBlockIDs, blocks - preBlocks); 1688 offset_start = preBlocks * blockSize; 1689 offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord); 1690 break; 1691 } 1692 blockCounter += blocks; 1693 } 1694 1695 if (localID == 0) 1696 DBG(printf("partition taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); 1697 1698 local struct AABB centroidAABB[2]; 1699 local struct AABB geometryAABB[2]; 1700 local uint local_sync; 1701 1702 while (1) 1703 { 1704 1705 const uint startID = global_record[splitRecordID].range.start + offset_start; 1706 const uint endID = global_record[splitRecordID].range.start + offset_end; 1707 1708 struct BinMapping binMapping = global_record[splitRecordID].binMapping; 1709 struct Split split = global_record[splitRecordID].split; 1710 1711 const uint global_start = global_record[splitRecordID].range.start; 1712 const uint global_end = global_record[splitRecordID].range.end; 1713 1714 if (localID == 0) 1715 DBG(printf("partition taskID %d startID %d endID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, startID, endID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); 1716 1717 parallel_partition_segment_index(&local_sync, primref, &binMapping, startID, endID, global_start, global_end, &split, ¢roidAABB[0], ¢roidAABB[1], &geometryAABB[0], &geometryAABB[1], primref_index0, primref_index1, &global_record[splitRecordID].atomicCountLeft, &global_record[splitRecordID].atomicCountRight); 1718 1719 /* update global structures */ 1720 if (localID == 0) 1721 { 1722 AABB_global_atomic_merge(&global_record[splitRecordID].leftCentroid, ¢roidAABB[0]); 1723 AABB_global_atomic_merge(&global_record[splitRecordID].rightCentroid, ¢roidAABB[1]); 1724 AABB_global_atomic_merge(&global_record[splitRecordID].leftGeometry, &geometryAABB[0]); 1725 AABB_global_atomic_merge(&global_record[splitRecordID].rightGeometry, &geometryAABB[1]); 1726 } 1727 1728 numBlockIDs -= cur_blocks; 1729 if (numBlockIDs == 0) 1730 break; 1731 1732 splitRecordID++; 1733 1734 const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; 1735 const uint blocks = (sizeRecord + blockSize - 1) / blockSize; 1736 cur_blocks = min(numBlockIDs, blocks); 1737 offset_start = 0; 1738 offset_end = min(cur_blocks * blockSize, sizeRecord); 1739 } 1740} 1741 1742inline void printBinaryNode(struct AABB *aabb) 1743{ 1744 printf("lower %f upper %f lower.w %d upper.w %d \n", aabb->lower, aabb->upper, as_uint(aabb->lower.w), as_uint(aabb->upper.w)); 1745} 1746 1747__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 1748__attribute__((intel_reqd_sub_group_size(16))) 1749void kernel global_finalize_iteration(global struct Globals *globals, 1750 global struct GlobalBuildRecord *global_record, 1751 global char *bvh_mem, 1752 global struct AABB *binary_nodes) 1753{ 1754 const uint localID = get_local_id(0); 1755 const uint localSize = get_local_size(0); 1756 const uint groupID = get_group_id(0); 1757 const uint numGroups = get_num_groups(0); 1758 1759 global struct BuildRecord *records = getBuildRecords(bvh_mem, globals); 1760 1761 for (uint i = localID; i < globals->numGlobalBuildRecords; i += localSize) 1762 { 1763 const uint buildRecordID = global_record[i].buildRecordID; 1764 const uint binaryNodeID = as_uint(records[buildRecordID].centroidBounds.lower.w); 1765 /* left child buildrecord */ 1766 const uint leftID = buildRecordID; 1767 records[leftID].start = global_record[i].range.start; 1768 records[leftID].end = global_record[i].range.start + global_record[i].atomicCountLeft; 1769 records[leftID].centroidBounds = global_record[i].leftCentroid; 1770 /* right child buildrecord */ 1771 const uint rightID = generic_atomic_add(&globals->numBuildRecords, 1); 1772 records[rightID].start = global_record[i].range.start + global_record[i].atomicCountLeft; 1773 records[rightID].end = global_record[i].range.end; 1774 records[rightID].centroidBounds = global_record[i].rightCentroid; 1775 /* two binary nodes */ 1776 const uint binaryChildID = generic_atomic_add(&globals->numGlobalBinaryNodes, 2); 1777 binary_nodes[binaryNodeID].lower.w = as_float(binaryChildID + 0); 1778 binary_nodes[binaryNodeID].upper.w = as_float(binaryChildID + 1); 1779 binary_nodes[binaryChildID + 0] = global_record[i].leftGeometry; 1780 binary_nodes[binaryChildID + 1] = global_record[i].rightGeometry; 1781 binary_nodes[binaryChildID + 0].lower.w = as_float(leftID); 1782 binary_nodes[binaryChildID + 0].upper.w = as_float(-1); 1783 binary_nodes[binaryChildID + 1].lower.w = as_float(rightID); 1784 binary_nodes[binaryChildID + 1].upper.w = as_float(-1); 1785 records[leftID].centroidBounds.lower.w = as_float(binaryChildID + 0); 1786 records[rightID].centroidBounds.lower.w = as_float(binaryChildID + 1); 1787 } 1788 1789 sub_group_barrier(CLK_LOCAL_MEM_FENCE); 1790 1791 if (localID == 0) 1792 { 1793 const uint sync = atomic_add(&globals->sync, 1); 1794 if (sync + 1 == numGroups) 1795 { 1796 globals->sync = 0; 1797 DBG(printf("globals->numBuildRecords %d \n", globals->numBuildRecords)); 1798 DBG( 1799 for (uint i = 0; i < globals->numBuildRecords; i++) { 1800 printf("i %d \n", i); 1801 printBuildRecord(&records[i]); 1802 } printf("Binary Tree \n"); 1803 for (uint i = 0; i < globals->numGlobalBinaryNodes; i++) { 1804 printf("i %d \n", i); 1805 printBinaryNode(&binary_nodes[i]); 1806 } 1807 1808 ); 1809 globals->numGlobalBuildRecords = 0; 1810 } 1811 } 1812} 1813 1814__attribute__((reqd_work_group_size(1, 1, 1))) void kernel global_build_top_level(global struct Globals *globals, 1815 global struct GlobalBuildRecord *global_record, 1816 global char *bvh_mem, 1817 global struct AABB *binary_nodes) 1818{ 1819#define MAX_TOP_LEVEL_STACK_DEPTH 32 1820 struct AABB stack[MAX_TOP_LEVEL_STACK_DEPTH]; 1821 global uchar *stackParentPtrs[MAX_TOP_LEVEL_STACK_DEPTH]; 1822 struct AABB childrenAABB[BVH_NODE_N6]; 1823 float childrenHalfArea[BVH_NODE_N6]; 1824 1825 /* build records */ 1826 global struct BuildRecord *record = getBuildRecords(bvh_mem, globals); 1827 1828 struct BVHBase *base = (struct BVHBase *)bvh_mem; 1829 struct QBVHNodeN *qnode_root = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset); 1830 1831 uint stack_index = 1; 1832 stack[0] = binary_nodes[0]; 1833 stackParentPtrs[0] = (global uchar *)qnode_root; 1834 1835 while (stack_index != 0) 1836 { 1837 stack_index--; 1838 1839 childrenAABB[0] = stack[stack_index]; 1840 struct QBVHNodeN *qnode = (struct QBVHNodeN *)stackParentPtrs[stack_index]; 1841 childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]); 1842 1843 /* buildrecord leaf => set parent pointer and continue*/ 1844 DBG( 1845 printf("stack_index %d \n", stack_index); 1846 printf("as_uint(childrenAABB[0].upper.w) %d \n", as_uint(childrenAABB[0].upper.w));); 1847 1848 if (as_uint(childrenAABB[0].upper.w) == -1) 1849 { 1850 const uint buildRecordID = as_uint(childrenAABB[0].lower.w); 1851 DBG( 1852 printf("leaf buildRecordID %d \n", buildRecordID); 1853 printBuildRecord(&record[buildRecordID]);) 1854 1855 record[buildRecordID].current = (global uchar *)qnode; 1856 continue; 1857 } 1858 1859 childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]); 1860 1861 uint numChildren = 1; 1862 while (numChildren < BVH_NODE_N6) 1863 { 1864 // FIXME 1865 1866 /*! find best child to split */ 1867 float bestArea = -(float)INFINITY; 1868 int bestChild = -1; 1869 for (int i = 0; i < numChildren; i++) 1870 { 1871 /* ignore leaves as they cannot get split */ 1872 if (as_uint(childrenAABB[i].upper.w) == -1) 1873 continue; 1874 1875 /* find child with largest surface area */ 1876 if (childrenHalfArea[i] > bestArea) 1877 { 1878 bestChild = i; 1879 bestArea = childrenAABB[i].lower.w; 1880 } 1881 } 1882 if (bestChild == -1) 1883 break; 1884 const uint leftID = as_uint(childrenAABB[bestChild].lower.w); 1885 const uint rightID = as_uint(childrenAABB[bestChild].upper.w); 1886 childrenAABB[bestChild] = binary_nodes[leftID]; 1887 childrenAABB[numChildren] = binary_nodes[rightID]; 1888 childrenHalfArea[bestChild] = AABB_halfArea(&childrenAABB[bestChild]); 1889 childrenHalfArea[numChildren] = AABB_halfArea(&childrenAABB[numChildren]); 1890 numChildren++; 1891 } 1892 1893 const uint child_node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren); 1894 1895 /* update single relative node pointer */ 1896 const int offset = encodeOffset(bvh_mem, (global void *)qnode, child_node_offset) >> 6; 1897 const uint type = BVH_INTERNAL_NODE; 1898 1899 setQBVHNodeN(offset, type, childrenAABB, numChildren, qnode); 1900 1901 DBG( 1902 printQBVHNodeN(qnode); 1903 printf("numChildren %d \n", numChildren); 1904 for (uint i = 0; i < numChildren; i++) 1905 AABB_print(&childrenAABB[i]);); 1906 1907 /* update parent pointer of build records of all children */ 1908 for (uint ID = 0; ID < numChildren; ID++) 1909 { 1910 stack[stack_index] = childrenAABB[ID]; 1911 stackParentPtrs[stack_index] = (global uchar *)bvh_mem + child_node_offset + ID * sizeof(struct QBVHNodeN); 1912 stack_index++; 1913 } 1914 } 1915} 1916 1917#endif 1918