1// 2// Copyright (C) 2009-2021 Intel Corporation 3// 4// SPDX-License-Identifier: MIT 5// 6// 7 8#include "api_interface.h" 9#include "common.h" 10#include "instance.h" 11 12 13GRL_ANNOTATE_IGC_DO_NOT_SPILL 14__attribute__((reqd_work_group_size(32, 1, 1))) 15__attribute__((intel_reqd_sub_group_size(16))) 16void kernel 17primref_to_quads(global struct Globals *globals, 18 global struct AABB *primref, 19 global char *primref_index, 20 global char *bvh_mem, 21 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, 22 const uint stride, 23 const uint offset, 24 const uint allow_update) 25{ 26 global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 27 global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart ); 28 uint quadIndicesStart = bvh->quadIndicesDataStart; 29 30 const uint numPrimitives = globals->numPrimitives; 31 uint i = get_group_id( 0 ) * get_local_size( 0 ) + get_local_id(0); 32 if (i < numPrimitives) 33 { 34 global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 35 36 const uint primrefID = *(uint *)(primref_index + i * stride + offset); 37 38 const uint geomID = PRIMREF_geomID(&primref[primrefID]); 39 const uint primID0 = PRIMREF_primID0(&primref[primrefID]); 40 const uint primID1 = PRIMREF_primID1(&primref[primrefID]); 41 const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]); 42 43 const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0); 44 const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1); 45 46 const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1); 47 48 uint vertex_stride = geomDesc[geomID].Desc.Triangles.VertexBufferByteStride; 49 50 const uint4 indices = q.a; 51 52 const uint mask = 0xff; // FIXME: hardcoded mask 53 float3 vtx0, vtx1, vtx2, vtx3; 54 GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices); 55 56 uint j0 = q.lb.x; 57 uint j1 = q.lb.y; 58 uint j2 = q.lb.z; 59 uint shaderIndex = (mask << 24) | geomID; 60 uint geomIndex = geomID | (geomFlags << 30); 61 uint primIndex0 = primID0; 62 const uint delta = primID1 - primID0; 63 const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4)); 64 uint primIndex1Delta = delta | (j << 16) | (1 << 22); 65 66 uint4 pack0 = (uint4)(shaderIndex, geomIndex, primIndex0, primIndex1Delta); 67 float4 pack1 = (float4)(vtx0.x, vtx0.y, vtx0.z, vtx1.x); 68 float4 pack2 = (float4)(vtx1.y, vtx1.z, vtx2.x, vtx2.y); 69 float4 pack3 = (float4)(vtx2.z, vtx3.x, vtx3.y, vtx3.z); 70 71 global uint4* dst = (global uint4*)&quads[i]; 72 store_uint4_L1WB_L3WB(dst, 0, pack0); 73 store_uint4_L1WB_L3WB(dst, 1, as_uint4(pack1)); 74 store_uint4_L1WB_L3WB(dst, 2, as_uint4(pack2)); 75 store_uint4_L1WB_L3WB(dst, 3, as_uint4(pack3)); 76 77 if(allow_update) 78 { 79 global uint4* vertex_indice_ptr = (global uint4*)(((char*)bvh) + (64u * quadIndicesStart + 32 * i)); 80 81 uint4 pack_indices = (uint4) ( indices.x , indices.y, indices.z, indices.w ); 82 83 store_uint4_L1WB_L3WB( vertex_indice_ptr, 0, pack0 ); 84 store_uint4_L1WB_L3WB( vertex_indice_ptr, 1, pack_indices * vertex_stride); 85 } 86 87 if (i == 0) 88 bvh->quadLeafCur += numPrimitives ; 89 } 90 91 92 93#if 0 94 global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 95 global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart ); 96 97 const uint numPrimitives = globals->numPrimitives; 98 const uint startID = get_group_id( 0 ) * get_local_size( 0 ); 99 const uint endID = min((uint)(startID + get_local_size( 0 )), numPrimitives); 100 101 for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) 102 { 103 const uint primrefID = *(uint *)(primref_index + i * stride + offset); 104 105 const uint geomID = PRIMREF_geomID(&primref[primrefID]); 106 const uint primID0 = PRIMREF_primID0(&primref[primrefID]); 107 const uint primID1 = PRIMREF_primID1(&primref[primrefID]); 108 const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]); 109 110 const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0); 111 const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1); 112 113 const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1); 114 115 const uint4 indices = q.a; 116 const uint mask = 0xff; // FIXME: hardcoded mask 117 float3 vtx0, vtx1, vtx2, vtx3; 118 GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices); 119 120 setQuad(&quads[i], (float4)(vtx0,0), (float4)(vtx1,0), (float4)(vtx2,0), (float4)(vtx3,0), q.lb.x, q.lb.y, q.lb.z, geomID, primID0, primID1, mask, geomFlags ); 121 } 122 123 if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0) 124 bvh->quadLeafCur += numPrimitives ; 125#endif 126} 127 128GRL_INLINE void create_procedural_leaf(global struct Globals *globals, 129 global struct AABB *primref, 130 local uint *primrefids, 131 uint numProcedurals, 132 struct QBVHNodeN *qnode, 133 global char *bvh_mem, 134 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc) 135{ 136 if (get_local_id(0) >= 8) 137 return; 138 139 global struct BVHBase* bvh_base = (global struct BVHBase*)bvh_mem; 140 141 /* first read geomID of all primitives */ 142 uint primrefID = -1; 143 uint geomID = -1; 144 uint geomFlags = 0; 145 if (get_local_id(0) < numProcedurals) 146 { 147 primrefID = primrefids[get_local_id(0)]; 148 geomID = PRIMREF_geomID(&primref[primrefID]); 149 geomFlags = PRIMREF_geomFlags( &primref[primrefID] ); 150 } 151 152 // cannot sort by geomID as bounds in parent node are then wrong 153 //ulong geomID_primrefID = (((ulong)geomID) << 32) | ((ulong)primrefID); 154 //geomID_primrefID = sort8_ascending_ulong(geomID_primrefID); 155 //geomID = geomID_primrefID >> 32; 156 //primrefID = geomID_primrefID; 157 158 /* We have to split at geomID boundaries into multiple leaves. This 159 * block calculates the lane where a leaf starts and ends. */ 160 const uint geomIDprev = intel_sub_group_shuffle_up(0xFFFFFFFFu, geomID, 1u); 161 const uint geomIDnext = intel_sub_group_shuffle_down(geomID, 0xFFFFFFFFu, 1u); 162 const uint leaf_start = geomIDprev != geomID; 163 const uint leaf_end = geomIDnext != geomID; 164 const uint leaf_start_next = intel_sub_group_shuffle_down(leaf_start, 0u, 1u); 165 166 /* This computes which leaf a lane processes. E.g. form geomID = 167 * [3,3,4,4,4,0] we get leaf_id = [0,0,1,1,1,2] */ 168 //const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); // FIXME: exclusive? 169 170 /* This computes the n'th primitive a lane processes inside its 171 * leaf. For the example above we compute leaf_prim = 172 * [0,1,0,1,2,0]. */ 173 const uint leaf_prim = get_local_id(0) - sub_group_scan_inclusive_max(leaf_start ? get_local_id(0) : 0); 174 175 /* from here on we allocate data and write to memory, thus only 176 * lanes that process a primitive should continue. */ 177 if (get_local_id(0) >= numProcedurals) 178 return; 179 180 /* Here we allocate a single memory block for each required 181 * ProceduralLeaf node. We do this from a single lane to ensure 182 * the allocation is contiguous. */ 183 uint leaf_base_offset = 0; 184 uint n_leafs = sub_group_reduce_add(leaf_start); 185 if (get_local_id(0) == 0) 186 leaf_base_offset = allocate_procedural_leaves( bvh_base, n_leafs ); 187 leaf_base_offset = sub_group_broadcast(leaf_base_offset, 0); 188 189 /* Compute the leaf offset for each lane. */ 190 uint leaf_offset = leaf_base_offset + sub_group_scan_inclusive_add(leaf_start) - 1; 191 192 struct ProceduralLeaf *pleaf = ((global struct ProceduralLeaf *)(bvh_mem)) + leaf_offset; 193 194 /* write the procedural leaf headers */ 195 if (leaf_end) 196 { 197 pleaf->leafDesc.shaderIndex_geomMask = 0xFF000000 | (geomID & 0x00FFFFFF); // FIXME: use accessor function. Future extensions may have shaderIndex != geomID 198 pleaf->leafDesc.geomIndex_flags = geomID | (geomFlags<<30); // FIXME: Use setter function 199 pleaf->DW1 = 0xFFFFFFF0 | (leaf_prim + 1); // !!! 200 } 201 /* write the procedural leaf primIDs */ 202 pleaf->_primIndex[leaf_prim] = PRIMREF_primID0(&primref[primrefID]); 203 204 /* update leaf node offset inside parent node */ 205 if (get_local_id(0) == 0) 206 { 207 QBVH6Node_set_offset(qnode, pleaf); 208 QBVH6Node_set_type(qnode, NODE_TYPE_PROCEDURAL); 209 } 210 211 /* Let parent node children point to proper procedural leaf block 212 * and primitive. */ 213 qnode->childData[get_local_id(0)] = leaf_start_next | (leaf_prim << 2); 214} 215 216GRL_ANNOTATE_IGC_DO_NOT_SPILL 217GRL_ANNOTATE_BIG_REG_REQ 218__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 219__attribute__((intel_reqd_sub_group_size(16))) void kernel 220primref_to_procedurals(global struct Globals *globals, 221 global struct AABB *primref, 222 global char *primref_index, 223 global char *bvh_mem, 224 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, 225 const uint stride, 226 const uint offset) 227{ 228 global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; 229 230 const uint numPrimitives = globals->numPrimitives; 231 uint startID = get_group_id( 0 ) * get_local_size( 0 ); 232 uint endID = min((uint)(startID + get_local_size( 0 )), numPrimitives); 233 234 uint offset1 = stride * globals->numPrimitives; 235 if (stride == 8) 236 offset1 = 4; 237 238 uint prev_start_back_pointer = startID == 0 ? -1 : *(uint *)(primref_index + (startID-1) * stride + offset1); 239 /* start at leaf start */ 240 while (startID < numPrimitives) 241 { 242 const uint back_pointer = *(uint *)(primref_index + startID * stride + offset1); 243 if (back_pointer != prev_start_back_pointer) 244 break; 245 startID++; 246 } 247 248 uint prev_end_back_pointer = *(uint *)(primref_index + (endID-1) * stride + offset1); 249 /* end at next leaf start */ 250 while (endID < numPrimitives) 251 { 252 const uint back_pointer = *(uint *)(primref_index + endID * stride + offset1); 253 if (back_pointer != prev_end_back_pointer) 254 break; 255 endID++; 256 } 257 258 local uint procedurals[16]; 259 260 for (uint lid = startID + get_local_id(0); lid < endID + get_local_id(0);) 261 { 262 /* load leaf start points and back_pointer */ 263 const uint primrefID = *(uint *)(primref_index + lid * stride + offset); 264 uint back_pointer = *(uint *)(primref_index + lid * stride + offset1); 265 uint prev_back_pointer = get_local_id(0) == 0 ? -1 : *(uint *)(primref_index + (lid-1) * stride + offset1); 266 267 const uint leaf_start = back_pointer != prev_back_pointer; 268 uint leaf_start_back_pointer = sub_group_broadcast(back_pointer, 0); 269 270 /* compute number of primitives inside the leaf starting at lid */ 271 const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); 272 uint numPrimitives = 0; 273 if (back_pointer == leaf_start_back_pointer && lid < endID) 274 numPrimitives = sub_group_reduce_add(1); 275 numPrimitives = sub_group_broadcast(numPrimitives, 0); 276 277 procedurals[get_local_id(0)] = primrefID; 278 279 struct QBVHNodeN *qnode = (struct QBVHNodeN *)bvh_mem + back_pointer; 280 281 create_procedural_leaf(globals, primref, procedurals, numPrimitives, qnode, bvh_mem, geomDesc); 282 283 lid += numPrimitives; 284 } 285} 286 287GRL_INLINE void create_HW_instance_leaf( 288 global struct BVHBase* bvh, 289 global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc, 290 uint dstLeafId, 291 uint instanceIndex, 292 uint rootNodeByteOffset, 293 uint instanceMask) 294{ 295 /* convert DXR instance to instance leaf node */ 296 global struct HwInstanceLeaf* leaves = (__global struct HwInstanceLeaf*)BVHBase_quadLeaves(bvh); 297 HwInstanceLeaf_Constructor(&leaves[dstLeafId], instDesc, instanceIndex, rootNodeByteOffset, instanceMask); 298} 299 300 301 302GRL_ANNOTATE_IGC_DO_NOT_SPILL 303__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 304__attribute__((intel_reqd_sub_group_size(16))) 305void kernel create_HW_instance_nodes( 306 global const struct Globals *globals, 307 global char *primref_index, 308 global struct AABB *primref, 309 global struct BVHBase *bvh, 310 global struct GRL_RAYTRACING_INSTANCE_DESC *src_instances, 311 uint32_t stride, 312 uint32_t offset) 313{ 314 uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id(); 315 uint num_prims = globals->numPrimitives; 316 if (dstLeafId >= num_prims) 317 return; 318 if( dstLeafId == 0 ) 319 bvh->instanceLeafEnd += 2*num_prims; 320 321 /* get instance ID */ 322 const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset); 323 const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]); 324 const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]); 325 const uint instMask = PRIMREF_instanceMask(&primref[primrefID]); 326 create_HW_instance_leaf(bvh, &src_instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask ); 327} 328 329GRL_ANNOTATE_IGC_DO_NOT_SPILL 330__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 331__attribute__((intel_reqd_sub_group_size(16))) 332void kernel create_HW_instance_nodes_pointers( 333 global const struct Globals *globals, 334 global char *primref_index, 335 global struct AABB *primref, 336 global struct BVHBase *bvh, 337 global void *instances_in, 338 uint32_t stride, 339 uint32_t offset) 340{ 341 uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id(); 342 uint num_prims = globals->numPrimitives; 343 if (dstLeafId >= num_prims) 344 return; 345 if (dstLeafId == 0) 346 bvh->instanceLeafEnd += 2 * num_prims; 347 348 global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = 349 (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; 350 351 /* get instance ID */ 352 const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset); 353 const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]); 354 const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]); 355 const uint instMask = PRIMREF_instanceMask(&primref[primrefID]); 356 create_HW_instance_leaf(bvh, instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask ); 357} 358