// // Copyright (C) 2009-2021 Intel Corporation // // SPDX-License-Identifier: MIT // // #include "api_interface.h" #include "d3d12.h" #include "common.h" #include "mem_utils.h" #include "misc_shared.h" #define offsetof(TYPE, ELEMENT) ((size_t)&(((TYPE *)0)->ELEMENT)) GRL_INLINE uint GroupCountForCopySize(uint size) { return (size >> 8) + 4; } GRL_INLINE uint GroupCountForCopy(BVHBase* base) { return GroupCountForCopySize(base->Meta.allocationSize); } GRL_INLINE void copyInstanceDescs(InstanceDesc* instances, D3D12_RAYTRACING_INSTANCE_DESC* descs, uint64_t numInstances) { for (uint64_t instanceIndex = get_local_id(0); instanceIndex < numInstances; instanceIndex += get_local_size(0)) { for (uint row = 0; row < 3; row++) { for (uint column = 0; column < 4; column++) { D3D12_set_transform(&descs[instanceIndex], row, column, InstanceDesc_get_transform(&instances[instanceIndex], row, column)); } } D3D12_set_instanceID(&descs[instanceIndex], InstanceDesc_get_instanceID(&instances[instanceIndex])); D3D12_set_InstanceMask(&descs[instanceIndex], InstanceDesc_get_InstanceMask(&instances[instanceIndex])); D3D12_set_InstanceContributionToHitGroupIndex(&descs[instanceIndex], InstanceDesc_get_InstanceContributionToHitGroupIndex(&instances[instanceIndex])); D3D12_set_InstanceFlags(&descs[instanceIndex], InstanceDesc_get_InstanceFlags(&instances[instanceIndex])); D3D12_set_AccelerationStructure(&descs[instanceIndex], InstanceDesc_get_AccelerationStructure(&instances[instanceIndex])); } } GRL_INLINE void createGeoDescs(GeoMetaData* geoMetaData, D3D12_RAYTRACING_GEOMETRY_DESC* descs, uint64_t numGeos, const uint64_t dataBufferStart) { if (get_local_id(0) == 0) { uint64_t previousGeoDataBufferEnd = dataBufferStart; for (uint64_t geoIndex = 0; geoIndex < numGeos; geoIndex += 1) { D3D12_set_Type(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Type)); D3D12_set_Flags(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Flags)); if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES) { // Every triangle is stored separately uint64_t vertexBufferSize = 9 * sizeof(float) * geoMetaData[geoIndex].PrimitiveCount; D3D12_set_triangles_Transform(&descs[geoIndex], 0); D3D12_set_triangles_IndexFormat(&descs[geoIndex], INDEX_FORMAT_NONE); D3D12_set_triangles_VertexFormat(&descs[geoIndex], VERTEX_FORMAT_R32G32B32_FLOAT); D3D12_set_triangles_IndexCount(&descs[geoIndex], 0); D3D12_set_triangles_VertexCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount * 3); D3D12_set_triangles_IndexBuffer(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd); D3D12_set_triangles_VertexBuffer_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd); D3D12_set_triangles_VertexBuffer_StrideInBytes(&descs[geoIndex], 3 * sizeof(float)); previousGeoDataBufferEnd += vertexBufferSize; } else { D3D12_set_procedurals_AABBCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount); D3D12_set_procedurals_AABBs_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd); D3D12_set_procedurals_AABBs_StrideInBytes(&descs[geoIndex], sizeof(D3D12_RAYTRACING_AABB)); previousGeoDataBufferEnd += sizeof(D3D12_RAYTRACING_AABB) * geoMetaData[geoIndex].PrimitiveCount; } } } } GRL_INLINE void copyIndiciesAndVerticies(D3D12_RAYTRACING_GEOMETRY_DESC* desc, QuadLeaf* quad) { float* vertices = (float*)D3D12_get_triangles_VertexBuffer_StartAddress(desc); uint64_t firstTriangleIndex = quad->primIndex0; uint64_t numTriangles = QuadLeaf_IsSingleTriangle(quad) ? 1 : 2; vertices[firstTriangleIndex * 9] = quad->v[0][0]; vertices[firstTriangleIndex * 9 + 1] = quad->v[0][1]; vertices[firstTriangleIndex * 9 + 2] = quad->v[0][2]; vertices[firstTriangleIndex * 9 + 3] = quad->v[1][0]; vertices[firstTriangleIndex * 9 + 4] = quad->v[1][1]; vertices[firstTriangleIndex * 9 + 5] = quad->v[1][2]; vertices[firstTriangleIndex * 9 + 6] = quad->v[2][0]; vertices[firstTriangleIndex * 9 + 7] = quad->v[2][1]; vertices[firstTriangleIndex * 9 + 8] = quad->v[2][2]; if (numTriangles == 2) { uint64_t secondTriangleIndex = firstTriangleIndex + QuadLeaf_GetPrimIndexDelta(quad); uint32_t packed_indices = QuadLeaf_GetSecondTriangleIndices(quad); for( size_t i=0; i<3; i++ ) { uint32_t idx = packed_indices & 3 ; packed_indices >>= 2; for( size_t j=0; j<3; j++ ) vertices[secondTriangleIndex * 9 + i * 3 + j] = quad->v[idx][j]; } } } GRL_INLINE void storeProceduralDesc( struct AABB procAABB, uint32_t primId, D3D12_RAYTRACING_GEOMETRY_DESC* geoDesc) { D3D12_RAYTRACING_AABB* proceduralDescs = (D3D12_RAYTRACING_AABB*)D3D12_get_procedurals_AABBs_StartAddress(geoDesc); D3D12_set_raytracing_aabb(&proceduralDescs[primId], &procAABB); } GRL_INLINE void copyDataFromLProcedurals( BVHBase* base, D3D12_RAYTRACING_GEOMETRY_DESC* descs) { unsigned numProcedurals = BVHBase_GetNumProcedurals(base); InternalNode* innerNodes = BVHBase_GetInternalNodes(base); unsigned numInnerNodes = BVHBase_GetNumInternalNodes(base); if (BVHBase_GetNumProcedurals(base) > 0) //< there's no point entering here if there are no procedurals { // iterate on all inner nodes to identify those with procedural children, we have to take aabbs from them for (uint32_t nodeI = get_local_id(0); nodeI < numInnerNodes; nodeI += get_local_size(0)) { InternalNode* innerNode = innerNodes + nodeI; if (innerNode->nodeType == NODE_TYPE_PROCEDURAL) { float* origin = innerNode->lower; global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer((struct QBVHNodeN*)innerNode); for (uint k = 0; k < 6; k++) { if (InternalNode_IsChildValid(innerNode, k)) { struct AABB3f qbounds = { (float)(innerNode->lower_x[k]), (float)(innerNode->lower_y[k]), (float)(innerNode->lower_z[k]), (float)(innerNode->upper_x[k]), (float)(innerNode->upper_y[k]), (float)(innerNode->upper_z[k]) }; struct AABB dequantizedAABB; dequantizedAABB.lower[0] = origin[0] + bitShiftLdexp(qbounds.lower[0], innerNode->exp_x - 8); dequantizedAABB.lower[1] = origin[1] + bitShiftLdexp(qbounds.lower[1], innerNode->exp_y - 8); dequantizedAABB.lower[2] = origin[2] + bitShiftLdexp(qbounds.lower[2], innerNode->exp_z - 8); dequantizedAABB.upper[0] = origin[0] + bitShiftLdexp(qbounds.upper[0], innerNode->exp_x - 8); dequantizedAABB.upper[1] = origin[1] + bitShiftLdexp(qbounds.upper[1], innerNode->exp_y - 8); dequantizedAABB.upper[2] = origin[2] + bitShiftLdexp(qbounds.upper[2], innerNode->exp_z - 8); dequantizedAABB = conservativeAABB(&dequantizedAABB); /* extract geomID and primID from leaf */ const uint startPrim = QBVHNodeN_startPrim((struct QBVHNodeN*) innerNode, k); const uint geomID = ProceduralLeaf_geomIndex(leaf); const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf! storeProceduralDesc(dequantizedAABB, primID, descs + geomID); } /* advance leaf pointer to next child */ leaf += QBVHNodeN_blockIncr((struct QBVHNodeN*)innerNode, k); } } else if (innerNode->nodeType == NODE_TYPE_MIXED) { ERROR(); } else {/* do nothing for other internal node types, they can't have procedural child (directly)*/; } } } } GRL_INLINE void copyDataFromQuadLeaves(BVHBase* base, D3D12_RAYTRACING_GEOMETRY_DESC* descs) { QuadLeaf* quads = BVHBase_GetQuadLeaves(base); uint64_t numQuads = BVHBase_GetNumQuads(base); for (uint64_t quadIdx = get_local_id(0); quadIdx < numQuads; quadIdx += get_local_size(0)) { uint64_t descIdx = PrimLeaf_GetGeoIndex(&quads[quadIdx].leafDesc); copyIndiciesAndVerticies(&descs[descIdx], &quads[quadIdx]); } } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel clone_indirect(global char* dest, global char* src) { BVHBase* base = (BVHBase*)src; uint64_t bvhSize = base->Meta.allocationSize; uint numGroups = GroupCountForCopy(base); CopyMemory(dest, src, bvhSize, numGroups); } GRL_INLINE void compactT(global char* dest, global char* src, uint64_t compactedSize, uint skipCopy, uint groupCnt) { global BVHBase* baseSrc = (global BVHBase*)src; global BVHBase* baseDest = (global BVHBase*)dest; uint32_t offset = sizeof(BVHBase); uint32_t numNodes = BVHBase_GetNumInternalNodes(baseSrc); uint32_t nodeSize = numNodes * sizeof(InternalNode); offset += nodeSize; int quadChildFix = baseSrc->quadLeafStart; int procChildFix = baseSrc->proceduralDataStart; int instChildFix = baseSrc->instanceLeafStart; // serialization already copies part of bvh base so skip this part CopyMemory(dest + skipCopy, src + skipCopy, sizeof(BVHBase) - skipCopy, groupCnt); baseDest->Meta.allocationSize = compactedSize; if (baseSrc->Meta.instanceCount) { const uint32_t instLeafsSize = BVHBase_GetNumHWInstanceLeaves(baseSrc) * sizeof(HwInstanceLeaf); CopyMemory(dest + offset, (global char*)BVHBase_GetHWInstanceLeaves(baseSrc), instLeafsSize, groupCnt); const uint instanceLeafStart = (uint)(offset / 64); baseDest->instanceLeafStart = instanceLeafStart; instChildFix -= instanceLeafStart; offset += instLeafsSize; baseDest->instanceLeafEnd = (uint)(offset / 64); } if (baseSrc->Meta.geoCount) { const uint quadLeafsSize = BVHBase_GetNumQuads(baseSrc) * sizeof(QuadLeaf); if (quadLeafsSize) { CopyMemory(dest + offset, (global char*)BVHBase_GetQuadLeaves(baseSrc), quadLeafsSize, groupCnt); const uint quadLeafStart = (uint)(offset / 64); baseDest->quadLeafStart = quadLeafStart; quadChildFix -= quadLeafStart; offset += quadLeafsSize; baseDest->quadLeafCur = (uint)(offset / 64); } const uint procLeafsSize = BVHBase_GetNumProcedurals(baseSrc) * sizeof(ProceduralLeaf); if (procLeafsSize) { CopyMemory(dest + offset, (global char*)BVHBase_GetProceduralLeaves(baseSrc), procLeafsSize, groupCnt); const uint proceduralDataStart = (uint)(offset / 64); baseDest->proceduralDataStart = proceduralDataStart; procChildFix -= proceduralDataStart; offset += procLeafsSize; baseDest->proceduralDataCur = (uint)(offset / 64); } } // copy nodes with fixed child offsets global uint* nodeDest = (global uint*)(dest + sizeof(BVHBase)); global InternalNode* nodeSrc = (global InternalNode*)BVHBase_GetInternalNodes(baseSrc); // used in mixed case char* instanceLeavesBegin = (char*)BVHBase_GetHWInstanceLeaves(baseSrc); char* instanceLeavesEnd = (char*)BVHBase_GetHWInstanceLeaves_End(baseSrc); uint localId = get_sub_group_local_id(); for (uint i = get_group_id(0); i < numNodes; i += groupCnt) { uint nodePart = CacheLineSubgroupRead((const global char*)&nodeSrc[i]); char nodeType = as_char4(sub_group_broadcast(nodePart, offsetof(InternalNode, nodeType) / 4))[0]; if (localId * 4 == offsetof(InternalNode, childOffset)) { int childOffset = as_int(nodePart); if (nodeType == NODE_TYPE_MIXED) { char* childPtr = (char*)&nodeSrc[i] + 64 * childOffset; if (childPtr > instanceLeavesBegin && childPtr < instanceLeavesEnd) nodePart = as_int(childOffset - instChildFix); } else if (nodeType == NODE_TYPE_INSTANCE) nodePart = as_int(childOffset - instChildFix); else if (nodeType == NODE_TYPE_QUAD) nodePart = as_int(childOffset - quadChildFix); else if (nodeType == NODE_TYPE_PROCEDURAL) nodePart = as_int(childOffset - procChildFix); } nodeDest[i * 16 + localId] = nodePart; } if (baseSrc->Meta.instanceCount) { const uint32_t instanceDescSize = baseSrc->Meta.instanceCount * sizeof(InstanceDesc); CopyMemory(dest + offset, src + baseSrc->Meta.instanceDescsStart, instanceDescSize, groupCnt); baseDest->Meta.instanceDescsStart = offset; offset += instanceDescSize; } if (baseSrc->Meta.geoCount) { const uint32_t geoMetaSize = baseSrc->Meta.geoCount * sizeof(GeoMetaData); CopyMemory(dest + offset, src + baseSrc->Meta.geoDescsStart, geoMetaSize, groupCnt); baseDest->Meta.geoDescsStart = offset; offset += (geoMetaSize + 63) & ~63; // align to 64 } uint backPointerDataStart = offset / 64; uint refitTreeletsDataStart = backPointerDataStart; uint refitStartPointDataStart = backPointerDataStart; uint dataEnd = backPointerDataStart; uint fatLeafTableStart = dataEnd; uint fatLeafCount = baseSrc->fatLeafCount; uint innerTableStart = dataEnd; uint innerCount = baseSrc->innerCount; uint quadLeftoversCountNewAtomicUpdate = baseSrc->quadLeftoversCountNewAtomicUpdate; uint quadTableSizeNewAtomicUpdate = baseSrc->quadTableSizeNewAtomicUpdate; uint quadIndicesDataStart = dataEnd; if (BVHBase_HasBackPointers(baseSrc)) { #if 0 // const uint oldbackpontersDataStart = baseSrc->backPointerDataStart; const uint shift = oldbackpontersDataStart - backPointerDataStart; const uint refitStructsSize = ((BVHBase_GetRefitStructsDataSize(baseSrc)) + 63) & ~63; CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), refitStructsSize, groupCnt); refitTreeletsDataStart = baseSrc->refitTreeletsDataStart - shift; refitStartPointDataStart = baseSrc->refitStartPointDataStart - shift; dataEnd = baseSrc->BVHDataEnd - shift; #else // compacting version const uint backpointersSize = ((numNodes*sizeof(uint)) + 63) & ~63; CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), backpointersSize, groupCnt); offset += backpointersSize; refitTreeletsDataStart = offset / 64; refitStartPointDataStart = offset / 64; // TODO: remove treelets from .... everywhere const uint treeletExecutedCnt = *BVHBase_GetRefitTreeletCntPtr(baseSrc); if (treeletExecutedCnt) { const uint treeletCnt = treeletExecutedCnt > 1 ? treeletExecutedCnt + 1 : 1; refitTreeletsDataStart = offset / 64; const uint treeletsSize = ((treeletCnt * sizeof(RefitTreelet)) + 63) & ~63; RefitTreelet* destTreelets = (RefitTreelet*)(dest + offset); RefitTreelet* srcTreelets = BVHBase_GetRefitTreeletDescs(baseSrc); uint numThreads = groupCnt * get_local_size(0); uint globalID = (get_group_id(0) * get_local_size(0)) + get_local_id(0); for (uint i = globalID; i < treeletCnt; i += numThreads) { RefitTreelet dsc = srcTreelets[i]; RefitTreeletTrivial* trivial_dsc = (RefitTreeletTrivial*)&dsc; if (trivial_dsc->numStartpoints == 1 && trivial_dsc->childrenOffsetOfTheNode > numNodes) { trivial_dsc->childrenOffsetOfTheNode -= quadChildFix; } destTreelets[i] = dsc; } offset += treeletsSize; refitStartPointDataStart = offset / 64; const uint startPointsSize = (BVHBase_GetRefitStartPointsSize(baseSrc) + 63) & ~63; CopyMemory(dest + offset, (global char*)BVHBase_GetRefitStartPoints(baseSrc), startPointsSize, groupCnt); offset += startPointsSize; dataEnd = offset / 64; } uint fatleafEntriesSize = ((fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63); fatLeafTableStart = offset / 64; if (fatleafEntriesSize) { CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), fatleafEntriesSize, groupCnt); } offset += fatleafEntriesSize; // New atomic update if(baseSrc->quadIndicesDataStart > baseSrc->backPointerDataStart) { uint numQuads = BVHBase_GetNumQuads(baseSrc); uint quadTableMainBufferSize = (numQuads + 255) & ~255; uint quadLeftoversSize = (quadLeftoversCountNewAtomicUpdate + 255) & ~255; uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63); if (quadTableEntriesSize) { CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), quadTableEntriesSize, groupCnt); } offset += quadTableEntriesSize; uint quadIndicesDataSize = ((numQuads * sizeof(QuadDataIndices) + 63) & ~63); quadIndicesDataStart = offset / 64; if (quadIndicesDataSize) { CopyMemory(dest + offset, (global char*)BVHBase_GetQuadDataIndicesTable(baseSrc), quadIndicesDataSize, groupCnt); } offset += quadIndicesDataSize; } uint innerEntriesSize = ((innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63); innerTableStart = offset / 64; if (innerEntriesSize) { CopyMemory(dest + offset, (global char*)BVHBase_GetInnerNodeTable(baseSrc), innerEntriesSize, groupCnt); } offset += innerEntriesSize; dataEnd = offset / 64; #endif } baseDest->backPointerDataStart = backPointerDataStart; baseDest->refitTreeletsDataStart = refitTreeletsDataStart; baseDest->refitStartPointDataStart = refitStartPointDataStart; baseDest->fatLeafTableStart = fatLeafTableStart ; baseDest->fatLeafCount = fatLeafCount; baseDest->innerTableStart = innerTableStart; baseDest->innerCount = innerCount; baseDest->quadLeftoversCountNewAtomicUpdate = quadLeftoversCountNewAtomicUpdate; baseDest->quadTableSizeNewAtomicUpdate = quadTableSizeNewAtomicUpdate; baseDest->quadIndicesDataStart = quadIndicesDataStart; baseDest->BVHDataEnd = dataEnd; } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel compact(global char* dest, global char* src, uint groupCnt) { uint64_t compactedSize = compute_compacted_size((BVHBase*)src); compactT(dest, src, compactedSize, 0, groupCnt); } // set serialization header along all lanes, each lane will get one dword of header plus 64bit reminding data GRL_INLINE unsigned prepare_header( uint64_t headerSize, uint64_t instancePtrSize, uint64_t numInstances, uint64_t bvhSize, uint8_t* driverID, uint64_t reminder) { unsigned loc_id = get_sub_group_local_id(); uint64_t SerializedSizeInBytesIncludingHeader = headerSize + instancePtrSize * numInstances + bvhSize; uint64_t DeserializedSizeInBytes = bvhSize; uint64_t InstanceHandleCount = numInstances; char bvh_magic_str[] = BVH_MAGIC_MACRO; uint* bvh_magic_uint = (uint*)bvh_magic_str; unsigned headerTempLanePiece; if (loc_id < 4) { headerTempLanePiece = *((unsigned*)&driverID[4*loc_id]); } else if (loc_id == 4) { headerTempLanePiece = bvh_magic_uint[0]; } else if (loc_id == 5) { headerTempLanePiece = bvh_magic_uint[1]; } else if (loc_id == 6) { headerTempLanePiece = bvh_magic_uint[2]; } else if (loc_id == 7) { headerTempLanePiece = bvh_magic_uint[3]; } else if (loc_id == 8) { headerTempLanePiece = (uint)SerializedSizeInBytesIncludingHeader; } else if (loc_id == 9) { headerTempLanePiece = (uint)(SerializedSizeInBytesIncludingHeader >> 32ul); } else if (loc_id == 10) { headerTempLanePiece = (uint)DeserializedSizeInBytes; } else if (loc_id == 11) { headerTempLanePiece = (uint)(DeserializedSizeInBytes >> 32ul); } else if (loc_id == 12) { headerTempLanePiece = (uint)InstanceHandleCount; } else if (loc_id == 13) { headerTempLanePiece = (uint)(InstanceHandleCount >> 32ul); } else if (loc_id == 14) { headerTempLanePiece = (uint)reminder; } else if (loc_id == 15) { headerTempLanePiece = (uint)(reminder >> 32ul); } return headerTempLanePiece; } GRL_INLINE void serializeT( global byte_align64B* dest, global byte_align64B* src, global uint8_t* driverID, uint groups_count) { SerializationHeader* header = (SerializationHeader*)dest; BVHBase* base = (BVHBase*)src; const uint headerSize = sizeof(SerializationHeader); const uint numInstances = base->Meta.instanceCount; const uint instancePtrSize = sizeof(gpuva_t); const uint compactedSize = compute_compacted_size(base); uint local_id = get_sub_group_local_id(); // this is not 64byte aligned :( const uint offsetToBvh = headerSize + instancePtrSize * numInstances; global InstanceDesc* src_instances = 0; if (numInstances) { src_instances = (global InstanceDesc*)((uint64_t)base + base->Meta.instanceDescsStart); } // effectively this part should end up as one 64B aligned 64B write if (get_group_id(0) == groups_count - 1) { Block64B headerPlus; // we patch the missing piece with instance or bhv beginning (TRICK A and B) // we assume header is 56B. global uint64_t* srcPiece = (numInstances != 0) ? &src_instances[0].AccelerationStructureGPUVA : (global uint64_t*)src; unsigned headerTemp; headerTemp = prepare_header( headerSize, instancePtrSize, numInstances, compactedSize, driverID, *srcPiece); CacheLineSubgroupWrite((global byte_align64B*)dest, headerTemp); } if (numInstances > 0) { uint instancesOffset = headerSize; uint aligned_instance_ptrs_offset = ((instancesOffset + 63) >> 6) << 6; uint unaligned_prefixing_instance_cnt = (aligned_instance_ptrs_offset - instancesOffset) >> 3; unaligned_prefixing_instance_cnt = min(unaligned_prefixing_instance_cnt, numInstances); global uint64_t* dst_instances = (global uint64_t*)(dest + instancesOffset); // we've copied first instance onto a header, (see TRICK A) // now we have only instances start at aligned memory uint numAlignedInstances = numInstances - unaligned_prefixing_instance_cnt; dst_instances += unaligned_prefixing_instance_cnt; src_instances += unaligned_prefixing_instance_cnt; if (numAlignedInstances) { // each 8 instances form a cacheline uint numCachelines = numAlignedInstances >> 3; //qwords -> 64Bs // qwords besides multiple of 8; uint startReminder = numAlignedInstances & ~((1 << 3) - 1); uint numreminder = numAlignedInstances & ((1 << 3) - 1); uint task_id = get_group_id(0); while (task_id < numCachelines) { uint src_id = task_id * 8 + (local_id >> 1); uint* src_uncorected = (uint*)& src_instances[src_id].AccelerationStructureGPUVA; uint* src = ((local_id & 1) != 0) ? src_uncorected + 1 : src_uncorected; uint data = *src; global char* dst = (global byte_align64B*)(dst_instances + (8 * task_id)); CacheLineSubgroupWrite(dst, data); task_id += groups_count; } if (task_id == numCachelines && local_id < 8 && numreminder > 0) { // this should write full cacheline uint index = startReminder + local_id; // data will be taken from instances for lanes (local_id < numreminder) // copy srcbvh beginning as uint64_t for remaining lanes (TRICK B) global uint64_t* srcData = (local_id < numreminder) ? &src_instances[index].AccelerationStructureGPUVA : ((global uint64_t*)src) + (local_id - numreminder); dst_instances[index] = *srcData; } } } // the parts above copied unaligned dst beginning of bvh (see TRICK B) uint32_t unalignedPartCopiedElsewhere = (64u - (offsetToBvh & (64u - 1u)))&(64u - 1u); compactT(dest + offsetToBvh, src, compactedSize, unalignedPartCopiedElsewhere, groups_count); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel serialize_indirect( global char* dest, global char* src, global uint8_t* driverID) { BVHBase* base = (BVHBase*)src; uint groups_count = GroupCountForCopy(base); serializeT(dest, src, driverID, groups_count); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel serialize_for_input_dump_indirect( global struct OutputBatchPtrs* batchPtrs, global dword* dstOffset, global char* src, global uint8_t* driverID) { BVHBase* base = (BVHBase*)src; uint groups_count = GroupCountForCopy(base); global char* dest = (global char*)(batchPtrs->dataStart + *dstOffset); dest += (sizeof(OutputData) + 127) & ~127; serializeT(dest, src, driverID, groups_count); } GRL_INLINE void deserializeT( global char* dest, global char* src, unsigned groupCnt) { SerializationHeader* header = (SerializationHeader*)src; const uint64_t headerSize = sizeof(struct SerializationHeader); const uint64_t instancePtrSize = sizeof(gpuva_t); const uint64_t numInstances = header->InstanceHandleCount; const uint64_t offsetToBvh = headerSize + instancePtrSize * numInstances; const uint64_t bvhSize = header->DeserializedSizeInBytes; if (numInstances) { const bool instances_mixed_with_inner_nodes = false; if (instances_mixed_with_inner_nodes) { // not implemented ! // copy each node with 64byte granularity if node is instance, patch it mid-copy } else { BVHBase* srcBvhBase = (BVHBase*)(src + offsetToBvh); // numHWInstances can be bigger (because of rebraiding) or smaller (because of inactive instances) than // numInstances (count of pointers and descriptors). uint offsetToHwInstances = srcBvhBase->instanceLeafStart << 6; uint numHwInstances = (srcBvhBase->instanceLeafEnd - srcBvhBase->instanceLeafStart) >> 1; // // instances are in separate memory intervals // copy all the other data simple way // uint nodesEnd = srcBvhBase->Meta.instanceDescsStart; // copy before instance leafs CopyMemory(dest, (global char*)(src + offsetToBvh), offsetToHwInstances, groupCnt); uint offsetPostInstances = srcBvhBase->instanceLeafEnd << 6; uint instanceDescStart = srcBvhBase->Meta.instanceDescsStart; uint sizePostInstances = instanceDescStart - offsetPostInstances; // copy after instance leafs before instance desc CopyMemory(dest + offsetPostInstances, (global char*)(src + offsetToBvh + offsetPostInstances), sizePostInstances, groupCnt); uint instanceDescEnd = instanceDescStart + numInstances * sizeof(InstanceDesc); uint sizePostInstanceDescs = bvhSize - instanceDescEnd; // copy after instance desc CopyMemory(dest + instanceDescEnd, (global char*)(src + offsetToBvh + instanceDescEnd), sizePostInstanceDescs, groupCnt); global gpuva_t* newInstancePtrs = (global gpuva_t*)(src + headerSize); global InstanceDesc* dstDesc = (global InstanceDesc*)(dest + instanceDescStart); global InstanceDesc* srcDesc = (global InstanceDesc*)(src + offsetToBvh + instanceDescStart); // copy and patch instance descriptors for (uint64_t instanceIndex = get_group_id(0); instanceIndex < numInstances; instanceIndex += groupCnt) { InstanceDesc desc = srcDesc[instanceIndex]; uint64_t newInstancePtr = newInstancePtrs[instanceIndex]; desc.AccelerationStructureGPUVA = newInstancePtr; // patch it with new ptr; dstDesc[instanceIndex] = desc; } // copy and patch hw instance leafs global HwInstanceLeaf* dstInstleafs = (global HwInstanceLeaf*)(dest + offsetToHwInstances); global HwInstanceLeaf* srcInstleafs = (global HwInstanceLeaf*)(src + offsetToBvh + offsetToHwInstances); for (uint hwLeafIndex = get_group_id(0); hwLeafIndex < numHwInstances; hwLeafIndex += groupCnt) { // pull the instance from srcBVH HwInstanceLeaf tmpInstleaf = srcInstleafs[hwLeafIndex]; uint swInstanceIndex = HwInstanceLeaf_GetInstanceIndex(&tmpInstleaf); uint64_t childBvhPtr = (uint64_t)newInstancePtrs[swInstanceIndex]; uint64_t originalBvhPtr = (uint64_t)HwInstanceLeaf_GetBVH(&tmpInstleaf); HwInstanceLeaf_SetBVH(&tmpInstleaf, childBvhPtr); uint64_t startNode = HwInstanceLeaf_GetStartNode(&tmpInstleaf); if (startNode != 0) { uint64_t rootNodeOffset = startNode - originalBvhPtr; HwInstanceLeaf_SetStartNode(&tmpInstleaf, childBvhPtr + rootNodeOffset); } dstInstleafs[hwLeafIndex] = tmpInstleaf; } } } else { CopyMemory(dest, (global char*)(src + offsetToBvh), bvhSize, groupCnt); } } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel deserialize_indirect( global char* dest, global char* src) { SerializationHeader* header = (SerializationHeader*)src; const uint64_t bvhSize = header->DeserializedSizeInBytes; unsigned groupCnt = GroupCountForCopySize(bvhSize); deserializeT(dest, src, groupCnt); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel dxr_decode(global char* dest, global char* src) { DecodeHeader* header = (DecodeHeader*)dest; BVHBase* base = (BVHBase*)src; uint32_t numGeos = base->Meta.geoCount; uint32_t numInstances = base->Meta.instanceCount; if (numInstances > 0) { header->Type = TOP_LEVEL; header->NumDesc = numInstances; D3D12_RAYTRACING_INSTANCE_DESC* instanceDesc = (D3D12_RAYTRACING_INSTANCE_DESC*)(dest + sizeof(DecodeHeader)); copyInstanceDescs((InstanceDesc*)((uint64_t)base + (uint64_t)base->Meta.instanceDescsStart), instanceDesc, numInstances); } else if (numGeos > 0) { header->Type = BOTTOM_LEVEL; header->NumDesc = numGeos; D3D12_RAYTRACING_GEOMETRY_DESC* geomDescs = (D3D12_RAYTRACING_GEOMETRY_DESC*)(dest + sizeof(DecodeHeader)); uint64_t data = (uint64_t)geomDescs + sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) * numGeos; createGeoDescs((GeoMetaData*)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart), geomDescs, numGeos, data); work_group_barrier(CLK_GLOBAL_MEM_FENCE); copyDataFromQuadLeaves(base, geomDescs); copyDataFromLProcedurals(base, geomDescs); } else { header->Type = BOTTOM_LEVEL; header->NumDesc = 0; } }