//
// Copyright (C) 2009-2021 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//

#include "api_interface.h"
#include "common.h"
#include "d3d12.h"
#include "mem_utils.h"
#include "misc_shared.h"

/// Align value to 128
///
/// @param value vale to align
/// @return aligned value
GRL_INLINE ulong AlignTo128(ulong value) { return ((value + 127) / 128) * 128; }

GRL_INLINE char* GetVertexBuffersStart(global InputBatchPtrs* batchPtrs) {
    return (global char*)(batchPtrs->dumpDst + AlignTo128(sizeof(InputBatch)));
}

/// Finds max used byte in vertex buffer
///
/// @param indexBuffPtr pointer to index buffer
/// @param vertexBufferUsedByteEnd pointer to max used byte of vertex buffers
/// @param IndexCount number of indices in index buffer
/// @param IndexFormat index format
/// @param VertexCount number of vertices in vertex buffer
/// @param VertexBufferByteStride vertex buffer byte stride
__attribute__((reqd_work_group_size(256, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
void kernel find_max_used_byte_in_buff(
    global void* indexBuffPtr,
    global uint* vertexBufferUsedByteEnd,
    dword IndexCount,
    dword IndexFormat,
    dword VertexCount,
    qword VertexBufferByteStride)
{
    local uint sgMax[16];
    uint glob_id = get_group_id(0) * get_local_size(0) + get_local_id(0);

    if (IndexFormat != INDEX_FORMAT_NONE)
    {
        uint endByte = 0;
        if (glob_id < IndexCount)
        {
            if (IndexFormat == INDEX_FORMAT_R16_UINT)
            {
                global ushort* indexBuffPtrShort = (global ushort*) indexBuffPtr;
                endByte = indexBuffPtrShort[glob_id];
            }
            else
            {
                global uint* indexBuffPtrUint = (global uint*) indexBuffPtr;
                endByte = indexBuffPtrUint[glob_id];
            }
        }

        endByte = sub_group_reduce_max(endByte);

        if (get_sub_group_local_id() == 0) { sgMax[get_sub_group_id()] = endByte; }

        barrier(CLK_LOCAL_MEM_FENCE);

        if (get_sub_group_id() == 0)
        {
            endByte = sub_group_reduce_max(sgMax[get_sub_group_local_id()]);
            if (get_sub_group_local_id() == 0) 
            {
                endByte = min(endByte, VertexCount);
                if (endByte < VertexCount && IndexCount != 0)
                    ++endByte;
                endByte *= (dword)VertexBufferByteStride;
                atomic_max(vertexBufferUsedByteEnd, endByte);
            }
        }
    }
    else if (glob_id == 0)
    {
        uint endByte = VertexCount * VertexBufferByteStride;
        atomic_max(vertexBufferUsedByteEnd, endByte);
    }
}

/// Allocates buffer for vertices
///
/// @param batchPtrs batch pointers struct
/// @param vertexBufferUsedByteEnd pointer to sizes of vertex buffers
/// @param vertexBufferOffset pointer to offsets to vertex buffers
/// @param numVertexBuffers number of vertex buffers
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
void kernel allocate_linear_offsets_for_vertex_buffers(
    global InputBatchPtrs* batchPtrs,
    global uint* vertexBufferUsedByteEnd,
    global uint* vertexBufferOffset,
    dword numVertexBuffers)
{
    uint glob_id = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();

    if (glob_id < numVertexBuffers)
    {
        uint numBytes = AlignTo128(vertexBufferUsedByteEnd[glob_id]);
        uint position = atomic_add_global( &batchPtrs->vertexBuffersSize, numBytes);
        vertexBufferOffset[glob_id] = position;
    }
}

/// Sets the dst data space for input dump of this batch
///
/// @param inputDumpMainBuffer pointer to main dump buffer
/// @param batchPtrs batch pointers struct
/// @param nonVertexSize size of non vertex data
/// @param batchIdPtr pointer to batch id
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
void kernel allocate_data_space_for_inputs(
    global DebugBufferHeader* inputDumpMainBuffer,
    global InputBatchPtrs* batchPtrs,
    uint nonVertexSize,
    global qword* batchIdPtr)
{
    if (get_sub_group_local_id() == 0) 
    {
        uint vertexBufferSize = batchPtrs->vertexBuffersSize;
        uint sizeOfThisBatch = vertexBufferSize + AlignTo128(sizeof(InputBatch)) + nonVertexSize;

        if ((sizeOfThisBatch + sizeof(InputBatch)) > ((inputDumpMainBuffer->totalSize - inputDumpMainBuffer->headStart) / 2)) 
        {
            inputDumpMainBuffer->overflow = 1;
            batchPtrs->dumpDst = 0;
            batchPtrs->globalDumpBuffer = 0;
            batchPtrs->nonVertexDataStart = 0;
            batchPtrs->totalSize = 0;
            return;
        }

        dword prevHead = inputDumpMainBuffer->gpuHead;
        dword newHead;
        bool circled;

        do
        {
            circled = false;
            newHead = prevHead + sizeOfThisBatch;
            dword bufferBegin = prevHead;
            if ((newHead + sizeof(InputBatch)) > inputDumpMainBuffer->totalSize)
            {
                circled = true;
                newHead = inputDumpMainBuffer->headStart + sizeOfThisBatch;
                bufferBegin = inputDumpMainBuffer->headStart;
            }
            dword bufferEnd = newHead + sizeof(InputBatch);

            uint tail;
            uint tail2 = 7;
            bool wait;
            do
            {
                wait = true;
                tail = load_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0);

                // dead code, workaround so IGC won't move tail load out of loop
                if (tail > inputDumpMainBuffer->totalSize) 
                {
                   store_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0, tail + tail2);
                   tail2 = tail;
                }

                if( prevHead >= tail )
                {
                    //colision example:
                    //  ----------T=======H------------
                    //  -------B=====E-----------------
                    //
                    if((bufferEnd < tail) || (bufferBegin >= prevHead))
                    {
                        wait = false;
                    }
                }
                else 
                {
                    //colision example:
                    //  ==========H-------T============
                    //  B==============E---------------
                    // caution: we will never have H circled completely so that H == T
                    if((bufferEnd < tail) && (bufferBegin >= prevHead)) 
                    {
                        wait = false;
                    }
                }
            } while (wait);
        } while (!atomic_compare_exchange_global(&inputDumpMainBuffer->gpuHead, &prevHead, newHead));

        if (circled)
        {
            global InputBatch* endBufferOp = (global InputBatch*)(((global char*)inputDumpMainBuffer) + prevHead);
            endBufferOp->header.opHeader.operationType = INPUT_DUMP_OP_END_BUFFER;
            prevHead = inputDumpMainBuffer->headStart;
        }

        global char* thisBatchDump = ((global char*)inputDumpMainBuffer) + prevHead;
        batchPtrs->dumpDst = (qword)thisBatchDump;
        batchPtrs->globalDumpBuffer = (qword)inputDumpMainBuffer;
        batchPtrs->nonVertexDataStart = (qword)(thisBatchDump + AlignTo128(sizeof(InputBatch)) + vertexBufferSize);
        batchPtrs->totalSize = sizeOfThisBatch;

        global InputBatch* batchOp = (global InputBatch*) thisBatchDump;
        batchOp->header.opHeader.operationType = INPUT_DUMP_OP_BATCH;
        batchOp->header.opHeader.endOfData = sizeOfThisBatch;
        batchOp->vertexBufferDataSize = vertexBufferSize;
        batchOp->firstContainedOpOffset = AlignTo128(sizeof(InputBatch)) + vertexBufferSize;
        batchOp->batchId = *batchIdPtr;
    }
}

/// Sets the dst data space for output dump of this batch
///
/// @param outputDumpMainBuffer pointer to main dump buffer
/// @param batchPtrs batch pointers struct
/// @param batchIdPtr pointer to batch id
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
void kernel allocate_data_space_for_outputs(
    global DebugBufferHeader* outputDumpMainBuffer,
    global OutputBatchPtrs* batchPtrs,
    global qword* batchIdPtr)
{
    if (get_sub_group_local_id() == 0) 
    {
        uint sizeOfThisBatch = AlignTo128(sizeof(OutputBatch)) + batchPtrs->dataSize;

        if ((sizeOfThisBatch + sizeof(OutputBatch)) > ((outputDumpMainBuffer->totalSize - outputDumpMainBuffer->headStart) / 2)) 
        {
            outputDumpMainBuffer->overflow = 1;
            batchPtrs->dumpDst = 0;
            batchPtrs->dataStart = 0;
            batchPtrs->totalSize = 0;
            return;
        }

        dword prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
        dword newHead;
        bool circled;

        do
        {
            //mem_fence_gpu_invalidate();
            //prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
            circled = false;
            newHead = prevHead + sizeOfThisBatch;
            dword bufferBegin = prevHead;
            if ((newHead + sizeof(OutputBatch)) > outputDumpMainBuffer->totalSize)
            {
                circled = true;
                newHead = outputDumpMainBuffer->headStart + sizeOfThisBatch;
                bufferBegin = outputDumpMainBuffer->headStart;
            }
            dword bufferEnd = newHead + sizeof(OutputBatch);

            uint tail;
            uint tail2 = 7;
            bool wait;
            do
            {
                wait = true;
                tail = load_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0);

                // dead code, workaround so IGC won't move tail load out of loop
                if (tail > outputDumpMainBuffer->totalSize) 
                {
                   store_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0, tail + tail2);
                   tail2 = tail;
                }

                if( prevHead >= tail )
                {
                    //colision example:
                    //  ----------T=======H------------
                    //  -------B=====E-----------------
                    //
                    if((bufferEnd < tail) || (bufferBegin >= prevHead))
                    {
                        wait = false;
                    }
                }
                else 
                {
                    //colision example:
                    //  ==========H-------T============
                    //  B==============E---------------
                    // caution: we will never have H circled completely so that H == T
                    if((bufferEnd < tail) && (bufferBegin >= prevHead)) 
                    {
                        wait = false;
                    }
                }
            } while (wait);
        } while (!atomic_compare_exchange_global(&outputDumpMainBuffer->gpuHead, &prevHead, newHead));

        if (circled)
        {
            global OutputBatch* endBufferOp = (global OutputBatch*)(((global char*)outputDumpMainBuffer) + prevHead);
            endBufferOp->header.opHeader.operationType = OUTPUT_DUMP_OP_END_BUFFER;
            prevHead = outputDumpMainBuffer->headStart;
        }

        global char* thisBatchDump = ((global char*)outputDumpMainBuffer) + prevHead;
        batchPtrs->dumpDst = (qword)thisBatchDump;
        batchPtrs->dataStart = (qword)(thisBatchDump + AlignTo128(sizeof(OutputBatch)));
        batchPtrs->totalSize = sizeOfThisBatch;

        global OutputBatch* batchOp = (global OutputBatch*) thisBatchDump;
        batchOp->header.opHeader.operationType = OUTPUT_DUMP_OP_BATCH;
        batchOp->header.opHeader.endOfData = sizeOfThisBatch;
        batchOp->firstContainedOpOffset = AlignTo128(sizeof(OutputBatch));
        batchOp->batchId = *batchIdPtr;
    }
}

/// Calculates sum of output sizes
///
/// @param pbi pointer to post build infos
/// @param destOffset offset in dest buffer
/// @param numOutputs number of outputs
/// @param batchPtrs batch pointers struct
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
void kernel calc_outputs_data_size(
    global PostbuildInfoSerializationDesc* pbi,
    global dword* destOffsets,
    qword numOutputs,
    global OutputBatchPtrs* batchPtrs)
{
    uint offset = 0;
    for (uint i = get_sub_group_local_id(); i < numOutputs + (MAX_HW_SIMD_WIDTH - 1); i += MAX_HW_SIMD_WIDTH)
    {
        uint size = 0;
        if (i < numOutputs)
        {
            size = AlignTo128(pbi[i].SerializedSizeInBytes);
            size += AlignTo128(sizeof(OutputData));
            destOffsets[i] = offset + sub_group_scan_exclusive_add(size);
        }
        offset += sub_group_reduce_add(size);
    }
    if (get_sub_group_local_id() == 0)
        batchPtrs->dataSize = offset;
}

/// Adds output data operation to batch
///
/// @param batchPtrs batch pointers struct
/// @param destOffset offset in dest buffer
/// @param src pointer to source bvh
/// @param pbi pointer to post build info
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
void kernel write_output_data_op(
    global OutputBatchPtrs* batchPtrs,
    global dword* destOffset,
    qword src,
    global PostbuildInfoSerializationDesc* pbi)
{
    if (batchPtrs->dataStart == 0)
        return;

    global OutputData* out = (global OutputData*)(batchPtrs->dataStart + *destOffset);
    out->header.operationType = OUTPUT_DUMP_OP_DATA;
    out->header.endOfData = AlignTo128(sizeof(OutputData)) + AlignTo128(pbi->SerializedSizeInBytes);
    out->srcBvhPtr = src;
}

/// Writes indices and transform or procedurals data
///
/// @param batchPtrs batch pointers struct
/// @param srcDesc description of source geometry
/// @param pVertexBufferOffsetInLinearisedUniqueVertexBuffers pointer to offset to vertices in vertex buffer
/// @param dstDescOffset offset to dest geo desc
/// @param dstDataOffset offset to dest geo data
/// @param numThreads number of threads
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
void kernel write_geo_data(
    global InputBatchPtrs* batchPtrs,
    global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc,
    global uint* pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
    global uint* pVertexBufferSize,
    qword dstDescOffset,
    qword dstDataOffset,
    dword numThreads)
{
    if (batchPtrs->dumpDst == 0) return;

    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();

    GRL_RAYTRACING_GEOMETRY_DESC geoDescToStore = *srcDesc;

    global char* dstDataPtr = (global char*)(
        batchPtrs->nonVertexDataStart + dstDataOffset);

    global char* srcDataPtr;
    global char* dstTransform;
    uint bytesToCopy = 0;

    if (geoDescToStore.Type == GEOMETRY_TYPE_TRIANGLES)
    {
        uint sizeOfMatrix = 0;

        if (geoDescToStore.Desc.Triangles.pTransformBuffer)
        {
            sizeOfMatrix = AlignTo128(4 * 3 * sizeof(float));
            if (glob_id < 12)
            {
                global float* matrixSrc = (global float*)geoDescToStore.Desc.Triangles.pTransformBuffer;
                global float* matrixDst = (global float*)dstDataPtr;
                matrixDst[glob_id] = matrixSrc[glob_id];
                if (glob_id == 0) 
                {
                    geoDescToStore.Desc.Triangles.pTransformBuffer = ((qword)matrixDst) - batchPtrs->globalDumpBuffer;
                }
            }
        }
        
        dstDataPtr += sizeOfMatrix;
        srcDataPtr = (global char*)geoDescToStore.Desc.Triangles.pIndexBuffer;

        bytesToCopy = AlignTo128(geoDescToStore.Desc.Triangles.IndexFormat * geoDescToStore.Desc.Triangles.IndexCount);

        if (bytesToCopy && (glob_id == 0)) 
        {
            qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
            // for this we remember offset relative to global debug buffer
            geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
            geoDescToStore.Desc.Triangles.pIndexBuffer = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
            geoDescToStore.Desc.Triangles.VertexCount = *pVertexBufferSize / geoDescToStore.Desc.Triangles.VertexBufferByteStride;
        }
        else if (geoDescToStore.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE && geoDescToStore.Desc.Triangles.VertexCount > 0 && glob_id == 0)
        {
            if (geoDescToStore.Desc.Triangles.pVertexBuffer)
            {
                qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
                // for this we remember offset relative to global debug buffer
                geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
            }
        }
        else if (glob_id == 0)
        {
            geoDescToStore.Desc.Triangles.IndexCount = 0;
            geoDescToStore.Desc.Triangles.VertexCount = 0;
            geoDescToStore.Desc.Triangles.pVertexBuffer = 0;
            geoDescToStore.Desc.Triangles.pIndexBuffer = 0;
        }
    }
    else 
    {
        srcDataPtr  = (global char*)geoDescToStore.Desc.Procedural.pAABBs_GPUVA;
        bytesToCopy = AlignTo128(geoDescToStore.Desc.Procedural.AABBByteStride * geoDescToStore.Desc.Procedural.AABBCount);
        if (glob_id == 0) 
        {
            geoDescToStore.Desc.Procedural.pAABBs_GPUVA = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
        }
    }

    if (bytesToCopy) 
    {
        CopyMemory(dstDataPtr, srcDataPtr, bytesToCopy, numThreads);
    }

    if (glob_id == 0) 
    {
        global GRL_RAYTRACING_GEOMETRY_DESC* dstDescPtr = (global GRL_RAYTRACING_GEOMETRY_DESC*)(
            batchPtrs->nonVertexDataStart + dstDescOffset);
        *dstDescPtr = geoDescToStore;
    }
}

/// Adds build operation to batch
///
/// @param batchPtrs batch pointers struct
/// @param buildOpOffset offset in dst buffer
/// @param srcBvh address of src bvh (in case of update)
/// @param dstBvhAddr address of dest bvh buffer
/// @param offsetToEnd offset to end of this operation
/// @param flags build flags
/// @param numGeometries number of geometries in build
/// @param numInstances number of instances in build
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
void kernel write_input_build_op(
    global InputBatchPtrs* batchPtrs,
    qword buildOpOffset,
    qword srcBvh,
    qword dstBvhAddr,
    dword offsetToEnd,
    dword flags,
    dword numGeometries, 
    dword numInstances,
    dword instArrayOfPtrs)
{
    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
    if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
    
    global InputBuild* buildOp = (global InputBuild*)(
        batchPtrs->nonVertexDataStart + buildOpOffset);
    buildOp->header.operationType = srcBvh ? INPUT_DUMP_OP_UPDATE : INPUT_DUMP_OP_BUILD;
    buildOp->header.endOfData = offsetToEnd;
    buildOp->dstBvhPtr = dstBvhAddr;
    buildOp->srcBvhPtr = srcBvh;
    buildOp->flags = flags;
    buildOp->numGeos = numGeometries;
    buildOp->numInstances = numInstances;
    buildOp->instArrayOfPtrs = instArrayOfPtrs;
}

/// Copies instance description
///
/// @param batchPtrs batch pointers struct
/// @param instanceDescArr inst desc source
/// @param offset ptr to offset in dst buffer
/// @param numInstances number of instances to copy
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
copy_instance_descriptors_array(
    global InputBatchPtrs* batchPtrs,
    global GRL_RAYTRACING_INSTANCE_DESC* instanceDescArr,
    qword offset,                               
    dword numInstances) 
{
    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
    if (batchPtrs->dumpDst == 0) return;

    global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC* )(
        batchPtrs->nonVertexDataStart + offset);

    if (glob_id < numInstances)
    {
        dst[glob_id] = instanceDescArr[glob_id];
    }
}

/// Copies instance description, array of pointers version
///
/// @param batchPtrs batch pointers struct
/// @param pInstanceDescPtrsArr inst desc source
/// @param offset ptr to offset in dst buffer
/// @param numInstances number of instances to copy
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
copy_instance_descriptors_array_of_ptrs(
    global InputBatchPtrs* batchPtrs,
    global qword* pInstanceDescPtrsArr,
    qword offset,
    dword numInstances)
{
    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
    if (batchPtrs->dumpDst == 0) return;

    // save gpuva of instance descs for debug
    global qword* gpuvaDst = (global qword*)(batchPtrs->nonVertexDataStart + offset);

    global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC*)(
        batchPtrs->nonVertexDataStart + AlignTo128(numInstances * sizeof(qword)) + offset);
    global GRL_RAYTRACING_INSTANCE_DESC** instanceDescPtrsArr = (global GRL_RAYTRACING_INSTANCE_DESC **)pInstanceDescPtrsArr;

    if (glob_id < numInstances)
    {
        gpuvaDst[glob_id] = (qword)instanceDescPtrsArr[glob_id];
        dst[glob_id] = *(instanceDescPtrsArr[glob_id]);
    }
}

/// Adds copy operation to batch
///
/// @param batchPtrs batch pointers struct
/// @param offset ptr to offset in dst buffer
/// @param src copy source pointer
/// @param dst copy destination pointer
/// @param copyOpType copy type
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
void kernel insert_copy_op(
    global InputBatchPtrs* batchPtrs,
    qword offset,
    global void* src,
    global void* dst,
    uint copyOpType)
{
    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
    if (batchPtrs->dumpDst == 0 || glob_id != 0) return;

    global InputCopy* copyOp = (global InputCopy*)(batchPtrs->nonVertexDataStart + offset);

    copyOp->header.operationType = copyOpType;
    copyOp->header.endOfData = AlignTo128(sizeof(InputCopy));
    copyOp->srcBvhPtr = (qword)src;
    copyOp->dstBvhPtr = (qword)dst;
}

/// Copies vertex buffer
///
/// @param batchPtrs batch pointers struct
/// @param src input buffer
/// @param offset ptr to offset in dst buffer
/// @param size ptr to number of bytes to copy
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
void kernel copy_vertex_data(
    global InputBatchPtrs* batchPtrs,
    global const char* src,
    global const uint* offset,
    global const uint* size) 
{
    if (batchPtrs->dumpDst == 0) return;

    global char *dst = (global char *)(GetVertexBuffersStart(batchPtrs) + *offset);
    uint numGroups = (*size >> 6) + 1;
    CopyMemory(dst, src, *size, numGroups);
}

/// Generate unique batch id
///
/// @param batchIds array of unique batch ids
/// @param index index of batch id to generate
__attribute__((reqd_work_group_size(1, 1, 1)))
void kernel generate_unique_batch_id(global unsigned long *batchIds, unsigned int index) {
    global unsigned int *counterPtrs = (global unsigned int *)batchIds;
    atomic_add(&counterPtrs[index * 2 + 1], 1);
    batchIds[index] |= (unsigned long)index;
}

/// Sets batch as ready to read and moves cpuHead forward, inputs case
///
/// @param batchPtrs batch pointers struct
/// @param dumpMainBuffer pointer to main dump buffer
__attribute__((reqd_work_group_size(1, 1, 1)))
void kernel finish_batch_dump_inputs(
    global InputBatchPtrs* batchPtrs,
    global DebugBufferHeader* dumpMainBuffer)
{
    if (batchPtrs->dumpDst == 0)
        return;

    global InputBatch* myBatchOp = (global InputBatch*)batchPtrs->dumpDst;

    dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);

    dword seven = 7;
    while (true)
    {
        dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
        if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
        {
            store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
            currentHead = seven;
        }

        if (currentHead == myDstOffset)
        {
            mem_fence_evict_to_memory();
            dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
            break;
        }
        else if (myDstOffset == dumpMainBuffer->headStart)
        {
            global InputBatch* curBatchOp = (global InputBatch*)(((global char*)dumpMainBuffer) + currentHead);
            if (curBatchOp->header.opHeader.operationType == INPUT_DUMP_OP_END_BUFFER)
            {
                mem_fence_evict_to_memory();
                dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
                break;
            }
        }
    }
}

/// Sets batch as ready to read and moves cpuHead forward, outputs case
///
/// @param batchPtrs batch pointers struct
/// @param dumpMainBuffer pointer to main dump buffer
__attribute__((reqd_work_group_size(1, 1, 1)))
void kernel finish_batch_dump_outputs(
    global OutputBatchPtrs* batchPtrs,
    global DebugBufferHeader* dumpMainBuffer)
{
    if (batchPtrs->dumpDst == 0)
        return;

    global OutputBatch* myBatchOp = (global OutputBatch*)batchPtrs->dumpDst;

    dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);

    dword seven = 7;
    while (true)
    {
        dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
        if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
        {
            store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
            currentHead = seven;
        }

        if (currentHead == myDstOffset)
        {
            mem_fence_evict_to_memory();
            dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
            break;
        }
        else if (myDstOffset == dumpMainBuffer->headStart)
        {
            global OutputBatch* curBatchOp = (global OutputBatch*)(((global char*)dumpMainBuffer) + currentHead);
            if (curBatchOp->header.opHeader.operationType == OUTPUT_DUMP_OP_END_BUFFER)
            {
                mem_fence_evict_to_memory();
                dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
                break;
            }
        }
    }
}