/* Copyright © 2022 Bas Nieuwenhuizen * Copyright © 2024 Intel Coorporation * SPDX-License-Identifier: MIT */ #version 460 #extension GL_GOOGLE_include_directive : require #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require #extension GL_EXT_shader_explicit_arithmetic_types_int64 : require #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #extension GL_EXT_scalar_block_layout : require #extension GL_EXT_buffer_reference : require #extension GL_EXT_buffer_reference2 : require layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; #include "anv_build_interface.h" layout(push_constant) uniform CONSTS { copy_args args; }; // Layout of serialized data /**************************************| | vk_accel_struct_serialization_header | |--------------------------------------| | For a TLAS, all handles to the BLAS | | within this TLAS. | | For a BLAS, nothing. | |--------------------------------------| | Driver-specific part. | | For Intel, this starts with | | anv_accel_struct_header as drawn | | in anv_bvh.h | |**************************************/ /* * Explanation of BLAS handles: * According to the spec of vkCmdCopyAccelerationStructureToMemoryKHR, * for a TLAS, the handles of all BLAS/instances within this TLAS are * tightly stored after vk_accel_struct_serialization_header, making this * serialized-memory a semi-opaque object. The application might be able * to swap/replace these handles with other handles. In fact this is what * dEQP-VK.ray_tracing_pipeline.acceleration_structures.header_bottom_address.* * is doing. * * Therefore, if the application updates the handles, we need to replace * the old handles in anv_instance_leaf with the new one. To access * anv_instance_leaf without traversing the TLAS, pointers to these * anv_instance_leaf are stored right after anv_accel_struct_header, * allowing us to know where they are in the TLAS instantly. * * Although, the fact that the application can swap/replace new handles * of BLAS without rebuilding the TLAS sounds a bit odd. */ void main(void) { uint32_t global_id = gl_GlobalInvocationID.x; uint32_t lanes = gl_NumWorkGroups.x * 128; uint32_t increment = lanes * 8; uint64_t copy_src_addr = args.src_addr; uint64_t copy_dst_addr = args.dst_addr; if (args.mode == ANV_COPY_MODE_DESERIALIZE) { copy_src_addr += SIZEOF(vk_accel_struct_serialization_header) + DEREF(REF(vk_accel_struct_serialization_header)(args.src_addr)).instance_count * SIZEOF(uint64_t); } REF(anv_accel_struct_header) header = REF(anv_accel_struct_header)(copy_src_addr); uint64_t instance_base = args.src_addr + SIZEOF(vk_accel_struct_serialization_header); uint64_t instance_offset = SIZEOF(anv_accel_struct_header); /* We store the address of instance_leaf after bvh header */ uint64_t instance_end = DEREF(header).instance_count * SIZEOF(uint64_t); if (instance_end > 0) instance_end += instance_offset; if (args.mode == ANV_COPY_MODE_SERIALIZE) { copy_dst_addr += SIZEOF(vk_accel_struct_serialization_header) + DEREF(REF(anv_accel_struct_header)(args.src_addr)).instance_count * SIZEOF(uint64_t); if (global_id == 0) { REF(vk_accel_struct_serialization_header) ser_header = REF(vk_accel_struct_serialization_header)(args.dst_addr); DEREF(ser_header).serialization_size = DEREF(header).serialization_size; DEREF(ser_header).deserialization_size = DEREF(header).compacted_size; DEREF(ser_header).instance_count = DEREF(header).instance_count; for (uint32_t offset = 0; offset < VK_UUID_SIZE; offset++) { DEREF(ser_header).driver_uuid[offset] = args.driver_uuid[offset]; } for (uint32_t offset = 0; offset < VK_UUID_SIZE; offset++) { DEREF(ser_header).accel_struct_compat[offset] = args.accel_struct_compat[offset]; } } instance_base = args.dst_addr + SIZEOF(vk_accel_struct_serialization_header); } else if (args.mode == ANV_COPY_MODE_COPY) { instance_end = 0; } uint64_t size = DEREF(header).compacted_size; for (uint64_t offset = global_id * 8; offset < size; offset += increment) { /* copy 8 bytes per iteration */ DEREF(REF(uint64_t)(copy_dst_addr + offset)) = DEREF(REF(uint64_t)(copy_src_addr + offset)); /* Do the adjustment inline in the same invocation that copies the data so that we don't have * to synchronize. */ if (offset < instance_end && offset >= instance_offset && (offset - instance_offset) % SIZEOF(uint64_t) == 0) { uint64_t idx = (offset - instance_offset) / SIZEOF(uint64_t); if (args.mode == ANV_COPY_MODE_SERIALIZE) { /* Indirectly access the anv_instance_leaf, and store the blas_ptrs after ser_header */ uint64_t instance_leaf_addr = DEREF(REF(uint64_t)(copy_src_addr + offset)); REF(anv_instance_leaf) instance_leaf = REF(anv_instance_leaf)(instance_leaf_addr); uint64_t blas_ptr = DEREF(instance_leaf).part1.bvh_ptr & 0xfffffffffffful; DEREF(INDEX(uint64_t, instance_base, idx)) = blas_ptr; } else { /* ANV_COPY_MODE_DESERIALIZE */ /* Indirectly access the anv_instance_leaf, and replace the bvh_ptr with the ones after ser_header */ uint64_t instance_leaf_addr = DEREF(REF(uint64_t)(copy_dst_addr + offset)); REF(anv_instance_leaf) instance_leaf = REF(anv_instance_leaf)(instance_leaf_addr); uint64_t blas_ptr = DEREF(INDEX(uint64_t, instance_base, idx)); DEREF(instance_leaf).part1.bvh_ptr = (blas_ptr & 0xfffffffffffful); /* set the startNodePtr to blas_ptr + ANV_HEADER_SIZE */ uint64_t mask = 0x0000fffffffffffful; uint64_t new_startNodePtr = blas_ptr + ANV_RT_BVH_HEADER_SIZE; /* clear bits and set */ DEREF(instance_leaf).part0.start_node_ptr_and_inst_flags = (DEREF(instance_leaf).part0.start_node_ptr_and_inst_flags & ~mask) | (new_startNodePtr & mask); } } } }