1 /*
2 * Copyright © 2022 Friedrich Vock
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "bvh/bvh.h"
25 #include "util/half_float.h"
26 #include "amd_family.h"
27 #include "radv_private.h"
28 #include "vk_acceleration_structure.h"
29 #include "vk_common_entrypoints.h"
30
31 #define RRA_MAGIC 0x204644525F444D41
32
33 struct rra_file_header {
34 uint64_t magic;
35 uint32_t version;
36 uint32_t unused;
37 uint64_t chunk_descriptions_offset;
38 uint64_t chunk_descriptions_size;
39 };
40
41 static_assert(sizeof(struct rra_file_header) == 32, "rra_file_header does not match RRA spec");
42
43 enum rra_chunk_version {
44 RADV_RRA_ASIC_API_INFO_CHUNK_VERSION = 0x1,
45 RADV_RRA_RAY_HISTORY_CHUNK_VERSION = 0x2,
46 RADV_RRA_ACCEL_STRUCT_CHUNK_VERSION = 0xF0005,
47 };
48
49 enum rra_file_api {
50 RADV_RRA_API_DX9,
51 RADV_RRA_API_DX11,
52 RADV_RRA_API_DX12,
53 RADV_RRA_API_VULKAN,
54 RADV_RRA_API_OPENGL,
55 RADV_RRA_API_OPENCL,
56 RADV_RRA_API_MANTLE,
57 RADV_RRA_API_GENERIC,
58 };
59
60 struct rra_file_chunk_description {
61 char name[16];
62 uint32_t is_zstd_compressed;
63 enum rra_chunk_version version;
64 uint64_t header_offset;
65 uint64_t header_size;
66 uint64_t data_offset;
67 uint64_t data_size;
68 uint64_t unused;
69 };
70
71 static_assert(sizeof(struct rra_file_chunk_description) == 64, "rra_file_chunk_description does not match RRA spec");
72
73 static uint64_t
node_to_addr(uint64_t node)74 node_to_addr(uint64_t node)
75 {
76 node &= ~7ull;
77 node <<= 19;
78 return ((int64_t)node) >> 16;
79 }
80
81 static void
rra_dump_header(FILE * output,uint64_t chunk_descriptions_offset,uint64_t chunk_descriptions_size)82 rra_dump_header(FILE *output, uint64_t chunk_descriptions_offset, uint64_t chunk_descriptions_size)
83 {
84 struct rra_file_header header = {
85 .magic = RRA_MAGIC,
86 .version = 3,
87 .chunk_descriptions_offset = chunk_descriptions_offset,
88 .chunk_descriptions_size = chunk_descriptions_size,
89 };
90 fwrite(&header, sizeof(header), 1, output);
91 }
92
93 static void
rra_dump_chunk_description(uint64_t offset,uint64_t header_size,uint64_t data_size,const char * name,enum rra_chunk_version version,FILE * output)94 rra_dump_chunk_description(uint64_t offset, uint64_t header_size, uint64_t data_size, const char *name,
95 enum rra_chunk_version version, FILE *output)
96 {
97 struct rra_file_chunk_description chunk = {
98 .version = version,
99 .header_offset = offset,
100 .header_size = header_size,
101 .data_offset = offset + header_size,
102 .data_size = data_size,
103 };
104 memcpy(chunk.name, name, strnlen(name, sizeof(chunk.name)));
105 fwrite(&chunk, sizeof(struct rra_file_chunk_description), 1, output);
106 }
107
108 enum rra_memory_type {
109 RRA_MEMORY_TYPE_UNKNOWN,
110 RRA_MEMORY_TYPE_DDR,
111 RRA_MEMORY_TYPE_DDR2,
112 RRA_MEMORY_TYPE_DDR3,
113 RRA_MEMORY_TYPE_DDR4,
114 RRA_MEMORY_TYPE_DDR5,
115 RRA_MEMORY_TYPE_GDDR3,
116 RRA_MEMORY_TYPE_GDDR4,
117 RRA_MEMORY_TYPE_GDDR5,
118 RRA_MEMORY_TYPE_GDDR6,
119 RRA_MEMORY_TYPE_HBM,
120 RRA_MEMORY_TYPE_HBM2,
121 RRA_MEMORY_TYPE_HBM3,
122 RRA_MEMORY_TYPE_LPDDR4,
123 RRA_MEMORY_TYPE_LPDDR5,
124 };
125
126 #define RRA_FILE_DEVICE_NAME_MAX_SIZE 256
127
128 struct rra_asic_info {
129 uint64_t min_shader_clk_freq;
130 uint64_t min_mem_clk_freq;
131 char unused[8];
132 uint64_t max_shader_clk_freq;
133 uint64_t max_mem_clk_freq;
134 uint32_t device_id;
135 uint32_t rev_id;
136 char unused2[80];
137 uint64_t vram_size;
138 uint32_t bus_width;
139 char unused3[12];
140 char device_name[RRA_FILE_DEVICE_NAME_MAX_SIZE];
141 char unused4[16];
142 uint32_t mem_ops_per_clk;
143 uint32_t mem_type;
144 char unused5[135];
145 bool valid;
146 };
147
148 static_assert(sizeof(struct rra_asic_info) == 568, "rra_asic_info does not match RRA spec");
149
150 static uint32_t
amdgpu_vram_type_to_rra(uint32_t type)151 amdgpu_vram_type_to_rra(uint32_t type)
152 {
153 switch (type) {
154 case AMD_VRAM_TYPE_UNKNOWN:
155 return RRA_MEMORY_TYPE_UNKNOWN;
156 case AMD_VRAM_TYPE_DDR2:
157 return RRA_MEMORY_TYPE_DDR2;
158 case AMD_VRAM_TYPE_DDR3:
159 return RRA_MEMORY_TYPE_DDR3;
160 case AMD_VRAM_TYPE_DDR4:
161 return RRA_MEMORY_TYPE_DDR4;
162 case AMD_VRAM_TYPE_DDR5:
163 return RRA_MEMORY_TYPE_DDR5;
164 case AMD_VRAM_TYPE_HBM:
165 return RRA_MEMORY_TYPE_HBM;
166 case AMD_VRAM_TYPE_GDDR3:
167 return RRA_MEMORY_TYPE_GDDR3;
168 case AMD_VRAM_TYPE_GDDR4:
169 return RRA_MEMORY_TYPE_GDDR4;
170 case AMD_VRAM_TYPE_GDDR5:
171 return RRA_MEMORY_TYPE_GDDR5;
172 case AMD_VRAM_TYPE_GDDR6:
173 return RRA_MEMORY_TYPE_GDDR6;
174 case AMD_VRAM_TYPE_LPDDR4:
175 return RRA_MEMORY_TYPE_LPDDR4;
176 case AMD_VRAM_TYPE_LPDDR5:
177 return RRA_MEMORY_TYPE_LPDDR5;
178 default:
179 unreachable("invalid vram type");
180 }
181 }
182
183 static void
rra_dump_asic_info(const struct radeon_info * rad_info,FILE * output)184 rra_dump_asic_info(const struct radeon_info *rad_info, FILE *output)
185 {
186 struct rra_asic_info asic_info = {
187 /* All frequencies are in Hz */
188 .min_shader_clk_freq = 0,
189 .max_shader_clk_freq = rad_info->max_gpu_freq_mhz * 1000000,
190 .min_mem_clk_freq = 0,
191 .max_mem_clk_freq = rad_info->memory_freq_mhz * 1000000,
192
193 .vram_size = (uint64_t)rad_info->vram_size_kb * 1024,
194
195 .mem_type = amdgpu_vram_type_to_rra(rad_info->vram_type),
196 .mem_ops_per_clk = ac_memory_ops_per_clock(rad_info->vram_type),
197 .bus_width = rad_info->memory_bus_width,
198
199 .device_id = rad_info->pci.dev,
200 .rev_id = rad_info->pci_rev_id,
201 };
202
203 strncpy(asic_info.device_name, rad_info->marketing_name ? rad_info->marketing_name : rad_info->name,
204 RRA_FILE_DEVICE_NAME_MAX_SIZE - 1);
205
206 fwrite(&asic_info, sizeof(struct rra_asic_info), 1, output);
207 }
208
209 enum rra_bvh_type {
210 RRA_BVH_TYPE_TLAS,
211 RRA_BVH_TYPE_BLAS,
212 };
213
214 struct rra_accel_struct_chunk_header {
215 /*
216 * Declaring this as uint64_t would make the compiler insert padding to
217 * satisfy alignment restrictions.
218 */
219 uint32_t virtual_address[2];
220 uint32_t metadata_offset;
221 uint32_t metadata_size;
222 uint32_t header_offset;
223 uint32_t header_size;
224 enum rra_bvh_type bvh_type;
225 };
226
227 static_assert(sizeof(struct rra_accel_struct_chunk_header) == 28,
228 "rra_accel_struct_chunk_header does not match RRA spec");
229
230 struct rra_accel_struct_post_build_info {
231 uint32_t bvh_type : 1;
232 uint32_t reserved1 : 5;
233 uint32_t tri_compression_mode : 2;
234 uint32_t fp16_interior_mode : 2;
235 uint32_t reserved2 : 6;
236 uint32_t build_flags : 16;
237 };
238
239 static_assert(sizeof(struct rra_accel_struct_post_build_info) == 4,
240 "rra_accel_struct_post_build_info does not match RRA spec");
241
242 struct rra_accel_struct_header {
243 struct rra_accel_struct_post_build_info post_build_info;
244 /*
245 * Size of the internal acceleration structure metadata in the
246 * proprietary drivers. Seems to always be 128.
247 */
248 uint32_t metadata_size;
249 uint32_t file_size;
250 uint32_t primitive_count;
251 uint32_t active_primitive_count;
252 uint32_t unused1;
253 uint32_t geometry_description_count;
254 VkGeometryTypeKHR geometry_type;
255 uint32_t internal_nodes_offset;
256 uint32_t leaf_nodes_offset;
257 uint32_t geometry_infos_offset;
258 uint32_t leaf_ids_offset;
259 uint32_t interior_fp32_node_count;
260 uint32_t interior_fp16_node_count;
261 uint32_t leaf_node_count;
262 uint32_t rt_driver_interface_version;
263 uint64_t unused2;
264 uint32_t half_fp32_node_count;
265 char unused3[44];
266 };
267
268 #define RRA_ROOT_NODE_OFFSET align(sizeof(struct rra_accel_struct_header), 64)
269
270 static_assert(sizeof(struct rra_accel_struct_header) == 120, "rra_accel_struct_header does not match RRA spec");
271
272 struct rra_accel_struct_metadata {
273 uint64_t virtual_address;
274 uint32_t byte_size;
275 char unused[116];
276 };
277
278 static_assert(sizeof(struct rra_accel_struct_metadata) == 128, "rra_accel_struct_metadata does not match RRA spec");
279
280 struct rra_geometry_info {
281 uint32_t primitive_count : 29;
282 uint32_t flags : 3;
283 uint32_t unknown;
284 uint32_t leaf_node_list_offset;
285 };
286
287 static_assert(sizeof(struct rra_geometry_info) == 12, "rra_geometry_info does not match RRA spec");
288
289 static struct rra_accel_struct_header
rra_fill_accel_struct_header_common(struct radv_accel_struct_header * header,size_t parent_id_table_size,size_t leaf_node_data_size,size_t internal_node_data_size,uint64_t primitive_count)290 rra_fill_accel_struct_header_common(struct radv_accel_struct_header *header, size_t parent_id_table_size,
291 size_t leaf_node_data_size, size_t internal_node_data_size,
292 uint64_t primitive_count)
293 {
294 struct rra_accel_struct_header result = {
295 .post_build_info =
296 {
297 .build_flags = header->build_flags,
298 /* Seems to be no compression */
299 .tri_compression_mode = 0,
300 },
301 .primitive_count = primitive_count,
302 /* TODO: calculate active primitives */
303 .active_primitive_count = primitive_count,
304 .geometry_description_count = header->geometry_count,
305 .interior_fp32_node_count = internal_node_data_size / sizeof(struct radv_bvh_box32_node),
306 .leaf_node_count = primitive_count,
307 };
308
309 result.metadata_size = sizeof(struct rra_accel_struct_metadata) + parent_id_table_size;
310 result.file_size =
311 result.metadata_size + sizeof(struct rra_accel_struct_header) + internal_node_data_size + leaf_node_data_size;
312
313 result.internal_nodes_offset = sizeof(struct rra_accel_struct_metadata);
314 result.leaf_nodes_offset = result.internal_nodes_offset + internal_node_data_size;
315 result.geometry_infos_offset = result.leaf_nodes_offset + leaf_node_data_size;
316 result.leaf_ids_offset = result.geometry_infos_offset;
317 if (!header->instance_count)
318 result.leaf_ids_offset += header->geometry_count * sizeof(struct rra_geometry_info);
319
320 return result;
321 }
322
323 struct rra_box32_node {
324 uint32_t children[4];
325 float coords[4][2][3];
326 uint32_t reserved[4];
327 };
328
329 struct rra_box16_node {
330 uint32_t children[4];
331 float16_t coords[4][2][3];
332 };
333
334 /*
335 * RRA files contain this struct in place of hardware
336 * instance nodes. They're named "instance desc" internally.
337 */
338 struct rra_instance_node {
339 float wto_matrix[12];
340 uint32_t custom_instance_id : 24;
341 uint32_t mask : 8;
342 uint32_t sbt_offset : 24;
343 uint32_t instance_flags : 8;
344 uint64_t blas_va : 54;
345 uint64_t hw_instance_flags : 10;
346 uint32_t instance_id;
347 uint32_t unused1;
348 uint32_t blas_metadata_size;
349 uint32_t unused2;
350 float otw_matrix[12];
351 };
352
353 static_assert(sizeof(struct rra_instance_node) == 128, "rra_instance_node does not match RRA spec!");
354
355 /*
356 * Format RRA uses for aabb nodes
357 */
358 struct rra_aabb_node {
359 float aabb[2][3];
360 uint32_t unused1[6];
361 uint32_t geometry_id : 28;
362 uint32_t flags : 4;
363 uint32_t primitive_id;
364 uint32_t unused[2];
365 };
366
367 static_assert(sizeof(struct rra_aabb_node) == 64, "rra_aabb_node does not match RRA spec!");
368
369 struct rra_triangle_node {
370 float coords[3][3];
371 uint32_t reserved[3];
372 uint32_t geometry_id : 28;
373 uint32_t flags : 4;
374 uint32_t triangle_id;
375 uint32_t reserved2;
376 uint32_t id;
377 };
378
379 static_assert(sizeof(struct rra_triangle_node) == 64, "rra_triangle_node does not match RRA spec!");
380
381 static void
rra_dump_tlas_header(struct radv_accel_struct_header * header,size_t parent_id_table_size,size_t leaf_node_data_size,size_t internal_node_data_size,uint64_t primitive_count,FILE * output)382 rra_dump_tlas_header(struct radv_accel_struct_header *header, size_t parent_id_table_size, size_t leaf_node_data_size,
383 size_t internal_node_data_size, uint64_t primitive_count, FILE *output)
384 {
385 struct rra_accel_struct_header file_header = rra_fill_accel_struct_header_common(
386 header, parent_id_table_size, leaf_node_data_size, internal_node_data_size, primitive_count);
387 file_header.post_build_info.bvh_type = RRA_BVH_TYPE_TLAS;
388 file_header.geometry_type = VK_GEOMETRY_TYPE_INSTANCES_KHR;
389
390 fwrite(&file_header, sizeof(struct rra_accel_struct_header), 1, output);
391 }
392
393 static void
rra_dump_blas_header(struct radv_accel_struct_header * header,size_t parent_id_table_size,struct radv_accel_struct_geometry_info * geometry_infos,size_t leaf_node_data_size,size_t internal_node_data_size,uint64_t primitive_count,FILE * output)394 rra_dump_blas_header(struct radv_accel_struct_header *header, size_t parent_id_table_size,
395 struct radv_accel_struct_geometry_info *geometry_infos, size_t leaf_node_data_size,
396 size_t internal_node_data_size, uint64_t primitive_count, FILE *output)
397 {
398 struct rra_accel_struct_header file_header = rra_fill_accel_struct_header_common(
399 header, parent_id_table_size, leaf_node_data_size, internal_node_data_size, primitive_count);
400 file_header.post_build_info.bvh_type = RRA_BVH_TYPE_BLAS;
401 file_header.geometry_type = header->geometry_count ? geometry_infos->type : VK_GEOMETRY_TYPE_TRIANGLES_KHR;
402
403 fwrite(&file_header, sizeof(struct rra_accel_struct_header), 1, output);
404 }
405
406 static uint32_t
rra_parent_table_index_from_offset(uint32_t offset,uint32_t parent_table_size)407 rra_parent_table_index_from_offset(uint32_t offset, uint32_t parent_table_size)
408 {
409 uint32_t max_parent_table_index = parent_table_size / sizeof(uint32_t) - 1;
410 return max_parent_table_index - (offset - RRA_ROOT_NODE_OFFSET) / 64;
411 }
412
413 struct rra_validation_context {
414 bool failed;
415 char location[31];
416 };
417
rra_validation_fail(struct rra_validation_context * ctx,const char * message,...)418 static void PRINTFLIKE(2, 3) rra_validation_fail(struct rra_validation_context *ctx, const char *message, ...)
419 {
420 if (!ctx->failed) {
421 fprintf(stderr, "radv: rra: Validation failed at %s:\n", ctx->location);
422 ctx->failed = true;
423 }
424
425 fprintf(stderr, " ");
426
427 va_list list;
428 va_start(list, message);
429 vfprintf(stderr, message, list);
430 va_end(list);
431
432 fprintf(stderr, "\n");
433 }
434
435 static bool
rra_validate_header(struct radv_rra_accel_struct_data * accel_struct,const struct radv_accel_struct_header * header)436 rra_validate_header(struct radv_rra_accel_struct_data *accel_struct, const struct radv_accel_struct_header *header)
437 {
438 struct rra_validation_context ctx = {
439 .location = "header",
440 };
441
442 if (accel_struct->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR && header->instance_count > 0)
443 rra_validation_fail(&ctx, "BLAS contains instances");
444
445 if (header->bvh_offset >= accel_struct->size)
446 rra_validation_fail(&ctx, "Invalid BVH offset %u", header->bvh_offset);
447
448 if (header->instance_count * sizeof(struct radv_bvh_instance_node) >= accel_struct->size)
449 rra_validation_fail(&ctx, "Too many instances");
450
451 return ctx.failed;
452 }
453
454 static bool
is_internal_node(uint32_t type)455 is_internal_node(uint32_t type)
456 {
457 return type == radv_bvh_node_box16 || type == radv_bvh_node_box32;
458 }
459
460 static const char *node_type_names[8] = {
461 [radv_bvh_node_triangle + 0] = "triangle0",
462 [radv_bvh_node_triangle + 1] = "triangle1",
463 [radv_bvh_node_triangle + 2] = "triangle2",
464 [radv_bvh_node_triangle + 3] = "triangle3",
465 [radv_bvh_node_box16] = "box16",
466 [radv_bvh_node_box32] = "box32",
467 [radv_bvh_node_instance] = "instance",
468 [radv_bvh_node_aabb] = "aabb",
469 };
470
471 static bool
rra_validate_node(struct hash_table_u64 * accel_struct_vas,uint8_t * data,void * node,uint32_t geometry_count,uint32_t size,bool is_bottom_level)472 rra_validate_node(struct hash_table_u64 *accel_struct_vas, uint8_t *data, void *node, uint32_t geometry_count,
473 uint32_t size, bool is_bottom_level)
474 {
475 struct rra_validation_context ctx = {0};
476
477 uint32_t cur_offset = (uint8_t *)node - data;
478 snprintf(ctx.location, sizeof(ctx.location), "internal node (offset=%u)", cur_offset);
479
480 /* The child ids are located at offset=0 for both box16 and box32 nodes. */
481 uint32_t *children = node;
482 for (uint32_t i = 0; i < 4; ++i) {
483 if (children[i] == 0xFFFFFFFF)
484 continue;
485
486 uint32_t type = children[i] & 7;
487 uint32_t offset = (children[i] & (~7u)) << 3;
488
489 if (!is_internal_node(type) && is_bottom_level == (type == radv_bvh_node_instance))
490 rra_validation_fail(&ctx,
491 is_bottom_level ? "%s node in BLAS (child index %u)" : "%s node in TLAS (child index %u)",
492 node_type_names[type], i);
493
494 if (offset > size) {
495 rra_validation_fail(&ctx, "Invalid child offset (child index %u)", i);
496 continue;
497 }
498
499 struct rra_validation_context child_ctx = {0};
500 snprintf(child_ctx.location, sizeof(child_ctx.location), "%s node (offset=%u)", node_type_names[type], offset);
501
502 if (is_internal_node(type)) {
503 ctx.failed |= rra_validate_node(accel_struct_vas, data, data + offset, geometry_count, size, is_bottom_level);
504 } else if (type == radv_bvh_node_instance) {
505 struct radv_bvh_instance_node *src = (struct radv_bvh_instance_node *)(data + offset);
506 uint64_t blas_va = node_to_addr(src->bvh_ptr) - src->bvh_offset;
507 if (!_mesa_hash_table_u64_search(accel_struct_vas, blas_va))
508 rra_validation_fail(&child_ctx, "Invalid instance node pointer 0x%llx (offset: 0x%x)",
509 (unsigned long long)src->bvh_ptr, src->bvh_offset);
510 } else if (type == radv_bvh_node_aabb) {
511 struct radv_bvh_aabb_node *src = (struct radv_bvh_aabb_node *)(data + offset);
512 if ((src->geometry_id_and_flags & 0xFFFFFFF) >= geometry_count)
513 rra_validation_fail(&ctx, "geometry_id >= geometry_count");
514 } else {
515 struct radv_bvh_triangle_node *src = (struct radv_bvh_triangle_node *)(data + offset);
516 if ((src->geometry_id_and_flags & 0xFFFFFFF) >= geometry_count)
517 rra_validation_fail(&ctx, "geometry_id >= geometry_count");
518 }
519
520 ctx.failed |= child_ctx.failed;
521 }
522 return ctx.failed;
523 }
524
525 struct rra_transcoding_context {
526 const uint8_t *src;
527 uint8_t *dst;
528 uint32_t dst_leaf_offset;
529 uint32_t dst_internal_offset;
530 uint32_t *parent_id_table;
531 uint32_t parent_id_table_size;
532 uint32_t *leaf_node_ids;
533 uint32_t *leaf_indices;
534 };
535
536 static void
rra_transcode_triangle_node(struct rra_transcoding_context * ctx,const struct radv_bvh_triangle_node * src)537 rra_transcode_triangle_node(struct rra_transcoding_context *ctx, const struct radv_bvh_triangle_node *src)
538 {
539 struct rra_triangle_node *dst = (struct rra_triangle_node *)(ctx->dst + ctx->dst_leaf_offset);
540 ctx->dst_leaf_offset += sizeof(struct rra_triangle_node);
541
542 for (int i = 0; i < 3; ++i)
543 for (int j = 0; j < 3; ++j)
544 dst->coords[i][j] = src->coords[i][j];
545 dst->triangle_id = src->triangle_id;
546 dst->geometry_id = src->geometry_id_and_flags & 0xfffffff;
547 dst->flags = src->geometry_id_and_flags >> 28;
548 dst->id = src->id;
549 }
550
551 static void
rra_transcode_aabb_node(struct rra_transcoding_context * ctx,const struct radv_bvh_aabb_node * src,radv_aabb bounds)552 rra_transcode_aabb_node(struct rra_transcoding_context *ctx, const struct radv_bvh_aabb_node *src, radv_aabb bounds)
553 {
554 struct rra_aabb_node *dst = (struct rra_aabb_node *)(ctx->dst + ctx->dst_leaf_offset);
555 ctx->dst_leaf_offset += sizeof(struct rra_aabb_node);
556
557 dst->aabb[0][0] = bounds.min.x;
558 dst->aabb[0][1] = bounds.min.y;
559 dst->aabb[0][2] = bounds.min.z;
560 dst->aabb[1][0] = bounds.max.x;
561 dst->aabb[1][1] = bounds.max.y;
562 dst->aabb[1][2] = bounds.max.z;
563
564 dst->geometry_id = src->geometry_id_and_flags & 0xfffffff;
565 dst->flags = src->geometry_id_and_flags >> 28;
566 dst->primitive_id = src->primitive_id;
567 }
568
569 static void
rra_transcode_instance_node(struct rra_transcoding_context * ctx,const struct radv_bvh_instance_node * src)570 rra_transcode_instance_node(struct rra_transcoding_context *ctx, const struct radv_bvh_instance_node *src)
571 {
572 uint64_t blas_va = node_to_addr(src->bvh_ptr) - src->bvh_offset;
573
574 struct rra_instance_node *dst = (struct rra_instance_node *)(ctx->dst + ctx->dst_leaf_offset);
575 ctx->dst_leaf_offset += sizeof(struct rra_instance_node);
576
577 dst->custom_instance_id = src->custom_instance_and_mask & 0xffffff;
578 dst->mask = src->custom_instance_and_mask >> 24;
579 dst->sbt_offset = src->sbt_offset_and_flags & 0xffffff;
580 dst->instance_flags = src->sbt_offset_and_flags >> 24;
581 dst->blas_va = (blas_va + sizeof(struct rra_accel_struct_metadata)) >> 3;
582 dst->instance_id = src->instance_id;
583 dst->blas_metadata_size = sizeof(struct rra_accel_struct_metadata);
584
585 memcpy(dst->wto_matrix, src->wto_matrix.values, sizeof(dst->wto_matrix));
586 memcpy(dst->otw_matrix, src->otw_matrix.values, sizeof(dst->otw_matrix));
587 }
588
589 static uint32_t rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id,
590 radv_aabb bounds);
591
592 static void
rra_transcode_box16_node(struct rra_transcoding_context * ctx,const struct radv_bvh_box16_node * src)593 rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_bvh_box16_node *src)
594 {
595 uint32_t dst_offset = ctx->dst_internal_offset;
596 ctx->dst_internal_offset += sizeof(struct rra_box16_node);
597 struct rra_box16_node *dst = (struct rra_box16_node *)(ctx->dst + dst_offset);
598
599 memcpy(dst->coords, src->coords, sizeof(dst->coords));
600
601 for (uint32_t i = 0; i < 4; ++i) {
602 if (src->children[i] == 0xffffffff) {
603 dst->children[i] = 0xffffffff;
604 continue;
605 }
606
607 radv_aabb bounds = {
608 .min =
609 {
610 _mesa_half_to_float(src->coords[i][0][0]),
611 _mesa_half_to_float(src->coords[i][0][1]),
612 _mesa_half_to_float(src->coords[i][0][2]),
613 },
614 .max =
615 {
616 _mesa_half_to_float(src->coords[i][1][0]),
617 _mesa_half_to_float(src->coords[i][1][1]),
618 _mesa_half_to_float(src->coords[i][1][2]),
619 },
620 };
621
622 dst->children[i] = rra_transcode_node(ctx, radv_bvh_node_box16 | (dst_offset >> 3), src->children[i], bounds);
623 }
624 }
625
626 static void
rra_transcode_box32_node(struct rra_transcoding_context * ctx,const struct radv_bvh_box32_node * src)627 rra_transcode_box32_node(struct rra_transcoding_context *ctx, const struct radv_bvh_box32_node *src)
628 {
629 uint32_t dst_offset = ctx->dst_internal_offset;
630 ctx->dst_internal_offset += sizeof(struct rra_box32_node);
631 struct rra_box32_node *dst = (struct rra_box32_node *)(ctx->dst + dst_offset);
632
633 memcpy(dst->coords, src->coords, sizeof(dst->coords));
634
635 for (uint32_t i = 0; i < 4; ++i) {
636 if (isnan(src->coords[i].min.x)) {
637 dst->children[i] = 0xffffffff;
638 continue;
639 }
640
641 dst->children[i] =
642 rra_transcode_node(ctx, radv_bvh_node_box32 | (dst_offset >> 3), src->children[i], src->coords[i]);
643 }
644 }
645
646 static uint32_t
get_geometry_id(const void * node,uint32_t node_type)647 get_geometry_id(const void *node, uint32_t node_type)
648 {
649 if (node_type == radv_bvh_node_triangle) {
650 const struct radv_bvh_triangle_node *triangle = node;
651 return triangle->geometry_id_and_flags & 0xFFFFFFF;
652 }
653
654 if (node_type == radv_bvh_node_aabb) {
655 const struct radv_bvh_aabb_node *aabb = node;
656 return aabb->geometry_id_and_flags & 0xFFFFFFF;
657 }
658
659 return 0;
660 }
661
662 static uint32_t
rra_transcode_node(struct rra_transcoding_context * ctx,uint32_t parent_id,uint32_t src_id,radv_aabb bounds)663 rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, radv_aabb bounds)
664 {
665 uint32_t node_type = src_id & 7;
666 uint32_t src_offset = (src_id & (~7u)) << 3;
667
668 uint32_t dst_offset;
669
670 const void *src_child_node = ctx->src + src_offset;
671 if (is_internal_node(node_type)) {
672 dst_offset = ctx->dst_internal_offset;
673 if (node_type == radv_bvh_node_box32)
674 rra_transcode_box32_node(ctx, src_child_node);
675 else
676 rra_transcode_box16_node(ctx, src_child_node);
677 } else {
678 dst_offset = ctx->dst_leaf_offset;
679
680 if (node_type == radv_bvh_node_triangle)
681 rra_transcode_triangle_node(ctx, src_child_node);
682 else if (node_type == radv_bvh_node_aabb)
683 rra_transcode_aabb_node(ctx, src_child_node, bounds);
684 else if (node_type == radv_bvh_node_instance)
685 rra_transcode_instance_node(ctx, src_child_node);
686 }
687
688 uint32_t parent_id_index = rra_parent_table_index_from_offset(dst_offset, ctx->parent_id_table_size);
689 ctx->parent_id_table[parent_id_index] = parent_id;
690
691 uint32_t dst_id = node_type | (dst_offset >> 3);
692 if (!is_internal_node(node_type))
693 ctx->leaf_node_ids[ctx->leaf_indices[get_geometry_id(src_child_node, node_type)]++] = dst_id;
694
695 return dst_id;
696 }
697
698 struct rra_bvh_info {
699 uint32_t leaf_nodes_size;
700 uint32_t internal_nodes_size;
701 struct rra_geometry_info *geometry_infos;
702 };
703
704 static void
rra_gather_bvh_info(const uint8_t * bvh,uint32_t node_id,struct rra_bvh_info * dst)705 rra_gather_bvh_info(const uint8_t *bvh, uint32_t node_id, struct rra_bvh_info *dst)
706 {
707 uint32_t node_type = node_id & 7;
708
709 switch (node_type) {
710 case radv_bvh_node_box16:
711 dst->internal_nodes_size += sizeof(struct rra_box16_node);
712 break;
713 case radv_bvh_node_box32:
714 dst->internal_nodes_size += sizeof(struct rra_box32_node);
715 break;
716 case radv_bvh_node_instance:
717 dst->leaf_nodes_size += sizeof(struct rra_instance_node);
718 break;
719 case radv_bvh_node_triangle:
720 dst->leaf_nodes_size += sizeof(struct rra_triangle_node);
721 break;
722 case radv_bvh_node_aabb:
723 dst->leaf_nodes_size += sizeof(struct rra_aabb_node);
724 break;
725 default:
726 break;
727 }
728
729 const void *node = bvh + ((node_id & (~7u)) << 3);
730 if (is_internal_node(node_type)) {
731 /* The child ids are located at offset=0 for both box16 and box32 nodes. */
732 const uint32_t *children = node;
733 for (uint32_t i = 0; i < 4; i++)
734 if (children[i] != 0xffffffff)
735 rra_gather_bvh_info(bvh, children[i], dst);
736 } else {
737 dst->geometry_infos[get_geometry_id(node, node_type)].primitive_count++;
738 }
739 }
740
741 static VkResult
rra_dump_acceleration_structure(struct radv_rra_accel_struct_data * accel_struct,uint8_t * data,struct hash_table_u64 * accel_struct_vas,bool should_validate,FILE * output)742 rra_dump_acceleration_structure(struct radv_rra_accel_struct_data *accel_struct, uint8_t *data,
743 struct hash_table_u64 *accel_struct_vas, bool should_validate, FILE *output)
744 {
745 struct radv_accel_struct_header *header = (struct radv_accel_struct_header *)data;
746
747 bool is_tlas = header->instance_count > 0;
748
749 uint64_t geometry_infos_offset = sizeof(struct radv_accel_struct_header);
750
751 /* convert root node id to offset */
752 uint32_t src_root_offset = (RADV_BVH_ROOT_NODE & ~7) << 3;
753
754 if (should_validate) {
755 if (rra_validate_header(accel_struct, header)) {
756 return VK_ERROR_VALIDATION_FAILED_EXT;
757 }
758 if (rra_validate_node(accel_struct_vas, data + header->bvh_offset, data + header->bvh_offset + src_root_offset,
759 header->geometry_count, accel_struct->size, !is_tlas)) {
760 return VK_ERROR_VALIDATION_FAILED_EXT;
761 }
762 }
763
764 VkResult result = VK_SUCCESS;
765
766 struct rra_geometry_info *rra_geometry_infos = NULL;
767 uint32_t *leaf_indices = NULL;
768 uint32_t *node_parent_table = NULL;
769 uint32_t *leaf_node_ids = NULL;
770 uint8_t *dst_structure_data = NULL;
771
772 rra_geometry_infos = calloc(header->geometry_count, sizeof(struct rra_geometry_info));
773 if (!rra_geometry_infos) {
774 result = VK_ERROR_OUT_OF_HOST_MEMORY;
775 goto exit;
776 }
777
778 struct rra_bvh_info bvh_info = {
779 .geometry_infos = rra_geometry_infos,
780 };
781 rra_gather_bvh_info(data + header->bvh_offset, RADV_BVH_ROOT_NODE, &bvh_info);
782
783 leaf_indices = calloc(header->geometry_count, sizeof(struct rra_geometry_info));
784 if (!leaf_indices) {
785 result = VK_ERROR_OUT_OF_HOST_MEMORY;
786 goto exit;
787 }
788
789 uint64_t primitive_count = 0;
790
791 struct radv_accel_struct_geometry_info *geometry_infos =
792 (struct radv_accel_struct_geometry_info *)(data + geometry_infos_offset);
793
794 for (uint32_t i = 0; i < header->geometry_count; ++i) {
795 rra_geometry_infos[i].flags = geometry_infos[i].flags;
796 rra_geometry_infos[i].leaf_node_list_offset = primitive_count * sizeof(uint32_t);
797 leaf_indices[i] = primitive_count;
798 primitive_count += rra_geometry_infos[i].primitive_count;
799 }
800
801 uint32_t node_parent_table_size =
802 ((bvh_info.leaf_nodes_size + bvh_info.internal_nodes_size) / 64) * sizeof(uint32_t);
803
804 node_parent_table = calloc(node_parent_table_size, 1);
805 if (!node_parent_table) {
806 result = VK_ERROR_OUT_OF_HOST_MEMORY;
807 goto exit;
808 }
809
810 leaf_node_ids = calloc(primitive_count, sizeof(uint32_t));
811 if (!leaf_node_ids) {
812 result = VK_ERROR_OUT_OF_HOST_MEMORY;
813 goto exit;
814 }
815 dst_structure_data = calloc(RRA_ROOT_NODE_OFFSET + bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size, 1);
816 if (!dst_structure_data) {
817 result = VK_ERROR_OUT_OF_HOST_MEMORY;
818 goto exit;
819 }
820
821 struct rra_transcoding_context ctx = {
822 .src = data + header->bvh_offset,
823 .dst = dst_structure_data,
824 .dst_leaf_offset = RRA_ROOT_NODE_OFFSET + bvh_info.internal_nodes_size,
825 .dst_internal_offset = RRA_ROOT_NODE_OFFSET,
826 .parent_id_table = node_parent_table,
827 .parent_id_table_size = node_parent_table_size,
828 .leaf_node_ids = leaf_node_ids,
829 .leaf_indices = leaf_indices,
830 };
831
832 rra_transcode_node(&ctx, 0xFFFFFFFF, RADV_BVH_ROOT_NODE, header->aabb);
833
834 struct rra_accel_struct_chunk_header chunk_header = {
835 .metadata_offset = 0,
836 /*
837 * RRA loads the part of the metadata that is used into a struct.
838 * If the size is larger than just the "used" part, the loading
839 * operation overwrites internal pointers with data from the file,
840 * likely causing a crash.
841 */
842 .metadata_size = offsetof(struct rra_accel_struct_metadata, unused),
843 .header_offset = sizeof(struct rra_accel_struct_metadata) + node_parent_table_size,
844 .header_size = sizeof(struct rra_accel_struct_header),
845 .bvh_type = is_tlas ? RRA_BVH_TYPE_TLAS : RRA_BVH_TYPE_BLAS,
846 };
847
848 /*
849 * When associating TLASes with BLASes, acceleration structure VAs are
850 * looked up in a hashmap. But due to the way BLAS VAs are stored for
851 * each instance in the RRA file format (divided by 8, and limited to 54 bits),
852 * the top bits are masked away.
853 * In order to make sure BLASes can be found in the hashmap, we have
854 * to replicate that mask here.
855 */
856 uint64_t va = accel_struct->va & 0x1FFFFFFFFFFFFFF;
857 memcpy(chunk_header.virtual_address, &va, sizeof(uint64_t));
858
859 struct rra_accel_struct_metadata rra_metadata = {
860 .virtual_address = va,
861 .byte_size = bvh_info.leaf_nodes_size + bvh_info.internal_nodes_size + sizeof(struct rra_accel_struct_header),
862 };
863
864 fwrite(&chunk_header, sizeof(struct rra_accel_struct_chunk_header), 1, output);
865 fwrite(&rra_metadata, sizeof(struct rra_accel_struct_metadata), 1, output);
866
867 /* Write node parent id data */
868 fwrite(node_parent_table, 1, node_parent_table_size, output);
869
870 if (is_tlas)
871 rra_dump_tlas_header(header, node_parent_table_size, bvh_info.leaf_nodes_size, bvh_info.internal_nodes_size,
872 primitive_count, output);
873 else
874 rra_dump_blas_header(header, node_parent_table_size, geometry_infos, bvh_info.leaf_nodes_size,
875 bvh_info.internal_nodes_size, primitive_count, output);
876
877 /* Write acceleration structure data */
878 fwrite(dst_structure_data + RRA_ROOT_NODE_OFFSET, 1, bvh_info.internal_nodes_size + bvh_info.leaf_nodes_size,
879 output);
880
881 if (!is_tlas)
882 fwrite(rra_geometry_infos, sizeof(struct rra_geometry_info), header->geometry_count, output);
883
884 /* Write leaf node ids */
885 uint32_t leaf_node_list_size = primitive_count * sizeof(uint32_t);
886 fwrite(leaf_node_ids, 1, leaf_node_list_size, output);
887
888 exit:
889 free(rra_geometry_infos);
890 free(leaf_indices);
891 free(dst_structure_data);
892 free(node_parent_table);
893 free(leaf_node_ids);
894
895 return result;
896 }
897
898 VkResult
radv_rra_trace_init(struct radv_device * device)899 radv_rra_trace_init(struct radv_device *device)
900 {
901 device->rra_trace.validate_as = debug_get_bool_option("RADV_RRA_TRACE_VALIDATE", false);
902 device->rra_trace.copy_after_build = debug_get_bool_option("RADV_RRA_TRACE_COPY_AFTER_BUILD", false);
903 device->rra_trace.accel_structs = _mesa_pointer_hash_table_create(NULL);
904 device->rra_trace.accel_struct_vas = _mesa_hash_table_u64_create(NULL);
905 simple_mtx_init(&device->rra_trace.data_mtx, mtx_plain);
906
907 device->rra_trace.copy_memory_index = radv_find_memory_index(
908 device->physical_device,
909 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
910
911 util_dynarray_init(&device->rra_trace.ray_history, NULL);
912
913 device->rra_trace.ray_history_buffer_size = debug_get_num_option("RADV_RRA_TRACE_HISTORY_SIZE", 100 * 1024 * 1024);
914 if (device->rra_trace.ray_history_buffer_size <
915 sizeof(struct radv_ray_history_header) + sizeof(struct radv_packed_end_trace_token))
916 return VK_SUCCESS;
917
918 device->rra_trace.ray_history_resolution_scale = debug_get_num_option("RADV_RRA_TRACE_RESOLUTION_SCALE", 1);
919 device->rra_trace.ray_history_resolution_scale = MAX2(device->rra_trace.ray_history_resolution_scale, 1);
920
921 VkBufferCreateInfo buffer_create_info = {
922 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
923 .pNext =
924 &(VkBufferUsageFlags2CreateInfoKHR){
925 .sType = VK_STRUCTURE_TYPE_BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR,
926 .usage = VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR | VK_BUFFER_USAGE_2_SHADER_DEVICE_ADDRESS_BIT_KHR,
927 },
928 .size = device->rra_trace.ray_history_buffer_size,
929 };
930
931 VkDevice _device = radv_device_to_handle(device);
932 VkResult result = radv_CreateBuffer(_device, &buffer_create_info, NULL, &device->rra_trace.ray_history_buffer);
933 if (result != VK_SUCCESS)
934 return result;
935
936 VkMemoryRequirements requirements;
937 vk_common_GetBufferMemoryRequirements(_device, device->rra_trace.ray_history_buffer, &requirements);
938
939 VkMemoryAllocateInfo alloc_info = {
940 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
941 .allocationSize = requirements.size,
942 .memoryTypeIndex = radv_find_memory_index(device->physical_device, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
943 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
944 VK_MEMORY_PROPERTY_HOST_COHERENT_BIT),
945 };
946
947 result = radv_AllocateMemory(_device, &alloc_info, NULL, &device->rra_trace.ray_history_memory);
948 if (result != VK_SUCCESS)
949 return result;
950
951 result = vk_common_MapMemory(_device, device->rra_trace.ray_history_memory, 0, VK_WHOLE_SIZE, 0,
952 (void **)&device->rra_trace.ray_history_data);
953 if (result != VK_SUCCESS)
954 return result;
955
956 result = vk_common_BindBufferMemory(_device, device->rra_trace.ray_history_buffer,
957 device->rra_trace.ray_history_memory, 0);
958
959 VkBufferDeviceAddressInfo addr_info = {
960 .sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO,
961 .buffer = device->rra_trace.ray_history_buffer,
962 };
963 device->rra_trace.ray_history_addr = radv_GetBufferDeviceAddress(_device, &addr_info);
964
965 struct radv_ray_history_header *ray_history_header = device->rra_trace.ray_history_data;
966 memset(ray_history_header, 0, sizeof(struct radv_ray_history_header));
967 ray_history_header->offset = 1;
968
969 return result;
970 }
971
972 void
radv_rra_trace_clear_ray_history(VkDevice _device,struct radv_rra_trace_data * data)973 radv_rra_trace_clear_ray_history(VkDevice _device, struct radv_rra_trace_data *data)
974 {
975 util_dynarray_foreach (&data->ray_history, struct radv_rra_ray_history_data *, _entry) {
976 struct radv_rra_ray_history_data *entry = *_entry;
977 free(entry);
978 }
979 util_dynarray_clear(&data->ray_history);
980 }
981
982 void
radv_rra_trace_finish(VkDevice vk_device,struct radv_rra_trace_data * data)983 radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data)
984 {
985 radv_DestroyBuffer(vk_device, data->ray_history_buffer, NULL);
986
987 if (data->ray_history_memory)
988 vk_common_UnmapMemory(vk_device, data->ray_history_memory);
989
990 radv_FreeMemory(vk_device, data->ray_history_memory, NULL);
991
992 radv_rra_trace_clear_ray_history(vk_device, data);
993 util_dynarray_fini(&data->ray_history);
994
995 if (data->accel_structs)
996 hash_table_foreach (data->accel_structs, entry)
997 radv_destroy_rra_accel_struct_data(vk_device, entry->data);
998
999 simple_mtx_destroy(&data->data_mtx);
1000 _mesa_hash_table_destroy(data->accel_structs, NULL);
1001 _mesa_hash_table_u64_destroy(data->accel_struct_vas);
1002 }
1003
1004 void
radv_destroy_rra_accel_struct_data(VkDevice device,struct radv_rra_accel_struct_data * data)1005 radv_destroy_rra_accel_struct_data(VkDevice device, struct radv_rra_accel_struct_data *data)
1006 {
1007 radv_DestroyEvent(device, data->build_event, NULL);
1008 radv_DestroyBuffer(device, data->buffer, NULL);
1009 radv_FreeMemory(device, data->memory, NULL);
1010 free(data);
1011 }
1012
1013 static int
accel_struct_entry_cmp(const void * a,const void * b)1014 accel_struct_entry_cmp(const void *a, const void *b)
1015 {
1016 struct hash_entry *entry_a = *(struct hash_entry *const *)a;
1017 struct hash_entry *entry_b = *(struct hash_entry *const *)b;
1018 const struct radv_rra_accel_struct_data *s_a = entry_a->data;
1019 const struct radv_rra_accel_struct_data *s_b = entry_b->data;
1020
1021 return s_a->va > s_b->va ? 1 : s_a->va < s_b->va ? -1 : 0;
1022 }
1023
1024 struct rra_copy_context {
1025 VkDevice device;
1026 VkQueue queue;
1027
1028 VkCommandPool pool;
1029 VkCommandBuffer cmd_buffer;
1030 uint32_t family_index;
1031
1032 VkDeviceMemory memory;
1033 VkBuffer buffer;
1034 void *mapped_data;
1035
1036 struct hash_entry **entries;
1037
1038 uint32_t min_size;
1039 };
1040
1041 static VkResult
rra_copy_context_init(struct rra_copy_context * ctx)1042 rra_copy_context_init(struct rra_copy_context *ctx)
1043 {
1044 RADV_FROM_HANDLE(radv_device, device, ctx->device);
1045 if (device->rra_trace.copy_after_build)
1046 return VK_SUCCESS;
1047
1048 uint32_t max_size = ctx->min_size;
1049 uint32_t accel_struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs);
1050 for (unsigned i = 0; i < accel_struct_count; i++) {
1051 struct radv_rra_accel_struct_data *data = ctx->entries[i]->data;
1052 max_size = MAX2(max_size, data->size);
1053 }
1054
1055 VkCommandPoolCreateInfo pool_info = {
1056 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
1057 .queueFamilyIndex = ctx->family_index,
1058 };
1059
1060 VkResult result = vk_common_CreateCommandPool(ctx->device, &pool_info, NULL, &ctx->pool);
1061 if (result != VK_SUCCESS)
1062 return result;
1063
1064 VkCommandBufferAllocateInfo cmdbuf_alloc_info = {
1065 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
1066 .commandPool = ctx->pool,
1067 .commandBufferCount = 1,
1068 };
1069
1070 result = vk_common_AllocateCommandBuffers(ctx->device, &cmdbuf_alloc_info, &ctx->cmd_buffer);
1071 if (result != VK_SUCCESS)
1072 goto fail_pool;
1073
1074 VkBufferCreateInfo buffer_create_info = {
1075 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
1076 .pNext =
1077 &(VkBufferUsageFlags2CreateInfoKHR){
1078 .sType = VK_STRUCTURE_TYPE_BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR,
1079 .usage = VK_BUFFER_USAGE_2_TRANSFER_DST_BIT_KHR,
1080 },
1081 .size = max_size,
1082 };
1083
1084 result = radv_CreateBuffer(ctx->device, &buffer_create_info, NULL, &ctx->buffer);
1085 if (result != VK_SUCCESS)
1086 goto fail_pool;
1087
1088 VkMemoryRequirements requirements;
1089 vk_common_GetBufferMemoryRequirements(ctx->device, ctx->buffer, &requirements);
1090
1091 VkMemoryAllocateInfo alloc_info = {
1092 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
1093 .allocationSize = requirements.size,
1094 .memoryTypeIndex = device->rra_trace.copy_memory_index,
1095 };
1096
1097 result = radv_AllocateMemory(ctx->device, &alloc_info, NULL, &ctx->memory);
1098 if (result != VK_SUCCESS)
1099 goto fail_buffer;
1100
1101 result = vk_common_MapMemory(ctx->device, ctx->memory, 0, VK_WHOLE_SIZE, 0, (void **)&ctx->mapped_data);
1102 if (result != VK_SUCCESS)
1103 goto fail_memory;
1104
1105 result = vk_common_BindBufferMemory(ctx->device, ctx->buffer, ctx->memory, 0);
1106 if (result != VK_SUCCESS)
1107 goto fail_memory;
1108
1109 return result;
1110 fail_memory:
1111 radv_FreeMemory(ctx->device, ctx->memory, NULL);
1112 fail_buffer:
1113 radv_DestroyBuffer(ctx->device, ctx->buffer, NULL);
1114 fail_pool:
1115 vk_common_DestroyCommandPool(ctx->device, ctx->pool, NULL);
1116 return result;
1117 }
1118
1119 static void
rra_copy_context_finish(struct rra_copy_context * ctx)1120 rra_copy_context_finish(struct rra_copy_context *ctx)
1121 {
1122 RADV_FROM_HANDLE(radv_device, device, ctx->device);
1123 if (device->rra_trace.copy_after_build)
1124 return;
1125
1126 vk_common_DestroyCommandPool(ctx->device, ctx->pool, NULL);
1127 radv_DestroyBuffer(ctx->device, ctx->buffer, NULL);
1128 vk_common_UnmapMemory(ctx->device, ctx->memory);
1129 radv_FreeMemory(ctx->device, ctx->memory, NULL);
1130 }
1131
1132 static void *
rra_map_accel_struct_data(struct rra_copy_context * ctx,uint32_t i)1133 rra_map_accel_struct_data(struct rra_copy_context *ctx, uint32_t i)
1134 {
1135 struct radv_rra_accel_struct_data *data = ctx->entries[i]->data;
1136 if (radv_GetEventStatus(ctx->device, data->build_event) != VK_EVENT_SET)
1137 return NULL;
1138
1139 if (data->memory) {
1140 void *mapped_data;
1141 vk_common_MapMemory(ctx->device, data->memory, 0, VK_WHOLE_SIZE, 0, &mapped_data);
1142 return mapped_data;
1143 }
1144
1145 const struct vk_acceleration_structure *accel_struct = ctx->entries[i]->key;
1146 VkResult result;
1147
1148 VkCommandBufferBeginInfo begin_info = {
1149 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
1150 };
1151 result = radv_BeginCommandBuffer(ctx->cmd_buffer, &begin_info);
1152 if (result != VK_SUCCESS)
1153 return NULL;
1154
1155 VkBufferCopy2 copy = {
1156 .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2,
1157 .srcOffset = accel_struct->offset,
1158 .size = accel_struct->size,
1159 };
1160
1161 VkCopyBufferInfo2 copy_info = {
1162 .sType = VK_STRUCTURE_TYPE_COPY_BUFFER_INFO_2,
1163 .srcBuffer = accel_struct->buffer,
1164 .dstBuffer = ctx->buffer,
1165 .regionCount = 1,
1166 .pRegions = ©,
1167 };
1168
1169 radv_CmdCopyBuffer2(ctx->cmd_buffer, ©_info);
1170
1171 result = radv_EndCommandBuffer(ctx->cmd_buffer);
1172 if (result != VK_SUCCESS)
1173 return NULL;
1174
1175 VkSubmitInfo submit_info = {
1176 .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
1177 .commandBufferCount = 1,
1178 .pCommandBuffers = &ctx->cmd_buffer,
1179 };
1180
1181 result = vk_common_QueueSubmit(ctx->queue, 1, &submit_info, VK_NULL_HANDLE);
1182 if (result != VK_SUCCESS)
1183 return NULL;
1184
1185 result = vk_common_QueueWaitIdle(ctx->queue);
1186 if (result != VK_SUCCESS)
1187 return NULL;
1188
1189 return ctx->mapped_data;
1190 }
1191
1192 static void
rra_unmap_accel_struct_data(struct rra_copy_context * ctx,uint32_t i)1193 rra_unmap_accel_struct_data(struct rra_copy_context *ctx, uint32_t i)
1194 {
1195 struct radv_rra_accel_struct_data *data = ctx->entries[i]->data;
1196
1197 if (data->memory)
1198 vk_common_UnmapMemory(ctx->device, data->memory);
1199 }
1200
1201 enum rra_ray_history_token_type {
1202 rra_ray_history_token_begin,
1203 rra_ray_history_token_tlas,
1204 rra_ray_history_token_blas,
1205 rra_ray_history_token_end,
1206 rra_ray_history_token_call,
1207 rra_ray_history_token_timestamp,
1208 rra_ray_history_token_ahit_status,
1209 rra_ray_history_token_call2,
1210 rra_ray_history_token_isec_status,
1211 rra_ray_history_token_end2,
1212 rra_ray_history_token_begin2,
1213 rra_ray_history_token_normal = 0xFFFF,
1214 };
1215
1216 struct rra_ray_history_id_token {
1217 uint32_t id : 30;
1218 uint32_t reserved : 1;
1219 uint32_t has_control : 1;
1220 };
1221 static_assert(sizeof(struct rra_ray_history_id_token) == 4, "rra_ray_history_id_token does not match RRA expectations");
1222
1223 struct rra_ray_history_control_token {
1224 uint32_t type : 16;
1225 uint32_t length : 8;
1226 uint32_t data : 8;
1227 };
1228 static_assert(sizeof(struct rra_ray_history_control_token) == 4,
1229 "rra_ray_history_control_token does not match RRA expectations");
1230
1231 struct rra_ray_history_begin_token {
1232 uint32_t wave_id;
1233 uint32_t launch_ids[3];
1234 uint32_t accel_struct_lo;
1235 uint32_t accel_struct_hi;
1236 uint32_t ray_flags;
1237 uint32_t cull_mask : 8;
1238 uint32_t stb_offset : 4;
1239 uint32_t stb_stride : 4;
1240 uint32_t miss_index : 16;
1241 float origin[3];
1242 float tmin;
1243 float direction[3];
1244 float tmax;
1245 };
1246 static_assert(sizeof(struct rra_ray_history_begin_token) == 64,
1247 "rra_ray_history_begin_token does not match RRA expectations");
1248
1249 struct rra_ray_history_begin2_token {
1250 struct rra_ray_history_begin_token base;
1251 uint32_t call_instruction_id;
1252 uint32_t unique_wave_id;
1253 uint32_t parent_unique_wave_id;
1254 };
1255 static_assert(sizeof(struct rra_ray_history_begin2_token) == 76,
1256 "rra_ray_history_begin2_token does not match RRA expectations");
1257
1258 struct rra_ray_history_end_token {
1259 uint32_t primitive_index;
1260 uint32_t geometry_index;
1261 };
1262 static_assert(sizeof(struct rra_ray_history_end_token) == 8,
1263 "rra_ray_history_end_token does not match RRA expectations");
1264
1265 struct rra_ray_history_end2_token {
1266 struct rra_ray_history_end_token base;
1267 uint32_t instance_index : 24;
1268 uint32_t hit_kind : 8;
1269 uint32_t iteration_count;
1270 uint32_t candidate_instance_count;
1271 float t;
1272 };
1273 static_assert(sizeof(struct rra_ray_history_end2_token) == 24,
1274 "rra_ray_history_end2_token does not match RRA expectations");
1275
1276 struct rra_ray_history_tlas_token {
1277 uint64_t addr;
1278 };
1279 static_assert(sizeof(struct rra_ray_history_tlas_token) == 8,
1280 "rra_ray_history_tlas_token does not match RRA expectations");
1281
1282 struct rra_ray_history_blas_token {
1283 uint64_t addr;
1284 };
1285 static_assert(sizeof(struct rra_ray_history_blas_token) == 8,
1286 "rra_ray_history_blas_token does not match RRA expectations");
1287
1288 struct rra_ray_history_call_token {
1289 uint32_t addr[2];
1290 };
1291 static_assert(sizeof(struct rra_ray_history_call_token) == 8,
1292 "rra_ray_history_call_token does not match RRA expectations");
1293
1294 struct rra_ray_history_call2_token {
1295 struct rra_ray_history_call_token base;
1296 uint32_t sbt_index;
1297 };
1298 static_assert(sizeof(struct rra_ray_history_call2_token) == 12,
1299 "rra_ray_history_call2_token does not match RRA expectations");
1300
1301 struct rra_ray_history_isec_token {
1302 float t;
1303 uint32_t hit_kind;
1304 };
1305 static_assert(sizeof(struct rra_ray_history_isec_token) == 8,
1306 "rra_ray_history_isec_token does not match RRA expectations");
1307
1308 struct rra_ray_history_timestamp_token {
1309 uint64_t gpu_timestamp;
1310 };
1311 static_assert(sizeof(struct rra_ray_history_timestamp_token) == 8,
1312 "rra_ray_history_timestamp_token does not match RRA expectations");
1313
1314 VkResult
radv_rra_dump_trace(VkQueue vk_queue,char * filename)1315 radv_rra_dump_trace(VkQueue vk_queue, char *filename)
1316 {
1317 RADV_FROM_HANDLE(radv_queue, queue, vk_queue);
1318 struct radv_device *device = queue->device;
1319 VkDevice vk_device = radv_device_to_handle(device);
1320
1321 VkResult result = vk_common_DeviceWaitIdle(vk_device);
1322 if (result != VK_SUCCESS)
1323 return result;
1324
1325 uint64_t *accel_struct_offsets = NULL;
1326 uint64_t *ray_history_offsets = NULL;
1327 struct hash_entry **hash_entries = NULL;
1328 FILE *file = NULL;
1329
1330 uint32_t struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs);
1331 accel_struct_offsets = calloc(struct_count, sizeof(uint64_t));
1332 if (!accel_struct_offsets)
1333 return VK_ERROR_OUT_OF_HOST_MEMORY;
1334
1335 uint32_t dispatch_count =
1336 util_dynarray_num_elements(&device->rra_trace.ray_history, struct radv_rra_ray_history_data *);
1337 ray_history_offsets = calloc(dispatch_count, sizeof(uint64_t));
1338 if (!ray_history_offsets) {
1339 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1340 goto cleanup;
1341 }
1342
1343 hash_entries = malloc(sizeof(*hash_entries) * struct_count);
1344 if (!hash_entries) {
1345 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1346 goto cleanup;
1347 }
1348
1349 file = fopen(filename, "w");
1350 if (!file) {
1351 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1352 goto cleanup;
1353 }
1354
1355 /*
1356 * The header contents can only be determined after all acceleration
1357 * structures have been dumped. An empty struct is written instead
1358 * to keep offsets intact.
1359 */
1360 struct rra_file_header header = {0};
1361 fwrite(&header, sizeof(struct rra_file_header), 1, file);
1362
1363 uint64_t api_info_offset = (uint64_t)ftell(file);
1364 uint64_t api = RADV_RRA_API_VULKAN;
1365 fwrite(&api, sizeof(uint64_t), 1, file);
1366
1367 uint64_t asic_info_offset = (uint64_t)ftell(file);
1368 rra_dump_asic_info(&device->physical_device->rad_info, file);
1369
1370 uint64_t written_accel_struct_count = 0;
1371
1372 struct hash_entry *last_entry = NULL;
1373 for (unsigned i = 0; (last_entry = _mesa_hash_table_next_entry(device->rra_trace.accel_structs, last_entry)); ++i)
1374 hash_entries[i] = last_entry;
1375
1376 qsort(hash_entries, struct_count, sizeof(*hash_entries), accel_struct_entry_cmp);
1377
1378 struct rra_copy_context copy_ctx = {
1379 .device = vk_device,
1380 .queue = vk_queue,
1381 .entries = hash_entries,
1382 .family_index = queue->vk.queue_family_index,
1383 .min_size = device->rra_trace.ray_history_buffer_size,
1384 };
1385
1386 result = rra_copy_context_init(©_ctx);
1387 if (result != VK_SUCCESS)
1388 goto cleanup;
1389
1390 for (unsigned i = 0; i < struct_count; i++) {
1391 struct radv_rra_accel_struct_data *data = hash_entries[i]->data;
1392 void *mapped_data = rra_map_accel_struct_data(©_ctx, i);
1393 if (!mapped_data)
1394 continue;
1395
1396 accel_struct_offsets[written_accel_struct_count] = (uint64_t)ftell(file);
1397 result = rra_dump_acceleration_structure(data, mapped_data, device->rra_trace.accel_struct_vas,
1398 device->rra_trace.validate_as, file);
1399
1400 rra_unmap_accel_struct_data(©_ctx, i);
1401
1402 if (result == VK_SUCCESS)
1403 written_accel_struct_count++;
1404 }
1405
1406 uint64_t ray_history_offset = (uint64_t)ftell(file);
1407
1408 uint32_t ray_history_index = 0xFFFFFFFF;
1409 struct radv_rra_ray_history_data *ray_history = NULL;
1410
1411 uint8_t *history = device->rra_trace.ray_history_data;
1412 struct radv_ray_history_header *history_header = (void *)history;
1413
1414 uint32_t history_buffer_size_mb = device->rra_trace.ray_history_buffer_size / 1024 / 1024;
1415 uint32_t history_size_mb = history_header->offset / 1024 / 1024;
1416 if (history_header->offset > device->rra_trace.ray_history_buffer_size) {
1417 fprintf(stderr, "radv: rra: The ray history buffer size (%u MB) is to small. %u MB is required.\n",
1418 history_buffer_size_mb, history_size_mb);
1419 } else {
1420 fprintf(stderr, "radv: rra: Ray history buffer size = %u MB, ray history size = %u MB.\n", history_buffer_size_mb,
1421 history_size_mb);
1422 }
1423
1424 uint32_t history_size = MIN2(history_header->offset, device->rra_trace.ray_history_buffer_size);
1425
1426 uint32_t token_size;
1427 for (uint32_t offset = sizeof(struct radv_ray_history_header); offset < history_size; offset += token_size) {
1428 struct radv_packed_end_trace_token *src = (void *)(history + offset);
1429 token_size = src->header.hit ? sizeof(struct radv_packed_end_trace_token)
1430 : offsetof(struct radv_packed_end_trace_token, primitive_id);
1431
1432 if (src->dispatch_index != ray_history_index) {
1433 ray_history_index = src->dispatch_index;
1434 assert(ray_history_index < dispatch_count);
1435 ray_history = *util_dynarray_element(&device->rra_trace.ray_history, struct radv_rra_ray_history_data *,
1436 ray_history_index);
1437
1438 assert(!ray_history_offsets[ray_history_index]);
1439 ray_history_offsets[ray_history_index] = (uint64_t)ftell(file);
1440 fwrite(&ray_history->metadata, sizeof(struct radv_rra_ray_history_metadata), 1, file);
1441 }
1442
1443 uint32_t *dispatch_size = ray_history->metadata.dispatch_size.size;
1444
1445 uint32_t x = src->header.launch_index % dispatch_size[0];
1446 uint32_t y = (src->header.launch_index / dispatch_size[0]) % dispatch_size[1];
1447 uint32_t z = src->header.launch_index / (dispatch_size[0] * dispatch_size[1]);
1448
1449 struct rra_ray_history_id_token begin_id = {
1450 .id = src->header.launch_index,
1451 .has_control = true,
1452 };
1453 struct rra_ray_history_control_token begin_control = {
1454 .type = rra_ray_history_token_begin,
1455 .length = sizeof(struct rra_ray_history_begin_token) / 4,
1456 };
1457 struct rra_ray_history_begin_token begin = {
1458 .wave_id = src->header.launch_index / 32,
1459 .launch_ids = {x, y, z},
1460 .accel_struct_lo = src->accel_struct_lo,
1461 .accel_struct_hi = src->accel_struct_hi & 0x1FFFFFF,
1462 .ray_flags = src->flags,
1463 .cull_mask = src->cull_mask,
1464 .stb_offset = src->sbt_offset,
1465 .stb_stride = src->sbt_stride,
1466 .miss_index = src->miss_index,
1467 .origin[0] = src->origin[0],
1468 .origin[1] = src->origin[1],
1469 .origin[2] = src->origin[2],
1470 .tmin = src->tmin,
1471 .direction[0] = src->direction[0],
1472 .direction[1] = src->direction[1],
1473 .direction[2] = src->direction[2],
1474 .tmax = src->tmax,
1475 };
1476 fwrite(&begin_id, sizeof(begin_id), 1, file);
1477 fwrite(&begin_control, sizeof(begin_control), 1, file);
1478 fwrite(&begin, sizeof(begin), 1, file);
1479
1480 for (uint32_t i = 0; i < src->ahit_count; i++) {
1481 struct rra_ray_history_id_token ahit_status_id = {
1482 .id = src->header.launch_index,
1483 .has_control = true,
1484 };
1485 struct rra_ray_history_control_token ahit_status_control = {
1486 .type = rra_ray_history_token_ahit_status,
1487 .data = i == src->ahit_count - 1 ? 2 : 0,
1488 };
1489 fwrite(&ahit_status_id, sizeof(ahit_status_id), 1, file);
1490 fwrite(&ahit_status_control, sizeof(ahit_status_control), 1, file);
1491 }
1492
1493 for (uint32_t i = 0; i < src->isec_count; i++) {
1494 struct rra_ray_history_id_token isec_status_id = {
1495 .id = src->header.launch_index,
1496 .has_control = true,
1497 };
1498 struct rra_ray_history_control_token isec_status_control = {
1499 .type = rra_ray_history_token_isec_status,
1500 .data = i == src->ahit_count - 1 ? 2 : 0,
1501 };
1502 fwrite(&isec_status_id, sizeof(isec_status_id), 1, file);
1503 fwrite(&isec_status_control, sizeof(isec_status_control), 1, file);
1504 }
1505
1506 struct rra_ray_history_id_token end_id = {
1507 .id = src->header.launch_index,
1508 .has_control = true,
1509 };
1510 struct rra_ray_history_control_token end_control = {
1511 .type = rra_ray_history_token_end2,
1512 .length = sizeof(struct rra_ray_history_end2_token) / 4,
1513 };
1514 struct rra_ray_history_end2_token end = {
1515 .base.primitive_index = 0xFFFFFFFF,
1516 .base.geometry_index = 0xFFFFFFFF,
1517 .iteration_count = src->iteration_count,
1518 .candidate_instance_count = src->instance_count,
1519 };
1520
1521 if (src->header.hit) {
1522 end.base.primitive_index = src->primitive_id;
1523 end.base.geometry_index = src->geometry_id;
1524 end.instance_index = src->instance_id;
1525 end.hit_kind = src->hit_kind;
1526 end.t = src->t;
1527 }
1528
1529 fwrite(&end_id, sizeof(end_id), 1, file);
1530 fwrite(&end_control, sizeof(end_control), 1, file);
1531 fwrite(&end, sizeof(end), 1, file);
1532 }
1533
1534 for (uint32_t i = 0; i < dispatch_count; i++) {
1535 if (ray_history_offsets[i])
1536 continue;
1537
1538 ray_history = *util_dynarray_element(&device->rra_trace.ray_history, struct radv_rra_ray_history_data *, i);
1539 ray_history_offsets[i] = (uint64_t)ftell(file);
1540 fwrite(&ray_history->metadata, sizeof(struct radv_rra_ray_history_metadata), 1, file);
1541 }
1542
1543 history_header->offset = 1;
1544
1545 rra_copy_context_finish(©_ctx);
1546
1547 uint64_t chunk_info_offset = (uint64_t)ftell(file);
1548 rra_dump_chunk_description(api_info_offset, 0, 8, "ApiInfo", RADV_RRA_ASIC_API_INFO_CHUNK_VERSION, file);
1549 rra_dump_chunk_description(asic_info_offset, 0, sizeof(struct rra_asic_info), "AsicInfo",
1550 RADV_RRA_ASIC_API_INFO_CHUNK_VERSION, file);
1551
1552 for (uint32_t i = 0; i < dispatch_count; i++) {
1553 uint64_t tokens_size;
1554 if (i == dispatch_count - 1)
1555 tokens_size = (uint64_t)(chunk_info_offset - ray_history_offsets[i]);
1556 else
1557 tokens_size = (uint64_t)(ray_history_offsets[i + 1] - ray_history_offsets[i]);
1558 tokens_size -= sizeof(struct radv_rra_ray_history_metadata);
1559
1560 rra_dump_chunk_description(ray_history_offsets[i], 0, sizeof(struct radv_rra_ray_history_metadata),
1561 "HistoryMetadata", RADV_RRA_RAY_HISTORY_CHUNK_VERSION, file);
1562 rra_dump_chunk_description(ray_history_offsets[i] + sizeof(struct radv_rra_ray_history_metadata), 0, tokens_size,
1563 "HistoryTokensRaw", RADV_RRA_RAY_HISTORY_CHUNK_VERSION, file);
1564 }
1565
1566 for (uint32_t i = 0; i < written_accel_struct_count; ++i) {
1567 uint64_t accel_struct_size;
1568 if (i == written_accel_struct_count - 1)
1569 accel_struct_size = (uint64_t)(ray_history_offset - accel_struct_offsets[i]);
1570 else
1571 accel_struct_size = (uint64_t)(accel_struct_offsets[i + 1] - accel_struct_offsets[i]);
1572
1573 rra_dump_chunk_description(accel_struct_offsets[i], sizeof(struct rra_accel_struct_chunk_header),
1574 accel_struct_size, "RawAccelStruct", RADV_RRA_ACCEL_STRUCT_CHUNK_VERSION, file);
1575 }
1576
1577 uint64_t file_end = (uint64_t)ftell(file);
1578
1579 /* All info is available, dump header now */
1580 fseek(file, 0, SEEK_SET);
1581 rra_dump_header(file, chunk_info_offset, file_end - chunk_info_offset);
1582
1583 result = VK_SUCCESS;
1584 cleanup:
1585 if (file)
1586 fclose(file);
1587
1588 free(hash_entries);
1589 free(ray_history_offsets);
1590 free(accel_struct_offsets);
1591 return result;
1592 }
1593