1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * SPDX-License-Identifier: MIT
9 */
10
11 #include <fcntl.h>
12 #include <stdbool.h>
13 #include <string.h>
14
15 #ifdef __FreeBSD__
16 #include <sys/types.h>
17 #endif
18 #ifdef MAJOR_IN_MKDEV
19 #include <sys/mkdev.h>
20 #endif
21 #ifdef MAJOR_IN_SYSMACROS
22 #include <sys/sysmacros.h>
23 #endif
24
25 #ifdef __linux__
26 #include <sys/inotify.h>
27 #endif
28
29 #include "meta/radv_meta.h"
30 #include "util/disk_cache.h"
31 #include "util/u_debug.h"
32 #include "radv_cs.h"
33 #include "radv_debug.h"
34 #include "radv_entrypoints.h"
35 #include "radv_formats.h"
36 #include "radv_physical_device.h"
37 #include "radv_printf.h"
38 #include "radv_rmv.h"
39 #include "radv_shader.h"
40 #include "radv_spm.h"
41 #include "radv_sqtt.h"
42 #include "vk_common_entrypoints.h"
43 #include "vk_pipeline_cache.h"
44 #include "vk_semaphore.h"
45 #include "vk_util.h"
46 #ifdef _WIN32
47 typedef void *drmDevicePtr;
48 #include <io.h>
49 #else
50 #include <amdgpu.h>
51 #include <xf86drm.h>
52 #include "drm-uapi/amdgpu_drm.h"
53 #include "winsys/amdgpu/radv_amdgpu_winsys_public.h"
54 #endif
55 #include "util/build_id.h"
56 #include "util/driconf.h"
57 #include "util/mesa-sha1.h"
58 #include "util/os_time.h"
59 #include "util/timespec.h"
60 #include "util/u_atomic.h"
61 #include "util/u_process.h"
62 #include "vulkan/vk_icd.h"
63 #include "winsys/null/radv_null_winsys_public.h"
64 #include "git_sha1.h"
65 #include "sid.h"
66 #include "vk_common_entrypoints.h"
67 #include "vk_format.h"
68 #include "vk_sync.h"
69 #include "vk_sync_dummy.h"
70
71 #if AMD_LLVM_AVAILABLE
72 #include "ac_llvm_util.h"
73 #endif
74
75 #include "ac_descriptors.h"
76 #include "ac_formats.h"
77
78 static bool
radv_spm_trace_enabled(const struct radv_instance * instance)79 radv_spm_trace_enabled(const struct radv_instance *instance)
80 {
81 return (instance->vk.trace_mode & RADV_TRACE_MODE_RGP) &&
82 debug_get_bool_option("RADV_THREAD_TRACE_CACHE_COUNTERS", true);
83 }
84
85 static bool
radv_trap_handler_enabled()86 radv_trap_handler_enabled()
87 {
88 return !!getenv("RADV_TRAP_HANDLER");
89 }
90
91 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetMemoryHostPointerPropertiesEXT(VkDevice _device,VkExternalMemoryHandleTypeFlagBits handleType,const void * pHostPointer,VkMemoryHostPointerPropertiesEXT * pMemoryHostPointerProperties)92 radv_GetMemoryHostPointerPropertiesEXT(VkDevice _device, VkExternalMemoryHandleTypeFlagBits handleType,
93 const void *pHostPointer,
94 VkMemoryHostPointerPropertiesEXT *pMemoryHostPointerProperties)
95 {
96 VK_FROM_HANDLE(radv_device, device, _device);
97 const struct radv_physical_device *pdev = radv_device_physical(device);
98
99 switch (handleType) {
100 case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT: {
101 uint32_t memoryTypeBits = 0;
102 for (int i = 0; i < pdev->memory_properties.memoryTypeCount; i++) {
103 if (pdev->memory_domains[i] == RADEON_DOMAIN_GTT && !(pdev->memory_flags[i] & RADEON_FLAG_GTT_WC)) {
104 memoryTypeBits = (1 << i);
105 break;
106 }
107 }
108 pMemoryHostPointerProperties->memoryTypeBits = memoryTypeBits;
109 return VK_SUCCESS;
110 }
111 default:
112 return VK_ERROR_INVALID_EXTERNAL_HANDLE;
113 }
114 }
115
116 static VkResult
radv_device_init_border_color(struct radv_device * device)117 radv_device_init_border_color(struct radv_device *device)
118 {
119 VkResult result;
120
121 result = radv_bo_create(device, NULL, RADV_BORDER_COLOR_BUFFER_SIZE, 4096, RADEON_DOMAIN_VRAM,
122 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_READ_ONLY | RADEON_FLAG_NO_INTERPROCESS_SHARING,
123 RADV_BO_PRIORITY_SHADER, 0, true, &device->border_color_data.bo);
124
125 if (result != VK_SUCCESS)
126 return vk_error(device, result);
127
128 radv_rmv_log_border_color_palette_create(device, device->border_color_data.bo);
129
130 result = device->ws->buffer_make_resident(device->ws, device->border_color_data.bo, true);
131 if (result != VK_SUCCESS)
132 return vk_error(device, result);
133
134 device->border_color_data.colors_gpu_ptr = radv_buffer_map(device->ws, device->border_color_data.bo);
135 if (!device->border_color_data.colors_gpu_ptr)
136 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
137 mtx_init(&device->border_color_data.mutex, mtx_plain);
138
139 return VK_SUCCESS;
140 }
141
142 static void
radv_device_finish_border_color(struct radv_device * device)143 radv_device_finish_border_color(struct radv_device *device)
144 {
145 if (device->border_color_data.bo) {
146 radv_rmv_log_border_color_palette_destroy(device, device->border_color_data.bo);
147 device->ws->buffer_make_resident(device->ws, device->border_color_data.bo, false);
148 radv_bo_destroy(device, NULL, device->border_color_data.bo);
149
150 mtx_destroy(&device->border_color_data.mutex);
151 }
152 }
153
154 static struct radv_shader_part *
_radv_create_vs_prolog(struct radv_device * device,const void * _key)155 _radv_create_vs_prolog(struct radv_device *device, const void *_key)
156 {
157 struct radv_vs_prolog_key *key = (struct radv_vs_prolog_key *)_key;
158 return radv_create_vs_prolog(device, key);
159 }
160
161 static uint32_t
radv_hash_vs_prolog(const void * key_)162 radv_hash_vs_prolog(const void *key_)
163 {
164 const struct radv_vs_prolog_key *key = key_;
165 return _mesa_hash_data(key, sizeof(*key));
166 }
167
168 static bool
radv_cmp_vs_prolog(const void * a_,const void * b_)169 radv_cmp_vs_prolog(const void *a_, const void *b_)
170 {
171 const struct radv_vs_prolog_key *a = a_;
172 const struct radv_vs_prolog_key *b = b_;
173
174 return memcmp(a, b, sizeof(*a)) == 0;
175 }
176
177 static struct radv_shader_part_cache_ops vs_prolog_ops = {
178 .create = _radv_create_vs_prolog,
179 .hash = radv_hash_vs_prolog,
180 .equals = radv_cmp_vs_prolog,
181 };
182
183 static VkResult
radv_device_init_vs_prologs(struct radv_device * device)184 radv_device_init_vs_prologs(struct radv_device *device)
185 {
186 const struct radv_physical_device *pdev = radv_device_physical(device);
187 const struct radv_instance *instance = radv_physical_device_instance(pdev);
188
189 if (!radv_shader_part_cache_init(&device->vs_prologs, &vs_prolog_ops))
190 return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
191
192 /* don't pre-compile prologs if we want to print them */
193 if (instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS)
194 return VK_SUCCESS;
195
196 struct radv_vs_prolog_key key;
197 memset(&key, 0, sizeof(key));
198 key.as_ls = false;
199 key.is_ngg = pdev->use_ngg;
200 key.next_stage = MESA_SHADER_VERTEX;
201 key.wave32 = pdev->ge_wave_size == 32;
202
203 for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) {
204 key.instance_rate_inputs = 0;
205 key.num_attributes = i;
206
207 device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key);
208 if (!device->simple_vs_prologs[i - 1])
209 return vk_error(instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
210 }
211
212 unsigned idx = 0;
213 for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) {
214 for (unsigned count = 1; count <= num_attributes; count++) {
215 for (unsigned start = 0; start <= (num_attributes - count); start++) {
216 key.instance_rate_inputs = u_bit_consecutive(start, count);
217 key.num_attributes = num_attributes;
218
219 struct radv_shader_part *prolog = radv_create_vs_prolog(device, &key);
220 if (!prolog)
221 return vk_error(instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
222
223 assert(idx == radv_instance_rate_prolog_index(num_attributes, key.instance_rate_inputs));
224 device->instance_rate_vs_prologs[idx++] = prolog;
225 }
226 }
227 }
228 assert(idx == ARRAY_SIZE(device->instance_rate_vs_prologs));
229
230 return VK_SUCCESS;
231 }
232
233 static void
radv_device_finish_vs_prologs(struct radv_device * device)234 radv_device_finish_vs_prologs(struct radv_device *device)
235 {
236 if (device->vs_prologs.ops)
237 radv_shader_part_cache_finish(device, &device->vs_prologs);
238
239 for (unsigned i = 0; i < ARRAY_SIZE(device->simple_vs_prologs); i++) {
240 if (!device->simple_vs_prologs[i])
241 continue;
242
243 radv_shader_part_unref(device, device->simple_vs_prologs[i]);
244 }
245
246 for (unsigned i = 0; i < ARRAY_SIZE(device->instance_rate_vs_prologs); i++) {
247 if (!device->instance_rate_vs_prologs[i])
248 continue;
249
250 radv_shader_part_unref(device, device->instance_rate_vs_prologs[i]);
251 }
252 }
253
254 static struct radv_shader_part *
_radv_create_ps_epilog(struct radv_device * device,const void * _key)255 _radv_create_ps_epilog(struct radv_device *device, const void *_key)
256 {
257 struct radv_ps_epilog_key *key = (struct radv_ps_epilog_key *)_key;
258 return radv_create_ps_epilog(device, key, NULL);
259 }
260
261 static uint32_t
radv_hash_ps_epilog(const void * key_)262 radv_hash_ps_epilog(const void *key_)
263 {
264 const struct radv_ps_epilog_key *key = key_;
265 return _mesa_hash_data(key, sizeof(*key));
266 }
267
268 static bool
radv_cmp_ps_epilog(const void * a_,const void * b_)269 radv_cmp_ps_epilog(const void *a_, const void *b_)
270 {
271 const struct radv_ps_epilog_key *a = a_;
272 const struct radv_ps_epilog_key *b = b_;
273
274 return memcmp(a, b, sizeof(*a)) == 0;
275 }
276
277 static struct radv_shader_part_cache_ops ps_epilog_ops = {
278 .create = _radv_create_ps_epilog,
279 .hash = radv_hash_ps_epilog,
280 .equals = radv_cmp_ps_epilog,
281 };
282
283 VkResult
radv_device_init_vrs_state(struct radv_device * device)284 radv_device_init_vrs_state(struct radv_device *device)
285 {
286 VkDeviceMemory mem;
287 VkBuffer buffer;
288 VkResult result;
289 VkImage image;
290
291 VkImageCreateInfo image_create_info = {
292 .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
293 .imageType = VK_IMAGE_TYPE_2D,
294 .format = VK_FORMAT_D16_UNORM,
295 .extent = {MAX_FRAMEBUFFER_WIDTH, MAX_FRAMEBUFFER_HEIGHT, 1},
296 .mipLevels = 1,
297 .arrayLayers = 1,
298 .samples = VK_SAMPLE_COUNT_1_BIT,
299 .tiling = VK_IMAGE_TILING_OPTIMAL,
300 .usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
301 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
302 .queueFamilyIndexCount = 0,
303 .pQueueFamilyIndices = NULL,
304 .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
305 };
306
307 result =
308 radv_image_create(radv_device_to_handle(device), &(struct radv_image_create_info){.vk_info = &image_create_info},
309 &device->meta_state.alloc, &image, true);
310 if (result != VK_SUCCESS)
311 return result;
312
313 VkBufferCreateInfo buffer_create_info = {
314 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
315 .pNext =
316 &(VkBufferUsageFlags2CreateInfo){
317 .sType = VK_STRUCTURE_TYPE_BUFFER_USAGE_FLAGS_2_CREATE_INFO,
318 .usage = VK_BUFFER_USAGE_2_STORAGE_BUFFER_BIT,
319 },
320 .size = radv_image_from_handle(image)->planes[0].surface.meta_size,
321 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
322 };
323
324 result = radv_create_buffer(device, &buffer_create_info, &device->meta_state.alloc, &buffer, true);
325 if (result != VK_SUCCESS)
326 goto fail_create;
327
328 VkBufferMemoryRequirementsInfo2 info = {
329 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
330 .buffer = buffer,
331 };
332 VkMemoryRequirements2 mem_req = {
333 .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
334 };
335 vk_common_GetBufferMemoryRequirements2(radv_device_to_handle(device), &info, &mem_req);
336
337 VkMemoryAllocateInfo alloc_info = {
338 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
339 .allocationSize = mem_req.memoryRequirements.size,
340 };
341
342 result = radv_alloc_memory(device, &alloc_info, &device->meta_state.alloc, &mem, true);
343 if (result != VK_SUCCESS)
344 goto fail_alloc;
345
346 VkBindBufferMemoryInfo bind_info = {.sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
347 .buffer = buffer,
348 .memory = mem,
349 .memoryOffset = 0};
350
351 result = radv_BindBufferMemory2(radv_device_to_handle(device), 1, &bind_info);
352 if (result != VK_SUCCESS)
353 goto fail_bind;
354
355 device->vrs.image = radv_image_from_handle(image);
356 device->vrs.buffer = radv_buffer_from_handle(buffer);
357 device->vrs.mem = radv_device_memory_from_handle(mem);
358
359 return VK_SUCCESS;
360
361 fail_bind:
362 radv_FreeMemory(radv_device_to_handle(device), mem, &device->meta_state.alloc);
363 fail_alloc:
364 radv_DestroyBuffer(radv_device_to_handle(device), buffer, &device->meta_state.alloc);
365 fail_create:
366 radv_DestroyImage(radv_device_to_handle(device), image, &device->meta_state.alloc);
367
368 return result;
369 }
370
371 static void
radv_device_finish_vrs_image(struct radv_device * device)372 radv_device_finish_vrs_image(struct radv_device *device)
373 {
374 if (!device->vrs.image)
375 return;
376
377 radv_FreeMemory(radv_device_to_handle(device), radv_device_memory_to_handle(device->vrs.mem),
378 &device->meta_state.alloc);
379 radv_DestroyBuffer(radv_device_to_handle(device), radv_buffer_to_handle(device->vrs.buffer),
380 &device->meta_state.alloc);
381 radv_DestroyImage(radv_device_to_handle(device), radv_image_to_handle(device->vrs.image), &device->meta_state.alloc);
382 }
383
384 static enum radv_force_vrs
radv_parse_vrs_rates(const char * str)385 radv_parse_vrs_rates(const char *str)
386 {
387 if (!strcmp(str, "2x2")) {
388 return RADV_FORCE_VRS_2x2;
389 } else if (!strcmp(str, "2x1")) {
390 return RADV_FORCE_VRS_2x1;
391 } else if (!strcmp(str, "1x2")) {
392 return RADV_FORCE_VRS_1x2;
393 } else if (!strcmp(str, "1x1")) {
394 return RADV_FORCE_VRS_1x1;
395 }
396
397 fprintf(stderr, "radv: Invalid VRS rates specified (valid values are 2x2, 2x1, 1x2 and 1x1)\n");
398 return RADV_FORCE_VRS_1x1;
399 }
400
401 static const char *
radv_get_force_vrs_config_file(void)402 radv_get_force_vrs_config_file(void)
403 {
404 return getenv("RADV_FORCE_VRS_CONFIG_FILE");
405 }
406
407 static enum radv_force_vrs
radv_parse_force_vrs_config_file(const char * config_file)408 radv_parse_force_vrs_config_file(const char *config_file)
409 {
410 enum radv_force_vrs force_vrs = RADV_FORCE_VRS_1x1;
411 char buf[4];
412 FILE *f;
413
414 f = fopen(config_file, "r");
415 if (!f) {
416 fprintf(stderr, "radv: Can't open file: '%s'.\n", config_file);
417 return force_vrs;
418 }
419
420 if (fread(buf, sizeof(buf), 1, f) == 1) {
421 buf[3] = '\0';
422 force_vrs = radv_parse_vrs_rates(buf);
423 }
424
425 fclose(f);
426 return force_vrs;
427 }
428
429 #ifdef __linux__
430
431 #define BUF_LEN ((10 * (sizeof(struct inotify_event) + NAME_MAX + 1)))
432
433 static int
radv_notifier_thread_run(void * data)434 radv_notifier_thread_run(void *data)
435 {
436 struct radv_device *device = data;
437 struct radv_notifier *notifier = &device->notifier;
438 char buf[BUF_LEN];
439
440 while (!notifier->quit) {
441 const char *file = radv_get_force_vrs_config_file();
442 struct timespec tm = {.tv_nsec = 100000000}; /* 1OOms */
443 int length, i = 0;
444
445 length = read(notifier->fd, buf, BUF_LEN);
446 while (i < length) {
447 struct inotify_event *event = (struct inotify_event *)&buf[i];
448
449 i += sizeof(struct inotify_event) + event->len;
450 if (event->mask & IN_MODIFY || event->mask & IN_DELETE_SELF) {
451 /* Sleep 100ms for editors that use a temporary file and delete the original. */
452 thrd_sleep(&tm, NULL);
453 device->force_vrs = radv_parse_force_vrs_config_file(file);
454
455 fprintf(stderr, "radv: Updated the per-vertex VRS rate to '%d'.\n", device->force_vrs);
456
457 if (event->mask & IN_DELETE_SELF) {
458 inotify_rm_watch(notifier->fd, notifier->watch);
459 notifier->watch = inotify_add_watch(notifier->fd, file, IN_MODIFY | IN_DELETE_SELF);
460 }
461 }
462 }
463
464 thrd_sleep(&tm, NULL);
465 }
466
467 return 0;
468 }
469
470 #endif
471
472 static int
radv_device_init_notifier(struct radv_device * device)473 radv_device_init_notifier(struct radv_device *device)
474 {
475 #ifndef __linux__
476 return true;
477 #else
478 struct radv_notifier *notifier = &device->notifier;
479 const char *file = radv_get_force_vrs_config_file();
480 int ret;
481
482 notifier->fd = inotify_init1(IN_NONBLOCK);
483 if (notifier->fd < 0)
484 return false;
485
486 notifier->watch = inotify_add_watch(notifier->fd, file, IN_MODIFY | IN_DELETE_SELF);
487 if (notifier->watch < 0)
488 goto fail_watch;
489
490 ret = thrd_create(¬ifier->thread, radv_notifier_thread_run, device);
491 if (ret)
492 goto fail_thread;
493
494 return true;
495
496 fail_thread:
497 inotify_rm_watch(notifier->fd, notifier->watch);
498 fail_watch:
499 close(notifier->fd);
500
501 return false;
502 #endif
503 }
504
505 static void
radv_device_finish_notifier(struct radv_device * device)506 radv_device_finish_notifier(struct radv_device *device)
507 {
508 #ifdef __linux__
509 struct radv_notifier *notifier = &device->notifier;
510
511 if (!notifier->thread)
512 return;
513
514 notifier->quit = true;
515 thrd_join(notifier->thread, NULL);
516 inotify_rm_watch(notifier->fd, notifier->watch);
517 close(notifier->fd);
518 #endif
519 }
520
521 static VkResult
radv_device_init_perf_counter(struct radv_device * device)522 radv_device_init_perf_counter(struct radv_device *device)
523 {
524 const struct radv_physical_device *pdev = radv_device_physical(device);
525 const size_t bo_size = PERF_CTR_BO_PASS_OFFSET + sizeof(uint64_t) * PERF_CTR_MAX_PASSES;
526 VkResult result;
527
528 result = radv_bo_create(device, NULL, bo_size, 4096, RADEON_DOMAIN_GTT,
529 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_UPLOAD_BUFFER,
530 0, true, &device->perf_counter_bo);
531 if (result != VK_SUCCESS)
532 return result;
533
534 device->perf_counter_lock_cs = calloc(sizeof(struct radeon_winsys_cs *), 2 * PERF_CTR_MAX_PASSES);
535 if (!device->perf_counter_lock_cs)
536 return VK_ERROR_OUT_OF_HOST_MEMORY;
537
538 if (!pdev->ac_perfcounters.blocks)
539 return VK_ERROR_INITIALIZATION_FAILED;
540
541 return VK_SUCCESS;
542 }
543
544 static void
radv_device_finish_perf_counter(struct radv_device * device)545 radv_device_finish_perf_counter(struct radv_device *device)
546 {
547 if (device->perf_counter_bo)
548 radv_bo_destroy(device, NULL, device->perf_counter_bo);
549
550 if (!device->perf_counter_lock_cs)
551 return;
552
553 for (unsigned i = 0; i < 2 * PERF_CTR_MAX_PASSES; ++i) {
554 if (device->perf_counter_lock_cs[i])
555 device->ws->cs_destroy(device->perf_counter_lock_cs[i]);
556 }
557
558 free(device->perf_counter_lock_cs);
559 }
560
561 static VkResult
radv_device_init_memory_cache(struct radv_device * device)562 radv_device_init_memory_cache(struct radv_device *device)
563 {
564 struct vk_pipeline_cache_create_info info = {.weak_ref = true};
565
566 device->mem_cache = vk_pipeline_cache_create(&device->vk, &info, NULL);
567 if (!device->mem_cache)
568 return VK_ERROR_OUT_OF_HOST_MEMORY;
569
570 return VK_SUCCESS;
571 }
572
573 static void
radv_device_finish_memory_cache(struct radv_device * device)574 radv_device_finish_memory_cache(struct radv_device *device)
575 {
576 if (device->mem_cache)
577 vk_pipeline_cache_destroy(device->mem_cache, NULL);
578 }
579
580 static VkResult
radv_device_init_rgp(struct radv_device * device)581 radv_device_init_rgp(struct radv_device *device)
582 {
583 const struct radv_physical_device *pdev = radv_device_physical(device);
584 const struct radv_instance *instance = radv_physical_device_instance(pdev);
585
586 if (!(instance->vk.trace_mode & RADV_TRACE_MODE_RGP))
587 return VK_SUCCESS;
588
589 if (pdev->info.gfx_level < GFX8 || pdev->info.gfx_level > GFX11_5) {
590 fprintf(stderr, "GPU hardware not supported: refer to "
591 "the RGP documentation for the list of "
592 "supported GPUs!\n");
593 abort();
594 }
595
596 if (!radv_sqtt_init(device))
597 return VK_ERROR_INITIALIZATION_FAILED;
598
599 fprintf(stderr,
600 "radv: Thread trace support is enabled (initial buffer size: %u MiB, "
601 "instruction timing: %s, cache counters: %s, queue events: %s).\n",
602 device->sqtt.buffer_size / (1024 * 1024), radv_is_instruction_timing_enabled() ? "enabled" : "disabled",
603 radv_spm_trace_enabled(instance) ? "enabled" : "disabled",
604 radv_sqtt_queue_events_enabled() ? "enabled" : "disabled");
605
606 if (radv_spm_trace_enabled(instance)) {
607 if (pdev->info.gfx_level >= GFX10 && pdev->info.gfx_level < GFX11_5) {
608 if (!radv_spm_init(device))
609 return VK_ERROR_INITIALIZATION_FAILED;
610 } else {
611 fprintf(stderr, "radv: SPM isn't supported for this GPU (%s)!\n", pdev->name);
612 }
613 }
614
615 return VK_SUCCESS;
616 }
617
618 static void
radv_device_finish_rgp(struct radv_device * device)619 radv_device_finish_rgp(struct radv_device *device)
620 {
621 radv_sqtt_finish(device);
622 radv_spm_finish(device);
623 }
624
625 static void
radv_device_init_rmv(struct radv_device * device)626 radv_device_init_rmv(struct radv_device *device)
627 {
628 const struct radv_physical_device *pdev = radv_device_physical(device);
629 const struct radv_instance *instance = radv_physical_device_instance(pdev);
630
631 if (!(instance->vk.trace_mode & VK_TRACE_MODE_RMV))
632 return;
633
634 struct vk_rmv_device_info info;
635 memset(&info, 0, sizeof(struct vk_rmv_device_info));
636 radv_rmv_fill_device_info(pdev, &info);
637 vk_memory_trace_init(&device->vk, &info);
638 radv_memory_trace_init(device);
639 }
640
641 static VkResult
radv_device_init_trap_handler(struct radv_device * device)642 radv_device_init_trap_handler(struct radv_device *device)
643 {
644 const struct radv_physical_device *pdev = radv_device_physical(device);
645
646 if (!pdev->info.has_trap_handler_support)
647 return VK_SUCCESS;
648
649 if (!radv_trap_handler_enabled())
650 return VK_SUCCESS;
651
652 fprintf(stderr, "**********************************************************************\n");
653 fprintf(stderr, "* WARNING: RADV_TRAP_HANDLER is experimental and only for debugging! *\n");
654 fprintf(stderr, "**********************************************************************\n");
655
656 if (!radv_trap_handler_init(device))
657 return VK_ERROR_INITIALIZATION_FAILED;
658
659 return VK_SUCCESS;
660 }
661
662 static VkResult
radv_device_init_device_fault_detection(struct radv_device * device)663 radv_device_init_device_fault_detection(struct radv_device *device)
664 {
665 const struct radv_physical_device *pdev = radv_device_physical(device);
666 struct radv_instance *instance = radv_physical_device_instance(pdev);
667
668 if (!radv_device_fault_detection_enabled(device))
669 return VK_SUCCESS;
670
671 if (!radv_init_trace(device))
672 return VK_ERROR_INITIALIZATION_FAILED;
673
674 fprintf(stderr, "*****************************************************************************\n");
675 fprintf(stderr, "* WARNING: RADV_DEBUG=hang is costly and should only be used for debugging! *\n");
676 fprintf(stderr, "*****************************************************************************\n");
677
678 /* Wait for idle after every draw/dispatch to identify the
679 * first bad call.
680 */
681 instance->debug_flags |= RADV_DEBUG_SYNC_SHADERS;
682
683 radv_dump_enabled_options(device, stderr);
684
685 return VK_SUCCESS;
686 }
687
688 static void
radv_device_finish_device_fault_detection(struct radv_device * device)689 radv_device_finish_device_fault_detection(struct radv_device *device)
690 {
691 radv_finish_trace(device);
692 ralloc_free(device->gpu_hang_report);
693 }
694
695 static VkResult
radv_device_init_tools(struct radv_device * device)696 radv_device_init_tools(struct radv_device *device)
697 {
698 const struct radv_physical_device *pdev = radv_device_physical(device);
699 struct radv_instance *instance = radv_physical_device_instance(pdev);
700 VkResult result;
701
702 result = radv_device_init_device_fault_detection(device);
703 if (result != VK_SUCCESS)
704 return result;
705
706 result = radv_device_init_rgp(device);
707 if (result != VK_SUCCESS)
708 return result;
709
710 radv_device_init_rmv(device);
711
712 result = radv_device_init_trap_handler(device);
713 if (result != VK_SUCCESS)
714 return result;
715
716 if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev)) {
717 result = radv_rra_trace_init(device);
718 if (result != VK_SUCCESS)
719 return result;
720 }
721
722 result = radv_printf_data_init(device);
723 if (result != VK_SUCCESS)
724 return result;
725
726 return VK_SUCCESS;
727 }
728
729 static void
radv_device_finish_tools(struct radv_device * device)730 radv_device_finish_tools(struct radv_device *device)
731 {
732 radv_printf_data_finish(device);
733 radv_rra_trace_finish(radv_device_to_handle(device), &device->rra_trace);
734 radv_trap_handler_finish(device);
735 radv_memory_trace_finish(device);
736 radv_device_finish_rgp(device);
737 radv_device_finish_device_fault_detection(device);
738 }
739
740 struct dispatch_table_builder {
741 struct vk_device_dispatch_table *tables[RADV_DISPATCH_TABLE_COUNT];
742 bool used[RADV_DISPATCH_TABLE_COUNT];
743 bool initialized[RADV_DISPATCH_TABLE_COUNT];
744 };
745
746 static void
add_entrypoints(struct dispatch_table_builder * b,const struct vk_device_entrypoint_table * entrypoints,enum radv_dispatch_table table)747 add_entrypoints(struct dispatch_table_builder *b, const struct vk_device_entrypoint_table *entrypoints,
748 enum radv_dispatch_table table)
749 {
750 for (int32_t i = table - 1; i >= RADV_DEVICE_DISPATCH_TABLE; i--) {
751 if (i == RADV_DEVICE_DISPATCH_TABLE || b->used[i]) {
752 vk_device_dispatch_table_from_entrypoints(b->tables[i], entrypoints, !b->initialized[i]);
753 b->initialized[i] = true;
754 }
755 }
756
757 if (table < RADV_DISPATCH_TABLE_COUNT)
758 b->used[table] = true;
759 }
760
761 static void
init_dispatch_tables(struct radv_device * device,struct radv_physical_device * pdev)762 init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pdev)
763 {
764 const struct radv_instance *instance = radv_physical_device_instance(pdev);
765 struct dispatch_table_builder b = {0};
766 b.tables[RADV_DEVICE_DISPATCH_TABLE] = &device->vk.dispatch_table;
767 b.tables[RADV_ANNOTATE_DISPATCH_TABLE] = &device->layer_dispatch.annotate;
768 b.tables[RADV_APP_DISPATCH_TABLE] = &device->layer_dispatch.app;
769 b.tables[RADV_RGP_DISPATCH_TABLE] = &device->layer_dispatch.rgp;
770 b.tables[RADV_RRA_DISPATCH_TABLE] = &device->layer_dispatch.rra;
771 b.tables[RADV_RMV_DISPATCH_TABLE] = &device->layer_dispatch.rmv;
772 b.tables[RADV_CTX_ROLL_DISPATCH_TABLE] = &device->layer_dispatch.ctx_roll;
773
774 bool gather_ctx_rolls = instance->vk.trace_mode & RADV_TRACE_MODE_CTX_ROLLS;
775 if (radv_device_fault_detection_enabled(device) || gather_ctx_rolls)
776 add_entrypoints(&b, &annotate_device_entrypoints, RADV_ANNOTATE_DISPATCH_TABLE);
777
778 if (!strcmp(instance->drirc.app_layer, "metroexodus")) {
779 add_entrypoints(&b, &metro_exodus_device_entrypoints, RADV_APP_DISPATCH_TABLE);
780 } else if (!strcmp(instance->drirc.app_layer, "rage2")) {
781 add_entrypoints(&b, &rage2_device_entrypoints, RADV_APP_DISPATCH_TABLE);
782 } else if (!strcmp(instance->drirc.app_layer, "quanticdream")) {
783 add_entrypoints(&b, &quantic_dream_device_entrypoints, RADV_APP_DISPATCH_TABLE);
784 }
785
786 if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
787 add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE);
788
789 if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev))
790 add_entrypoints(&b, &rra_device_entrypoints, RADV_RRA_DISPATCH_TABLE);
791
792 #ifndef _WIN32
793 if (instance->vk.trace_mode & VK_TRACE_MODE_RMV)
794 add_entrypoints(&b, &rmv_device_entrypoints, RADV_RMV_DISPATCH_TABLE);
795 #endif
796
797 if (gather_ctx_rolls)
798 add_entrypoints(&b, &ctx_roll_device_entrypoints, RADV_CTX_ROLL_DISPATCH_TABLE);
799
800 add_entrypoints(&b, &radv_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
801 add_entrypoints(&b, &wsi_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
802 add_entrypoints(&b, &vk_common_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
803 }
804
805 static VkResult
get_timestamp(struct vk_device * _device,uint64_t * timestamp)806 get_timestamp(struct vk_device *_device, uint64_t *timestamp)
807 {
808 struct radv_device *device = container_of(_device, struct radv_device, vk);
809 *timestamp = device->ws->query_value(device->ws, RADEON_TIMESTAMP);
810 return VK_SUCCESS;
811 }
812
813 static VkResult
capture_trace(VkQueue _queue)814 capture_trace(VkQueue _queue)
815 {
816 VK_FROM_HANDLE(radv_queue, queue, _queue);
817 struct radv_device *device = radv_queue_device(queue);
818 const struct radv_physical_device *pdev = radv_device_physical(device);
819 const struct radv_instance *instance = radv_physical_device_instance(pdev);
820
821 VkResult result = VK_SUCCESS;
822
823 if (instance->vk.trace_mode & RADV_TRACE_MODE_RRA)
824 device->rra_trace.triggered = true;
825
826 if (device->vk.memory_trace_data.is_enabled) {
827 simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
828 radv_rmv_collect_trace_events(device);
829 vk_dump_rmv_capture(&device->vk.memory_trace_data);
830 simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
831 }
832
833 if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
834 device->sqtt_triggered = true;
835
836 if (instance->vk.trace_mode & RADV_TRACE_MODE_CTX_ROLLS) {
837 char filename[2048];
838 time_t t = time(NULL);
839 struct tm now = *localtime(&t);
840 snprintf(filename, sizeof(filename), "/tmp/%s_%04d.%02d.%02d_%02d.%02d.%02d.ctxroll", util_get_process_name(),
841 1900 + now.tm_year, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec);
842
843 simple_mtx_lock(&device->ctx_roll_mtx);
844
845 device->ctx_roll_file = fopen(filename, "w");
846 if (device->ctx_roll_file)
847 fprintf(stderr, "radv: Writing context rolls to '%s'...\n", filename);
848
849 simple_mtx_unlock(&device->ctx_roll_mtx);
850 }
851
852 return result;
853 }
854
855 static void
radv_device_init_cache_key(struct radv_device * device)856 radv_device_init_cache_key(struct radv_device *device)
857 {
858 const struct radv_physical_device *pdev = radv_device_physical(device);
859 const struct radv_instance *instance = radv_physical_device_instance(pdev);
860 struct radv_device_cache_key *key = &device->cache_key;
861
862 key->keep_shader_info = device->keep_shader_info;
863 key->trap_excp_flags = device->trap_handler_shader && instance->trap_excp_flags;
864 key->disable_trunc_coord = device->disable_trunc_coord;
865 key->image_2d_view_of_3d = device->vk.enabled_features.image2DViewOf3D && pdev->info.gfx_level == GFX9;
866 key->mesh_shader_queries = device->vk.enabled_features.meshShaderQueries && pdev->emulate_mesh_shader_queries;
867 key->primitives_generated_query = radv_uses_primitives_generated_query(device);
868
869 /* The Vulkan spec says:
870 * "Binary shaders retrieved from a physical device with a certain shaderBinaryUUID are
871 * guaranteed to be compatible with all other physical devices reporting the same
872 * shaderBinaryUUID and the same or higher shaderBinaryVersion."
873 *
874 * That means the driver should compile shaders for the "worst" case of all features being
875 * enabled, regardless of what features are actually enabled on the logical device.
876 */
877 if (device->vk.enabled_features.shaderObject) {
878 key->image_2d_view_of_3d = pdev->info.gfx_level == GFX9;
879 key->primitives_generated_query = true;
880 }
881
882 _mesa_blake3_compute(key, sizeof(*key), device->cache_hash);
883 }
884
885 static void
radv_create_gfx_preamble(struct radv_device * device)886 radv_create_gfx_preamble(struct radv_device *device)
887 {
888 struct radeon_cmdbuf *cs = device->ws->cs_create(device->ws, AMD_IP_GFX, false);
889 if (!cs)
890 return;
891
892 radeon_check_space(device->ws, cs, 512);
893
894 radv_emit_graphics(device, cs);
895
896 device->ws->cs_pad(cs, 0);
897
898 VkResult result = radv_bo_create(
899 device, NULL, cs->cdw * 4, 4096, device->ws->cs_domain(device->ws),
900 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | RADEON_FLAG_GTT_WC,
901 RADV_BO_PRIORITY_CS, 0, true, &device->gfx_init);
902 if (result != VK_SUCCESS)
903 goto fail;
904
905 void *map = radv_buffer_map(device->ws, device->gfx_init);
906 if (!map) {
907 radv_bo_destroy(device, NULL, device->gfx_init);
908 device->gfx_init = NULL;
909 goto fail;
910 }
911 memcpy(map, cs->buf, cs->cdw * 4);
912
913 device->ws->buffer_unmap(device->ws, device->gfx_init, false);
914 device->gfx_init_size_dw = cs->cdw;
915 fail:
916 device->ws->cs_destroy(cs);
917 }
918
919 /* For MSAA sample positions. */
920 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \
921 ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) | \
922 (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) | (((unsigned)(s2y)&0xf) << 20) | \
923 (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28))
924
925 /* For obtaining location coordinates from registers */
926 #define SEXT4(x) ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0)))
927 #define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf)
928 #define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
929 #define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
930
931 /* 1x MSAA */
932 static const uint32_t sample_locs_1x = FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0);
933 static const unsigned max_dist_1x = 0;
934 static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
935
936 /* 2xMSAA */
937 static const uint32_t sample_locs_2x = FILL_SREG(4, 4, -4, -4, 0, 0, 0, 0);
938 static const unsigned max_dist_2x = 4;
939 static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
940
941 /* 4xMSAA */
942 static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6);
943 static const unsigned max_dist_4x = 6;
944 static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
945
946 /* 8xMSAA */
947 static const uint32_t sample_locs_8x[] = {
948 FILL_SREG(1, -3, -1, 3, 5, 1, -3, -5),
949 FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7),
950 /* The following are unused by hardware, but we emit them to IBs
951 * instead of multiple SET_CONTEXT_REG packets. */
952 0,
953 0,
954 };
955 static const unsigned max_dist_8x = 7;
956 static const uint64_t centroid_priority_8x = 0x7654321076543210ull;
957
958 unsigned
radv_get_default_max_sample_dist(int log_samples)959 radv_get_default_max_sample_dist(int log_samples)
960 {
961 unsigned max_dist[] = {
962 max_dist_1x,
963 max_dist_2x,
964 max_dist_4x,
965 max_dist_8x,
966 };
967 return max_dist[log_samples];
968 }
969
970 void
radv_emit_default_sample_locations(const struct radv_physical_device * pdev,struct radeon_cmdbuf * cs,int nr_samples)971 radv_emit_default_sample_locations(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs, int nr_samples)
972 {
973 uint64_t centroid_priority;
974
975 switch (nr_samples) {
976 default:
977 case 1:
978 centroid_priority = centroid_priority_1x;
979
980 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_1x);
981 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_1x);
982 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_1x);
983 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_1x);
984 break;
985 case 2:
986 centroid_priority = centroid_priority_2x;
987
988 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_2x);
989 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_2x);
990 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_2x);
991 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_2x);
992 break;
993 case 4:
994 centroid_priority = centroid_priority_4x;
995
996 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_4x);
997 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_4x);
998 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_4x);
999 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_4x);
1000 break;
1001 case 8:
1002 centroid_priority = centroid_priority_8x;
1003
1004 radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14);
1005 radeon_emit_array(cs, sample_locs_8x, 4);
1006 radeon_emit_array(cs, sample_locs_8x, 4);
1007 radeon_emit_array(cs, sample_locs_8x, 4);
1008 radeon_emit_array(cs, sample_locs_8x, 2);
1009 break;
1010 }
1011
1012 /* The exclusion bits can be set to improve rasterization efficiency if no sample lies on the
1013 * pixel boundary (-8 sample offset). It's currently always TRUE because the driver doesn't
1014 * support 16 samples.
1015 */
1016 if (pdev->info.gfx_level >= GFX7) {
1017 radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL,
1018 S_02882C_XMAX_RIGHT_EXCLUSION(1) | S_02882C_YMAX_BOTTOM_EXCLUSION(1));
1019 }
1020
1021 if (pdev->info.gfx_level >= GFX12) {
1022 radeon_set_context_reg_seq(cs, R_028BF0_PA_SC_CENTROID_PRIORITY_0, 2);
1023 } else {
1024 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1025 }
1026 radeon_emit(cs, centroid_priority);
1027 radeon_emit(cs, centroid_priority >> 32);
1028 }
1029
1030 static void
radv_get_sample_position(struct radv_device * device,unsigned sample_count,unsigned sample_index,float * out_value)1031 radv_get_sample_position(struct radv_device *device, unsigned sample_count, unsigned sample_index, float *out_value)
1032 {
1033 const uint32_t *sample_locs;
1034
1035 switch (sample_count) {
1036 case 1:
1037 default:
1038 sample_locs = &sample_locs_1x;
1039 break;
1040 case 2:
1041 sample_locs = &sample_locs_2x;
1042 break;
1043 case 4:
1044 sample_locs = &sample_locs_4x;
1045 break;
1046 case 8:
1047 sample_locs = sample_locs_8x;
1048 break;
1049 }
1050
1051 out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
1052 out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
1053 }
1054
1055 static void
radv_device_init_msaa(struct radv_device * device)1056 radv_device_init_msaa(struct radv_device *device)
1057 {
1058 int i;
1059
1060 radv_get_sample_position(device, 1, 0, device->sample_locations_1x[0]);
1061
1062 for (i = 0; i < 2; i++)
1063 radv_get_sample_position(device, 2, i, device->sample_locations_2x[i]);
1064 for (i = 0; i < 4; i++)
1065 radv_get_sample_position(device, 4, i, device->sample_locations_4x[i]);
1066 for (i = 0; i < 8; i++)
1067 radv_get_sample_position(device, 8, i, device->sample_locations_8x[i]);
1068 }
1069
1070 static void
radv_destroy_device(struct radv_device * device,const VkAllocationCallbacks * pAllocator)1071 radv_destroy_device(struct radv_device *device, const VkAllocationCallbacks *pAllocator)
1072 {
1073 radv_device_finish_perf_counter(device);
1074
1075 if (device->gfx_init)
1076 radv_bo_destroy(device, NULL, device->gfx_init);
1077
1078 radv_device_finish_notifier(device);
1079 radv_device_finish_vs_prologs(device);
1080 if (device->ps_epilogs.ops)
1081 radv_shader_part_cache_finish(device, &device->ps_epilogs);
1082 radv_device_finish_border_color(device);
1083 radv_device_finish_vrs_image(device);
1084
1085 for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
1086 for (unsigned q = 0; q < device->queue_count[i]; q++)
1087 radv_queue_finish(&device->queues[i][q]);
1088 if (device->queue_count[i])
1089 vk_free(&device->vk.alloc, device->queues[i]);
1090 }
1091 if (device->private_sdma_queue != VK_NULL_HANDLE) {
1092 radv_queue_finish(device->private_sdma_queue);
1093 vk_free(&device->vk.alloc, device->private_sdma_queue);
1094 }
1095
1096 _mesa_hash_table_destroy(device->rt_handles, NULL);
1097
1098 radv_device_finish_meta(device);
1099 radv_device_finish_tools(device);
1100 radv_device_finish_memory_cache(device);
1101
1102 radv_destroy_shader_upload_queue(device);
1103
1104 for (unsigned i = 0; i < RADV_NUM_HW_CTX; i++) {
1105 if (device->hw_ctx[i])
1106 device->ws->ctx_destroy(device->hw_ctx[i]);
1107 }
1108
1109 mtx_destroy(&device->overallocation_mutex);
1110 simple_mtx_destroy(&device->ctx_roll_mtx);
1111 simple_mtx_destroy(&device->pstate_mtx);
1112 simple_mtx_destroy(&device->trace_mtx);
1113 simple_mtx_destroy(&device->rt_handles_mtx);
1114 simple_mtx_destroy(&device->pso_cache_stats_mtx);
1115
1116 radv_destroy_shader_arenas(device);
1117 if (device->capture_replay_arena_vas)
1118 _mesa_hash_table_u64_destroy(device->capture_replay_arena_vas);
1119
1120 vk_device_finish(&device->vk);
1121 vk_free(&device->vk.alloc, device);
1122 }
1123
1124 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)1125 radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo,
1126 const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
1127 {
1128 VK_FROM_HANDLE(radv_physical_device, pdev, physicalDevice);
1129 struct radv_instance *instance = radv_physical_device_instance(pdev);
1130 VkResult result;
1131 struct radv_device *device;
1132
1133 bool overallocation_disallowed = false;
1134
1135 vk_foreach_struct_const (ext, pCreateInfo->pNext) {
1136 switch (ext->sType) {
1137 case VK_STRUCTURE_TYPE_DEVICE_MEMORY_OVERALLOCATION_CREATE_INFO_AMD: {
1138 const VkDeviceMemoryOverallocationCreateInfoAMD *overallocation = (const void *)ext;
1139 if (overallocation->overallocationBehavior == VK_MEMORY_OVERALLOCATION_BEHAVIOR_DISALLOWED_AMD)
1140 overallocation_disallowed = true;
1141 break;
1142 }
1143 default:
1144 break;
1145 }
1146 }
1147
1148 device = vk_zalloc2(&instance->vk.alloc, pAllocator, sizeof(*device), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1149 if (!device)
1150 return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1151
1152 result = vk_device_init(&device->vk, &pdev->vk, NULL, pCreateInfo, pAllocator);
1153 if (result != VK_SUCCESS) {
1154 vk_free(&device->vk.alloc, device);
1155 return result;
1156 }
1157
1158 device->vk.get_timestamp = get_timestamp;
1159 device->vk.capture_trace = capture_trace;
1160
1161 device->vk.command_buffer_ops = &radv_cmd_buffer_ops;
1162
1163 init_dispatch_tables(device, pdev);
1164
1165 simple_mtx_init(&device->ctx_roll_mtx, mtx_plain);
1166 simple_mtx_init(&device->trace_mtx, mtx_plain);
1167 simple_mtx_init(&device->pstate_mtx, mtx_plain);
1168 simple_mtx_init(&device->rt_handles_mtx, mtx_plain);
1169 simple_mtx_init(&device->pso_cache_stats_mtx, mtx_plain);
1170
1171 device->rt_handles = _mesa_hash_table_create(NULL, _mesa_hash_u32, _mesa_key_u32_equal);
1172
1173 device->ws = pdev->ws;
1174 vk_device_set_drm_fd(&device->vk, device->ws->get_fd(device->ws));
1175
1176 /* With update after bind we can't attach bo's to the command buffer
1177 * from the descriptor set anymore, so we have to use a global BO list.
1178 */
1179 device->use_global_bo_list = (instance->perftest_flags & RADV_PERFTEST_BO_LIST) ||
1180 device->vk.enabled_features.bufferDeviceAddress ||
1181 device->vk.enabled_features.descriptorIndexing ||
1182 device->vk.enabled_features.descriptorBindingUniformBufferUpdateAfterBind ||
1183 device->vk.enabled_features.descriptorBindingSampledImageUpdateAfterBind ||
1184 device->vk.enabled_features.descriptorBindingStorageImageUpdateAfterBind ||
1185 device->vk.enabled_features.descriptorBindingStorageBufferUpdateAfterBind ||
1186 device->vk.enabled_features.descriptorBindingUniformTexelBufferUpdateAfterBind ||
1187 device->vk.enabled_features.descriptorBindingStorageTexelBufferUpdateAfterBind ||
1188 device->vk.enabled_features.descriptorBindingUpdateUnusedWhilePending ||
1189 device->vk.enabled_features.descriptorBindingPartiallyBound;
1190
1191 radv_init_shader_arenas(device);
1192
1193 device->overallocation_disallowed = overallocation_disallowed;
1194 mtx_init(&device->overallocation_mutex, mtx_plain);
1195
1196 if (pdev->info.register_shadowing_required || instance->debug_flags & RADV_DEBUG_SHADOW_REGS)
1197 device->uses_shadow_regs = true;
1198
1199 /* Create one context per queue priority. */
1200 for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
1201 const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i];
1202 const VkDeviceQueueGlobalPriorityCreateInfo *global_priority =
1203 vk_find_struct_const(queue_create->pNext, DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO);
1204 enum radeon_ctx_priority priority = radv_get_queue_global_priority(global_priority);
1205
1206 if (device->hw_ctx[priority])
1207 continue;
1208
1209 result = device->ws->ctx_create(device->ws, priority, &device->hw_ctx[priority]);
1210 if (result != VK_SUCCESS)
1211 goto fail;
1212 }
1213
1214 for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
1215 const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i];
1216 uint32_t qfi = queue_create->queueFamilyIndex;
1217 const VkDeviceQueueGlobalPriorityCreateInfo *global_priority =
1218 vk_find_struct_const(queue_create->pNext, DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO);
1219
1220 device->queues[qfi] = vk_zalloc(&device->vk.alloc, queue_create->queueCount * sizeof(struct radv_queue), 8,
1221 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1222 if (!device->queues[qfi]) {
1223 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1224 goto fail;
1225 }
1226
1227 device->queue_count[qfi] = queue_create->queueCount;
1228
1229 for (unsigned q = 0; q < queue_create->queueCount; q++) {
1230 result = radv_queue_init(device, &device->queues[qfi][q], q, queue_create, global_priority);
1231 if (result != VK_SUCCESS)
1232 goto fail;
1233 }
1234 }
1235 device->private_sdma_queue = VK_NULL_HANDLE;
1236
1237 device->shader_use_invisible_vram = (instance->perftest_flags & RADV_PERFTEST_DMA_SHADERS) &&
1238 /* SDMA buffer copy is only implemented for GFX7+. */
1239 pdev->info.gfx_level >= GFX7;
1240 result = radv_init_shader_upload_queue(device);
1241 if (result != VK_SUCCESS)
1242 goto fail;
1243
1244 device->pbb_allowed = pdev->info.gfx_level >= GFX9 && !(instance->debug_flags & RADV_DEBUG_NOBINNING);
1245
1246 device->disable_trunc_coord = instance->drirc.disable_trunc_coord;
1247
1248 if (instance->vk.app_info.engine_name && !strcmp(instance->vk.app_info.engine_name, "DXVK")) {
1249 /* For DXVK 2.3.0 and older, use dualSrcBlend to determine if this is D3D9. */
1250 bool is_d3d9 = !device->vk.enabled_features.dualSrcBlend;
1251 if (instance->vk.app_info.engine_version > VK_MAKE_VERSION(2, 3, 0))
1252 is_d3d9 = instance->vk.app_info.app_version & 0x1;
1253
1254 device->disable_trunc_coord &= !is_d3d9;
1255 }
1256
1257 /* The maximum number of scratch waves. Scratch space isn't divided
1258 * evenly between CUs. The number is only a function of the number of CUs.
1259 * We can decrease the constant to decrease the scratch buffer size.
1260 *
1261 * sctx->scratch_waves must be >= the maximum possible size of
1262 * 1 threadgroup, so that the hw doesn't hang from being unable
1263 * to start any.
1264 *
1265 * The recommended value is 4 per CU at most. Higher numbers don't
1266 * bring much benefit, but they still occupy chip resources (think
1267 * async compute). I've seen ~2% performance difference between 4 and 32.
1268 */
1269 uint32_t max_threads_per_block = 2048;
1270 device->scratch_waves = MAX2(32 * pdev->info.num_cu, max_threads_per_block / 64);
1271
1272 device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
1273
1274 if (pdev->info.gfx_level >= GFX7) {
1275 /* If the KMD allows it (there is a KMD hw register for it),
1276 * allow launching waves out-of-order.
1277 */
1278 device->dispatch_initiator |= S_00B800_ORDER_MODE(1);
1279 }
1280 if (pdev->info.gfx_level >= GFX10) {
1281 /* Enable asynchronous compute tunneling. The KMD restricts this feature
1282 * to high-priority compute queues, so setting the bit on any other queue
1283 * is a no-op. PAL always sets this bit as well.
1284 */
1285 device->dispatch_initiator |= S_00B800_TUNNEL_ENABLE(1);
1286 }
1287
1288 /* Disable partial preemption for task shaders.
1289 * The kernel may not support preemption, but PAL always sets this bit,
1290 * so let's also set it here for consistency.
1291 */
1292 device->dispatch_initiator_task = device->dispatch_initiator | S_00B800_DISABLE_DISP_PREMPT_EN(1);
1293
1294 if (pdev->info.gfx_level == GFX10_3) {
1295 if (getenv("RADV_FORCE_VRS_CONFIG_FILE")) {
1296 const char *file = radv_get_force_vrs_config_file();
1297
1298 device->force_vrs = radv_parse_force_vrs_config_file(file);
1299
1300 if (radv_device_init_notifier(device)) {
1301 device->force_vrs_enabled = true;
1302 } else {
1303 fprintf(stderr, "radv: Failed to initialize the notifier for RADV_FORCE_VRS_CONFIG_FILE!\n");
1304 }
1305 } else if (getenv("RADV_FORCE_VRS")) {
1306 const char *vrs_rates = getenv("RADV_FORCE_VRS");
1307
1308 device->force_vrs = radv_parse_vrs_rates(vrs_rates);
1309 device->force_vrs_enabled = device->force_vrs != RADV_FORCE_VRS_1x1;
1310 }
1311 }
1312
1313 /* PKT3_LOAD_SH_REG_INDEX is supported on GFX8+, but it hangs with compute queues until GFX10.3. */
1314 device->load_grid_size_from_user_sgpr = pdev->info.gfx_level >= GFX10_3;
1315
1316 /* Keep shader info for GPU hangs debugging. */
1317 device->keep_shader_info = radv_device_fault_detection_enabled(device) || radv_trap_handler_enabled();
1318
1319 /* Initialize the per-device cache key before compiling meta shaders. */
1320 radv_device_init_cache_key(device);
1321
1322 result = radv_device_init_tools(device);
1323 if (result != VK_SUCCESS)
1324 goto fail;
1325
1326 result = radv_device_init_meta(device);
1327 if (result != VK_SUCCESS)
1328 goto fail;
1329
1330 radv_device_init_msaa(device);
1331
1332 /* If the border color extension is enabled, let's create the buffer we need. */
1333 if (device->vk.enabled_features.customBorderColors) {
1334 result = radv_device_init_border_color(device);
1335 if (result != VK_SUCCESS)
1336 goto fail;
1337 }
1338
1339 if (device->vk.enabled_features.vertexInputDynamicState || device->vk.enabled_features.graphicsPipelineLibrary ||
1340 device->vk.enabled_features.shaderObject) {
1341 result = radv_device_init_vs_prologs(device);
1342 if (result != VK_SUCCESS)
1343 goto fail;
1344 }
1345
1346 if (device->vk.enabled_features.graphicsPipelineLibrary || device->vk.enabled_features.shaderObject ||
1347 device->vk.enabled_features.extendedDynamicState3ColorBlendEnable ||
1348 device->vk.enabled_features.extendedDynamicState3ColorWriteMask ||
1349 device->vk.enabled_features.extendedDynamicState3AlphaToCoverageEnable ||
1350 device->vk.enabled_features.extendedDynamicState3ColorBlendEquation) {
1351 if (!radv_shader_part_cache_init(&device->ps_epilogs, &ps_epilog_ops)) {
1352 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1353 goto fail;
1354 }
1355 }
1356
1357 if (!(instance->debug_flags & RADV_DEBUG_NO_IBS))
1358 radv_create_gfx_preamble(device);
1359
1360 if (!device->vk.disable_internal_cache) {
1361 result = radv_device_init_memory_cache(device);
1362 if (result != VK_SUCCESS)
1363 goto fail;
1364 }
1365
1366 device->force_aniso = MIN2(16, (int)debug_get_num_option("RADV_TEX_ANISO", -1));
1367 if (device->force_aniso >= 0) {
1368 fprintf(stderr, "radv: Forcing anisotropy filter to %ix\n", 1 << util_logbase2(device->force_aniso));
1369 }
1370
1371 if (device->vk.enabled_features.performanceCounterQueryPools) {
1372 result = radv_device_init_perf_counter(device);
1373 if (result != VK_SUCCESS)
1374 goto fail;
1375 }
1376
1377 if (device->vk.enabled_features.rayTracingPipelineShaderGroupHandleCaptureReplay) {
1378 device->capture_replay_arena_vas = _mesa_hash_table_u64_create(NULL);
1379 }
1380
1381 if (pdev->info.gfx_level == GFX11 && pdev->info.has_dedicated_vram && instance->drirc.force_pstate_peak_gfx11_dgpu) {
1382 if (!radv_device_acquire_performance_counters(device))
1383 fprintf(stderr, "radv: failed to set pstate to profile_peak.\n");
1384 }
1385
1386 *pDevice = radv_device_to_handle(device);
1387 return VK_SUCCESS;
1388
1389 fail:
1390 radv_destroy_device(device, pAllocator);
1391 return result;
1392 }
1393
1394 VKAPI_ATTR void VKAPI_CALL
radv_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)1395 radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
1396 {
1397 VK_FROM_HANDLE(radv_device, device, _device);
1398
1399 if (!device)
1400 return;
1401
1402 radv_destroy_device(device, pAllocator);
1403 }
1404
1405 VKAPI_ATTR void VKAPI_CALL
radv_GetImageMemoryRequirements2(VkDevice _device,const VkImageMemoryRequirementsInfo2 * pInfo,VkMemoryRequirements2 * pMemoryRequirements)1406 radv_GetImageMemoryRequirements2(VkDevice _device, const VkImageMemoryRequirementsInfo2 *pInfo,
1407 VkMemoryRequirements2 *pMemoryRequirements)
1408 {
1409 VK_FROM_HANDLE(radv_device, device, _device);
1410 VK_FROM_HANDLE(radv_image, image, pInfo->image);
1411 const struct radv_physical_device *pdev = radv_device_physical(device);
1412 uint32_t alignment;
1413 uint64_t size;
1414
1415 const VkImagePlaneMemoryRequirementsInfo *plane_info =
1416 vk_find_struct_const(pInfo->pNext, IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO);
1417
1418 if (plane_info) {
1419 const uint32_t plane = radv_plane_from_aspect(plane_info->planeAspect);
1420
1421 size = image->planes[plane].surface.total_size;
1422 alignment = 1 << image->planes[plane].surface.alignment_log2;
1423 } else {
1424 size = image->size;
1425 alignment = image->alignment;
1426 }
1427
1428 pMemoryRequirements->memoryRequirements.memoryTypeBits =
1429 ((1u << pdev->memory_properties.memoryTypeCount) - 1u) & ~pdev->memory_types_32bit;
1430
1431 pMemoryRequirements->memoryRequirements.size = size;
1432 pMemoryRequirements->memoryRequirements.alignment = alignment;
1433
1434 vk_foreach_struct (ext, pMemoryRequirements->pNext) {
1435 switch (ext->sType) {
1436 case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
1437 VkMemoryDedicatedRequirements *req = (VkMemoryDedicatedRequirements *)ext;
1438 req->requiresDedicatedAllocation = image->shareable && image->vk.tiling != VK_IMAGE_TILING_LINEAR;
1439 req->prefersDedicatedAllocation = req->requiresDedicatedAllocation;
1440 break;
1441 }
1442 default:
1443 break;
1444 }
1445 }
1446 }
1447
1448 VKAPI_ATTR void VKAPI_CALL
radv_GetDeviceImageMemoryRequirements(VkDevice device,const VkDeviceImageMemoryRequirements * pInfo,VkMemoryRequirements2 * pMemoryRequirements)1449 radv_GetDeviceImageMemoryRequirements(VkDevice device, const VkDeviceImageMemoryRequirements *pInfo,
1450 VkMemoryRequirements2 *pMemoryRequirements)
1451 {
1452 UNUSED VkResult result;
1453 VkImage image;
1454
1455 /* Determining the image size/alignment require to create a surface, which is complicated without
1456 * creating an image.
1457 * TODO: Avoid creating an image.
1458 */
1459 result =
1460 radv_image_create(device, &(struct radv_image_create_info){.vk_info = pInfo->pCreateInfo}, NULL, &image, true);
1461 assert(result == VK_SUCCESS);
1462
1463 VkImageMemoryRequirementsInfo2 info2 = {
1464 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
1465 .image = image,
1466 };
1467
1468 radv_GetImageMemoryRequirements2(device, &info2, pMemoryRequirements);
1469
1470 radv_DestroyImage(device, image, NULL);
1471 }
1472
1473 static uint32_t
radv_surface_max_layer_count(struct radv_image_view * iview)1474 radv_surface_max_layer_count(struct radv_image_view *iview)
1475 {
1476 return iview->vk.view_type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth
1477 : (iview->vk.base_array_layer + iview->vk.layer_count);
1478 }
1479
1480 unsigned
radv_get_dcc_max_uncompressed_block_size(const struct radv_device * device,const struct radv_image * image)1481 radv_get_dcc_max_uncompressed_block_size(const struct radv_device *device, const struct radv_image *image)
1482 {
1483 const struct radv_physical_device *pdev = radv_device_physical(device);
1484
1485 if (pdev->info.gfx_level < GFX10 && image->vk.samples > 1) {
1486 if (image->planes[0].surface.bpe == 1)
1487 return V_028C78_MAX_BLOCK_SIZE_64B;
1488 else if (image->planes[0].surface.bpe == 2)
1489 return V_028C78_MAX_BLOCK_SIZE_128B;
1490 }
1491
1492 return V_028C78_MAX_BLOCK_SIZE_256B;
1493 }
1494
1495 void
radv_initialise_color_surface(struct radv_device * device,struct radv_color_buffer_info * cb,struct radv_image_view * iview)1496 radv_initialise_color_surface(struct radv_device *device, struct radv_color_buffer_info *cb,
1497 struct radv_image_view *iview)
1498 {
1499 const struct radv_physical_device *pdev = radv_device_physical(device);
1500 const struct radv_instance *instance = radv_physical_device_instance(pdev);
1501 uint64_t va;
1502 const struct radv_image_plane *plane = &iview->image->planes[iview->plane_id];
1503 const struct radeon_surf *surf = &plane->surface;
1504
1505 memset(cb, 0, sizeof(*cb));
1506
1507 const unsigned num_layers =
1508 iview->image->vk.image_type == VK_IMAGE_TYPE_3D ? (iview->extent.depth - 1) : (iview->image->vk.array_layers - 1);
1509
1510 const struct ac_cb_state cb_state = {
1511 .surf = surf,
1512 .format = radv_format_to_pipe_format(iview->vk.format),
1513 .width = vk_format_get_plane_width(iview->image->vk.format, iview->plane_id, iview->extent.width),
1514 .height = vk_format_get_plane_height(iview->image->vk.format, iview->plane_id, iview->extent.height),
1515 .first_layer = iview->vk.base_array_layer,
1516 .last_layer = radv_surface_max_layer_count(iview) - 1,
1517 .num_layers = num_layers,
1518 .num_samples = iview->image->vk.samples,
1519 .num_storage_samples = iview->image->vk.samples,
1520 .base_level = iview->vk.base_mip_level,
1521 .num_levels = iview->image->vk.mip_levels,
1522 .gfx10 =
1523 {
1524 .nbc_view = iview->nbc_view.valid ? &iview->nbc_view : NULL,
1525 },
1526 };
1527
1528 ac_init_cb_surface(&pdev->info, &cb_state, &cb->ac);
1529
1530 uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
1531 va = radv_image_get_va(iview->image, plane_id);
1532
1533 const struct ac_mutable_cb_state mutable_cb_state = {
1534 .surf = surf,
1535 .cb = &cb->ac,
1536 .va = va,
1537 .base_level = iview->vk.base_mip_level,
1538 .num_samples = iview->image->vk.samples,
1539 .fmask_enabled = radv_image_has_fmask(iview->image),
1540 .cmask_enabled = radv_image_has_cmask(iview->image),
1541 .fast_clear_enabled = !(instance->debug_flags & RADV_DEBUG_NO_FAST_CLEARS),
1542 .tc_compat_cmask_enabled = radv_image_is_tc_compat_cmask(iview->image),
1543 .dcc_enabled = radv_dcc_enabled(iview->image, iview->vk.base_mip_level) &&
1544 (pdev->info.gfx_level >= GFX11 || !iview->disable_dcc_mrt),
1545 .gfx10 =
1546 {
1547 .nbc_view = iview->nbc_view.valid ? &iview->nbc_view : NULL,
1548 },
1549 };
1550
1551 ac_set_mutable_cb_surface_fields(&pdev->info, &mutable_cb_state, &cb->ac);
1552 }
1553
1554 void
radv_initialise_vrs_surface(struct radv_image * image,struct radv_buffer * htile_buffer,struct radv_ds_buffer_info * ds)1555 radv_initialise_vrs_surface(struct radv_image *image, struct radv_buffer *htile_buffer, struct radv_ds_buffer_info *ds)
1556 {
1557 const struct radeon_surf *surf = &image->planes[0].surface;
1558
1559 assert(image->vk.format == VK_FORMAT_D16_UNORM);
1560 memset(ds, 0, sizeof(*ds));
1561
1562 ds->ac.db_z_info = S_028038_FORMAT(V_028040_Z_16) | S_028038_SW_MODE(surf->u.gfx9.swizzle_mode) |
1563 S_028038_ZRANGE_PRECISION(1) | S_028038_TILE_SURFACE_ENABLE(1);
1564 ds->ac.db_stencil_info = S_02803C_FORMAT(V_028044_STENCIL_INVALID);
1565
1566 ds->ac.db_depth_size = S_02801C_X_MAX(image->vk.extent.width - 1) | S_02801C_Y_MAX(image->vk.extent.height - 1);
1567
1568 ds->ac.u.gfx6.db_htile_data_base = radv_buffer_get_va(htile_buffer->bo) >> 8;
1569 ds->ac.u.gfx6.db_htile_surface =
1570 S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1) | S_028ABC_VRS_HTILE_ENCODING(V_028ABC_VRS_HTILE_4BIT_ENCODING);
1571 }
1572
1573 void
radv_initialise_ds_surface(const struct radv_device * device,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,VkImageAspectFlags ds_aspects)1574 radv_initialise_ds_surface(const struct radv_device *device, struct radv_ds_buffer_info *ds,
1575 struct radv_image_view *iview, VkImageAspectFlags ds_aspects)
1576 {
1577 const struct radv_physical_device *pdev = radv_device_physical(device);
1578 unsigned level = iview->vk.base_mip_level;
1579 bool stencil_only = iview->image->vk.format == VK_FORMAT_S8_UINT;
1580
1581 assert(vk_format_get_plane_count(iview->image->vk.format) == 1);
1582
1583 memset(ds, 0, sizeof(*ds));
1584
1585 uint32_t max_slice = radv_surface_max_layer_count(iview) - 1;
1586
1587 /* Recommended value for better performance with 4x and 8x. */
1588 ds->db_render_override2 = S_028010_DECOMPRESS_Z_ON_FLUSH(iview->image->vk.samples >= 4) |
1589 S_028010_CENTROID_COMPUTATION_MODE(pdev->info.gfx_level >= GFX10_3);
1590
1591 const struct ac_ds_state ds_state = {
1592 .surf = &iview->image->planes[0].surface,
1593 .va = radv_image_get_va(iview->image, 0),
1594 .format = radv_format_to_pipe_format(iview->image->vk.format),
1595 .width = iview->image->vk.extent.width,
1596 .height = iview->image->vk.extent.height,
1597 .level = level,
1598 .num_levels = iview->image->vk.mip_levels,
1599 .num_samples = iview->image->vk.samples,
1600 .first_layer = iview->vk.base_array_layer,
1601 .last_layer = max_slice,
1602 .stencil_only = stencil_only,
1603 .z_read_only = !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT),
1604 .stencil_read_only = !(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT),
1605 .htile_enabled = radv_htile_enabled(iview->image, level),
1606 .htile_stencil_disabled = radv_image_tile_stencil_disabled(device, iview->image),
1607 .vrs_enabled = radv_image_has_vrs_htile(device, iview->image),
1608 };
1609
1610 ac_init_ds_surface(&pdev->info, &ds_state, &ds->ac);
1611
1612 const struct ac_mutable_ds_state mutable_ds_state = {
1613 .ds = &ds->ac,
1614 .format = radv_format_to_pipe_format(iview->image->vk.format),
1615 .tc_compat_htile_enabled = radv_htile_enabled(iview->image, level) && radv_image_is_tc_compat_htile(iview->image),
1616 .zrange_precision = true,
1617 .no_d16_compression = true,
1618 };
1619
1620 ac_set_mutable_ds_surface_fields(&pdev->info, &mutable_ds_state, &ds->ac);
1621
1622 if (pdev->info.gfx_level >= GFX11) {
1623 radv_gfx11_set_db_render_control(device, iview->image->vk.samples, &ds->db_render_control);
1624 }
1625 }
1626
1627 void
radv_gfx11_set_db_render_control(const struct radv_device * device,unsigned num_samples,unsigned * db_render_control)1628 radv_gfx11_set_db_render_control(const struct radv_device *device, unsigned num_samples, unsigned *db_render_control)
1629 {
1630 const struct radv_physical_device *pdev = radv_device_physical(device);
1631 unsigned max_allowed_tiles_in_wave = 0;
1632
1633 if (pdev->info.has_dedicated_vram) {
1634 if (num_samples == 8)
1635 max_allowed_tiles_in_wave = 6;
1636 else if (num_samples == 4)
1637 max_allowed_tiles_in_wave = 13;
1638 else
1639 max_allowed_tiles_in_wave = 0;
1640 } else {
1641 if (num_samples == 8)
1642 max_allowed_tiles_in_wave = 7;
1643 else if (num_samples == 4)
1644 max_allowed_tiles_in_wave = 15;
1645 else
1646 max_allowed_tiles_in_wave = 0;
1647 }
1648
1649 *db_render_control |= S_028000_MAX_ALLOWED_TILES_IN_WAVE(max_allowed_tiles_in_wave);
1650 }
1651
1652 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetMemoryFdKHR(VkDevice _device,const VkMemoryGetFdInfoKHR * pGetFdInfo,int * pFD)1653 radv_GetMemoryFdKHR(VkDevice _device, const VkMemoryGetFdInfoKHR *pGetFdInfo, int *pFD)
1654 {
1655 VK_FROM_HANDLE(radv_device, device, _device);
1656 VK_FROM_HANDLE(radv_device_memory, memory, pGetFdInfo->memory);
1657
1658 assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR);
1659
1660 /* At the moment, we support only the below handle types. */
1661 assert(pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
1662 pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
1663
1664 /* Set BO metadata for dedicated image allocations. We don't need it for import when the image
1665 * tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, but we set it anyway for foreign consumers.
1666 */
1667 if (memory->image) {
1668 struct radeon_bo_metadata metadata;
1669
1670 assert(memory->image->bindings[0].offset == 0);
1671 radv_init_metadata(device, memory->image, &metadata);
1672 device->ws->buffer_set_metadata(device->ws, memory->bo, &metadata);
1673 }
1674
1675 bool ret = device->ws->buffer_get_fd(device->ws, memory->bo, pFD);
1676 if (ret == false)
1677 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1678 return VK_SUCCESS;
1679 }
1680
1681 static uint32_t
radv_compute_valid_memory_types_attempt(struct radv_physical_device * pdev,enum radeon_bo_domain domains,enum radeon_bo_flag flags,enum radeon_bo_flag ignore_flags)1682 radv_compute_valid_memory_types_attempt(struct radv_physical_device *pdev, enum radeon_bo_domain domains,
1683 enum radeon_bo_flag flags, enum radeon_bo_flag ignore_flags)
1684 {
1685 /* Don't count GTT/CPU as relevant:
1686 *
1687 * - We're not fully consistent between the two.
1688 * - Sometimes VRAM gets VRAM|GTT.
1689 */
1690 const enum radeon_bo_domain relevant_domains = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GDS | RADEON_DOMAIN_OA;
1691 uint32_t bits = 0;
1692 for (unsigned i = 0; i < pdev->memory_properties.memoryTypeCount; ++i) {
1693 if ((domains & relevant_domains) != (pdev->memory_domains[i] & relevant_domains))
1694 continue;
1695
1696 if ((flags & ~ignore_flags) != (pdev->memory_flags[i] & ~ignore_flags))
1697 continue;
1698
1699 bits |= 1u << i;
1700 }
1701
1702 return bits;
1703 }
1704
1705 static uint32_t
radv_compute_valid_memory_types(struct radv_physical_device * pdev,enum radeon_bo_domain domains,enum radeon_bo_flag flags)1706 radv_compute_valid_memory_types(struct radv_physical_device *pdev, enum radeon_bo_domain domains,
1707 enum radeon_bo_flag flags)
1708 {
1709 enum radeon_bo_flag ignore_flags = ~(RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_GTT_WC);
1710 uint32_t bits = radv_compute_valid_memory_types_attempt(pdev, domains, flags, ignore_flags);
1711
1712 if (!bits) {
1713 ignore_flags |= RADEON_FLAG_GTT_WC;
1714 bits = radv_compute_valid_memory_types_attempt(pdev, domains, flags, ignore_flags);
1715 }
1716
1717 if (!bits) {
1718 ignore_flags |= RADEON_FLAG_NO_CPU_ACCESS;
1719 bits = radv_compute_valid_memory_types_attempt(pdev, domains, flags, ignore_flags);
1720 }
1721
1722 /* Avoid 32-bit memory types for shared memory. */
1723 bits &= ~pdev->memory_types_32bit;
1724
1725 return bits;
1726 }
1727 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetMemoryFdPropertiesKHR(VkDevice _device,VkExternalMemoryHandleTypeFlagBits handleType,int fd,VkMemoryFdPropertiesKHR * pMemoryFdProperties)1728 radv_GetMemoryFdPropertiesKHR(VkDevice _device, VkExternalMemoryHandleTypeFlagBits handleType, int fd,
1729 VkMemoryFdPropertiesKHR *pMemoryFdProperties)
1730 {
1731 VK_FROM_HANDLE(radv_device, device, _device);
1732 struct radv_physical_device *pdev = radv_device_physical(device);
1733
1734 switch (handleType) {
1735 case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: {
1736 enum radeon_bo_domain domains;
1737 enum radeon_bo_flag flags;
1738 if (!device->ws->buffer_get_flags_from_fd(device->ws, fd, &domains, &flags))
1739 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1740
1741 pMemoryFdProperties->memoryTypeBits = radv_compute_valid_memory_types(pdev, domains, flags);
1742 return VK_SUCCESS;
1743 }
1744 default:
1745 /* The valid usage section for this function says:
1746 *
1747 * "handleType must not be one of the handle types defined as
1748 * opaque."
1749 *
1750 * So opaque handle types fall into the default "unsupported" case.
1751 */
1752 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1753 }
1754 }
1755
1756 bool
radv_device_set_pstate(struct radv_device * device,bool enable)1757 radv_device_set_pstate(struct radv_device *device, bool enable)
1758 {
1759 const struct radv_physical_device *pdev = radv_device_physical(device);
1760 const struct radv_instance *instance = radv_physical_device_instance(pdev);
1761 struct radeon_winsys *ws = device->ws;
1762 enum radeon_ctx_pstate pstate = enable ? instance->profile_pstate : RADEON_CTX_PSTATE_NONE;
1763
1764 if (pdev->info.has_stable_pstate) {
1765 /* pstate is per-device; setting it for one ctx is sufficient.
1766 * We pick the first initialized one below. */
1767 for (unsigned i = 0; i < RADV_NUM_HW_CTX; i++)
1768 if (device->hw_ctx[i])
1769 return ws->ctx_set_pstate(device->hw_ctx[i], pstate) >= 0;
1770 }
1771
1772 return true;
1773 }
1774
1775 bool
radv_device_acquire_performance_counters(struct radv_device * device)1776 radv_device_acquire_performance_counters(struct radv_device *device)
1777 {
1778 bool result = true;
1779 simple_mtx_lock(&device->pstate_mtx);
1780
1781 if (device->pstate_cnt == 0) {
1782 result = radv_device_set_pstate(device, true);
1783 if (result)
1784 ++device->pstate_cnt;
1785 }
1786
1787 simple_mtx_unlock(&device->pstate_mtx);
1788 return result;
1789 }
1790
1791 void
radv_device_release_performance_counters(struct radv_device * device)1792 radv_device_release_performance_counters(struct radv_device *device)
1793 {
1794 simple_mtx_lock(&device->pstate_mtx);
1795
1796 if (--device->pstate_cnt == 0)
1797 radv_device_set_pstate(device, false);
1798
1799 simple_mtx_unlock(&device->pstate_mtx);
1800 }
1801
1802 VKAPI_ATTR VkResult VKAPI_CALL
radv_AcquireProfilingLockKHR(VkDevice _device,const VkAcquireProfilingLockInfoKHR * pInfo)1803 radv_AcquireProfilingLockKHR(VkDevice _device, const VkAcquireProfilingLockInfoKHR *pInfo)
1804 {
1805 VK_FROM_HANDLE(radv_device, device, _device);
1806 bool result = radv_device_acquire_performance_counters(device);
1807 return result ? VK_SUCCESS : VK_ERROR_UNKNOWN;
1808 }
1809
1810 VKAPI_ATTR void VKAPI_CALL
radv_ReleaseProfilingLockKHR(VkDevice _device)1811 radv_ReleaseProfilingLockKHR(VkDevice _device)
1812 {
1813 VK_FROM_HANDLE(radv_device, device, _device);
1814 radv_device_release_performance_counters(device);
1815 }
1816
1817 VKAPI_ATTR void VKAPI_CALL
radv_GetDeviceImageSubresourceLayout(VkDevice device,const VkDeviceImageSubresourceInfo * pInfo,VkSubresourceLayout2 * pLayout)1818 radv_GetDeviceImageSubresourceLayout(VkDevice device, const VkDeviceImageSubresourceInfo *pInfo,
1819 VkSubresourceLayout2 *pLayout)
1820 {
1821 UNUSED VkResult result;
1822 VkImage image;
1823
1824 result =
1825 radv_image_create(device, &(struct radv_image_create_info){.vk_info = pInfo->pCreateInfo}, NULL, &image, true);
1826 assert(result == VK_SUCCESS);
1827
1828 radv_GetImageSubresourceLayout2(device, image, pInfo->pSubresource, pLayout);
1829
1830 radv_DestroyImage(device, image, NULL);
1831 }
1832