1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include <amdgpu.h>
9 #include <assert.h>
10 #include <libsync.h>
11 #include <pthread.h>
12 #include <stdlib.h>
13 #include "drm-uapi/amdgpu_drm.h"
14
15 #include "util/detect_os.h"
16 #include "util/os_time.h"
17 #include "util/u_memory.h"
18 #include "ac_debug.h"
19 #include "ac_linux_drm.h"
20 #include "radv_amdgpu_bo.h"
21 #include "radv_amdgpu_cs.h"
22 #include "radv_amdgpu_winsys.h"
23 #include "radv_debug.h"
24 #include "radv_radeon_winsys.h"
25 #include "sid.h"
26 #include "vk_alloc.h"
27 #include "vk_drm_syncobj.h"
28 #include "vk_sync.h"
29 #include "vk_sync_dummy.h"
30
31 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
32 * codes in the kernel).
33 */
34 #if DETECT_OS_OPENBSD
35 #define ENODATA ENOTSUP
36 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
37 #define ENODATA ECONNREFUSED
38 #endif
39
40 /* Maximum allowed total number of submitted IBs. */
41 #define RADV_MAX_IBS_PER_SUBMIT 192
42
43 enum { VIRTUAL_BUFFER_HASH_TABLE_SIZE = 1024 };
44
45 struct radv_amdgpu_ib {
46 struct radeon_winsys_bo *bo; /* NULL when not owned by the current CS object */
47 uint64_t va;
48 unsigned cdw;
49 };
50
51 struct radv_amdgpu_cs_ib_info {
52 int64_t flags;
53 uint64_t ib_mc_address;
54 uint32_t size;
55 enum amd_ip_type ip_type;
56 };
57
58 struct radv_amdgpu_cs {
59 struct radeon_cmdbuf base;
60 struct radv_amdgpu_winsys *ws;
61
62 struct radv_amdgpu_cs_ib_info ib;
63
64 struct radeon_winsys_bo *ib_buffer;
65 uint8_t *ib_mapped;
66 unsigned max_num_buffers;
67 unsigned num_buffers;
68 struct drm_amdgpu_bo_list_entry *handles;
69
70 struct radv_amdgpu_ib *ib_buffers;
71 unsigned num_ib_buffers;
72 unsigned max_num_ib_buffers;
73 unsigned *ib_size_ptr;
74 VkResult status;
75 struct radv_amdgpu_cs *chained_to;
76 bool use_ib;
77 bool is_secondary;
78
79 int buffer_hash_table[1024];
80 unsigned hw_ip;
81
82 unsigned num_virtual_buffers;
83 unsigned max_num_virtual_buffers;
84 struct radeon_winsys_bo **virtual_buffers;
85 int *virtual_buffer_hash_table;
86
87 struct hash_table *annotations;
88 };
89
90 struct radv_winsys_sem_counts {
91 uint32_t syncobj_count;
92 uint32_t timeline_syncobj_count;
93 uint32_t *syncobj;
94 uint64_t *points;
95 };
96
97 struct radv_winsys_sem_info {
98 bool cs_emit_signal;
99 bool cs_emit_wait;
100 struct radv_winsys_sem_counts wait;
101 struct radv_winsys_sem_counts signal;
102 };
103
104 static void
radeon_emit_unchecked(struct radeon_cmdbuf * cs,uint32_t value)105 radeon_emit_unchecked(struct radeon_cmdbuf *cs, uint32_t value)
106 {
107 cs->buf[cs->cdw++] = value;
108 }
109
110 static uint32_t radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring);
111
112 static inline struct radv_amdgpu_cs *
radv_amdgpu_cs(struct radeon_cmdbuf * base)113 radv_amdgpu_cs(struct radeon_cmdbuf *base)
114 {
115 return (struct radv_amdgpu_cs *)base;
116 }
117
118 static bool
ring_can_use_ib_bos(const struct radv_amdgpu_winsys * ws,enum amd_ip_type ip_type)119 ring_can_use_ib_bos(const struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
120 {
121 return ws->use_ib_bos && (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
122 }
123
124 struct radv_amdgpu_cs_request {
125 /** Specify HW IP block type to which to send the IB. */
126 unsigned ip_type;
127
128 /** IP instance index if there are several IPs of the same type. */
129 unsigned ip_instance;
130
131 /**
132 * Specify ring index of the IP. We could have several rings
133 * in the same IP. E.g. 0 for SDMA0 and 1 for SDMA1.
134 */
135 uint32_t ring;
136
137 /**
138 * BO list handles used by this request.
139 */
140 struct drm_amdgpu_bo_list_entry *handles;
141 uint32_t num_handles;
142
143 /** Number of IBs to submit in the field ibs. */
144 uint32_t number_of_ibs;
145
146 /**
147 * IBs to submit. Those IBs will be submitted together as single entity
148 */
149 struct radv_amdgpu_cs_ib_info *ibs;
150
151 /**
152 * The returned sequence number for the command submission
153 */
154 uint64_t seq_no;
155 };
156
157 static VkResult radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
158 struct radv_winsys_sem_info *sem_info);
159
160 static void
radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_fence * fence,struct radv_amdgpu_cs_request * req)161 radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_fence *fence,
162 struct radv_amdgpu_cs_request *req)
163 {
164 fence->fence.ip_type = req->ip_type;
165 fence->fence.ip_instance = req->ip_instance;
166 fence->fence.ring = req->ring;
167 fence->fence.fence = req->seq_no;
168 }
169
170 static struct radv_amdgpu_cs_ib_info
radv_amdgpu_cs_ib_to_info(struct radv_amdgpu_cs * cs,struct radv_amdgpu_ib ib)171 radv_amdgpu_cs_ib_to_info(struct radv_amdgpu_cs *cs, struct radv_amdgpu_ib ib)
172 {
173 struct radv_amdgpu_cs_ib_info info = {
174 .flags = 0,
175 .ip_type = cs->hw_ip,
176 .ib_mc_address = ib.va,
177 .size = ib.cdw,
178 };
179 return info;
180 }
181
182 static void
radv_amdgpu_cs_free_annotation(struct hash_entry * entry)183 radv_amdgpu_cs_free_annotation(struct hash_entry *entry)
184 {
185 free(entry->data);
186 }
187
188 static void
radv_amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)189 radv_amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
190 {
191 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
192
193 _mesa_hash_table_destroy(cs->annotations, radv_amdgpu_cs_free_annotation);
194
195 if (cs->ib_buffer)
196 cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
197
198 for (unsigned i = 0; i < cs->num_ib_buffers; ++i)
199 cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffers[i].bo);
200
201 free(cs->ib_buffers);
202 free(cs->virtual_buffers);
203 free(cs->virtual_buffer_hash_table);
204 free(cs->handles);
205 free(cs);
206 }
207
208 static void
radv_amdgpu_init_cs(struct radv_amdgpu_cs * cs,enum amd_ip_type ip_type)209 radv_amdgpu_init_cs(struct radv_amdgpu_cs *cs, enum amd_ip_type ip_type)
210 {
211 for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i)
212 cs->buffer_hash_table[i] = -1;
213
214 cs->hw_ip = ip_type;
215 }
216
217 static enum radeon_bo_domain
radv_amdgpu_cs_domain(const struct radeon_winsys * _ws)218 radv_amdgpu_cs_domain(const struct radeon_winsys *_ws)
219 {
220 const struct radv_amdgpu_winsys *ws = (const struct radv_amdgpu_winsys *)_ws;
221
222 bool enough_vram = ws->info.all_vram_visible ||
223 p_atomic_read_relaxed(&ws->allocated_vram_vis) * 2 <= (uint64_t)ws->info.vram_vis_size_kb * 1024;
224
225 /* Bandwidth should be equivalent to at least PCIe 3.0 x8.
226 * If there is no PCIe info, assume there is enough bandwidth.
227 */
228 bool enough_bandwidth = !ws->info.has_pcie_bandwidth_info || ws->info.pcie_bandwidth_mbps >= 8 * 0.985 * 1024;
229
230 bool use_sam =
231 (enough_vram && enough_bandwidth && ws->info.has_dedicated_vram && !(ws->perftest & RADV_PERFTEST_NO_SAM)) ||
232 (ws->perftest & RADV_PERFTEST_SAM);
233 return use_sam ? RADEON_DOMAIN_VRAM : RADEON_DOMAIN_GTT;
234 }
235
236 static VkResult
radv_amdgpu_cs_bo_create(struct radv_amdgpu_cs * cs,uint32_t ib_size)237 radv_amdgpu_cs_bo_create(struct radv_amdgpu_cs *cs, uint32_t ib_size)
238 {
239 struct radeon_winsys *ws = &cs->ws->base;
240
241 /* Avoid memcpy from VRAM when a secondary cmdbuf can't always rely on IB2. */
242 const bool can_always_use_ib2 = cs->ws->info.gfx_level >= GFX8 && cs->hw_ip == AMD_IP_GFX;
243 const bool avoid_vram = cs->is_secondary && !can_always_use_ib2;
244 const enum radeon_bo_domain domain = avoid_vram ? RADEON_DOMAIN_GTT : radv_amdgpu_cs_domain(ws);
245 const enum radeon_bo_flag gtt_wc_flag = avoid_vram ? 0 : RADEON_FLAG_GTT_WC;
246 const enum radeon_bo_flag flags =
247 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | gtt_wc_flag;
248
249 return ws->buffer_create(ws, ib_size, cs->ws->info.ip[cs->hw_ip].ib_alignment, domain, flags, RADV_BO_PRIORITY_CS, 0,
250 &cs->ib_buffer);
251 }
252
253 static VkResult
radv_amdgpu_cs_get_new_ib(struct radeon_cmdbuf * _cs,uint32_t ib_size)254 radv_amdgpu_cs_get_new_ib(struct radeon_cmdbuf *_cs, uint32_t ib_size)
255 {
256 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
257 VkResult result;
258
259 result = radv_amdgpu_cs_bo_create(cs, ib_size);
260 if (result != VK_SUCCESS)
261 return result;
262
263 cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
264 if (!cs->ib_mapped) {
265 cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
266 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
267 }
268
269 cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
270 cs->base.buf = (uint32_t *)cs->ib_mapped;
271 cs->base.cdw = 0;
272 cs->base.reserved_dw = 0;
273 cs->base.max_dw = ib_size / 4 - 4;
274 cs->ib.size = 0;
275 cs->ib.ip_type = cs->hw_ip;
276
277 if (cs->use_ib)
278 cs->ib_size_ptr = &cs->ib.size;
279
280 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
281
282 return VK_SUCCESS;
283 }
284
285 static unsigned
radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys * ws,enum amd_ip_type ip_type)286 radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
287 {
288 const uint32_t ib_alignment = ws->info.ip[ip_type].ib_alignment;
289 assert(util_is_power_of_two_nonzero(ib_alignment));
290 return align(20 * 1024 * 4, ib_alignment);
291 }
292
293 static struct radeon_cmdbuf *
radv_amdgpu_cs_create(struct radeon_winsys * ws,enum amd_ip_type ip_type,bool is_secondary)294 radv_amdgpu_cs_create(struct radeon_winsys *ws, enum amd_ip_type ip_type, bool is_secondary)
295 {
296 struct radv_amdgpu_cs *cs;
297 uint32_t ib_size = radv_amdgpu_cs_get_initial_size(radv_amdgpu_winsys(ws), ip_type);
298
299 cs = calloc(1, sizeof(struct radv_amdgpu_cs));
300 if (!cs)
301 return NULL;
302
303 cs->is_secondary = is_secondary;
304 cs->ws = radv_amdgpu_winsys(ws);
305 radv_amdgpu_init_cs(cs, ip_type);
306
307 cs->use_ib = ring_can_use_ib_bos(cs->ws, ip_type);
308
309 VkResult result = radv_amdgpu_cs_get_new_ib(&cs->base, ib_size);
310 if (result != VK_SUCCESS) {
311 free(cs);
312 return NULL;
313 }
314
315 return &cs->base;
316 }
317
318 static uint32_t
get_nop_packet(struct radv_amdgpu_cs * cs)319 get_nop_packet(struct radv_amdgpu_cs *cs)
320 {
321 switch (cs->hw_ip) {
322 case AMDGPU_HW_IP_GFX:
323 case AMDGPU_HW_IP_COMPUTE:
324 return cs->ws->info.gfx_ib_pad_with_type2 ? PKT2_NOP_PAD : PKT3_NOP_PAD;
325 case AMDGPU_HW_IP_DMA:
326 return cs->ws->info.gfx_level == GFX6 ? 0xF0000000 : SDMA_NOP_PAD;
327 case AMDGPU_HW_IP_UVD:
328 case AMDGPU_HW_IP_UVD_ENC:
329 return PKT2_NOP_PAD;
330 case AMDGPU_HW_IP_VCN_DEC:
331 return 0x81FF;
332 case AMDGPU_HW_IP_VCN_ENC:
333 return 0; /* NOPs are illegal in encode, so don't pad */
334 default:
335 unreachable("Unknown IP type");
336 }
337 }
338
339 static void
radv_amdgpu_cs_add_ib_buffer(struct radv_amdgpu_cs * cs,struct radeon_winsys_bo * bo,uint64_t va,uint32_t cdw)340 radv_amdgpu_cs_add_ib_buffer(struct radv_amdgpu_cs *cs, struct radeon_winsys_bo *bo, uint64_t va, uint32_t cdw)
341 {
342 if (cs->num_ib_buffers == cs->max_num_ib_buffers) {
343 unsigned max_num_ib_buffers = MAX2(1, cs->max_num_ib_buffers * 2);
344 struct radv_amdgpu_ib *ib_buffers = realloc(cs->ib_buffers, max_num_ib_buffers * sizeof(*ib_buffers));
345 if (!ib_buffers) {
346 cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
347 return;
348 }
349 cs->max_num_ib_buffers = max_num_ib_buffers;
350 cs->ib_buffers = ib_buffers;
351 }
352
353 cs->ib_buffers[cs->num_ib_buffers].bo = bo;
354 cs->ib_buffers[cs->num_ib_buffers].va = va;
355 cs->ib_buffers[cs->num_ib_buffers++].cdw = cdw;
356 }
357
358 static void
radv_amdgpu_restore_last_ib(struct radv_amdgpu_cs * cs)359 radv_amdgpu_restore_last_ib(struct radv_amdgpu_cs *cs)
360 {
361 struct radv_amdgpu_ib *ib = &cs->ib_buffers[--cs->num_ib_buffers];
362 assert(ib->bo);
363 cs->ib_buffer = ib->bo;
364 }
365
366 static void
radv_amdgpu_cs_grow(struct radeon_cmdbuf * _cs,size_t min_size)367 radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
368 {
369 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
370
371 if (cs->status != VK_SUCCESS) {
372 cs->base.cdw = 0;
373 return;
374 }
375
376 const uint32_t ib_alignment = cs->ws->info.ip[cs->hw_ip].ib_alignment;
377
378 cs->ws->base.cs_finalize(_cs);
379
380 uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2);
381
382 ib_size = align(MIN2(ib_size, ~C_3F2_IB_SIZE), ib_alignment);
383
384 VkResult result = radv_amdgpu_cs_bo_create(cs, ib_size);
385
386 if (result != VK_SUCCESS) {
387 cs->base.cdw = 0;
388 cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
389 radv_amdgpu_restore_last_ib(cs);
390 }
391
392 cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
393 if (!cs->ib_mapped) {
394 cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
395 cs->base.cdw = 0;
396
397 /* VK_ERROR_MEMORY_MAP_FAILED is not valid for vkEndCommandBuffer. */
398 cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
399 radv_amdgpu_restore_last_ib(cs);
400 }
401
402 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
403
404 if (cs->use_ib) {
405 cs->base.buf[cs->base.cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
406 cs->base.buf[cs->base.cdw - 3] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
407 cs->base.buf[cs->base.cdw - 2] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va >> 32;
408 cs->base.buf[cs->base.cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1);
409
410 cs->ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
411 }
412
413 cs->base.buf = (uint32_t *)cs->ib_mapped;
414 cs->base.cdw = 0;
415 cs->base.reserved_dw = 0;
416 cs->base.max_dw = ib_size / 4 - 4;
417 }
418
419 static void
radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf * _cs,unsigned leave_dw_space)420 radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf *_cs, unsigned leave_dw_space)
421 {
422 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
423 const enum amd_ip_type ip_type = cs->hw_ip;
424 const uint32_t pad_dw_mask = cs->ws->info.ip[ip_type].ib_pad_dw_mask;
425 const uint32_t unaligned_dw = (cs->base.cdw + leave_dw_space) & pad_dw_mask;
426
427 if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
428 if (unaligned_dw) {
429 const int remaining = pad_dw_mask + 1 - unaligned_dw;
430
431 /* Only pad by 1 dword with the type-2 NOP if necessary. */
432 if (remaining == 1 && cs->ws->info.gfx_ib_pad_with_type2) {
433 radeon_emit_unchecked(&cs->base, PKT2_NOP_PAD);
434 } else {
435 /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
436 * packet. The size of the packet body after the header is always count + 1.
437 * If count == -1, there is no packet body. NOP is the only packet that can have
438 * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
439 */
440 radeon_emit_unchecked(&cs->base, PKT3(PKT3_NOP, remaining - 2, 0));
441 cs->base.cdw += remaining - 1;
442 }
443 }
444 } else {
445 /* Don't pad on VCN encode/unified as no NOPs */
446 if (ip_type == AMDGPU_HW_IP_VCN_ENC)
447 return;
448
449 /* Don't add padding to 0 length UVD due to kernel */
450 if (ip_type == AMDGPU_HW_IP_UVD && cs->base.cdw == 0)
451 return;
452
453 const uint32_t nop_packet = get_nop_packet(cs);
454
455 while (!cs->base.cdw || (cs->base.cdw & pad_dw_mask))
456 radeon_emit_unchecked(&cs->base, nop_packet);
457 }
458
459 assert(((cs->base.cdw + leave_dw_space) & pad_dw_mask) == 0);
460 }
461
462 static VkResult
radv_amdgpu_cs_finalize(struct radeon_cmdbuf * _cs)463 radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
464 {
465 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
466
467 assert(cs->base.cdw <= cs->base.reserved_dw);
468
469 if (cs->use_ib) {
470 const uint32_t nop_packet = get_nop_packet(cs);
471
472 /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
473 radv_amdgpu_winsys_cs_pad(_cs, 4);
474
475 radeon_emit_unchecked(&cs->base, nop_packet);
476 radeon_emit_unchecked(&cs->base, nop_packet);
477 radeon_emit_unchecked(&cs->base, nop_packet);
478 radeon_emit_unchecked(&cs->base, nop_packet);
479
480 assert(cs->base.cdw <= ~C_3F2_IB_SIZE);
481 *cs->ib_size_ptr |= cs->base.cdw;
482 } else {
483 radv_amdgpu_winsys_cs_pad(_cs, 0);
484 }
485
486 /* Append the current (last) IB to the array of IB buffers. */
487 radv_amdgpu_cs_add_ib_buffer(cs, cs->ib_buffer, cs->ib_buffer->va,
488 cs->use_ib ? G_3F2_IB_SIZE(*cs->ib_size_ptr) : cs->base.cdw);
489
490 /* Prevent freeing this BO twice. */
491 cs->ib_buffer = NULL;
492
493 cs->chained_to = NULL;
494
495 assert(cs->base.cdw <= cs->base.max_dw + 4);
496
497 return cs->status;
498 }
499
500 static void
radv_amdgpu_cs_reset(struct radeon_cmdbuf * _cs)501 radv_amdgpu_cs_reset(struct radeon_cmdbuf *_cs)
502 {
503 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
504 cs->base.cdw = 0;
505 cs->base.reserved_dw = 0;
506 cs->status = VK_SUCCESS;
507
508 for (unsigned i = 0; i < cs->num_buffers; ++i) {
509 unsigned hash = cs->handles[i].bo_handle & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
510 cs->buffer_hash_table[hash] = -1;
511 }
512
513 for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
514 unsigned hash = ((uintptr_t)cs->virtual_buffers[i] >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
515 cs->virtual_buffer_hash_table[hash] = -1;
516 }
517
518 cs->num_buffers = 0;
519 cs->num_virtual_buffers = 0;
520
521 /* When the CS is finalized and IBs are not allowed, use last IB. */
522 assert(cs->ib_buffer || cs->num_ib_buffers);
523 if (!cs->ib_buffer)
524 radv_amdgpu_restore_last_ib(cs);
525
526 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
527
528 for (unsigned i = 0; i < cs->num_ib_buffers; ++i)
529 cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffers[i].bo);
530
531 cs->num_ib_buffers = 0;
532 cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
533
534 cs->ib.size = 0;
535
536 if (cs->use_ib)
537 cs->ib_size_ptr = &cs->ib.size;
538
539 _mesa_hash_table_destroy(cs->annotations, radv_amdgpu_cs_free_annotation);
540 cs->annotations = NULL;
541 }
542
543 static void
radv_amdgpu_cs_unchain(struct radeon_cmdbuf * cs)544 radv_amdgpu_cs_unchain(struct radeon_cmdbuf *cs)
545 {
546 struct radv_amdgpu_cs *acs = radv_amdgpu_cs(cs);
547
548 if (!acs->chained_to)
549 return;
550
551 assert(cs->cdw <= cs->max_dw + 4);
552
553 acs->chained_to = NULL;
554 cs->buf[cs->cdw - 4] = PKT3_NOP_PAD;
555 cs->buf[cs->cdw - 3] = PKT3_NOP_PAD;
556 cs->buf[cs->cdw - 2] = PKT3_NOP_PAD;
557 cs->buf[cs->cdw - 1] = PKT3_NOP_PAD;
558 }
559
560 static bool
radv_amdgpu_cs_chain(struct radeon_cmdbuf * cs,struct radeon_cmdbuf * next_cs,bool pre_ena)561 radv_amdgpu_cs_chain(struct radeon_cmdbuf *cs, struct radeon_cmdbuf *next_cs, bool pre_ena)
562 {
563 /* Chains together two CS (command stream) objects by editing
564 * the end of the first CS to add a command that jumps to the
565 * second CS.
566 *
567 * After this, it is enough to submit the first CS to the GPU
568 * and not necessary to submit the second CS because it is already
569 * executed by the first.
570 */
571
572 struct radv_amdgpu_cs *acs = radv_amdgpu_cs(cs);
573 struct radv_amdgpu_cs *next_acs = radv_amdgpu_cs(next_cs);
574
575 /* Only some HW IP types have packets that we can use for chaining. */
576 if (!acs->use_ib)
577 return false;
578
579 assert(cs->cdw <= cs->max_dw + 4);
580
581 acs->chained_to = next_acs;
582
583 cs->buf[cs->cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
584 cs->buf[cs->cdw - 3] = next_acs->ib.ib_mc_address;
585 cs->buf[cs->cdw - 2] = next_acs->ib.ib_mc_address >> 32;
586 cs->buf[cs->cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(pre_ena) | next_acs->ib.size;
587
588 return true;
589 }
590
591 static int
radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs * cs,uint32_t bo)592 radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs *cs, uint32_t bo)
593 {
594 unsigned hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
595 int index = cs->buffer_hash_table[hash];
596
597 if (index == -1)
598 return -1;
599
600 if (cs->handles[index].bo_handle == bo)
601 return index;
602
603 for (unsigned i = 0; i < cs->num_buffers; ++i) {
604 if (cs->handles[i].bo_handle == bo) {
605 cs->buffer_hash_table[hash] = i;
606 return i;
607 }
608 }
609
610 return -1;
611 }
612
613 static void
radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs * cs,uint32_t bo,uint8_t priority)614 radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs, uint32_t bo, uint8_t priority)
615 {
616 unsigned hash;
617 int index = radv_amdgpu_cs_find_buffer(cs, bo);
618
619 if (index != -1)
620 return;
621
622 if (cs->num_buffers == cs->max_num_buffers) {
623 unsigned new_count = MAX2(1, cs->max_num_buffers * 2);
624 struct drm_amdgpu_bo_list_entry *new_entries =
625 realloc(cs->handles, new_count * sizeof(struct drm_amdgpu_bo_list_entry));
626 if (new_entries) {
627 cs->max_num_buffers = new_count;
628 cs->handles = new_entries;
629 } else {
630 cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
631 return;
632 }
633 }
634
635 cs->handles[cs->num_buffers].bo_handle = bo;
636 cs->handles[cs->num_buffers].bo_priority = priority;
637
638 hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
639 cs->buffer_hash_table[hash] = cs->num_buffers;
640
641 ++cs->num_buffers;
642 }
643
644 static void
radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo)645 radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo)
646 {
647 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
648 unsigned hash = ((uintptr_t)bo >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
649
650 if (!cs->virtual_buffer_hash_table) {
651 int *virtual_buffer_hash_table = malloc(VIRTUAL_BUFFER_HASH_TABLE_SIZE * sizeof(int));
652 if (!virtual_buffer_hash_table) {
653 cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
654 return;
655 }
656 cs->virtual_buffer_hash_table = virtual_buffer_hash_table;
657
658 for (int i = 0; i < VIRTUAL_BUFFER_HASH_TABLE_SIZE; ++i)
659 cs->virtual_buffer_hash_table[i] = -1;
660 }
661
662 if (cs->virtual_buffer_hash_table[hash] >= 0) {
663 int idx = cs->virtual_buffer_hash_table[hash];
664 if (cs->virtual_buffers[idx] == bo) {
665 return;
666 }
667 for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
668 if (cs->virtual_buffers[i] == bo) {
669 cs->virtual_buffer_hash_table[hash] = i;
670 return;
671 }
672 }
673 }
674
675 if (cs->max_num_virtual_buffers <= cs->num_virtual_buffers) {
676 unsigned max_num_virtual_buffers = MAX2(2, cs->max_num_virtual_buffers * 2);
677 struct radeon_winsys_bo **virtual_buffers =
678 realloc(cs->virtual_buffers, sizeof(struct radeon_winsys_bo *) * max_num_virtual_buffers);
679 if (!virtual_buffers) {
680 cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
681 return;
682 }
683 cs->max_num_virtual_buffers = max_num_virtual_buffers;
684 cs->virtual_buffers = virtual_buffers;
685 }
686
687 cs->virtual_buffers[cs->num_virtual_buffers] = bo;
688
689 cs->virtual_buffer_hash_table[hash] = cs->num_virtual_buffers;
690 ++cs->num_virtual_buffers;
691 }
692
693 static void
radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * _bo)694 radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *_bo)
695 {
696 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
697 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
698
699 if (cs->status != VK_SUCCESS)
700 return;
701
702 if (bo->is_virtual) {
703 radv_amdgpu_cs_add_virtual_buffer(_cs, _bo);
704 return;
705 }
706
707 radv_amdgpu_cs_add_buffer_internal(cs, bo->bo_handle, bo->priority);
708 }
709
710 static void
radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf * _parent,struct radeon_cmdbuf * _child,bool allow_ib2)711 radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cmdbuf *_child, bool allow_ib2)
712 {
713 struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
714 struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
715 struct radv_amdgpu_winsys *ws = parent->ws;
716 const bool use_ib2 = parent->use_ib && !parent->is_secondary && allow_ib2 && parent->hw_ip == AMD_IP_GFX;
717
718 if (parent->status != VK_SUCCESS || child->status != VK_SUCCESS)
719 return;
720
721 for (unsigned i = 0; i < child->num_buffers; ++i) {
722 radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i].bo_handle, child->handles[i].bo_priority);
723 }
724
725 for (unsigned i = 0; i < child->num_virtual_buffers; ++i) {
726 radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i]);
727 }
728
729 if (use_ib2) {
730 if (parent->base.cdw + 4 > parent->base.max_dw)
731 radv_amdgpu_cs_grow(&parent->base, 4);
732
733 parent->base.reserved_dw = MAX2(parent->base.reserved_dw, parent->base.cdw + 4);
734
735 /* Not setting the CHAIN bit will launch an IB2. */
736 radeon_emit(&parent->base, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
737 radeon_emit(&parent->base, child->ib.ib_mc_address);
738 radeon_emit(&parent->base, child->ib.ib_mc_address >> 32);
739 radeon_emit(&parent->base, child->ib.size);
740 } else {
741 assert(parent->use_ib == child->use_ib);
742
743 /* Grow the current CS and copy the contents of the secondary CS. */
744 for (unsigned i = 0; i < child->num_ib_buffers; i++) {
745 struct radv_amdgpu_ib *ib = &child->ib_buffers[i];
746 uint32_t cdw = ib->cdw;
747 uint8_t *mapped;
748
749 /* Do not copy the original chain link for IBs. */
750 if (child->use_ib)
751 cdw -= 4;
752
753 assert(ib->bo);
754
755 if (parent->base.cdw + cdw > parent->base.max_dw)
756 radv_amdgpu_cs_grow(&parent->base, cdw);
757
758 parent->base.reserved_dw = MAX2(parent->base.reserved_dw, parent->base.cdw + cdw);
759
760 mapped = radv_buffer_map(&ws->base, ib->bo);
761 if (!mapped) {
762 parent->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
763 return;
764 }
765
766 memcpy(parent->base.buf + parent->base.cdw, mapped, 4 * cdw);
767 parent->base.cdw += cdw;
768 }
769 }
770 }
771
772 static void
radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo,uint64_t va,const uint32_t cdw,const bool predicate)773 radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo, uint64_t va, const uint32_t cdw,
774 const bool predicate)
775 {
776 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
777 const uint64_t ib_va = bo ? bo->va : va;
778
779 if (cs->status != VK_SUCCESS)
780 return;
781
782 assert(ib_va && ib_va % cs->ws->info.ip[cs->hw_ip].ib_alignment == 0);
783 assert(cs->hw_ip == AMD_IP_GFX && cdw <= ~C_3F2_IB_SIZE);
784
785 radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER, 2, predicate));
786 radeon_emit(&cs->base, ib_va);
787 radeon_emit(&cs->base, ib_va >> 32);
788 radeon_emit(&cs->base, cdw);
789 }
790
791 static void
radv_amdgpu_cs_chain_dgc_ib(struct radeon_cmdbuf * _cs,uint64_t va,uint32_t cdw,uint64_t trailer_va,const bool predicate)792 radv_amdgpu_cs_chain_dgc_ib(struct radeon_cmdbuf *_cs, uint64_t va, uint32_t cdw, uint64_t trailer_va,
793 const bool predicate)
794 {
795 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
796
797 if (cs->status != VK_SUCCESS)
798 return;
799
800 assert(cs->ws->info.gfx_level >= GFX8);
801
802 if (cs->hw_ip == AMD_IP_GFX) {
803 /* Use IB2 for executing DGC CS on GFX. */
804 cs->ws->base.cs_execute_ib(_cs, NULL, va, cdw, predicate);
805 } else {
806 assert(va && va % cs->ws->info.ip[cs->hw_ip].ib_alignment == 0);
807 assert(cdw <= ~C_3F2_IB_SIZE);
808
809 /* Emit a WRITE_DATA packet to patch the DGC CS. */
810 const uint32_t chain_data[] = {
811 PKT3(PKT3_INDIRECT_BUFFER, 2, 0),
812 0,
813 0,
814 S_3F2_CHAIN(1) | S_3F2_VALID(1),
815 };
816
817 radeon_emit(&cs->base, PKT3(PKT3_WRITE_DATA, 2 + ARRAY_SIZE(chain_data), false));
818 radeon_emit(&cs->base, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
819 radeon_emit(&cs->base, trailer_va);
820 radeon_emit(&cs->base, trailer_va >> 32);
821 radeon_emit_array(&cs->base, chain_data, ARRAY_SIZE(chain_data));
822
823 /* Keep pointers for patching later. */
824 uint64_t *ib_va_ptr = (uint64_t *)(cs->base.buf + cs->base.cdw - 3);
825 uint32_t *ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
826
827 /* Writeback L2 because CP isn't coherent with L2 on GFX6-8. */
828 if (cs->ws->info.gfx_level == GFX8) {
829 radeon_emit(&cs->base, PKT3(PKT3_ACQUIRE_MEM, 5, false) | PKT3_SHADER_TYPE_S(1));
830 radeon_emit(&cs->base, S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
831 radeon_emit(&cs->base, 0xffffffff);
832 radeon_emit(&cs->base, 0xff);
833 radeon_emit(&cs->base, 0);
834 radeon_emit(&cs->base, 0);
835 radeon_emit(&cs->base, 0x0000000A);
836 }
837
838 /* Finalize the current CS. */
839 cs->ws->base.cs_finalize(_cs);
840
841 /* Chain the current CS to the DGC CS. */
842 _cs->buf[_cs->cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
843 _cs->buf[_cs->cdw - 3] = va;
844 _cs->buf[_cs->cdw - 2] = va >> 32;
845 _cs->buf[_cs->cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | cdw;
846
847 /* Allocate a new CS BO with initial size. */
848 const uint64_t ib_size = radv_amdgpu_cs_get_initial_size(cs->ws, cs->hw_ip);
849
850 VkResult result = radv_amdgpu_cs_bo_create(cs, ib_size);
851 if (result != VK_SUCCESS) {
852 cs->base.cdw = 0;
853 cs->status = result;
854 return;
855 }
856
857 cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
858 if (!cs->ib_mapped) {
859 cs->base.cdw = 0;
860 cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
861 return;
862 }
863
864 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
865
866 /* Chain back the trailer (DGC CS) to the newly created one. */
867 *ib_va_ptr = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
868 cs->ib_size_ptr = ib_size_ptr;
869
870 cs->base.buf = (uint32_t *)cs->ib_mapped;
871 cs->base.cdw = 0;
872 cs->base.reserved_dw = 0;
873 cs->base.max_dw = ib_size / 4 - 4;
874 }
875 }
876
877 static unsigned
radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs * start_cs)878 radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs *start_cs)
879 {
880 unsigned num_bo = 0;
881
882 for (struct radv_amdgpu_cs *cs = start_cs; cs; cs = cs->chained_to) {
883 num_bo += cs->num_buffers;
884 for (unsigned j = 0; j < cs->num_virtual_buffers; ++j)
885 num_bo += radv_amdgpu_winsys_bo(cs->virtual_buffers[j])->bo_count;
886 }
887
888 return num_bo;
889 }
890
891 static unsigned
radv_amdgpu_count_cs_array_bo(struct radeon_cmdbuf ** cs_array,unsigned num_cs)892 radv_amdgpu_count_cs_array_bo(struct radeon_cmdbuf **cs_array, unsigned num_cs)
893 {
894 unsigned num_bo = 0;
895
896 for (unsigned i = 0; i < num_cs; ++i) {
897 num_bo += radv_amdgpu_count_cs_bo(radv_amdgpu_cs(cs_array[i]));
898 }
899
900 return num_bo;
901 }
902
903 static unsigned
radv_amdgpu_add_cs_to_bo_list(struct radv_amdgpu_cs * cs,struct drm_amdgpu_bo_list_entry * handles,unsigned num_handles)904 radv_amdgpu_add_cs_to_bo_list(struct radv_amdgpu_cs *cs, struct drm_amdgpu_bo_list_entry *handles, unsigned num_handles)
905 {
906 if (!cs->num_buffers)
907 return num_handles;
908
909 if (num_handles == 0 && !cs->num_virtual_buffers) {
910 memcpy(handles, cs->handles, cs->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
911 return cs->num_buffers;
912 }
913
914 int unique_bo_so_far = num_handles;
915 for (unsigned j = 0; j < cs->num_buffers; ++j) {
916 bool found = false;
917 for (unsigned k = 0; k < unique_bo_so_far; ++k) {
918 if (handles[k].bo_handle == cs->handles[j].bo_handle) {
919 found = true;
920 break;
921 }
922 }
923 if (!found) {
924 handles[num_handles] = cs->handles[j];
925 ++num_handles;
926 }
927 }
928 for (unsigned j = 0; j < cs->num_virtual_buffers; ++j) {
929 struct radv_amdgpu_winsys_bo *virtual_bo = radv_amdgpu_winsys_bo(cs->virtual_buffers[j]);
930 u_rwlock_rdlock(&virtual_bo->lock);
931 for (unsigned k = 0; k < virtual_bo->bo_count; ++k) {
932 struct radv_amdgpu_winsys_bo *bo = virtual_bo->bos[k];
933 bool found = false;
934 for (unsigned m = 0; m < num_handles; ++m) {
935 if (handles[m].bo_handle == bo->bo_handle) {
936 found = true;
937 break;
938 }
939 }
940 if (!found) {
941 handles[num_handles].bo_handle = bo->bo_handle;
942 handles[num_handles].bo_priority = bo->priority;
943 ++num_handles;
944 }
945 }
946 u_rwlock_rdunlock(&virtual_bo->lock);
947 }
948
949 return num_handles;
950 }
951
952 static unsigned
radv_amdgpu_add_cs_array_to_bo_list(struct radeon_cmdbuf ** cs_array,unsigned num_cs,struct drm_amdgpu_bo_list_entry * handles,unsigned num_handles)953 radv_amdgpu_add_cs_array_to_bo_list(struct radeon_cmdbuf **cs_array, unsigned num_cs,
954 struct drm_amdgpu_bo_list_entry *handles, unsigned num_handles)
955 {
956 for (unsigned i = 0; i < num_cs; ++i) {
957 for (struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]); cs; cs = cs->chained_to) {
958 num_handles = radv_amdgpu_add_cs_to_bo_list(cs, handles, num_handles);
959 }
960 }
961
962 return num_handles;
963 }
964
965 static unsigned
radv_amdgpu_copy_global_bo_list(struct radv_amdgpu_winsys * ws,struct drm_amdgpu_bo_list_entry * handles)966 radv_amdgpu_copy_global_bo_list(struct radv_amdgpu_winsys *ws, struct drm_amdgpu_bo_list_entry *handles)
967 {
968 for (uint32_t i = 0; i < ws->global_bo_list.count; i++) {
969 handles[i].bo_handle = ws->global_bo_list.bos[i]->bo_handle;
970 handles[i].bo_priority = ws->global_bo_list.bos[i]->priority;
971 }
972
973 return ws->global_bo_list.count;
974 }
975
976 static VkResult
radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys * ws,struct radeon_cmdbuf ** cs_array,unsigned count,struct radeon_cmdbuf ** initial_preamble_array,unsigned num_initial_preambles,struct radeon_cmdbuf ** continue_preamble_array,unsigned num_continue_preambles,struct radeon_cmdbuf ** postamble_array,unsigned num_postambles,unsigned * rnum_handles,struct drm_amdgpu_bo_list_entry ** rhandles)977 radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys *ws, struct radeon_cmdbuf **cs_array, unsigned count,
978 struct radeon_cmdbuf **initial_preamble_array, unsigned num_initial_preambles,
979 struct radeon_cmdbuf **continue_preamble_array, unsigned num_continue_preambles,
980 struct radeon_cmdbuf **postamble_array, unsigned num_postambles, unsigned *rnum_handles,
981 struct drm_amdgpu_bo_list_entry **rhandles)
982 {
983 struct drm_amdgpu_bo_list_entry *handles = NULL;
984 unsigned num_handles = 0;
985
986 if (ws->debug_all_bos) {
987 handles = malloc(sizeof(handles[0]) * ws->global_bo_list.count);
988 if (!handles)
989 return VK_ERROR_OUT_OF_HOST_MEMORY;
990
991 num_handles = radv_amdgpu_copy_global_bo_list(ws, handles);
992 } else if (count == 1 && !num_initial_preambles && !num_continue_preambles && !num_postambles &&
993 !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers && !radv_amdgpu_cs(cs_array[0])->chained_to &&
994 !ws->global_bo_list.count) {
995 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)cs_array[0];
996 if (cs->num_buffers == 0)
997 return VK_SUCCESS;
998
999 handles = malloc(sizeof(handles[0]) * cs->num_buffers);
1000 if (!handles)
1001 return VK_ERROR_OUT_OF_HOST_MEMORY;
1002
1003 memcpy(handles, cs->handles, sizeof(handles[0]) * cs->num_buffers);
1004 num_handles = cs->num_buffers;
1005 } else {
1006 unsigned total_buffer_count = ws->global_bo_list.count;
1007 total_buffer_count += radv_amdgpu_count_cs_array_bo(cs_array, count);
1008 total_buffer_count += radv_amdgpu_count_cs_array_bo(initial_preamble_array, num_initial_preambles);
1009 total_buffer_count += radv_amdgpu_count_cs_array_bo(continue_preamble_array, num_continue_preambles);
1010 total_buffer_count += radv_amdgpu_count_cs_array_bo(postamble_array, num_postambles);
1011
1012 if (total_buffer_count == 0)
1013 return VK_SUCCESS;
1014
1015 handles = malloc(sizeof(handles[0]) * total_buffer_count);
1016 if (!handles)
1017 return VK_ERROR_OUT_OF_HOST_MEMORY;
1018
1019 num_handles = radv_amdgpu_copy_global_bo_list(ws, handles);
1020 num_handles = radv_amdgpu_add_cs_array_to_bo_list(cs_array, count, handles, num_handles);
1021 num_handles =
1022 radv_amdgpu_add_cs_array_to_bo_list(initial_preamble_array, num_initial_preambles, handles, num_handles);
1023 num_handles =
1024 radv_amdgpu_add_cs_array_to_bo_list(continue_preamble_array, num_continue_preambles, handles, num_handles);
1025 num_handles = radv_amdgpu_add_cs_array_to_bo_list(postamble_array, num_postambles, handles, num_handles);
1026 }
1027
1028 *rhandles = handles;
1029 *rnum_handles = num_handles;
1030
1031 return VK_SUCCESS;
1032 }
1033
1034 static void
radv_assign_last_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request)1035 radv_assign_last_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request)
1036 {
1037 radv_amdgpu_request_to_fence(ctx, &ctx->last_submission[request->ip_type][request->ring], request);
1038 }
1039
1040 static unsigned
radv_amdgpu_get_num_ibs_per_cs(const struct radv_amdgpu_cs * cs)1041 radv_amdgpu_get_num_ibs_per_cs(const struct radv_amdgpu_cs *cs)
1042 {
1043 unsigned num_ibs = 0;
1044
1045 if (cs->use_ib) {
1046 num_ibs = 1; /* Everything is chained. */
1047 } else {
1048 num_ibs = cs->num_ib_buffers;
1049 }
1050
1051 return num_ibs;
1052 }
1053
1054 static unsigned
radv_amdgpu_count_ibs(struct radeon_cmdbuf ** cs_array,unsigned cs_count,unsigned initial_preamble_count,unsigned continue_preamble_count,unsigned postamble_count)1055 radv_amdgpu_count_ibs(struct radeon_cmdbuf **cs_array, unsigned cs_count, unsigned initial_preamble_count,
1056 unsigned continue_preamble_count, unsigned postamble_count)
1057 {
1058 unsigned num_ibs = 0;
1059
1060 for (unsigned i = 0; i < cs_count; i++) {
1061 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
1062
1063 num_ibs += radv_amdgpu_get_num_ibs_per_cs(cs);
1064 }
1065
1066 return MAX2(initial_preamble_count, continue_preamble_count) + num_ibs + postamble_count;
1067 }
1068
1069 static VkResult
radv_amdgpu_winsys_cs_submit_internal(struct radv_amdgpu_ctx * ctx,int queue_idx,struct radv_winsys_sem_info * sem_info,struct radeon_cmdbuf ** cs_array,unsigned cs_count,struct radeon_cmdbuf ** initial_preamble_cs,unsigned initial_preamble_count,struct radeon_cmdbuf ** continue_preamble_cs,unsigned continue_preamble_count,struct radeon_cmdbuf ** postamble_cs,unsigned postamble_count,bool uses_shadow_regs)1070 radv_amdgpu_winsys_cs_submit_internal(struct radv_amdgpu_ctx *ctx, int queue_idx, struct radv_winsys_sem_info *sem_info,
1071 struct radeon_cmdbuf **cs_array, unsigned cs_count,
1072 struct radeon_cmdbuf **initial_preamble_cs, unsigned initial_preamble_count,
1073 struct radeon_cmdbuf **continue_preamble_cs, unsigned continue_preamble_count,
1074 struct radeon_cmdbuf **postamble_cs, unsigned postamble_count,
1075 bool uses_shadow_regs)
1076 {
1077 VkResult result;
1078
1079 /* Last CS is "the gang leader", its IP type determines which fence to signal. */
1080 struct radv_amdgpu_cs *last_cs = radv_amdgpu_cs(cs_array[cs_count - 1]);
1081 struct radv_amdgpu_winsys *ws = last_cs->ws;
1082
1083 const unsigned num_ibs =
1084 radv_amdgpu_count_ibs(cs_array, cs_count, initial_preamble_count, continue_preamble_count, postamble_count);
1085 const unsigned ib_array_size = MIN2(RADV_MAX_IBS_PER_SUBMIT, num_ibs);
1086
1087 STACK_ARRAY(struct radv_amdgpu_cs_ib_info, ibs, ib_array_size);
1088
1089 struct drm_amdgpu_bo_list_entry *handles = NULL;
1090 unsigned num_handles = 0;
1091
1092 u_rwlock_rdlock(&ws->global_bo_list.lock);
1093
1094 result = radv_amdgpu_get_bo_list(ws, &cs_array[0], cs_count, initial_preamble_cs, initial_preamble_count,
1095 continue_preamble_cs, continue_preamble_count, postamble_cs, postamble_count,
1096 &num_handles, &handles);
1097 if (result != VK_SUCCESS)
1098 goto fail;
1099
1100 /* Configure the CS request. */
1101 const uint32_t *max_ib_per_ip = ws->info.max_submitted_ibs;
1102 struct radv_amdgpu_cs_request request = {
1103 .ip_type = last_cs->hw_ip,
1104 .ip_instance = 0,
1105 .ring = queue_idx,
1106 .handles = handles,
1107 .num_handles = num_handles,
1108 .ibs = ibs,
1109 .number_of_ibs = 0, /* set below */
1110 };
1111
1112 for (unsigned cs_idx = 0, cs_ib_idx = 0; cs_idx < cs_count;) {
1113 struct radeon_cmdbuf **preambles = cs_idx ? continue_preamble_cs : initial_preamble_cs;
1114 const unsigned preamble_count = cs_idx ? continue_preamble_count : initial_preamble_count;
1115 const unsigned ib_per_submit = RADV_MAX_IBS_PER_SUBMIT - preamble_count - postamble_count;
1116 unsigned num_submitted_ibs = 0;
1117 unsigned ibs_per_ip[AMD_NUM_IP_TYPES] = {0};
1118
1119 /* Copy preambles to the submission. */
1120 for (unsigned i = 0; i < preamble_count; ++i) {
1121 /* Assume that the full preamble fits into 1 IB. */
1122 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(preambles[i]);
1123 struct radv_amdgpu_cs_ib_info ib;
1124
1125 assert(cs->num_ib_buffers == 1);
1126 ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1127
1128 ibs[num_submitted_ibs++] = ib;
1129 ibs_per_ip[cs->hw_ip]++;
1130 }
1131
1132 for (unsigned i = 0; i < ib_per_submit && cs_idx < cs_count; ++i) {
1133 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[cs_idx]);
1134 struct radv_amdgpu_cs_ib_info ib;
1135
1136 if (cs_ib_idx == 0) {
1137 /* Make sure the whole CS fits into the same submission. */
1138 unsigned cs_num_ib = radv_amdgpu_get_num_ibs_per_cs(cs);
1139 if (i + cs_num_ib > ib_per_submit || ibs_per_ip[cs->hw_ip] + cs_num_ib > max_ib_per_ip[cs->hw_ip])
1140 break;
1141
1142 if (cs->hw_ip != request.ip_type) {
1143 /* Found a "follower" CS in a gang submission.
1144 * Make sure to submit this together with its "leader", the next CS.
1145 * We rely on the caller to order each "follower" before its "leader."
1146 */
1147 assert(cs_idx != cs_count - 1);
1148 struct radv_amdgpu_cs *next_cs = radv_amdgpu_cs(cs_array[cs_idx + 1]);
1149 assert(next_cs->hw_ip == request.ip_type);
1150 unsigned next_cs_num_ib = radv_amdgpu_get_num_ibs_per_cs(next_cs);
1151 if (i + cs_num_ib + next_cs_num_ib > ib_per_submit ||
1152 ibs_per_ip[next_cs->hw_ip] + next_cs_num_ib > max_ib_per_ip[next_cs->hw_ip])
1153 break;
1154 }
1155 }
1156
1157 /* When IBs are used, we only need to submit the main IB of this CS, because everything
1158 * else is chained to the first IB. Otherwise we must submit all IBs in the ib_buffers
1159 * array.
1160 */
1161 if (cs->use_ib) {
1162 ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1163 cs_idx++;
1164 } else {
1165 assert(cs_ib_idx < cs->num_ib_buffers);
1166 ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[cs_ib_idx++]);
1167
1168 if (cs_ib_idx == cs->num_ib_buffers) {
1169 cs_idx++;
1170 cs_ib_idx = 0;
1171 }
1172 }
1173
1174 if (uses_shadow_regs && ib.ip_type == AMDGPU_HW_IP_GFX)
1175 ib.flags |= AMDGPU_IB_FLAG_PREEMPT;
1176
1177 assert(num_submitted_ibs < ib_array_size);
1178 ibs[num_submitted_ibs++] = ib;
1179 ibs_per_ip[cs->hw_ip]++;
1180 }
1181
1182 assert(num_submitted_ibs > preamble_count);
1183
1184 /* Copy postambles to the submission. */
1185 for (unsigned i = 0; i < postamble_count; ++i) {
1186 /* Assume that the full postamble fits into 1 IB. */
1187 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(postamble_cs[i]);
1188 struct radv_amdgpu_cs_ib_info ib;
1189
1190 assert(cs->num_ib_buffers == 1);
1191 ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1192
1193 ibs[num_submitted_ibs++] = ib;
1194 ibs_per_ip[cs->hw_ip]++;
1195 }
1196
1197 /* Submit the CS. */
1198 request.number_of_ibs = num_submitted_ibs;
1199 result = radv_amdgpu_cs_submit(ctx, &request, sem_info);
1200 if (result != VK_SUCCESS)
1201 goto fail;
1202 }
1203
1204 free(request.handles);
1205
1206 if (result != VK_SUCCESS)
1207 goto fail;
1208
1209 radv_assign_last_submit(ctx, &request);
1210
1211 fail:
1212 u_rwlock_rdunlock(&ws->global_bo_list.lock);
1213 STACK_ARRAY_FINISH(ibs);
1214 return result;
1215 }
1216
1217 static VkResult
radv_amdgpu_cs_submit_zero(struct radv_amdgpu_ctx * ctx,enum amd_ip_type ip_type,int queue_idx,struct radv_winsys_sem_info * sem_info)1218 radv_amdgpu_cs_submit_zero(struct radv_amdgpu_ctx *ctx, enum amd_ip_type ip_type, int queue_idx,
1219 struct radv_winsys_sem_info *sem_info)
1220 {
1221 unsigned hw_ip = ip_type;
1222 unsigned queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, hw_ip, queue_idx);
1223 int ret;
1224
1225 if (!queue_syncobj)
1226 return VK_ERROR_OUT_OF_HOST_MEMORY;
1227
1228 if (sem_info->wait.syncobj_count || sem_info->wait.timeline_syncobj_count) {
1229 int fd;
1230 ret = ac_drm_cs_syncobj_export_sync_file(ctx->ws->fd, queue_syncobj, &fd);
1231 if (ret < 0)
1232 return VK_ERROR_DEVICE_LOST;
1233
1234 for (unsigned i = 0; i < sem_info->wait.syncobj_count; ++i) {
1235 int fd2;
1236 ret = ac_drm_cs_syncobj_export_sync_file(ctx->ws->fd, sem_info->wait.syncobj[i], &fd2);
1237 if (ret < 0) {
1238 close(fd);
1239 return VK_ERROR_DEVICE_LOST;
1240 }
1241
1242 sync_accumulate("radv", &fd, fd2);
1243 close(fd2);
1244 }
1245 for (unsigned i = 0; i < sem_info->wait.timeline_syncobj_count; ++i) {
1246 int fd2;
1247 ret = ac_drm_cs_syncobj_export_sync_file2(
1248 ctx->ws->fd, sem_info->wait.syncobj[i + sem_info->wait.syncobj_count], sem_info->wait.points[i], 0, &fd2);
1249 if (ret < 0) {
1250 /* This works around a kernel bug where the fence isn't copied if it is already
1251 * signalled. Since it is already signalled it is totally fine to not wait on it.
1252 *
1253 * kernel patch: https://patchwork.freedesktop.org/patch/465583/ */
1254 uint64_t point;
1255 ret = ac_drm_cs_syncobj_query2(ctx->ws->fd, &sem_info->wait.syncobj[i + sem_info->wait.syncobj_count],
1256 &point, 1, 0);
1257 if (!ret && point >= sem_info->wait.points[i])
1258 continue;
1259
1260 close(fd);
1261 return VK_ERROR_DEVICE_LOST;
1262 }
1263
1264 sync_accumulate("radv", &fd, fd2);
1265 close(fd2);
1266 }
1267 ret = ac_drm_cs_syncobj_import_sync_file(ctx->ws->fd, queue_syncobj, fd);
1268 close(fd);
1269 if (ret < 0)
1270 return VK_ERROR_DEVICE_LOST;
1271
1272 ctx->queue_syncobj_wait[hw_ip][queue_idx] = true;
1273 }
1274
1275 for (unsigned i = 0; i < sem_info->signal.syncobj_count; ++i) {
1276 uint32_t dst_handle = sem_info->signal.syncobj[i];
1277 uint32_t src_handle = queue_syncobj;
1278
1279 if (ctx->ws->info.has_timeline_syncobj) {
1280 ret = ac_drm_cs_syncobj_transfer(ctx->ws->fd, dst_handle, 0, src_handle, 0, 0);
1281 if (ret < 0)
1282 return VK_ERROR_DEVICE_LOST;
1283 } else {
1284 int fd;
1285 ret = ac_drm_cs_syncobj_export_sync_file(ctx->ws->fd, src_handle, &fd);
1286 if (ret < 0)
1287 return VK_ERROR_DEVICE_LOST;
1288
1289 ret = ac_drm_cs_syncobj_import_sync_file(ctx->ws->fd, dst_handle, fd);
1290 close(fd);
1291 if (ret < 0)
1292 return VK_ERROR_DEVICE_LOST;
1293 }
1294 }
1295 for (unsigned i = 0; i < sem_info->signal.timeline_syncobj_count; ++i) {
1296 ret = ac_drm_cs_syncobj_transfer(ctx->ws->fd, sem_info->signal.syncobj[i + sem_info->signal.syncobj_count],
1297 sem_info->signal.points[i], queue_syncobj, 0, 0);
1298 if (ret < 0)
1299 return VK_ERROR_DEVICE_LOST;
1300 }
1301 return VK_SUCCESS;
1302 }
1303
1304 static VkResult
radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx * _ctx,const struct radv_winsys_submit_info * submit,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t signal_count,const struct vk_sync_signal * signals)1305 radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx, const struct radv_winsys_submit_info *submit,
1306 uint32_t wait_count, const struct vk_sync_wait *waits, uint32_t signal_count,
1307 const struct vk_sync_signal *signals)
1308 {
1309 struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
1310 struct radv_amdgpu_winsys *ws = ctx->ws;
1311 VkResult result;
1312 unsigned wait_idx = 0, signal_idx = 0;
1313
1314 STACK_ARRAY(uint64_t, wait_points, wait_count);
1315 STACK_ARRAY(uint32_t, wait_syncobj, wait_count);
1316 STACK_ARRAY(uint64_t, signal_points, signal_count);
1317 STACK_ARRAY(uint32_t, signal_syncobj, signal_count);
1318
1319 if (!wait_points || !wait_syncobj || !signal_points || !signal_syncobj) {
1320 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1321 goto out;
1322 }
1323
1324 for (uint32_t i = 0; i < wait_count; ++i) {
1325 if (waits[i].sync->type == &vk_sync_dummy_type)
1326 continue;
1327
1328 assert(waits[i].sync->type == &ws->syncobj_sync_type);
1329 wait_syncobj[wait_idx] = ((struct vk_drm_syncobj *)waits[i].sync)->syncobj;
1330 wait_points[wait_idx] = waits[i].wait_value;
1331 ++wait_idx;
1332 }
1333
1334 for (uint32_t i = 0; i < signal_count; ++i) {
1335 if (signals[i].sync->type == &vk_sync_dummy_type)
1336 continue;
1337
1338 assert(signals[i].sync->type == &ws->syncobj_sync_type);
1339 signal_syncobj[signal_idx] = ((struct vk_drm_syncobj *)signals[i].sync)->syncobj;
1340 signal_points[signal_idx] = signals[i].signal_value;
1341 ++signal_idx;
1342 }
1343
1344 assert(signal_idx <= signal_count);
1345 assert(wait_idx <= wait_count);
1346
1347 const uint32_t wait_timeline_syncobj_count =
1348 (ws->syncobj_sync_type.features & VK_SYNC_FEATURE_TIMELINE) ? wait_idx : 0;
1349 const uint32_t signal_timeline_syncobj_count =
1350 (ws->syncobj_sync_type.features & VK_SYNC_FEATURE_TIMELINE) ? signal_idx : 0;
1351
1352 struct radv_winsys_sem_info sem_info = {
1353 .wait =
1354 {
1355 .points = wait_points,
1356 .syncobj = wait_syncobj,
1357 .timeline_syncobj_count = wait_timeline_syncobj_count,
1358 .syncobj_count = wait_idx - wait_timeline_syncobj_count,
1359 },
1360 .signal =
1361 {
1362 .points = signal_points,
1363 .syncobj = signal_syncobj,
1364 .timeline_syncobj_count = signal_timeline_syncobj_count,
1365 .syncobj_count = signal_idx - signal_timeline_syncobj_count,
1366 },
1367 .cs_emit_wait = true,
1368 .cs_emit_signal = true,
1369 };
1370
1371 if (!submit->cs_count) {
1372 result = radv_amdgpu_cs_submit_zero(ctx, submit->ip_type, submit->queue_index, &sem_info);
1373 } else {
1374 result = radv_amdgpu_winsys_cs_submit_internal(
1375 ctx, submit->queue_index, &sem_info, submit->cs_array, submit->cs_count, submit->initial_preamble_cs,
1376 submit->initial_preamble_count, submit->continue_preamble_cs, submit->continue_preamble_count,
1377 submit->postamble_cs, submit->postamble_count, submit->uses_shadow_regs);
1378 }
1379
1380 out:
1381 STACK_ARRAY_FINISH(wait_points);
1382 STACK_ARRAY_FINISH(wait_syncobj);
1383 STACK_ARRAY_FINISH(signal_points);
1384 STACK_ARRAY_FINISH(signal_syncobj);
1385 return result;
1386 }
1387
1388 static void
radv_amdgpu_winsys_get_cpu_addr(void * _cs,uint64_t addr,struct ac_addr_info * info)1389 radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr, struct ac_addr_info *info)
1390 {
1391 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1392
1393 memset(info, 0, sizeof(struct ac_addr_info));
1394
1395 if (cs->ws->debug_log_bos) {
1396 u_rwlock_rdlock(&cs->ws->log_bo_list_lock);
1397 list_for_each_entry_rev (struct radv_amdgpu_winsys_bo_log, bo_log, &cs->ws->log_bo_list, list) {
1398 if (addr >= bo_log->va && addr - bo_log->va < bo_log->size) {
1399 info->use_after_free = bo_log->destroyed;
1400 break;
1401 }
1402 }
1403 u_rwlock_rdunlock(&cs->ws->log_bo_list_lock);
1404 }
1405
1406 if (info->use_after_free)
1407 return;
1408
1409 info->valid = !cs->ws->debug_all_bos;
1410
1411 for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
1412 struct radv_amdgpu_ib *ib = &cs->ib_buffers[i];
1413 struct radv_amdgpu_winsys_bo *bo = (struct radv_amdgpu_winsys_bo *)ib->bo;
1414
1415 if (addr >= bo->base.va && addr - bo->base.va < bo->base.size) {
1416 void *map = radv_buffer_map(&cs->ws->base, &bo->base);
1417 if (map) {
1418 info->cpu_addr = (char *)map + (addr - bo->base.va);
1419 info->valid = true;
1420 return;
1421 }
1422 }
1423 }
1424 u_rwlock_rdlock(&cs->ws->global_bo_list.lock);
1425 for (uint32_t i = 0; i < cs->ws->global_bo_list.count; i++) {
1426 struct radv_amdgpu_winsys_bo *bo = cs->ws->global_bo_list.bos[i];
1427 if (addr >= bo->base.va && addr - bo->base.va < bo->base.size) {
1428 void *map = radv_buffer_map(&cs->ws->base, &bo->base);
1429 if (map) {
1430 u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1431 info->valid = true;
1432 info->cpu_addr = (char *)map + (addr - bo->base.va);
1433 return;
1434 }
1435 }
1436 }
1437 u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1438
1439 return;
1440 }
1441
1442 static void
radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf * _cs,FILE * file,const int * trace_ids,int trace_id_count,enum radv_cs_dump_type type)1443 radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf *_cs, FILE *file, const int *trace_ids, int trace_id_count,
1444 enum radv_cs_dump_type type)
1445 {
1446 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1447 struct radv_amdgpu_winsys *ws = cs->ws;
1448
1449 if (cs->use_ib) {
1450 struct radv_amdgpu_cs_ib_info ib_info = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1451
1452 struct ac_addr_info addr_info;
1453 radv_amdgpu_winsys_get_cpu_addr(cs, ib_info.ib_mc_address, &addr_info);
1454 assert(addr_info.cpu_addr);
1455
1456 if (type == RADV_CS_DUMP_TYPE_IBS) {
1457 struct ac_ib_parser ib_parser = {
1458 .f = file,
1459 .ib = addr_info.cpu_addr,
1460 .num_dw = cs->ib_buffers[0].cdw,
1461 .trace_ids = trace_ids,
1462 .trace_id_count = trace_id_count,
1463 .gfx_level = ws->info.gfx_level,
1464 .vcn_version = ws->info.vcn_ip_version,
1465 .family = ws->info.family,
1466 .ip_type = cs->hw_ip,
1467 .addr_callback = radv_amdgpu_winsys_get_cpu_addr,
1468 .addr_callback_data = cs,
1469 .annotations = cs->annotations,
1470 };
1471
1472 ac_parse_ib(&ib_parser, "main IB");
1473 } else {
1474 uint32_t *ib_dw = addr_info.cpu_addr;
1475 ac_gather_context_rolls(file, &ib_dw, &cs->ib_buffers[0].cdw, 1, cs->annotations, &ws->info);
1476 }
1477 } else {
1478 uint32_t **ibs = type == RADV_CS_DUMP_TYPE_CTX_ROLLS ? malloc(cs->num_ib_buffers * sizeof(uint32_t *)) : NULL;
1479 uint32_t *ib_dw_sizes =
1480 type == RADV_CS_DUMP_TYPE_CTX_ROLLS ? malloc(cs->num_ib_buffers * sizeof(uint32_t)) : NULL;
1481
1482 for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
1483 struct radv_amdgpu_ib *ib = &cs->ib_buffers[i];
1484 char name[64];
1485 void *mapped;
1486
1487 mapped = radv_buffer_map(&ws->base, ib->bo);
1488 if (!mapped)
1489 continue;
1490
1491 if (cs->num_ib_buffers > 1) {
1492 snprintf(name, sizeof(name), "main IB (chunk %d)", i);
1493 } else {
1494 snprintf(name, sizeof(name), "main IB");
1495 }
1496
1497 if (type == RADV_CS_DUMP_TYPE_IBS) {
1498 struct ac_ib_parser ib_parser = {
1499 .f = file,
1500 .ib = mapped,
1501 .num_dw = ib->cdw,
1502 .trace_ids = trace_ids,
1503 .trace_id_count = trace_id_count,
1504 .gfx_level = ws->info.gfx_level,
1505 .vcn_version = ws->info.vcn_ip_version,
1506 .family = ws->info.family,
1507 .ip_type = cs->hw_ip,
1508 .addr_callback = radv_amdgpu_winsys_get_cpu_addr,
1509 .addr_callback_data = cs,
1510 .annotations = cs->annotations,
1511 };
1512
1513 ac_parse_ib(&ib_parser, name);
1514 } else {
1515 ibs[i] = mapped;
1516 ib_dw_sizes[i] = ib->cdw;
1517 }
1518 }
1519
1520 if (type == RADV_CS_DUMP_TYPE_CTX_ROLLS) {
1521 ac_gather_context_rolls(file, ibs, ib_dw_sizes, cs->num_ib_buffers, cs->annotations, &ws->info);
1522
1523 free(ibs);
1524 free(ib_dw_sizes);
1525 }
1526 }
1527 }
1528
1529 static void
radv_amdgpu_winsys_cs_annotate(struct radeon_cmdbuf * _cs,const char * annotation)1530 radv_amdgpu_winsys_cs_annotate(struct radeon_cmdbuf *_cs, const char *annotation)
1531 {
1532 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1533
1534 if (!cs->annotations) {
1535 cs->annotations = _mesa_pointer_hash_table_create(NULL);
1536 if (!cs->annotations)
1537 return;
1538 }
1539
1540 struct hash_entry *entry = _mesa_hash_table_search(cs->annotations, _cs->buf + _cs->cdw);
1541 if (entry) {
1542 char *old_annotation = entry->data;
1543 char *new_annotation = calloc(strlen(old_annotation) + strlen(annotation) + 5, 1);
1544 sprintf(new_annotation, "%s -> %s", old_annotation, annotation);
1545 free(old_annotation);
1546 _mesa_hash_table_insert(cs->annotations, _cs->buf + _cs->cdw, new_annotation);
1547 } else {
1548 _mesa_hash_table_insert(cs->annotations, _cs->buf + _cs->cdw, strdup(annotation));
1549 }
1550 }
1551
1552 static uint32_t
radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)1553 radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)
1554 {
1555 switch (radv_priority) {
1556 case RADEON_CTX_PRIORITY_REALTIME:
1557 return AMDGPU_CTX_PRIORITY_VERY_HIGH;
1558 case RADEON_CTX_PRIORITY_HIGH:
1559 return AMDGPU_CTX_PRIORITY_HIGH;
1560 case RADEON_CTX_PRIORITY_MEDIUM:
1561 return AMDGPU_CTX_PRIORITY_NORMAL;
1562 case RADEON_CTX_PRIORITY_LOW:
1563 return AMDGPU_CTX_PRIORITY_LOW;
1564 default:
1565 unreachable("Invalid context priority");
1566 }
1567 }
1568
1569 static VkResult
radv_amdgpu_ctx_create(struct radeon_winsys * _ws,enum radeon_ctx_priority priority,struct radeon_winsys_ctx ** rctx)1570 radv_amdgpu_ctx_create(struct radeon_winsys *_ws, enum radeon_ctx_priority priority, struct radeon_winsys_ctx **rctx)
1571 {
1572 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1573 struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx);
1574 uint32_t amdgpu_priority = radv_to_amdgpu_priority(priority);
1575 VkResult result;
1576 int r;
1577
1578 if (!ctx)
1579 return VK_ERROR_OUT_OF_HOST_MEMORY;
1580
1581 r = ac_drm_cs_ctx_create2(ws->dev, amdgpu_priority, &ctx->ctx_handle);
1582 if (r && r == -EACCES) {
1583 result = VK_ERROR_NOT_PERMITTED;
1584 goto fail_create;
1585 } else if (r) {
1586 fprintf(stderr, "radv/amdgpu: radv_amdgpu_cs_ctx_create2 failed. (%i)\n", r);
1587 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1588 goto fail_create;
1589 }
1590 ctx->ws = ws;
1591
1592 assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * 4 * sizeof(uint64_t) <= 4096);
1593 result = ws->base.buffer_create(&ws->base, 4096, 8, RADEON_DOMAIN_GTT,
1594 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_CS, 0,
1595 &ctx->fence_bo);
1596 if (result != VK_SUCCESS) {
1597 goto fail_alloc;
1598 }
1599
1600 *rctx = (struct radeon_winsys_ctx *)ctx;
1601 return VK_SUCCESS;
1602
1603 fail_alloc:
1604 ac_drm_cs_ctx_free(ws->dev, ctx->ctx_handle);
1605 fail_create:
1606 FREE(ctx);
1607 return result;
1608 }
1609
1610 static void
radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)1611 radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
1612 {
1613 struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1614
1615 for (unsigned ip = 0; ip <= AMDGPU_HW_IP_NUM; ++ip) {
1616 for (unsigned ring = 0; ring < MAX_RINGS_PER_TYPE; ++ring) {
1617 if (ctx->queue_syncobj[ip][ring])
1618 ac_drm_cs_destroy_syncobj(ctx->ws->fd, ctx->queue_syncobj[ip][ring]);
1619 }
1620 }
1621
1622 ctx->ws->base.buffer_destroy(&ctx->ws->base, ctx->fence_bo);
1623 ac_drm_cs_ctx_free(ctx->ws->dev, ctx->ctx_handle);
1624 FREE(ctx);
1625 }
1626
1627 static uint32_t
radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx * ctx,unsigned ip,unsigned ring)1628 radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring)
1629 {
1630 uint32_t *syncobj = &ctx->queue_syncobj[ip][ring];
1631 if (!*syncobj) {
1632 ac_drm_cs_create_syncobj2(ctx->ws->fd, DRM_SYNCOBJ_CREATE_SIGNALED, syncobj);
1633 }
1634 return *syncobj;
1635 }
1636
1637 static bool
radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,int ring_index)1638 radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx, enum amd_ip_type ip_type, int ring_index)
1639 {
1640 struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1641
1642 if (ctx->last_submission[ip_type][ring_index].fence.fence) {
1643 uint32_t expired;
1644 int ret = ac_drm_cs_query_fence_status(
1645 ctx->ws->dev, ctx->ctx_handle, ctx->last_submission[ip_type][ring_index].fence.ip_type,
1646 ctx->last_submission[ip_type][ring_index].fence.ip_instance,
1647 ctx->last_submission[ip_type][ring_index].fence.ring, ctx->last_submission[ip_type][ring_index].fence.fence,
1648 1000000000ull, 0, &expired);
1649
1650 if (ret || !expired)
1651 return false;
1652 }
1653
1654 return true;
1655 }
1656
1657 static uint32_t
radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate)1658 radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate)
1659 {
1660 switch (radv_pstate) {
1661 case RADEON_CTX_PSTATE_NONE:
1662 return AMDGPU_CTX_STABLE_PSTATE_NONE;
1663 case RADEON_CTX_PSTATE_STANDARD:
1664 return AMDGPU_CTX_STABLE_PSTATE_STANDARD;
1665 case RADEON_CTX_PSTATE_MIN_SCLK:
1666 return AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK;
1667 case RADEON_CTX_PSTATE_MIN_MCLK:
1668 return AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK;
1669 case RADEON_CTX_PSTATE_PEAK:
1670 return AMDGPU_CTX_STABLE_PSTATE_PEAK;
1671 default:
1672 unreachable("Invalid pstate");
1673 }
1674 }
1675
1676 static int
radv_amdgpu_ctx_set_pstate(struct radeon_winsys_ctx * rwctx,enum radeon_ctx_pstate pstate)1677 radv_amdgpu_ctx_set_pstate(struct radeon_winsys_ctx *rwctx, enum radeon_ctx_pstate pstate)
1678 {
1679 struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1680 uint32_t new_pstate = radv_to_amdgpu_pstate(pstate);
1681 uint32_t current_pstate = 0;
1682 int r;
1683
1684 r = ac_drm_cs_ctx_stable_pstate(ctx->ws->dev, ctx->ctx_handle, AMDGPU_CTX_OP_GET_STABLE_PSTATE, 0, ¤t_pstate);
1685 if (r) {
1686 fprintf(stderr, "radv/amdgpu: failed to get current pstate\n");
1687 return r;
1688 }
1689
1690 /* Do not try to set a new pstate when the current one is already what we want. Otherwise, the
1691 * kernel might return -EBUSY if we have multiple AMDGPU contexts in flight.
1692 */
1693 if (current_pstate == new_pstate)
1694 return 0;
1695
1696 r = ac_drm_cs_ctx_stable_pstate(ctx->ws->dev, ctx->ctx_handle, AMDGPU_CTX_OP_SET_STABLE_PSTATE, new_pstate, NULL);
1697 if (r) {
1698 fprintf(stderr, "radv/amdgpu: failed to set new pstate\n");
1699 return r;
1700 }
1701
1702 return 0;
1703 }
1704
1705 static void *
radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts * counts,uint32_t queue_syncobj,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1706 radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts, uint32_t queue_syncobj,
1707 struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1708 {
1709 unsigned count = counts->syncobj_count + (queue_syncobj ? 1 : 0);
1710 struct drm_amdgpu_cs_chunk_sem *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * count);
1711 if (!syncobj)
1712 return NULL;
1713
1714 for (unsigned i = 0; i < counts->syncobj_count; i++) {
1715 struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
1716 sem->handle = counts->syncobj[i];
1717 }
1718
1719 if (queue_syncobj)
1720 syncobj[counts->syncobj_count].handle = queue_syncobj;
1721
1722 chunk->chunk_id = chunk_id;
1723 chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_sem) / 4 * count;
1724 chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1725 return syncobj;
1726 }
1727
1728 static void *
radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts * counts,uint32_t queue_syncobj,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1729 radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts *counts, uint32_t queue_syncobj,
1730 struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1731 {
1732 uint32_t count = counts->syncobj_count + counts->timeline_syncobj_count + (queue_syncobj ? 1 : 0);
1733 struct drm_amdgpu_cs_chunk_syncobj *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_syncobj) * count);
1734 if (!syncobj)
1735 return NULL;
1736
1737 for (unsigned i = 0; i < counts->syncobj_count; i++) {
1738 struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i];
1739 sem->handle = counts->syncobj[i];
1740 sem->flags = 0;
1741 sem->point = 0;
1742 }
1743
1744 for (unsigned i = 0; i < counts->timeline_syncobj_count; i++) {
1745 struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i + counts->syncobj_count];
1746 sem->handle = counts->syncobj[i + counts->syncobj_count];
1747 sem->flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
1748 sem->point = counts->points[i];
1749 }
1750
1751 if (queue_syncobj) {
1752 syncobj[count - 1].handle = queue_syncobj;
1753 syncobj[count - 1].flags = 0;
1754 syncobj[count - 1].point = 0;
1755 }
1756
1757 chunk->chunk_id = chunk_id;
1758 chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_syncobj) / 4 * count;
1759 chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1760 return syncobj;
1761 }
1762
1763 static bool
radv_amdgpu_cs_has_user_fence(struct radv_amdgpu_cs_request * request)1764 radv_amdgpu_cs_has_user_fence(struct radv_amdgpu_cs_request *request)
1765 {
1766 return request->ip_type != AMDGPU_HW_IP_UVD && request->ip_type != AMDGPU_HW_IP_VCE &&
1767 request->ip_type != AMDGPU_HW_IP_UVD_ENC && request->ip_type != AMDGPU_HW_IP_VCN_DEC &&
1768 request->ip_type != AMDGPU_HW_IP_VCN_ENC && request->ip_type != AMDGPU_HW_IP_VCN_JPEG;
1769 }
1770
1771 static VkResult
radv_amdgpu_cs_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request,struct radv_winsys_sem_info * sem_info)1772 radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
1773 struct radv_winsys_sem_info *sem_info)
1774 {
1775 int r;
1776 int num_chunks;
1777 int size;
1778 struct drm_amdgpu_cs_chunk *chunks;
1779 struct drm_amdgpu_cs_chunk_data *chunk_data;
1780 struct drm_amdgpu_bo_list_in bo_list_in;
1781 void *wait_syncobj = NULL, *signal_syncobj = NULL;
1782 int i;
1783 VkResult result = VK_SUCCESS;
1784 bool has_user_fence = radv_amdgpu_cs_has_user_fence(request);
1785 uint32_t queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, request->ip_type, request->ring);
1786 bool *queue_syncobj_wait = &ctx->queue_syncobj_wait[request->ip_type][request->ring];
1787
1788 if (!queue_syncobj)
1789 return VK_ERROR_OUT_OF_HOST_MEMORY;
1790
1791 size = request->number_of_ibs + 1 + (has_user_fence ? 1 : 0) + 1 /* bo list */ + 3;
1792
1793 chunks = malloc(sizeof(chunks[0]) * size);
1794 if (!chunks)
1795 return VK_ERROR_OUT_OF_HOST_MEMORY;
1796
1797 size = request->number_of_ibs + (has_user_fence ? 1 : 0);
1798
1799 chunk_data = malloc(sizeof(chunk_data[0]) * size);
1800 if (!chunk_data) {
1801 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1802 goto error_out;
1803 }
1804
1805 num_chunks = request->number_of_ibs;
1806 for (i = 0; i < request->number_of_ibs; i++) {
1807 struct radv_amdgpu_cs_ib_info *ib;
1808 chunks[i].chunk_id = AMDGPU_CHUNK_ID_IB;
1809 chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1810 chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1811
1812 ib = &request->ibs[i];
1813 assert(ib->ib_mc_address && ib->ib_mc_address % ctx->ws->info.ip[ib->ip_type].ib_alignment == 0);
1814 assert(ib->size);
1815
1816 chunk_data[i].ib_data._pad = 0;
1817 chunk_data[i].ib_data.va_start = ib->ib_mc_address;
1818 chunk_data[i].ib_data.ib_bytes = ib->size * 4;
1819 chunk_data[i].ib_data.ip_type = ib->ip_type;
1820 chunk_data[i].ib_data.ip_instance = request->ip_instance;
1821 chunk_data[i].ib_data.ring = request->ring;
1822 chunk_data[i].ib_data.flags = ib->flags;
1823 }
1824
1825 assert(chunk_data[request->number_of_ibs - 1].ib_data.ip_type == request->ip_type);
1826
1827 if (has_user_fence) {
1828 i = num_chunks++;
1829 chunks[i].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1830 chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1831 chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1832
1833 /* Need to reserve 4 QWORD for user fence:
1834 * QWORD[0]: completed fence
1835 * QWORD[1]: preempted fence
1836 * QWORD[2]: reset fence
1837 * QWORD[3]: preempted then reset
1838 */
1839 uint32_t offset = (request->ip_type * MAX_RINGS_PER_TYPE + request->ring) * 4;
1840 ac_drm_cs_chunk_fence_info_to_data(radv_amdgpu_winsys_bo(ctx->fence_bo)->bo_handle, offset, &chunk_data[i]);
1841 }
1842
1843 if (sem_info->cs_emit_wait &&
1844 (sem_info->wait.timeline_syncobj_count || sem_info->wait.syncobj_count || *queue_syncobj_wait)) {
1845
1846 if (ctx->ws->info.has_timeline_syncobj) {
1847 wait_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(&sem_info->wait, queue_syncobj, &chunks[num_chunks],
1848 AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT);
1849 } else {
1850 wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait, queue_syncobj, &chunks[num_chunks],
1851 AMDGPU_CHUNK_ID_SYNCOBJ_IN);
1852 }
1853 if (!wait_syncobj) {
1854 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1855 goto error_out;
1856 }
1857 num_chunks++;
1858
1859 sem_info->cs_emit_wait = false;
1860 *queue_syncobj_wait = false;
1861 }
1862
1863 if (sem_info->cs_emit_signal) {
1864 if (ctx->ws->info.has_timeline_syncobj) {
1865 signal_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(
1866 &sem_info->signal, queue_syncobj, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL);
1867 } else {
1868 signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal, queue_syncobj, &chunks[num_chunks],
1869 AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
1870 }
1871 if (!signal_syncobj) {
1872 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1873 goto error_out;
1874 }
1875 num_chunks++;
1876 }
1877
1878 bo_list_in.operation = ~0;
1879 bo_list_in.list_handle = ~0;
1880 bo_list_in.bo_number = request->num_handles;
1881 bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1882 bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)request->handles;
1883
1884 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1885 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1886 chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1887 num_chunks++;
1888
1889 /* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites quite
1890 * often, but it eventually succeeds after enough attempts. This happens frequently with dEQP
1891 * using NGG streamout.
1892 */
1893 uint64_t abs_timeout_ns = os_time_get_absolute_timeout(1000000000ull); /* 1s */
1894
1895 r = 0;
1896 do {
1897 /* Wait 1 ms and try again. */
1898 if (r == -ENOMEM)
1899 os_time_sleep(1000);
1900
1901 r = ac_drm_cs_submit_raw2(ctx->ws->dev, ctx->ctx_handle, 0, num_chunks, chunks, &request->seq_no);
1902 } while (r == -ENOMEM && os_time_get_nano() < abs_timeout_ns);
1903
1904 if (r) {
1905 if (r == -ENOMEM) {
1906 fprintf(stderr, "radv/amdgpu: Not enough memory for command submission.\n");
1907 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1908 } else if (r == -ECANCELED) {
1909 fprintf(stderr,
1910 "radv/amdgpu: The CS has been cancelled because the context is lost. This context is innocent.\n");
1911 result = VK_ERROR_DEVICE_LOST;
1912 } else if (r == -ENODATA) {
1913 fprintf(stderr, "radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty "
1914 "of a soft recovery.\n");
1915 result = VK_ERROR_DEVICE_LOST;
1916 } else if (r == -ETIME) {
1917 fprintf(stderr, "radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty "
1918 "of a hard recovery.\n");
1919 result = VK_ERROR_DEVICE_LOST;
1920 } else {
1921 fprintf(stderr,
1922 "radv/amdgpu: The CS has been rejected, "
1923 "see dmesg for more information (%i).\n",
1924 r);
1925 result = VK_ERROR_UNKNOWN;
1926 }
1927 }
1928
1929 error_out:
1930 free(chunks);
1931 free(chunk_data);
1932 free(wait_syncobj);
1933 free(signal_syncobj);
1934 return result;
1935 }
1936
1937 void
radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys * ws)1938 radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
1939 {
1940 ws->base.ctx_create = radv_amdgpu_ctx_create;
1941 ws->base.ctx_destroy = radv_amdgpu_ctx_destroy;
1942 ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle;
1943 ws->base.ctx_set_pstate = radv_amdgpu_ctx_set_pstate;
1944 ws->base.cs_domain = radv_amdgpu_cs_domain;
1945 ws->base.cs_create = radv_amdgpu_cs_create;
1946 ws->base.cs_destroy = radv_amdgpu_cs_destroy;
1947 ws->base.cs_grow = radv_amdgpu_cs_grow;
1948 ws->base.cs_finalize = radv_amdgpu_cs_finalize;
1949 ws->base.cs_reset = radv_amdgpu_cs_reset;
1950 ws->base.cs_chain = radv_amdgpu_cs_chain;
1951 ws->base.cs_unchain = radv_amdgpu_cs_unchain;
1952 ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer;
1953 ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary;
1954 ws->base.cs_execute_ib = radv_amdgpu_cs_execute_ib;
1955 ws->base.cs_chain_dgc_ib = radv_amdgpu_cs_chain_dgc_ib;
1956 ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
1957 ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
1958 ws->base.cs_annotate = radv_amdgpu_winsys_cs_annotate;
1959 ws->base.cs_pad = radv_amdgpu_winsys_cs_pad;
1960 }
1961