• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include <amdgpu.h>
9 #include <assert.h>
10 #include <libsync.h>
11 #include <pthread.h>
12 #include <stdlib.h>
13 #include "drm-uapi/amdgpu_drm.h"
14 
15 #include "util/detect_os.h"
16 #include "util/os_time.h"
17 #include "util/u_memory.h"
18 #include "ac_debug.h"
19 #include "ac_linux_drm.h"
20 #include "radv_amdgpu_bo.h"
21 #include "radv_amdgpu_cs.h"
22 #include "radv_amdgpu_winsys.h"
23 #include "radv_debug.h"
24 #include "radv_radeon_winsys.h"
25 #include "sid.h"
26 #include "vk_alloc.h"
27 #include "vk_drm_syncobj.h"
28 #include "vk_sync.h"
29 #include "vk_sync_dummy.h"
30 
31 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
32  * codes in the kernel).
33  */
34 #if DETECT_OS_OPENBSD
35 #define ENODATA ENOTSUP
36 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
37 #define ENODATA ECONNREFUSED
38 #endif
39 
40 /* Maximum allowed total number of submitted IBs. */
41 #define RADV_MAX_IBS_PER_SUBMIT 192
42 
43 enum { VIRTUAL_BUFFER_HASH_TABLE_SIZE = 1024 };
44 
45 struct radv_amdgpu_ib {
46    struct radeon_winsys_bo *bo; /* NULL when not owned by the current CS object */
47    uint64_t va;
48    unsigned cdw;
49 };
50 
51 struct radv_amdgpu_cs_ib_info {
52    int64_t flags;
53    uint64_t ib_mc_address;
54    uint32_t size;
55    enum amd_ip_type ip_type;
56 };
57 
58 struct radv_amdgpu_cs {
59    struct radeon_cmdbuf base;
60    struct radv_amdgpu_winsys *ws;
61 
62    struct radv_amdgpu_cs_ib_info ib;
63 
64    struct radeon_winsys_bo *ib_buffer;
65    uint8_t *ib_mapped;
66    unsigned max_num_buffers;
67    unsigned num_buffers;
68    struct drm_amdgpu_bo_list_entry *handles;
69 
70    struct radv_amdgpu_ib *ib_buffers;
71    unsigned num_ib_buffers;
72    unsigned max_num_ib_buffers;
73    unsigned *ib_size_ptr;
74    VkResult status;
75    struct radv_amdgpu_cs *chained_to;
76    bool use_ib;
77    bool is_secondary;
78 
79    int buffer_hash_table[1024];
80    unsigned hw_ip;
81 
82    unsigned num_virtual_buffers;
83    unsigned max_num_virtual_buffers;
84    struct radeon_winsys_bo **virtual_buffers;
85    int *virtual_buffer_hash_table;
86 
87    struct hash_table *annotations;
88 };
89 
90 struct radv_winsys_sem_counts {
91    uint32_t syncobj_count;
92    uint32_t timeline_syncobj_count;
93    uint32_t *syncobj;
94    uint64_t *points;
95 };
96 
97 struct radv_winsys_sem_info {
98    bool cs_emit_signal;
99    bool cs_emit_wait;
100    struct radv_winsys_sem_counts wait;
101    struct radv_winsys_sem_counts signal;
102 };
103 
104 static void
radeon_emit_unchecked(struct radeon_cmdbuf * cs,uint32_t value)105 radeon_emit_unchecked(struct radeon_cmdbuf *cs, uint32_t value)
106 {
107    cs->buf[cs->cdw++] = value;
108 }
109 
110 static uint32_t radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring);
111 
112 static inline struct radv_amdgpu_cs *
radv_amdgpu_cs(struct radeon_cmdbuf * base)113 radv_amdgpu_cs(struct radeon_cmdbuf *base)
114 {
115    return (struct radv_amdgpu_cs *)base;
116 }
117 
118 static bool
ring_can_use_ib_bos(const struct radv_amdgpu_winsys * ws,enum amd_ip_type ip_type)119 ring_can_use_ib_bos(const struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
120 {
121    return ws->use_ib_bos && (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
122 }
123 
124 struct radv_amdgpu_cs_request {
125    /** Specify HW IP block type to which to send the IB. */
126    unsigned ip_type;
127 
128    /** IP instance index if there are several IPs of the same type. */
129    unsigned ip_instance;
130 
131    /**
132     * Specify ring index of the IP. We could have several rings
133     * in the same IP. E.g. 0 for SDMA0 and 1 for SDMA1.
134     */
135    uint32_t ring;
136 
137    /**
138     * BO list handles used by this request.
139     */
140    struct drm_amdgpu_bo_list_entry *handles;
141    uint32_t num_handles;
142 
143    /** Number of IBs to submit in the field ibs. */
144    uint32_t number_of_ibs;
145 
146    /**
147     * IBs to submit. Those IBs will be submitted together as single entity
148     */
149    struct radv_amdgpu_cs_ib_info *ibs;
150 
151    /**
152     * The returned sequence number for the command submission
153     */
154    uint64_t seq_no;
155 };
156 
157 static VkResult radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
158                                       struct radv_winsys_sem_info *sem_info);
159 
160 static void
radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_fence * fence,struct radv_amdgpu_cs_request * req)161 radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_fence *fence,
162                              struct radv_amdgpu_cs_request *req)
163 {
164    fence->fence.ip_type = req->ip_type;
165    fence->fence.ip_instance = req->ip_instance;
166    fence->fence.ring = req->ring;
167    fence->fence.fence = req->seq_no;
168 }
169 
170 static struct radv_amdgpu_cs_ib_info
radv_amdgpu_cs_ib_to_info(struct radv_amdgpu_cs * cs,struct radv_amdgpu_ib ib)171 radv_amdgpu_cs_ib_to_info(struct radv_amdgpu_cs *cs, struct radv_amdgpu_ib ib)
172 {
173    struct radv_amdgpu_cs_ib_info info = {
174       .flags = 0,
175       .ip_type = cs->hw_ip,
176       .ib_mc_address = ib.va,
177       .size = ib.cdw,
178    };
179    return info;
180 }
181 
182 static void
radv_amdgpu_cs_free_annotation(struct hash_entry * entry)183 radv_amdgpu_cs_free_annotation(struct hash_entry *entry)
184 {
185    free(entry->data);
186 }
187 
188 static void
radv_amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)189 radv_amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
190 {
191    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
192 
193    _mesa_hash_table_destroy(cs->annotations, radv_amdgpu_cs_free_annotation);
194 
195    if (cs->ib_buffer)
196       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
197 
198    for (unsigned i = 0; i < cs->num_ib_buffers; ++i)
199       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffers[i].bo);
200 
201    free(cs->ib_buffers);
202    free(cs->virtual_buffers);
203    free(cs->virtual_buffer_hash_table);
204    free(cs->handles);
205    free(cs);
206 }
207 
208 static void
radv_amdgpu_init_cs(struct radv_amdgpu_cs * cs,enum amd_ip_type ip_type)209 radv_amdgpu_init_cs(struct radv_amdgpu_cs *cs, enum amd_ip_type ip_type)
210 {
211    for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i)
212       cs->buffer_hash_table[i] = -1;
213 
214    cs->hw_ip = ip_type;
215 }
216 
217 static enum radeon_bo_domain
radv_amdgpu_cs_domain(const struct radeon_winsys * _ws)218 radv_amdgpu_cs_domain(const struct radeon_winsys *_ws)
219 {
220    const struct radv_amdgpu_winsys *ws = (const struct radv_amdgpu_winsys *)_ws;
221 
222    bool enough_vram = ws->info.all_vram_visible ||
223                       p_atomic_read_relaxed(&ws->allocated_vram_vis) * 2 <= (uint64_t)ws->info.vram_vis_size_kb * 1024;
224 
225    /* Bandwidth should be equivalent to at least PCIe 3.0 x8.
226     * If there is no PCIe info, assume there is enough bandwidth.
227     */
228    bool enough_bandwidth = !ws->info.has_pcie_bandwidth_info || ws->info.pcie_bandwidth_mbps >= 8 * 0.985 * 1024;
229 
230    bool use_sam =
231       (enough_vram && enough_bandwidth && ws->info.has_dedicated_vram && !(ws->perftest & RADV_PERFTEST_NO_SAM)) ||
232       (ws->perftest & RADV_PERFTEST_SAM);
233    return use_sam ? RADEON_DOMAIN_VRAM : RADEON_DOMAIN_GTT;
234 }
235 
236 static VkResult
radv_amdgpu_cs_bo_create(struct radv_amdgpu_cs * cs,uint32_t ib_size)237 radv_amdgpu_cs_bo_create(struct radv_amdgpu_cs *cs, uint32_t ib_size)
238 {
239    struct radeon_winsys *ws = &cs->ws->base;
240 
241    /* Avoid memcpy from VRAM when a secondary cmdbuf can't always rely on IB2. */
242    const bool can_always_use_ib2 = cs->ws->info.gfx_level >= GFX8 && cs->hw_ip == AMD_IP_GFX;
243    const bool avoid_vram = cs->is_secondary && !can_always_use_ib2;
244    const enum radeon_bo_domain domain = avoid_vram ? RADEON_DOMAIN_GTT : radv_amdgpu_cs_domain(ws);
245    const enum radeon_bo_flag gtt_wc_flag = avoid_vram ? 0 : RADEON_FLAG_GTT_WC;
246    const enum radeon_bo_flag flags =
247       RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | gtt_wc_flag;
248 
249    return ws->buffer_create(ws, ib_size, cs->ws->info.ip[cs->hw_ip].ib_alignment, domain, flags, RADV_BO_PRIORITY_CS, 0,
250                             &cs->ib_buffer);
251 }
252 
253 static VkResult
radv_amdgpu_cs_get_new_ib(struct radeon_cmdbuf * _cs,uint32_t ib_size)254 radv_amdgpu_cs_get_new_ib(struct radeon_cmdbuf *_cs, uint32_t ib_size)
255 {
256    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
257    VkResult result;
258 
259    result = radv_amdgpu_cs_bo_create(cs, ib_size);
260    if (result != VK_SUCCESS)
261       return result;
262 
263    cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
264    if (!cs->ib_mapped) {
265       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
266       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
267    }
268 
269    cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
270    cs->base.buf = (uint32_t *)cs->ib_mapped;
271    cs->base.cdw = 0;
272    cs->base.reserved_dw = 0;
273    cs->base.max_dw = ib_size / 4 - 4;
274    cs->ib.size = 0;
275    cs->ib.ip_type = cs->hw_ip;
276 
277    if (cs->use_ib)
278       cs->ib_size_ptr = &cs->ib.size;
279 
280    cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
281 
282    return VK_SUCCESS;
283 }
284 
285 static unsigned
radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys * ws,enum amd_ip_type ip_type)286 radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
287 {
288    const uint32_t ib_alignment = ws->info.ip[ip_type].ib_alignment;
289    assert(util_is_power_of_two_nonzero(ib_alignment));
290    return align(20 * 1024 * 4, ib_alignment);
291 }
292 
293 static struct radeon_cmdbuf *
radv_amdgpu_cs_create(struct radeon_winsys * ws,enum amd_ip_type ip_type,bool is_secondary)294 radv_amdgpu_cs_create(struct radeon_winsys *ws, enum amd_ip_type ip_type, bool is_secondary)
295 {
296    struct radv_amdgpu_cs *cs;
297    uint32_t ib_size = radv_amdgpu_cs_get_initial_size(radv_amdgpu_winsys(ws), ip_type);
298 
299    cs = calloc(1, sizeof(struct radv_amdgpu_cs));
300    if (!cs)
301       return NULL;
302 
303    cs->is_secondary = is_secondary;
304    cs->ws = radv_amdgpu_winsys(ws);
305    radv_amdgpu_init_cs(cs, ip_type);
306 
307    cs->use_ib = ring_can_use_ib_bos(cs->ws, ip_type);
308 
309    VkResult result = radv_amdgpu_cs_get_new_ib(&cs->base, ib_size);
310    if (result != VK_SUCCESS) {
311       free(cs);
312       return NULL;
313    }
314 
315    return &cs->base;
316 }
317 
318 static uint32_t
get_nop_packet(struct radv_amdgpu_cs * cs)319 get_nop_packet(struct radv_amdgpu_cs *cs)
320 {
321    switch (cs->hw_ip) {
322    case AMDGPU_HW_IP_GFX:
323    case AMDGPU_HW_IP_COMPUTE:
324       return cs->ws->info.gfx_ib_pad_with_type2 ? PKT2_NOP_PAD : PKT3_NOP_PAD;
325    case AMDGPU_HW_IP_DMA:
326       return cs->ws->info.gfx_level == GFX6 ? 0xF0000000 : SDMA_NOP_PAD;
327    case AMDGPU_HW_IP_UVD:
328    case AMDGPU_HW_IP_UVD_ENC:
329       return PKT2_NOP_PAD;
330    case AMDGPU_HW_IP_VCN_DEC:
331       return 0x81FF;
332    case AMDGPU_HW_IP_VCN_ENC:
333       return 0; /* NOPs are illegal in encode, so don't pad */
334    default:
335       unreachable("Unknown IP type");
336    }
337 }
338 
339 static void
radv_amdgpu_cs_add_ib_buffer(struct radv_amdgpu_cs * cs,struct radeon_winsys_bo * bo,uint64_t va,uint32_t cdw)340 radv_amdgpu_cs_add_ib_buffer(struct radv_amdgpu_cs *cs, struct radeon_winsys_bo *bo, uint64_t va, uint32_t cdw)
341 {
342    if (cs->num_ib_buffers == cs->max_num_ib_buffers) {
343       unsigned max_num_ib_buffers = MAX2(1, cs->max_num_ib_buffers * 2);
344       struct radv_amdgpu_ib *ib_buffers = realloc(cs->ib_buffers, max_num_ib_buffers * sizeof(*ib_buffers));
345       if (!ib_buffers) {
346          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
347          return;
348       }
349       cs->max_num_ib_buffers = max_num_ib_buffers;
350       cs->ib_buffers = ib_buffers;
351    }
352 
353    cs->ib_buffers[cs->num_ib_buffers].bo = bo;
354    cs->ib_buffers[cs->num_ib_buffers].va = va;
355    cs->ib_buffers[cs->num_ib_buffers++].cdw = cdw;
356 }
357 
358 static void
radv_amdgpu_restore_last_ib(struct radv_amdgpu_cs * cs)359 radv_amdgpu_restore_last_ib(struct radv_amdgpu_cs *cs)
360 {
361    struct radv_amdgpu_ib *ib = &cs->ib_buffers[--cs->num_ib_buffers];
362    assert(ib->bo);
363    cs->ib_buffer = ib->bo;
364 }
365 
366 static void
radv_amdgpu_cs_grow(struct radeon_cmdbuf * _cs,size_t min_size)367 radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
368 {
369    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
370 
371    if (cs->status != VK_SUCCESS) {
372       cs->base.cdw = 0;
373       return;
374    }
375 
376    const uint32_t ib_alignment = cs->ws->info.ip[cs->hw_ip].ib_alignment;
377 
378    cs->ws->base.cs_finalize(_cs);
379 
380    uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2);
381 
382    ib_size = align(MIN2(ib_size, ~C_3F2_IB_SIZE), ib_alignment);
383 
384    VkResult result = radv_amdgpu_cs_bo_create(cs, ib_size);
385 
386    if (result != VK_SUCCESS) {
387       cs->base.cdw = 0;
388       cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
389       radv_amdgpu_restore_last_ib(cs);
390    }
391 
392    cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
393    if (!cs->ib_mapped) {
394       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
395       cs->base.cdw = 0;
396 
397       /* VK_ERROR_MEMORY_MAP_FAILED is not valid for vkEndCommandBuffer. */
398       cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
399       radv_amdgpu_restore_last_ib(cs);
400    }
401 
402    cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
403 
404    if (cs->use_ib) {
405       cs->base.buf[cs->base.cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
406       cs->base.buf[cs->base.cdw - 3] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
407       cs->base.buf[cs->base.cdw - 2] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va >> 32;
408       cs->base.buf[cs->base.cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1);
409 
410       cs->ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
411    }
412 
413    cs->base.buf = (uint32_t *)cs->ib_mapped;
414    cs->base.cdw = 0;
415    cs->base.reserved_dw = 0;
416    cs->base.max_dw = ib_size / 4 - 4;
417 }
418 
419 static void
radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf * _cs,unsigned leave_dw_space)420 radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf *_cs, unsigned leave_dw_space)
421 {
422    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
423    const enum amd_ip_type ip_type = cs->hw_ip;
424    const uint32_t pad_dw_mask = cs->ws->info.ip[ip_type].ib_pad_dw_mask;
425    const uint32_t unaligned_dw = (cs->base.cdw + leave_dw_space) & pad_dw_mask;
426 
427    if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
428       if (unaligned_dw) {
429          const int remaining = pad_dw_mask + 1 - unaligned_dw;
430 
431          /* Only pad by 1 dword with the type-2 NOP if necessary. */
432          if (remaining == 1 && cs->ws->info.gfx_ib_pad_with_type2) {
433             radeon_emit_unchecked(&cs->base, PKT2_NOP_PAD);
434          } else {
435             /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
436              * packet. The size of the packet body after the header is always count + 1.
437              * If count == -1, there is no packet body. NOP is the only packet that can have
438              * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
439              */
440             radeon_emit_unchecked(&cs->base, PKT3(PKT3_NOP, remaining - 2, 0));
441             cs->base.cdw += remaining - 1;
442          }
443       }
444    } else {
445       /* Don't pad on VCN encode/unified as no NOPs */
446       if (ip_type == AMDGPU_HW_IP_VCN_ENC)
447          return;
448 
449       /* Don't add padding to 0 length UVD due to kernel */
450       if (ip_type == AMDGPU_HW_IP_UVD && cs->base.cdw == 0)
451          return;
452 
453       const uint32_t nop_packet = get_nop_packet(cs);
454 
455       while (!cs->base.cdw || (cs->base.cdw & pad_dw_mask))
456          radeon_emit_unchecked(&cs->base, nop_packet);
457    }
458 
459    assert(((cs->base.cdw + leave_dw_space) & pad_dw_mask) == 0);
460 }
461 
462 static VkResult
radv_amdgpu_cs_finalize(struct radeon_cmdbuf * _cs)463 radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
464 {
465    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
466 
467    assert(cs->base.cdw <= cs->base.reserved_dw);
468 
469    if (cs->use_ib) {
470       const uint32_t nop_packet = get_nop_packet(cs);
471 
472       /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
473       radv_amdgpu_winsys_cs_pad(_cs, 4);
474 
475       radeon_emit_unchecked(&cs->base, nop_packet);
476       radeon_emit_unchecked(&cs->base, nop_packet);
477       radeon_emit_unchecked(&cs->base, nop_packet);
478       radeon_emit_unchecked(&cs->base, nop_packet);
479 
480       assert(cs->base.cdw <= ~C_3F2_IB_SIZE);
481       *cs->ib_size_ptr |= cs->base.cdw;
482    } else {
483       radv_amdgpu_winsys_cs_pad(_cs, 0);
484    }
485 
486    /* Append the current (last) IB to the array of IB buffers. */
487    radv_amdgpu_cs_add_ib_buffer(cs, cs->ib_buffer, cs->ib_buffer->va,
488                                 cs->use_ib ? G_3F2_IB_SIZE(*cs->ib_size_ptr) : cs->base.cdw);
489 
490    /* Prevent freeing this BO twice. */
491    cs->ib_buffer = NULL;
492 
493    cs->chained_to = NULL;
494 
495    assert(cs->base.cdw <= cs->base.max_dw + 4);
496 
497    return cs->status;
498 }
499 
500 static void
radv_amdgpu_cs_reset(struct radeon_cmdbuf * _cs)501 radv_amdgpu_cs_reset(struct radeon_cmdbuf *_cs)
502 {
503    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
504    cs->base.cdw = 0;
505    cs->base.reserved_dw = 0;
506    cs->status = VK_SUCCESS;
507 
508    for (unsigned i = 0; i < cs->num_buffers; ++i) {
509       unsigned hash = cs->handles[i].bo_handle & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
510       cs->buffer_hash_table[hash] = -1;
511    }
512 
513    for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
514       unsigned hash = ((uintptr_t)cs->virtual_buffers[i] >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
515       cs->virtual_buffer_hash_table[hash] = -1;
516    }
517 
518    cs->num_buffers = 0;
519    cs->num_virtual_buffers = 0;
520 
521    /* When the CS is finalized and IBs are not allowed, use last IB. */
522    assert(cs->ib_buffer || cs->num_ib_buffers);
523    if (!cs->ib_buffer)
524       radv_amdgpu_restore_last_ib(cs);
525 
526    cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
527 
528    for (unsigned i = 0; i < cs->num_ib_buffers; ++i)
529       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffers[i].bo);
530 
531    cs->num_ib_buffers = 0;
532    cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
533 
534    cs->ib.size = 0;
535 
536    if (cs->use_ib)
537       cs->ib_size_ptr = &cs->ib.size;
538 
539    _mesa_hash_table_destroy(cs->annotations, radv_amdgpu_cs_free_annotation);
540    cs->annotations = NULL;
541 }
542 
543 static void
radv_amdgpu_cs_unchain(struct radeon_cmdbuf * cs)544 radv_amdgpu_cs_unchain(struct radeon_cmdbuf *cs)
545 {
546    struct radv_amdgpu_cs *acs = radv_amdgpu_cs(cs);
547 
548    if (!acs->chained_to)
549       return;
550 
551    assert(cs->cdw <= cs->max_dw + 4);
552 
553    acs->chained_to = NULL;
554    cs->buf[cs->cdw - 4] = PKT3_NOP_PAD;
555    cs->buf[cs->cdw - 3] = PKT3_NOP_PAD;
556    cs->buf[cs->cdw - 2] = PKT3_NOP_PAD;
557    cs->buf[cs->cdw - 1] = PKT3_NOP_PAD;
558 }
559 
560 static bool
radv_amdgpu_cs_chain(struct radeon_cmdbuf * cs,struct radeon_cmdbuf * next_cs,bool pre_ena)561 radv_amdgpu_cs_chain(struct radeon_cmdbuf *cs, struct radeon_cmdbuf *next_cs, bool pre_ena)
562 {
563    /* Chains together two CS (command stream) objects by editing
564     * the end of the first CS to add a command that jumps to the
565     * second CS.
566     *
567     * After this, it is enough to submit the first CS to the GPU
568     * and not necessary to submit the second CS because it is already
569     * executed by the first.
570     */
571 
572    struct radv_amdgpu_cs *acs = radv_amdgpu_cs(cs);
573    struct radv_amdgpu_cs *next_acs = radv_amdgpu_cs(next_cs);
574 
575    /* Only some HW IP types have packets that we can use for chaining. */
576    if (!acs->use_ib)
577       return false;
578 
579    assert(cs->cdw <= cs->max_dw + 4);
580 
581    acs->chained_to = next_acs;
582 
583    cs->buf[cs->cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
584    cs->buf[cs->cdw - 3] = next_acs->ib.ib_mc_address;
585    cs->buf[cs->cdw - 2] = next_acs->ib.ib_mc_address >> 32;
586    cs->buf[cs->cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(pre_ena) | next_acs->ib.size;
587 
588    return true;
589 }
590 
591 static int
radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs * cs,uint32_t bo)592 radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs *cs, uint32_t bo)
593 {
594    unsigned hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
595    int index = cs->buffer_hash_table[hash];
596 
597    if (index == -1)
598       return -1;
599 
600    if (cs->handles[index].bo_handle == bo)
601       return index;
602 
603    for (unsigned i = 0; i < cs->num_buffers; ++i) {
604       if (cs->handles[i].bo_handle == bo) {
605          cs->buffer_hash_table[hash] = i;
606          return i;
607       }
608    }
609 
610    return -1;
611 }
612 
613 static void
radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs * cs,uint32_t bo,uint8_t priority)614 radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs, uint32_t bo, uint8_t priority)
615 {
616    unsigned hash;
617    int index = radv_amdgpu_cs_find_buffer(cs, bo);
618 
619    if (index != -1)
620       return;
621 
622    if (cs->num_buffers == cs->max_num_buffers) {
623       unsigned new_count = MAX2(1, cs->max_num_buffers * 2);
624       struct drm_amdgpu_bo_list_entry *new_entries =
625          realloc(cs->handles, new_count * sizeof(struct drm_amdgpu_bo_list_entry));
626       if (new_entries) {
627          cs->max_num_buffers = new_count;
628          cs->handles = new_entries;
629       } else {
630          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
631          return;
632       }
633    }
634 
635    cs->handles[cs->num_buffers].bo_handle = bo;
636    cs->handles[cs->num_buffers].bo_priority = priority;
637 
638    hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
639    cs->buffer_hash_table[hash] = cs->num_buffers;
640 
641    ++cs->num_buffers;
642 }
643 
644 static void
radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo)645 radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo)
646 {
647    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
648    unsigned hash = ((uintptr_t)bo >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
649 
650    if (!cs->virtual_buffer_hash_table) {
651       int *virtual_buffer_hash_table = malloc(VIRTUAL_BUFFER_HASH_TABLE_SIZE * sizeof(int));
652       if (!virtual_buffer_hash_table) {
653          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
654          return;
655       }
656       cs->virtual_buffer_hash_table = virtual_buffer_hash_table;
657 
658       for (int i = 0; i < VIRTUAL_BUFFER_HASH_TABLE_SIZE; ++i)
659          cs->virtual_buffer_hash_table[i] = -1;
660    }
661 
662    if (cs->virtual_buffer_hash_table[hash] >= 0) {
663       int idx = cs->virtual_buffer_hash_table[hash];
664       if (cs->virtual_buffers[idx] == bo) {
665          return;
666       }
667       for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
668          if (cs->virtual_buffers[i] == bo) {
669             cs->virtual_buffer_hash_table[hash] = i;
670             return;
671          }
672       }
673    }
674 
675    if (cs->max_num_virtual_buffers <= cs->num_virtual_buffers) {
676       unsigned max_num_virtual_buffers = MAX2(2, cs->max_num_virtual_buffers * 2);
677       struct radeon_winsys_bo **virtual_buffers =
678          realloc(cs->virtual_buffers, sizeof(struct radeon_winsys_bo *) * max_num_virtual_buffers);
679       if (!virtual_buffers) {
680          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
681          return;
682       }
683       cs->max_num_virtual_buffers = max_num_virtual_buffers;
684       cs->virtual_buffers = virtual_buffers;
685    }
686 
687    cs->virtual_buffers[cs->num_virtual_buffers] = bo;
688 
689    cs->virtual_buffer_hash_table[hash] = cs->num_virtual_buffers;
690    ++cs->num_virtual_buffers;
691 }
692 
693 static void
radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * _bo)694 radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *_bo)
695 {
696    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
697    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
698 
699    if (cs->status != VK_SUCCESS)
700       return;
701 
702    if (bo->is_virtual) {
703       radv_amdgpu_cs_add_virtual_buffer(_cs, _bo);
704       return;
705    }
706 
707    radv_amdgpu_cs_add_buffer_internal(cs, bo->bo_handle, bo->priority);
708 }
709 
710 static void
radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf * _parent,struct radeon_cmdbuf * _child,bool allow_ib2)711 radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cmdbuf *_child, bool allow_ib2)
712 {
713    struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
714    struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
715    struct radv_amdgpu_winsys *ws = parent->ws;
716    const bool use_ib2 = parent->use_ib && !parent->is_secondary && allow_ib2 && parent->hw_ip == AMD_IP_GFX;
717 
718    if (parent->status != VK_SUCCESS || child->status != VK_SUCCESS)
719       return;
720 
721    for (unsigned i = 0; i < child->num_buffers; ++i) {
722       radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i].bo_handle, child->handles[i].bo_priority);
723    }
724 
725    for (unsigned i = 0; i < child->num_virtual_buffers; ++i) {
726       radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i]);
727    }
728 
729    if (use_ib2) {
730       if (parent->base.cdw + 4 > parent->base.max_dw)
731          radv_amdgpu_cs_grow(&parent->base, 4);
732 
733       parent->base.reserved_dw = MAX2(parent->base.reserved_dw, parent->base.cdw + 4);
734 
735       /* Not setting the CHAIN bit will launch an IB2. */
736       radeon_emit(&parent->base, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
737       radeon_emit(&parent->base, child->ib.ib_mc_address);
738       radeon_emit(&parent->base, child->ib.ib_mc_address >> 32);
739       radeon_emit(&parent->base, child->ib.size);
740    } else {
741       assert(parent->use_ib == child->use_ib);
742 
743       /* Grow the current CS and copy the contents of the secondary CS. */
744       for (unsigned i = 0; i < child->num_ib_buffers; i++) {
745          struct radv_amdgpu_ib *ib = &child->ib_buffers[i];
746          uint32_t cdw = ib->cdw;
747          uint8_t *mapped;
748 
749          /* Do not copy the original chain link for IBs. */
750          if (child->use_ib)
751             cdw -= 4;
752 
753          assert(ib->bo);
754 
755          if (parent->base.cdw + cdw > parent->base.max_dw)
756             radv_amdgpu_cs_grow(&parent->base, cdw);
757 
758          parent->base.reserved_dw = MAX2(parent->base.reserved_dw, parent->base.cdw + cdw);
759 
760          mapped = radv_buffer_map(&ws->base, ib->bo);
761          if (!mapped) {
762             parent->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
763             return;
764          }
765 
766          memcpy(parent->base.buf + parent->base.cdw, mapped, 4 * cdw);
767          parent->base.cdw += cdw;
768       }
769    }
770 }
771 
772 static void
radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo,uint64_t va,const uint32_t cdw,const bool predicate)773 radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo, uint64_t va, const uint32_t cdw,
774                           const bool predicate)
775 {
776    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
777    const uint64_t ib_va = bo ? bo->va : va;
778 
779    if (cs->status != VK_SUCCESS)
780       return;
781 
782    assert(ib_va && ib_va % cs->ws->info.ip[cs->hw_ip].ib_alignment == 0);
783    assert(cs->hw_ip == AMD_IP_GFX && cdw <= ~C_3F2_IB_SIZE);
784 
785    radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER, 2, predicate));
786    radeon_emit(&cs->base, ib_va);
787    radeon_emit(&cs->base, ib_va >> 32);
788    radeon_emit(&cs->base, cdw);
789 }
790 
791 static void
radv_amdgpu_cs_chain_dgc_ib(struct radeon_cmdbuf * _cs,uint64_t va,uint32_t cdw,uint64_t trailer_va,const bool predicate)792 radv_amdgpu_cs_chain_dgc_ib(struct radeon_cmdbuf *_cs, uint64_t va, uint32_t cdw, uint64_t trailer_va,
793                             const bool predicate)
794 {
795    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
796 
797    if (cs->status != VK_SUCCESS)
798       return;
799 
800    assert(cs->ws->info.gfx_level >= GFX8);
801 
802    if (cs->hw_ip == AMD_IP_GFX) {
803       /* Use IB2 for executing DGC CS on GFX. */
804       cs->ws->base.cs_execute_ib(_cs, NULL, va, cdw, predicate);
805    } else {
806       assert(va && va % cs->ws->info.ip[cs->hw_ip].ib_alignment == 0);
807       assert(cdw <= ~C_3F2_IB_SIZE);
808 
809       /* Emit a WRITE_DATA packet to patch the DGC CS. */
810       const uint32_t chain_data[] = {
811          PKT3(PKT3_INDIRECT_BUFFER, 2, 0),
812          0,
813          0,
814          S_3F2_CHAIN(1) | S_3F2_VALID(1),
815       };
816 
817       radeon_emit(&cs->base, PKT3(PKT3_WRITE_DATA, 2 + ARRAY_SIZE(chain_data), false));
818       radeon_emit(&cs->base, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
819       radeon_emit(&cs->base, trailer_va);
820       radeon_emit(&cs->base, trailer_va >> 32);
821       radeon_emit_array(&cs->base, chain_data, ARRAY_SIZE(chain_data));
822 
823       /* Keep pointers for patching later. */
824       uint64_t *ib_va_ptr = (uint64_t *)(cs->base.buf + cs->base.cdw - 3);
825       uint32_t *ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
826 
827       /* Writeback L2 because CP isn't coherent with L2 on GFX6-8. */
828       if (cs->ws->info.gfx_level == GFX8) {
829          radeon_emit(&cs->base, PKT3(PKT3_ACQUIRE_MEM, 5, false) | PKT3_SHADER_TYPE_S(1));
830          radeon_emit(&cs->base, S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
831          radeon_emit(&cs->base, 0xffffffff);
832          radeon_emit(&cs->base, 0xff);
833          radeon_emit(&cs->base, 0);
834          radeon_emit(&cs->base, 0);
835          radeon_emit(&cs->base, 0x0000000A);
836       }
837 
838       /* Finalize the current CS. */
839       cs->ws->base.cs_finalize(_cs);
840 
841       /* Chain the current CS to the DGC CS. */
842       _cs->buf[_cs->cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
843       _cs->buf[_cs->cdw - 3] = va;
844       _cs->buf[_cs->cdw - 2] = va >> 32;
845       _cs->buf[_cs->cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | cdw;
846 
847       /* Allocate a new CS BO with initial size. */
848       const uint64_t ib_size = radv_amdgpu_cs_get_initial_size(cs->ws, cs->hw_ip);
849 
850       VkResult result = radv_amdgpu_cs_bo_create(cs, ib_size);
851       if (result != VK_SUCCESS) {
852          cs->base.cdw = 0;
853          cs->status = result;
854          return;
855       }
856 
857       cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
858       if (!cs->ib_mapped) {
859          cs->base.cdw = 0;
860          cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
861          return;
862       }
863 
864       cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
865 
866       /* Chain back the trailer (DGC CS) to the newly created one. */
867       *ib_va_ptr = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
868       cs->ib_size_ptr = ib_size_ptr;
869 
870       cs->base.buf = (uint32_t *)cs->ib_mapped;
871       cs->base.cdw = 0;
872       cs->base.reserved_dw = 0;
873       cs->base.max_dw = ib_size / 4 - 4;
874    }
875 }
876 
877 static unsigned
radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs * start_cs)878 radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs *start_cs)
879 {
880    unsigned num_bo = 0;
881 
882    for (struct radv_amdgpu_cs *cs = start_cs; cs; cs = cs->chained_to) {
883       num_bo += cs->num_buffers;
884       for (unsigned j = 0; j < cs->num_virtual_buffers; ++j)
885          num_bo += radv_amdgpu_winsys_bo(cs->virtual_buffers[j])->bo_count;
886    }
887 
888    return num_bo;
889 }
890 
891 static unsigned
radv_amdgpu_count_cs_array_bo(struct radeon_cmdbuf ** cs_array,unsigned num_cs)892 radv_amdgpu_count_cs_array_bo(struct radeon_cmdbuf **cs_array, unsigned num_cs)
893 {
894    unsigned num_bo = 0;
895 
896    for (unsigned i = 0; i < num_cs; ++i) {
897       num_bo += radv_amdgpu_count_cs_bo(radv_amdgpu_cs(cs_array[i]));
898    }
899 
900    return num_bo;
901 }
902 
903 static unsigned
radv_amdgpu_add_cs_to_bo_list(struct radv_amdgpu_cs * cs,struct drm_amdgpu_bo_list_entry * handles,unsigned num_handles)904 radv_amdgpu_add_cs_to_bo_list(struct radv_amdgpu_cs *cs, struct drm_amdgpu_bo_list_entry *handles, unsigned num_handles)
905 {
906    if (!cs->num_buffers)
907       return num_handles;
908 
909    if (num_handles == 0 && !cs->num_virtual_buffers) {
910       memcpy(handles, cs->handles, cs->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
911       return cs->num_buffers;
912    }
913 
914    int unique_bo_so_far = num_handles;
915    for (unsigned j = 0; j < cs->num_buffers; ++j) {
916       bool found = false;
917       for (unsigned k = 0; k < unique_bo_so_far; ++k) {
918          if (handles[k].bo_handle == cs->handles[j].bo_handle) {
919             found = true;
920             break;
921          }
922       }
923       if (!found) {
924          handles[num_handles] = cs->handles[j];
925          ++num_handles;
926       }
927    }
928    for (unsigned j = 0; j < cs->num_virtual_buffers; ++j) {
929       struct radv_amdgpu_winsys_bo *virtual_bo = radv_amdgpu_winsys_bo(cs->virtual_buffers[j]);
930       u_rwlock_rdlock(&virtual_bo->lock);
931       for (unsigned k = 0; k < virtual_bo->bo_count; ++k) {
932          struct radv_amdgpu_winsys_bo *bo = virtual_bo->bos[k];
933          bool found = false;
934          for (unsigned m = 0; m < num_handles; ++m) {
935             if (handles[m].bo_handle == bo->bo_handle) {
936                found = true;
937                break;
938             }
939          }
940          if (!found) {
941             handles[num_handles].bo_handle = bo->bo_handle;
942             handles[num_handles].bo_priority = bo->priority;
943             ++num_handles;
944          }
945       }
946       u_rwlock_rdunlock(&virtual_bo->lock);
947    }
948 
949    return num_handles;
950 }
951 
952 static unsigned
radv_amdgpu_add_cs_array_to_bo_list(struct radeon_cmdbuf ** cs_array,unsigned num_cs,struct drm_amdgpu_bo_list_entry * handles,unsigned num_handles)953 radv_amdgpu_add_cs_array_to_bo_list(struct radeon_cmdbuf **cs_array, unsigned num_cs,
954                                     struct drm_amdgpu_bo_list_entry *handles, unsigned num_handles)
955 {
956    for (unsigned i = 0; i < num_cs; ++i) {
957       for (struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]); cs; cs = cs->chained_to) {
958          num_handles = radv_amdgpu_add_cs_to_bo_list(cs, handles, num_handles);
959       }
960    }
961 
962    return num_handles;
963 }
964 
965 static unsigned
radv_amdgpu_copy_global_bo_list(struct radv_amdgpu_winsys * ws,struct drm_amdgpu_bo_list_entry * handles)966 radv_amdgpu_copy_global_bo_list(struct radv_amdgpu_winsys *ws, struct drm_amdgpu_bo_list_entry *handles)
967 {
968    for (uint32_t i = 0; i < ws->global_bo_list.count; i++) {
969       handles[i].bo_handle = ws->global_bo_list.bos[i]->bo_handle;
970       handles[i].bo_priority = ws->global_bo_list.bos[i]->priority;
971    }
972 
973    return ws->global_bo_list.count;
974 }
975 
976 static VkResult
radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys * ws,struct radeon_cmdbuf ** cs_array,unsigned count,struct radeon_cmdbuf ** initial_preamble_array,unsigned num_initial_preambles,struct radeon_cmdbuf ** continue_preamble_array,unsigned num_continue_preambles,struct radeon_cmdbuf ** postamble_array,unsigned num_postambles,unsigned * rnum_handles,struct drm_amdgpu_bo_list_entry ** rhandles)977 radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys *ws, struct radeon_cmdbuf **cs_array, unsigned count,
978                         struct radeon_cmdbuf **initial_preamble_array, unsigned num_initial_preambles,
979                         struct radeon_cmdbuf **continue_preamble_array, unsigned num_continue_preambles,
980                         struct radeon_cmdbuf **postamble_array, unsigned num_postambles, unsigned *rnum_handles,
981                         struct drm_amdgpu_bo_list_entry **rhandles)
982 {
983    struct drm_amdgpu_bo_list_entry *handles = NULL;
984    unsigned num_handles = 0;
985 
986    if (ws->debug_all_bos) {
987       handles = malloc(sizeof(handles[0]) * ws->global_bo_list.count);
988       if (!handles)
989          return VK_ERROR_OUT_OF_HOST_MEMORY;
990 
991       num_handles = radv_amdgpu_copy_global_bo_list(ws, handles);
992    } else if (count == 1 && !num_initial_preambles && !num_continue_preambles && !num_postambles &&
993               !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers && !radv_amdgpu_cs(cs_array[0])->chained_to &&
994               !ws->global_bo_list.count) {
995       struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)cs_array[0];
996       if (cs->num_buffers == 0)
997          return VK_SUCCESS;
998 
999       handles = malloc(sizeof(handles[0]) * cs->num_buffers);
1000       if (!handles)
1001          return VK_ERROR_OUT_OF_HOST_MEMORY;
1002 
1003       memcpy(handles, cs->handles, sizeof(handles[0]) * cs->num_buffers);
1004       num_handles = cs->num_buffers;
1005    } else {
1006       unsigned total_buffer_count = ws->global_bo_list.count;
1007       total_buffer_count += radv_amdgpu_count_cs_array_bo(cs_array, count);
1008       total_buffer_count += radv_amdgpu_count_cs_array_bo(initial_preamble_array, num_initial_preambles);
1009       total_buffer_count += radv_amdgpu_count_cs_array_bo(continue_preamble_array, num_continue_preambles);
1010       total_buffer_count += radv_amdgpu_count_cs_array_bo(postamble_array, num_postambles);
1011 
1012       if (total_buffer_count == 0)
1013          return VK_SUCCESS;
1014 
1015       handles = malloc(sizeof(handles[0]) * total_buffer_count);
1016       if (!handles)
1017          return VK_ERROR_OUT_OF_HOST_MEMORY;
1018 
1019       num_handles = radv_amdgpu_copy_global_bo_list(ws, handles);
1020       num_handles = radv_amdgpu_add_cs_array_to_bo_list(cs_array, count, handles, num_handles);
1021       num_handles =
1022          radv_amdgpu_add_cs_array_to_bo_list(initial_preamble_array, num_initial_preambles, handles, num_handles);
1023       num_handles =
1024          radv_amdgpu_add_cs_array_to_bo_list(continue_preamble_array, num_continue_preambles, handles, num_handles);
1025       num_handles = radv_amdgpu_add_cs_array_to_bo_list(postamble_array, num_postambles, handles, num_handles);
1026    }
1027 
1028    *rhandles = handles;
1029    *rnum_handles = num_handles;
1030 
1031    return VK_SUCCESS;
1032 }
1033 
1034 static void
radv_assign_last_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request)1035 radv_assign_last_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request)
1036 {
1037    radv_amdgpu_request_to_fence(ctx, &ctx->last_submission[request->ip_type][request->ring], request);
1038 }
1039 
1040 static unsigned
radv_amdgpu_get_num_ibs_per_cs(const struct radv_amdgpu_cs * cs)1041 radv_amdgpu_get_num_ibs_per_cs(const struct radv_amdgpu_cs *cs)
1042 {
1043    unsigned num_ibs = 0;
1044 
1045    if (cs->use_ib) {
1046       num_ibs = 1; /* Everything is chained. */
1047    } else {
1048       num_ibs = cs->num_ib_buffers;
1049    }
1050 
1051    return num_ibs;
1052 }
1053 
1054 static unsigned
radv_amdgpu_count_ibs(struct radeon_cmdbuf ** cs_array,unsigned cs_count,unsigned initial_preamble_count,unsigned continue_preamble_count,unsigned postamble_count)1055 radv_amdgpu_count_ibs(struct radeon_cmdbuf **cs_array, unsigned cs_count, unsigned initial_preamble_count,
1056                       unsigned continue_preamble_count, unsigned postamble_count)
1057 {
1058    unsigned num_ibs = 0;
1059 
1060    for (unsigned i = 0; i < cs_count; i++) {
1061       struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
1062 
1063       num_ibs += radv_amdgpu_get_num_ibs_per_cs(cs);
1064    }
1065 
1066    return MAX2(initial_preamble_count, continue_preamble_count) + num_ibs + postamble_count;
1067 }
1068 
1069 static VkResult
radv_amdgpu_winsys_cs_submit_internal(struct radv_amdgpu_ctx * ctx,int queue_idx,struct radv_winsys_sem_info * sem_info,struct radeon_cmdbuf ** cs_array,unsigned cs_count,struct radeon_cmdbuf ** initial_preamble_cs,unsigned initial_preamble_count,struct radeon_cmdbuf ** continue_preamble_cs,unsigned continue_preamble_count,struct radeon_cmdbuf ** postamble_cs,unsigned postamble_count,bool uses_shadow_regs)1070 radv_amdgpu_winsys_cs_submit_internal(struct radv_amdgpu_ctx *ctx, int queue_idx, struct radv_winsys_sem_info *sem_info,
1071                                       struct radeon_cmdbuf **cs_array, unsigned cs_count,
1072                                       struct radeon_cmdbuf **initial_preamble_cs, unsigned initial_preamble_count,
1073                                       struct radeon_cmdbuf **continue_preamble_cs, unsigned continue_preamble_count,
1074                                       struct radeon_cmdbuf **postamble_cs, unsigned postamble_count,
1075                                       bool uses_shadow_regs)
1076 {
1077    VkResult result;
1078 
1079    /* Last CS is "the gang leader", its IP type determines which fence to signal. */
1080    struct radv_amdgpu_cs *last_cs = radv_amdgpu_cs(cs_array[cs_count - 1]);
1081    struct radv_amdgpu_winsys *ws = last_cs->ws;
1082 
1083    const unsigned num_ibs =
1084       radv_amdgpu_count_ibs(cs_array, cs_count, initial_preamble_count, continue_preamble_count, postamble_count);
1085    const unsigned ib_array_size = MIN2(RADV_MAX_IBS_PER_SUBMIT, num_ibs);
1086 
1087    STACK_ARRAY(struct radv_amdgpu_cs_ib_info, ibs, ib_array_size);
1088 
1089    struct drm_amdgpu_bo_list_entry *handles = NULL;
1090    unsigned num_handles = 0;
1091 
1092    u_rwlock_rdlock(&ws->global_bo_list.lock);
1093 
1094    result = radv_amdgpu_get_bo_list(ws, &cs_array[0], cs_count, initial_preamble_cs, initial_preamble_count,
1095                                     continue_preamble_cs, continue_preamble_count, postamble_cs, postamble_count,
1096                                     &num_handles, &handles);
1097    if (result != VK_SUCCESS)
1098       goto fail;
1099 
1100    /* Configure the CS request. */
1101    const uint32_t *max_ib_per_ip = ws->info.max_submitted_ibs;
1102    struct radv_amdgpu_cs_request request = {
1103       .ip_type = last_cs->hw_ip,
1104       .ip_instance = 0,
1105       .ring = queue_idx,
1106       .handles = handles,
1107       .num_handles = num_handles,
1108       .ibs = ibs,
1109       .number_of_ibs = 0, /* set below */
1110    };
1111 
1112    for (unsigned cs_idx = 0, cs_ib_idx = 0; cs_idx < cs_count;) {
1113       struct radeon_cmdbuf **preambles = cs_idx ? continue_preamble_cs : initial_preamble_cs;
1114       const unsigned preamble_count = cs_idx ? continue_preamble_count : initial_preamble_count;
1115       const unsigned ib_per_submit = RADV_MAX_IBS_PER_SUBMIT - preamble_count - postamble_count;
1116       unsigned num_submitted_ibs = 0;
1117       unsigned ibs_per_ip[AMD_NUM_IP_TYPES] = {0};
1118 
1119       /* Copy preambles to the submission. */
1120       for (unsigned i = 0; i < preamble_count; ++i) {
1121          /* Assume that the full preamble fits into 1 IB. */
1122          struct radv_amdgpu_cs *cs = radv_amdgpu_cs(preambles[i]);
1123          struct radv_amdgpu_cs_ib_info ib;
1124 
1125          assert(cs->num_ib_buffers == 1);
1126          ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1127 
1128          ibs[num_submitted_ibs++] = ib;
1129          ibs_per_ip[cs->hw_ip]++;
1130       }
1131 
1132       for (unsigned i = 0; i < ib_per_submit && cs_idx < cs_count; ++i) {
1133          struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[cs_idx]);
1134          struct radv_amdgpu_cs_ib_info ib;
1135 
1136          if (cs_ib_idx == 0) {
1137             /* Make sure the whole CS fits into the same submission. */
1138             unsigned cs_num_ib = radv_amdgpu_get_num_ibs_per_cs(cs);
1139             if (i + cs_num_ib > ib_per_submit || ibs_per_ip[cs->hw_ip] + cs_num_ib > max_ib_per_ip[cs->hw_ip])
1140                break;
1141 
1142             if (cs->hw_ip != request.ip_type) {
1143                /* Found a "follower" CS in a gang submission.
1144                 * Make sure to submit this together with its "leader", the next CS.
1145                 * We rely on the caller to order each "follower" before its "leader."
1146                 */
1147                assert(cs_idx != cs_count - 1);
1148                struct radv_amdgpu_cs *next_cs = radv_amdgpu_cs(cs_array[cs_idx + 1]);
1149                assert(next_cs->hw_ip == request.ip_type);
1150                unsigned next_cs_num_ib = radv_amdgpu_get_num_ibs_per_cs(next_cs);
1151                if (i + cs_num_ib + next_cs_num_ib > ib_per_submit ||
1152                    ibs_per_ip[next_cs->hw_ip] + next_cs_num_ib > max_ib_per_ip[next_cs->hw_ip])
1153                   break;
1154             }
1155          }
1156 
1157          /* When IBs are used, we only need to submit the main IB of this CS, because everything
1158           * else is chained to the first IB. Otherwise we must submit all IBs in the ib_buffers
1159           * array.
1160           */
1161          if (cs->use_ib) {
1162             ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1163             cs_idx++;
1164          } else {
1165             assert(cs_ib_idx < cs->num_ib_buffers);
1166             ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[cs_ib_idx++]);
1167 
1168             if (cs_ib_idx == cs->num_ib_buffers) {
1169                cs_idx++;
1170                cs_ib_idx = 0;
1171             }
1172          }
1173 
1174          if (uses_shadow_regs && ib.ip_type == AMDGPU_HW_IP_GFX)
1175             ib.flags |= AMDGPU_IB_FLAG_PREEMPT;
1176 
1177          assert(num_submitted_ibs < ib_array_size);
1178          ibs[num_submitted_ibs++] = ib;
1179          ibs_per_ip[cs->hw_ip]++;
1180       }
1181 
1182       assert(num_submitted_ibs > preamble_count);
1183 
1184       /* Copy postambles to the submission. */
1185       for (unsigned i = 0; i < postamble_count; ++i) {
1186          /* Assume that the full postamble fits into 1 IB. */
1187          struct radv_amdgpu_cs *cs = radv_amdgpu_cs(postamble_cs[i]);
1188          struct radv_amdgpu_cs_ib_info ib;
1189 
1190          assert(cs->num_ib_buffers == 1);
1191          ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1192 
1193          ibs[num_submitted_ibs++] = ib;
1194          ibs_per_ip[cs->hw_ip]++;
1195       }
1196 
1197       /* Submit the CS. */
1198       request.number_of_ibs = num_submitted_ibs;
1199       result = radv_amdgpu_cs_submit(ctx, &request, sem_info);
1200       if (result != VK_SUCCESS)
1201          goto fail;
1202    }
1203 
1204    free(request.handles);
1205 
1206    if (result != VK_SUCCESS)
1207       goto fail;
1208 
1209    radv_assign_last_submit(ctx, &request);
1210 
1211 fail:
1212    u_rwlock_rdunlock(&ws->global_bo_list.lock);
1213    STACK_ARRAY_FINISH(ibs);
1214    return result;
1215 }
1216 
1217 static VkResult
radv_amdgpu_cs_submit_zero(struct radv_amdgpu_ctx * ctx,enum amd_ip_type ip_type,int queue_idx,struct radv_winsys_sem_info * sem_info)1218 radv_amdgpu_cs_submit_zero(struct radv_amdgpu_ctx *ctx, enum amd_ip_type ip_type, int queue_idx,
1219                            struct radv_winsys_sem_info *sem_info)
1220 {
1221    unsigned hw_ip = ip_type;
1222    unsigned queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, hw_ip, queue_idx);
1223    int ret;
1224 
1225    if (!queue_syncobj)
1226       return VK_ERROR_OUT_OF_HOST_MEMORY;
1227 
1228    if (sem_info->wait.syncobj_count || sem_info->wait.timeline_syncobj_count) {
1229       int fd;
1230       ret = ac_drm_cs_syncobj_export_sync_file(ctx->ws->fd, queue_syncobj, &fd);
1231       if (ret < 0)
1232          return VK_ERROR_DEVICE_LOST;
1233 
1234       for (unsigned i = 0; i < sem_info->wait.syncobj_count; ++i) {
1235          int fd2;
1236          ret = ac_drm_cs_syncobj_export_sync_file(ctx->ws->fd, sem_info->wait.syncobj[i], &fd2);
1237          if (ret < 0) {
1238             close(fd);
1239             return VK_ERROR_DEVICE_LOST;
1240          }
1241 
1242          sync_accumulate("radv", &fd, fd2);
1243          close(fd2);
1244       }
1245       for (unsigned i = 0; i < sem_info->wait.timeline_syncobj_count; ++i) {
1246          int fd2;
1247          ret = ac_drm_cs_syncobj_export_sync_file2(
1248             ctx->ws->fd, sem_info->wait.syncobj[i + sem_info->wait.syncobj_count], sem_info->wait.points[i], 0, &fd2);
1249          if (ret < 0) {
1250             /* This works around a kernel bug where the fence isn't copied if it is already
1251              * signalled. Since it is already signalled it is totally fine to not wait on it.
1252              *
1253              * kernel patch: https://patchwork.freedesktop.org/patch/465583/ */
1254             uint64_t point;
1255             ret = ac_drm_cs_syncobj_query2(ctx->ws->fd, &sem_info->wait.syncobj[i + sem_info->wait.syncobj_count],
1256                                            &point, 1, 0);
1257             if (!ret && point >= sem_info->wait.points[i])
1258                continue;
1259 
1260             close(fd);
1261             return VK_ERROR_DEVICE_LOST;
1262          }
1263 
1264          sync_accumulate("radv", &fd, fd2);
1265          close(fd2);
1266       }
1267       ret = ac_drm_cs_syncobj_import_sync_file(ctx->ws->fd, queue_syncobj, fd);
1268       close(fd);
1269       if (ret < 0)
1270          return VK_ERROR_DEVICE_LOST;
1271 
1272       ctx->queue_syncobj_wait[hw_ip][queue_idx] = true;
1273    }
1274 
1275    for (unsigned i = 0; i < sem_info->signal.syncobj_count; ++i) {
1276       uint32_t dst_handle = sem_info->signal.syncobj[i];
1277       uint32_t src_handle = queue_syncobj;
1278 
1279       if (ctx->ws->info.has_timeline_syncobj) {
1280          ret = ac_drm_cs_syncobj_transfer(ctx->ws->fd, dst_handle, 0, src_handle, 0, 0);
1281          if (ret < 0)
1282             return VK_ERROR_DEVICE_LOST;
1283       } else {
1284          int fd;
1285          ret = ac_drm_cs_syncobj_export_sync_file(ctx->ws->fd, src_handle, &fd);
1286          if (ret < 0)
1287             return VK_ERROR_DEVICE_LOST;
1288 
1289          ret = ac_drm_cs_syncobj_import_sync_file(ctx->ws->fd, dst_handle, fd);
1290          close(fd);
1291          if (ret < 0)
1292             return VK_ERROR_DEVICE_LOST;
1293       }
1294    }
1295    for (unsigned i = 0; i < sem_info->signal.timeline_syncobj_count; ++i) {
1296       ret = ac_drm_cs_syncobj_transfer(ctx->ws->fd, sem_info->signal.syncobj[i + sem_info->signal.syncobj_count],
1297                                        sem_info->signal.points[i], queue_syncobj, 0, 0);
1298       if (ret < 0)
1299          return VK_ERROR_DEVICE_LOST;
1300    }
1301    return VK_SUCCESS;
1302 }
1303 
1304 static VkResult
radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx * _ctx,const struct radv_winsys_submit_info * submit,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t signal_count,const struct vk_sync_signal * signals)1305 radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx, const struct radv_winsys_submit_info *submit,
1306                              uint32_t wait_count, const struct vk_sync_wait *waits, uint32_t signal_count,
1307                              const struct vk_sync_signal *signals)
1308 {
1309    struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
1310    struct radv_amdgpu_winsys *ws = ctx->ws;
1311    VkResult result;
1312    unsigned wait_idx = 0, signal_idx = 0;
1313 
1314    STACK_ARRAY(uint64_t, wait_points, wait_count);
1315    STACK_ARRAY(uint32_t, wait_syncobj, wait_count);
1316    STACK_ARRAY(uint64_t, signal_points, signal_count);
1317    STACK_ARRAY(uint32_t, signal_syncobj, signal_count);
1318 
1319    if (!wait_points || !wait_syncobj || !signal_points || !signal_syncobj) {
1320       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1321       goto out;
1322    }
1323 
1324    for (uint32_t i = 0; i < wait_count; ++i) {
1325       if (waits[i].sync->type == &vk_sync_dummy_type)
1326          continue;
1327 
1328       assert(waits[i].sync->type == &ws->syncobj_sync_type);
1329       wait_syncobj[wait_idx] = ((struct vk_drm_syncobj *)waits[i].sync)->syncobj;
1330       wait_points[wait_idx] = waits[i].wait_value;
1331       ++wait_idx;
1332    }
1333 
1334    for (uint32_t i = 0; i < signal_count; ++i) {
1335       if (signals[i].sync->type == &vk_sync_dummy_type)
1336          continue;
1337 
1338       assert(signals[i].sync->type == &ws->syncobj_sync_type);
1339       signal_syncobj[signal_idx] = ((struct vk_drm_syncobj *)signals[i].sync)->syncobj;
1340       signal_points[signal_idx] = signals[i].signal_value;
1341       ++signal_idx;
1342    }
1343 
1344    assert(signal_idx <= signal_count);
1345    assert(wait_idx <= wait_count);
1346 
1347    const uint32_t wait_timeline_syncobj_count =
1348       (ws->syncobj_sync_type.features & VK_SYNC_FEATURE_TIMELINE) ? wait_idx : 0;
1349    const uint32_t signal_timeline_syncobj_count =
1350       (ws->syncobj_sync_type.features & VK_SYNC_FEATURE_TIMELINE) ? signal_idx : 0;
1351 
1352    struct radv_winsys_sem_info sem_info = {
1353       .wait =
1354          {
1355             .points = wait_points,
1356             .syncobj = wait_syncobj,
1357             .timeline_syncobj_count = wait_timeline_syncobj_count,
1358             .syncobj_count = wait_idx - wait_timeline_syncobj_count,
1359          },
1360       .signal =
1361          {
1362             .points = signal_points,
1363             .syncobj = signal_syncobj,
1364             .timeline_syncobj_count = signal_timeline_syncobj_count,
1365             .syncobj_count = signal_idx - signal_timeline_syncobj_count,
1366          },
1367       .cs_emit_wait = true,
1368       .cs_emit_signal = true,
1369    };
1370 
1371    if (!submit->cs_count) {
1372       result = radv_amdgpu_cs_submit_zero(ctx, submit->ip_type, submit->queue_index, &sem_info);
1373    } else {
1374       result = radv_amdgpu_winsys_cs_submit_internal(
1375          ctx, submit->queue_index, &sem_info, submit->cs_array, submit->cs_count, submit->initial_preamble_cs,
1376          submit->initial_preamble_count, submit->continue_preamble_cs, submit->continue_preamble_count,
1377          submit->postamble_cs, submit->postamble_count, submit->uses_shadow_regs);
1378    }
1379 
1380 out:
1381    STACK_ARRAY_FINISH(wait_points);
1382    STACK_ARRAY_FINISH(wait_syncobj);
1383    STACK_ARRAY_FINISH(signal_points);
1384    STACK_ARRAY_FINISH(signal_syncobj);
1385    return result;
1386 }
1387 
1388 static void
radv_amdgpu_winsys_get_cpu_addr(void * _cs,uint64_t addr,struct ac_addr_info * info)1389 radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr, struct ac_addr_info *info)
1390 {
1391    struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1392 
1393    memset(info, 0, sizeof(struct ac_addr_info));
1394 
1395    if (cs->ws->debug_log_bos) {
1396       u_rwlock_rdlock(&cs->ws->log_bo_list_lock);
1397       list_for_each_entry_rev (struct radv_amdgpu_winsys_bo_log, bo_log, &cs->ws->log_bo_list, list) {
1398          if (addr >= bo_log->va && addr - bo_log->va < bo_log->size) {
1399             info->use_after_free = bo_log->destroyed;
1400             break;
1401          }
1402       }
1403       u_rwlock_rdunlock(&cs->ws->log_bo_list_lock);
1404    }
1405 
1406    if (info->use_after_free)
1407       return;
1408 
1409    info->valid = !cs->ws->debug_all_bos;
1410 
1411    for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
1412       struct radv_amdgpu_ib *ib = &cs->ib_buffers[i];
1413       struct radv_amdgpu_winsys_bo *bo = (struct radv_amdgpu_winsys_bo *)ib->bo;
1414 
1415       if (addr >= bo->base.va && addr - bo->base.va < bo->base.size) {
1416          void *map = radv_buffer_map(&cs->ws->base, &bo->base);
1417          if (map) {
1418             info->cpu_addr = (char *)map + (addr - bo->base.va);
1419             info->valid = true;
1420             return;
1421          }
1422       }
1423    }
1424    u_rwlock_rdlock(&cs->ws->global_bo_list.lock);
1425    for (uint32_t i = 0; i < cs->ws->global_bo_list.count; i++) {
1426       struct radv_amdgpu_winsys_bo *bo = cs->ws->global_bo_list.bos[i];
1427       if (addr >= bo->base.va && addr - bo->base.va < bo->base.size) {
1428          void *map = radv_buffer_map(&cs->ws->base, &bo->base);
1429          if (map) {
1430             u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1431             info->valid = true;
1432             info->cpu_addr = (char *)map + (addr - bo->base.va);
1433             return;
1434          }
1435       }
1436    }
1437    u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1438 
1439    return;
1440 }
1441 
1442 static void
radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf * _cs,FILE * file,const int * trace_ids,int trace_id_count,enum radv_cs_dump_type type)1443 radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf *_cs, FILE *file, const int *trace_ids, int trace_id_count,
1444                            enum radv_cs_dump_type type)
1445 {
1446    struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1447    struct radv_amdgpu_winsys *ws = cs->ws;
1448 
1449    if (cs->use_ib) {
1450       struct radv_amdgpu_cs_ib_info ib_info = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1451 
1452       struct ac_addr_info addr_info;
1453       radv_amdgpu_winsys_get_cpu_addr(cs, ib_info.ib_mc_address, &addr_info);
1454       assert(addr_info.cpu_addr);
1455 
1456       if (type == RADV_CS_DUMP_TYPE_IBS) {
1457          struct ac_ib_parser ib_parser = {
1458             .f = file,
1459             .ib = addr_info.cpu_addr,
1460             .num_dw = cs->ib_buffers[0].cdw,
1461             .trace_ids = trace_ids,
1462             .trace_id_count = trace_id_count,
1463             .gfx_level = ws->info.gfx_level,
1464             .vcn_version = ws->info.vcn_ip_version,
1465             .family = ws->info.family,
1466             .ip_type = cs->hw_ip,
1467             .addr_callback = radv_amdgpu_winsys_get_cpu_addr,
1468             .addr_callback_data = cs,
1469             .annotations = cs->annotations,
1470          };
1471 
1472          ac_parse_ib(&ib_parser, "main IB");
1473       } else {
1474          uint32_t *ib_dw = addr_info.cpu_addr;
1475          ac_gather_context_rolls(file, &ib_dw, &cs->ib_buffers[0].cdw, 1, cs->annotations, &ws->info);
1476       }
1477    } else {
1478       uint32_t **ibs = type == RADV_CS_DUMP_TYPE_CTX_ROLLS ? malloc(cs->num_ib_buffers * sizeof(uint32_t *)) : NULL;
1479       uint32_t *ib_dw_sizes =
1480          type == RADV_CS_DUMP_TYPE_CTX_ROLLS ? malloc(cs->num_ib_buffers * sizeof(uint32_t)) : NULL;
1481 
1482       for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
1483          struct radv_amdgpu_ib *ib = &cs->ib_buffers[i];
1484          char name[64];
1485          void *mapped;
1486 
1487          mapped = radv_buffer_map(&ws->base, ib->bo);
1488          if (!mapped)
1489             continue;
1490 
1491          if (cs->num_ib_buffers > 1) {
1492             snprintf(name, sizeof(name), "main IB (chunk %d)", i);
1493          } else {
1494             snprintf(name, sizeof(name), "main IB");
1495          }
1496 
1497          if (type == RADV_CS_DUMP_TYPE_IBS) {
1498             struct ac_ib_parser ib_parser = {
1499                .f = file,
1500                .ib = mapped,
1501                .num_dw = ib->cdw,
1502                .trace_ids = trace_ids,
1503                .trace_id_count = trace_id_count,
1504                .gfx_level = ws->info.gfx_level,
1505                .vcn_version = ws->info.vcn_ip_version,
1506                .family = ws->info.family,
1507                .ip_type = cs->hw_ip,
1508                .addr_callback = radv_amdgpu_winsys_get_cpu_addr,
1509                .addr_callback_data = cs,
1510                .annotations = cs->annotations,
1511             };
1512 
1513             ac_parse_ib(&ib_parser, name);
1514          } else {
1515             ibs[i] = mapped;
1516             ib_dw_sizes[i] = ib->cdw;
1517          }
1518       }
1519 
1520       if (type == RADV_CS_DUMP_TYPE_CTX_ROLLS) {
1521          ac_gather_context_rolls(file, ibs, ib_dw_sizes, cs->num_ib_buffers, cs->annotations, &ws->info);
1522 
1523          free(ibs);
1524          free(ib_dw_sizes);
1525       }
1526    }
1527 }
1528 
1529 static void
radv_amdgpu_winsys_cs_annotate(struct radeon_cmdbuf * _cs,const char * annotation)1530 radv_amdgpu_winsys_cs_annotate(struct radeon_cmdbuf *_cs, const char *annotation)
1531 {
1532    struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1533 
1534    if (!cs->annotations) {
1535       cs->annotations = _mesa_pointer_hash_table_create(NULL);
1536       if (!cs->annotations)
1537          return;
1538    }
1539 
1540    struct hash_entry *entry = _mesa_hash_table_search(cs->annotations, _cs->buf + _cs->cdw);
1541    if (entry) {
1542       char *old_annotation = entry->data;
1543       char *new_annotation = calloc(strlen(old_annotation) + strlen(annotation) + 5, 1);
1544       sprintf(new_annotation, "%s -> %s", old_annotation, annotation);
1545       free(old_annotation);
1546       _mesa_hash_table_insert(cs->annotations, _cs->buf + _cs->cdw, new_annotation);
1547    } else {
1548       _mesa_hash_table_insert(cs->annotations, _cs->buf + _cs->cdw, strdup(annotation));
1549    }
1550 }
1551 
1552 static uint32_t
radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)1553 radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)
1554 {
1555    switch (radv_priority) {
1556    case RADEON_CTX_PRIORITY_REALTIME:
1557       return AMDGPU_CTX_PRIORITY_VERY_HIGH;
1558    case RADEON_CTX_PRIORITY_HIGH:
1559       return AMDGPU_CTX_PRIORITY_HIGH;
1560    case RADEON_CTX_PRIORITY_MEDIUM:
1561       return AMDGPU_CTX_PRIORITY_NORMAL;
1562    case RADEON_CTX_PRIORITY_LOW:
1563       return AMDGPU_CTX_PRIORITY_LOW;
1564    default:
1565       unreachable("Invalid context priority");
1566    }
1567 }
1568 
1569 static VkResult
radv_amdgpu_ctx_create(struct radeon_winsys * _ws,enum radeon_ctx_priority priority,struct radeon_winsys_ctx ** rctx)1570 radv_amdgpu_ctx_create(struct radeon_winsys *_ws, enum radeon_ctx_priority priority, struct radeon_winsys_ctx **rctx)
1571 {
1572    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1573    struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx);
1574    uint32_t amdgpu_priority = radv_to_amdgpu_priority(priority);
1575    VkResult result;
1576    int r;
1577 
1578    if (!ctx)
1579       return VK_ERROR_OUT_OF_HOST_MEMORY;
1580 
1581    r = ac_drm_cs_ctx_create2(ws->dev, amdgpu_priority, &ctx->ctx_handle);
1582    if (r && r == -EACCES) {
1583       result = VK_ERROR_NOT_PERMITTED;
1584       goto fail_create;
1585    } else if (r) {
1586       fprintf(stderr, "radv/amdgpu: radv_amdgpu_cs_ctx_create2 failed. (%i)\n", r);
1587       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1588       goto fail_create;
1589    }
1590    ctx->ws = ws;
1591 
1592    assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * 4 * sizeof(uint64_t) <= 4096);
1593    result = ws->base.buffer_create(&ws->base, 4096, 8, RADEON_DOMAIN_GTT,
1594                                    RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_CS, 0,
1595                                    &ctx->fence_bo);
1596    if (result != VK_SUCCESS) {
1597       goto fail_alloc;
1598    }
1599 
1600    *rctx = (struct radeon_winsys_ctx *)ctx;
1601    return VK_SUCCESS;
1602 
1603 fail_alloc:
1604    ac_drm_cs_ctx_free(ws->dev, ctx->ctx_handle);
1605 fail_create:
1606    FREE(ctx);
1607    return result;
1608 }
1609 
1610 static void
radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)1611 radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
1612 {
1613    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1614 
1615    for (unsigned ip = 0; ip <= AMDGPU_HW_IP_NUM; ++ip) {
1616       for (unsigned ring = 0; ring < MAX_RINGS_PER_TYPE; ++ring) {
1617          if (ctx->queue_syncobj[ip][ring])
1618             ac_drm_cs_destroy_syncobj(ctx->ws->fd, ctx->queue_syncobj[ip][ring]);
1619       }
1620    }
1621 
1622    ctx->ws->base.buffer_destroy(&ctx->ws->base, ctx->fence_bo);
1623    ac_drm_cs_ctx_free(ctx->ws->dev, ctx->ctx_handle);
1624    FREE(ctx);
1625 }
1626 
1627 static uint32_t
radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx * ctx,unsigned ip,unsigned ring)1628 radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring)
1629 {
1630    uint32_t *syncobj = &ctx->queue_syncobj[ip][ring];
1631    if (!*syncobj) {
1632       ac_drm_cs_create_syncobj2(ctx->ws->fd, DRM_SYNCOBJ_CREATE_SIGNALED, syncobj);
1633    }
1634    return *syncobj;
1635 }
1636 
1637 static bool
radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,int ring_index)1638 radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx, enum amd_ip_type ip_type, int ring_index)
1639 {
1640    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1641 
1642    if (ctx->last_submission[ip_type][ring_index].fence.fence) {
1643       uint32_t expired;
1644       int ret = ac_drm_cs_query_fence_status(
1645          ctx->ws->dev, ctx->ctx_handle, ctx->last_submission[ip_type][ring_index].fence.ip_type,
1646          ctx->last_submission[ip_type][ring_index].fence.ip_instance,
1647          ctx->last_submission[ip_type][ring_index].fence.ring, ctx->last_submission[ip_type][ring_index].fence.fence,
1648          1000000000ull, 0, &expired);
1649 
1650       if (ret || !expired)
1651          return false;
1652    }
1653 
1654    return true;
1655 }
1656 
1657 static uint32_t
radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate)1658 radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate)
1659 {
1660    switch (radv_pstate) {
1661    case RADEON_CTX_PSTATE_NONE:
1662       return AMDGPU_CTX_STABLE_PSTATE_NONE;
1663    case RADEON_CTX_PSTATE_STANDARD:
1664       return AMDGPU_CTX_STABLE_PSTATE_STANDARD;
1665    case RADEON_CTX_PSTATE_MIN_SCLK:
1666       return AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK;
1667    case RADEON_CTX_PSTATE_MIN_MCLK:
1668       return AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK;
1669    case RADEON_CTX_PSTATE_PEAK:
1670       return AMDGPU_CTX_STABLE_PSTATE_PEAK;
1671    default:
1672       unreachable("Invalid pstate");
1673    }
1674 }
1675 
1676 static int
radv_amdgpu_ctx_set_pstate(struct radeon_winsys_ctx * rwctx,enum radeon_ctx_pstate pstate)1677 radv_amdgpu_ctx_set_pstate(struct radeon_winsys_ctx *rwctx, enum radeon_ctx_pstate pstate)
1678 {
1679    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1680    uint32_t new_pstate = radv_to_amdgpu_pstate(pstate);
1681    uint32_t current_pstate = 0;
1682    int r;
1683 
1684    r = ac_drm_cs_ctx_stable_pstate(ctx->ws->dev, ctx->ctx_handle, AMDGPU_CTX_OP_GET_STABLE_PSTATE, 0, &current_pstate);
1685    if (r) {
1686       fprintf(stderr, "radv/amdgpu: failed to get current pstate\n");
1687       return r;
1688    }
1689 
1690    /* Do not try to set a new pstate when the current one is already what we want. Otherwise, the
1691     * kernel might return -EBUSY if we have multiple AMDGPU contexts in flight.
1692     */
1693    if (current_pstate == new_pstate)
1694       return 0;
1695 
1696    r = ac_drm_cs_ctx_stable_pstate(ctx->ws->dev, ctx->ctx_handle, AMDGPU_CTX_OP_SET_STABLE_PSTATE, new_pstate, NULL);
1697    if (r) {
1698       fprintf(stderr, "radv/amdgpu: failed to set new pstate\n");
1699       return r;
1700    }
1701 
1702    return 0;
1703 }
1704 
1705 static void *
radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts * counts,uint32_t queue_syncobj,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1706 radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts, uint32_t queue_syncobj,
1707                                    struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1708 {
1709    unsigned count = counts->syncobj_count + (queue_syncobj ? 1 : 0);
1710    struct drm_amdgpu_cs_chunk_sem *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * count);
1711    if (!syncobj)
1712       return NULL;
1713 
1714    for (unsigned i = 0; i < counts->syncobj_count; i++) {
1715       struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
1716       sem->handle = counts->syncobj[i];
1717    }
1718 
1719    if (queue_syncobj)
1720       syncobj[counts->syncobj_count].handle = queue_syncobj;
1721 
1722    chunk->chunk_id = chunk_id;
1723    chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_sem) / 4 * count;
1724    chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1725    return syncobj;
1726 }
1727 
1728 static void *
radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts * counts,uint32_t queue_syncobj,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1729 radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts *counts, uint32_t queue_syncobj,
1730                                             struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1731 {
1732    uint32_t count = counts->syncobj_count + counts->timeline_syncobj_count + (queue_syncobj ? 1 : 0);
1733    struct drm_amdgpu_cs_chunk_syncobj *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_syncobj) * count);
1734    if (!syncobj)
1735       return NULL;
1736 
1737    for (unsigned i = 0; i < counts->syncobj_count; i++) {
1738       struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i];
1739       sem->handle = counts->syncobj[i];
1740       sem->flags = 0;
1741       sem->point = 0;
1742    }
1743 
1744    for (unsigned i = 0; i < counts->timeline_syncobj_count; i++) {
1745       struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i + counts->syncobj_count];
1746       sem->handle = counts->syncobj[i + counts->syncobj_count];
1747       sem->flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
1748       sem->point = counts->points[i];
1749    }
1750 
1751    if (queue_syncobj) {
1752       syncobj[count - 1].handle = queue_syncobj;
1753       syncobj[count - 1].flags = 0;
1754       syncobj[count - 1].point = 0;
1755    }
1756 
1757    chunk->chunk_id = chunk_id;
1758    chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_syncobj) / 4 * count;
1759    chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1760    return syncobj;
1761 }
1762 
1763 static bool
radv_amdgpu_cs_has_user_fence(struct radv_amdgpu_cs_request * request)1764 radv_amdgpu_cs_has_user_fence(struct radv_amdgpu_cs_request *request)
1765 {
1766    return request->ip_type != AMDGPU_HW_IP_UVD && request->ip_type != AMDGPU_HW_IP_VCE &&
1767           request->ip_type != AMDGPU_HW_IP_UVD_ENC && request->ip_type != AMDGPU_HW_IP_VCN_DEC &&
1768           request->ip_type != AMDGPU_HW_IP_VCN_ENC && request->ip_type != AMDGPU_HW_IP_VCN_JPEG;
1769 }
1770 
1771 static VkResult
radv_amdgpu_cs_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request,struct radv_winsys_sem_info * sem_info)1772 radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
1773                       struct radv_winsys_sem_info *sem_info)
1774 {
1775    int r;
1776    int num_chunks;
1777    int size;
1778    struct drm_amdgpu_cs_chunk *chunks;
1779    struct drm_amdgpu_cs_chunk_data *chunk_data;
1780    struct drm_amdgpu_bo_list_in bo_list_in;
1781    void *wait_syncobj = NULL, *signal_syncobj = NULL;
1782    int i;
1783    VkResult result = VK_SUCCESS;
1784    bool has_user_fence = radv_amdgpu_cs_has_user_fence(request);
1785    uint32_t queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, request->ip_type, request->ring);
1786    bool *queue_syncobj_wait = &ctx->queue_syncobj_wait[request->ip_type][request->ring];
1787 
1788    if (!queue_syncobj)
1789       return VK_ERROR_OUT_OF_HOST_MEMORY;
1790 
1791    size = request->number_of_ibs + 1 + (has_user_fence ? 1 : 0) + 1 /* bo list */ + 3;
1792 
1793    chunks = malloc(sizeof(chunks[0]) * size);
1794    if (!chunks)
1795       return VK_ERROR_OUT_OF_HOST_MEMORY;
1796 
1797    size = request->number_of_ibs + (has_user_fence ? 1 : 0);
1798 
1799    chunk_data = malloc(sizeof(chunk_data[0]) * size);
1800    if (!chunk_data) {
1801       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1802       goto error_out;
1803    }
1804 
1805    num_chunks = request->number_of_ibs;
1806    for (i = 0; i < request->number_of_ibs; i++) {
1807       struct radv_amdgpu_cs_ib_info *ib;
1808       chunks[i].chunk_id = AMDGPU_CHUNK_ID_IB;
1809       chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1810       chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1811 
1812       ib = &request->ibs[i];
1813       assert(ib->ib_mc_address && ib->ib_mc_address % ctx->ws->info.ip[ib->ip_type].ib_alignment == 0);
1814       assert(ib->size);
1815 
1816       chunk_data[i].ib_data._pad = 0;
1817       chunk_data[i].ib_data.va_start = ib->ib_mc_address;
1818       chunk_data[i].ib_data.ib_bytes = ib->size * 4;
1819       chunk_data[i].ib_data.ip_type = ib->ip_type;
1820       chunk_data[i].ib_data.ip_instance = request->ip_instance;
1821       chunk_data[i].ib_data.ring = request->ring;
1822       chunk_data[i].ib_data.flags = ib->flags;
1823    }
1824 
1825    assert(chunk_data[request->number_of_ibs - 1].ib_data.ip_type == request->ip_type);
1826 
1827    if (has_user_fence) {
1828       i = num_chunks++;
1829       chunks[i].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1830       chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1831       chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1832 
1833       /* Need to reserve 4 QWORD for user fence:
1834        *   QWORD[0]: completed fence
1835        *   QWORD[1]: preempted fence
1836        *   QWORD[2]: reset fence
1837        *   QWORD[3]: preempted then reset
1838        */
1839       uint32_t offset = (request->ip_type * MAX_RINGS_PER_TYPE + request->ring) * 4;
1840       ac_drm_cs_chunk_fence_info_to_data(radv_amdgpu_winsys_bo(ctx->fence_bo)->bo_handle, offset, &chunk_data[i]);
1841    }
1842 
1843    if (sem_info->cs_emit_wait &&
1844        (sem_info->wait.timeline_syncobj_count || sem_info->wait.syncobj_count || *queue_syncobj_wait)) {
1845 
1846       if (ctx->ws->info.has_timeline_syncobj) {
1847          wait_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(&sem_info->wait, queue_syncobj, &chunks[num_chunks],
1848                                                                     AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT);
1849       } else {
1850          wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait, queue_syncobj, &chunks[num_chunks],
1851                                                            AMDGPU_CHUNK_ID_SYNCOBJ_IN);
1852       }
1853       if (!wait_syncobj) {
1854          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1855          goto error_out;
1856       }
1857       num_chunks++;
1858 
1859       sem_info->cs_emit_wait = false;
1860       *queue_syncobj_wait = false;
1861    }
1862 
1863    if (sem_info->cs_emit_signal) {
1864       if (ctx->ws->info.has_timeline_syncobj) {
1865          signal_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(
1866             &sem_info->signal, queue_syncobj, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL);
1867       } else {
1868          signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal, queue_syncobj, &chunks[num_chunks],
1869                                                              AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
1870       }
1871       if (!signal_syncobj) {
1872          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1873          goto error_out;
1874       }
1875       num_chunks++;
1876    }
1877 
1878    bo_list_in.operation = ~0;
1879    bo_list_in.list_handle = ~0;
1880    bo_list_in.bo_number = request->num_handles;
1881    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1882    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)request->handles;
1883 
1884    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1885    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1886    chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1887    num_chunks++;
1888 
1889    /* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites quite
1890     * often, but it eventually succeeds after enough attempts. This happens frequently with dEQP
1891     * using NGG streamout.
1892     */
1893    uint64_t abs_timeout_ns = os_time_get_absolute_timeout(1000000000ull); /* 1s */
1894 
1895    r = 0;
1896    do {
1897       /* Wait 1 ms and try again. */
1898       if (r == -ENOMEM)
1899          os_time_sleep(1000);
1900 
1901       r = ac_drm_cs_submit_raw2(ctx->ws->dev, ctx->ctx_handle, 0, num_chunks, chunks, &request->seq_no);
1902    } while (r == -ENOMEM && os_time_get_nano() < abs_timeout_ns);
1903 
1904    if (r) {
1905       if (r == -ENOMEM) {
1906          fprintf(stderr, "radv/amdgpu: Not enough memory for command submission.\n");
1907          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1908       } else if (r == -ECANCELED) {
1909          fprintf(stderr,
1910                  "radv/amdgpu: The CS has been cancelled because the context is lost. This context is innocent.\n");
1911          result = VK_ERROR_DEVICE_LOST;
1912       } else if (r == -ENODATA) {
1913          fprintf(stderr, "radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty "
1914                          "of a soft recovery.\n");
1915          result = VK_ERROR_DEVICE_LOST;
1916       } else if (r == -ETIME) {
1917          fprintf(stderr, "radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty "
1918                          "of a hard recovery.\n");
1919          result = VK_ERROR_DEVICE_LOST;
1920       } else {
1921          fprintf(stderr,
1922                  "radv/amdgpu: The CS has been rejected, "
1923                  "see dmesg for more information (%i).\n",
1924                  r);
1925          result = VK_ERROR_UNKNOWN;
1926       }
1927    }
1928 
1929 error_out:
1930    free(chunks);
1931    free(chunk_data);
1932    free(wait_syncobj);
1933    free(signal_syncobj);
1934    return result;
1935 }
1936 
1937 void
radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys * ws)1938 radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
1939 {
1940    ws->base.ctx_create = radv_amdgpu_ctx_create;
1941    ws->base.ctx_destroy = radv_amdgpu_ctx_destroy;
1942    ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle;
1943    ws->base.ctx_set_pstate = radv_amdgpu_ctx_set_pstate;
1944    ws->base.cs_domain = radv_amdgpu_cs_domain;
1945    ws->base.cs_create = radv_amdgpu_cs_create;
1946    ws->base.cs_destroy = radv_amdgpu_cs_destroy;
1947    ws->base.cs_grow = radv_amdgpu_cs_grow;
1948    ws->base.cs_finalize = radv_amdgpu_cs_finalize;
1949    ws->base.cs_reset = radv_amdgpu_cs_reset;
1950    ws->base.cs_chain = radv_amdgpu_cs_chain;
1951    ws->base.cs_unchain = radv_amdgpu_cs_unchain;
1952    ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer;
1953    ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary;
1954    ws->base.cs_execute_ib = radv_amdgpu_cs_execute_ib;
1955    ws->base.cs_chain_dgc_ib = radv_amdgpu_cs_chain_dgc_ib;
1956    ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
1957    ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
1958    ws->base.cs_annotate = radv_amdgpu_winsys_cs_annotate;
1959    ws->base.cs_pad = radv_amdgpu_winsys_cs_pad;
1960 }
1961