• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #include <amdgpu.h>
26 #include <assert.h>
27 #include <libsync.h>
28 #include <pthread.h>
29 #include <stdlib.h>
30 #include "drm-uapi/amdgpu_drm.h"
31 
32 #include "util/detect_os.h"
33 #include "util/os_time.h"
34 #include "util/u_memory.h"
35 #include "ac_debug.h"
36 #include "radv_amdgpu_bo.h"
37 #include "radv_amdgpu_cs.h"
38 #include "radv_amdgpu_winsys.h"
39 #include "radv_debug.h"
40 #include "radv_radeon_winsys.h"
41 #include "sid.h"
42 #include "vk_alloc.h"
43 #include "vk_drm_syncobj.h"
44 #include "vk_sync.h"
45 #include "vk_sync_dummy.h"
46 
47 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
48  * codes in the kernel).
49  */
50 #if DETECT_OS_OPENBSD
51 #define ENODATA ENOTSUP
52 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
53 #define ENODATA ECONNREFUSED
54 #endif
55 
56 /* Maximum allowed total number of submitted IBs. */
57 #define RADV_MAX_IBS_PER_SUBMIT 192
58 
59 enum { VIRTUAL_BUFFER_HASH_TABLE_SIZE = 1024 };
60 
61 struct radv_amdgpu_ib {
62    struct radeon_winsys_bo *bo;
63    unsigned cdw;
64    unsigned offset;  /* VA offset */
65    bool is_external; /* Not owned by the current CS object. */
66 };
67 
68 struct radv_amdgpu_cs_ib_info {
69    int64_t flags;
70    uint64_t ib_mc_address;
71    uint32_t size;
72    enum amd_ip_type ip_type;
73 };
74 
75 struct radv_amdgpu_cs {
76    struct radeon_cmdbuf base;
77    struct radv_amdgpu_winsys *ws;
78 
79    struct radv_amdgpu_cs_ib_info ib;
80 
81    struct radeon_winsys_bo *ib_buffer;
82    uint8_t *ib_mapped;
83    unsigned max_num_buffers;
84    unsigned num_buffers;
85    struct drm_amdgpu_bo_list_entry *handles;
86 
87    struct radv_amdgpu_ib *ib_buffers;
88    unsigned num_ib_buffers;
89    unsigned max_num_ib_buffers;
90    unsigned *ib_size_ptr;
91    VkResult status;
92    struct radv_amdgpu_cs *chained_to;
93    bool use_ib;
94    bool is_secondary;
95 
96    int buffer_hash_table[1024];
97    unsigned hw_ip;
98 
99    unsigned num_virtual_buffers;
100    unsigned max_num_virtual_buffers;
101    struct radeon_winsys_bo **virtual_buffers;
102    int *virtual_buffer_hash_table;
103 };
104 
105 struct radv_winsys_sem_counts {
106    uint32_t syncobj_count;
107    uint32_t timeline_syncobj_count;
108    uint32_t *syncobj;
109    uint64_t *points;
110 };
111 
112 struct radv_winsys_sem_info {
113    bool cs_emit_signal;
114    bool cs_emit_wait;
115    struct radv_winsys_sem_counts wait;
116    struct radv_winsys_sem_counts signal;
117 };
118 
119 static void
radeon_emit_unchecked(struct radeon_cmdbuf * cs,uint32_t value)120 radeon_emit_unchecked(struct radeon_cmdbuf *cs, uint32_t value)
121 {
122    cs->buf[cs->cdw++] = value;
123 }
124 
125 static uint32_t radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring);
126 
127 static inline struct radv_amdgpu_cs *
radv_amdgpu_cs(struct radeon_cmdbuf * base)128 radv_amdgpu_cs(struct radeon_cmdbuf *base)
129 {
130    return (struct radv_amdgpu_cs *)base;
131 }
132 
133 static bool
ring_can_use_ib_bos(const struct radv_amdgpu_winsys * ws,enum amd_ip_type ip_type)134 ring_can_use_ib_bos(const struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
135 {
136    return ws->use_ib_bos && (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
137 }
138 
139 struct radv_amdgpu_cs_request {
140    /** Specify HW IP block type to which to send the IB. */
141    unsigned ip_type;
142 
143    /** IP instance index if there are several IPs of the same type. */
144    unsigned ip_instance;
145 
146    /**
147     * Specify ring index of the IP. We could have several rings
148     * in the same IP. E.g. 0 for SDMA0 and 1 for SDMA1.
149     */
150    uint32_t ring;
151 
152    /**
153     * BO list handles used by this request.
154     */
155    struct drm_amdgpu_bo_list_entry *handles;
156    uint32_t num_handles;
157 
158    /** Number of IBs to submit in the field ibs. */
159    uint32_t number_of_ibs;
160 
161    /**
162     * IBs to submit. Those IBs will be submitted together as single entity
163     */
164    struct radv_amdgpu_cs_ib_info *ibs;
165 
166    /**
167     * The returned sequence number for the command submission
168     */
169    uint64_t seq_no;
170 };
171 
172 static VkResult radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
173                                       struct radv_winsys_sem_info *sem_info);
174 
175 static void
radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_fence * fence,struct radv_amdgpu_cs_request * req)176 radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_fence *fence,
177                              struct radv_amdgpu_cs_request *req)
178 {
179    fence->fence.context = ctx->ctx;
180    fence->fence.ip_type = req->ip_type;
181    fence->fence.ip_instance = req->ip_instance;
182    fence->fence.ring = req->ring;
183    fence->fence.fence = req->seq_no;
184 }
185 
186 static struct radv_amdgpu_cs_ib_info
radv_amdgpu_cs_ib_to_info(struct radv_amdgpu_cs * cs,struct radv_amdgpu_ib ib)187 radv_amdgpu_cs_ib_to_info(struct radv_amdgpu_cs *cs, struct radv_amdgpu_ib ib)
188 {
189    struct radv_amdgpu_cs_ib_info info = {
190       .flags = 0,
191       .ip_type = cs->hw_ip,
192       .ib_mc_address = radv_amdgpu_winsys_bo(ib.bo)->base.va + ib.offset,
193       .size = ib.cdw,
194    };
195    return info;
196 }
197 
198 static void
radv_amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)199 radv_amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
200 {
201    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
202 
203    if (cs->ib_buffer)
204       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
205 
206    for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
207       if (cs->ib_buffers[i].is_external)
208          continue;
209 
210       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffers[i].bo);
211    }
212 
213    free(cs->ib_buffers);
214    free(cs->virtual_buffers);
215    free(cs->virtual_buffer_hash_table);
216    free(cs->handles);
217    free(cs);
218 }
219 
220 static void
radv_amdgpu_init_cs(struct radv_amdgpu_cs * cs,enum amd_ip_type ip_type)221 radv_amdgpu_init_cs(struct radv_amdgpu_cs *cs, enum amd_ip_type ip_type)
222 {
223    for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i)
224       cs->buffer_hash_table[i] = -1;
225 
226    cs->hw_ip = ip_type;
227 }
228 
229 static enum radeon_bo_domain
radv_amdgpu_cs_domain(const struct radeon_winsys * _ws)230 radv_amdgpu_cs_domain(const struct radeon_winsys *_ws)
231 {
232    const struct radv_amdgpu_winsys *ws = (const struct radv_amdgpu_winsys *)_ws;
233 
234    bool enough_vram = ws->info.all_vram_visible ||
235                       p_atomic_read_relaxed(&ws->allocated_vram_vis) * 2 <= (uint64_t)ws->info.vram_vis_size_kb * 1024;
236 
237    /* Bandwidth should be equivalent to at least PCIe 3.0 x8.
238     * If there is no PCIe info, assume there is enough bandwidth.
239     */
240    bool enough_bandwidth = !ws->info.has_pcie_bandwidth_info || ws->info.pcie_bandwidth_mbps >= 8 * 0.985 * 1024;
241 
242    bool use_sam =
243       (enough_vram && enough_bandwidth && ws->info.has_dedicated_vram && !(ws->perftest & RADV_PERFTEST_NO_SAM)) ||
244       (ws->perftest & RADV_PERFTEST_SAM);
245    return use_sam ? RADEON_DOMAIN_VRAM : RADEON_DOMAIN_GTT;
246 }
247 
248 static VkResult
radv_amdgpu_cs_bo_create(struct radv_amdgpu_cs * cs,uint32_t ib_size)249 radv_amdgpu_cs_bo_create(struct radv_amdgpu_cs *cs, uint32_t ib_size)
250 {
251    struct radeon_winsys *ws = &cs->ws->base;
252 
253    /* Avoid memcpy from VRAM when a secondary cmdbuf can't always rely on IB2. */
254    const bool can_always_use_ib2 = cs->ws->info.gfx_level >= GFX8 && cs->hw_ip == AMD_IP_GFX;
255    const bool avoid_vram = cs->is_secondary && !can_always_use_ib2;
256    const enum radeon_bo_domain domain = avoid_vram ? RADEON_DOMAIN_GTT : radv_amdgpu_cs_domain(ws);
257    const enum radeon_bo_flag gtt_wc_flag = avoid_vram ? 0 : RADEON_FLAG_GTT_WC;
258    const enum radeon_bo_flag flags =
259       RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | gtt_wc_flag;
260 
261    return ws->buffer_create(ws, ib_size, cs->ws->info.ip[cs->hw_ip].ib_alignment, domain, flags, RADV_BO_PRIORITY_CS, 0,
262                             &cs->ib_buffer);
263 }
264 
265 static VkResult
radv_amdgpu_cs_get_new_ib(struct radeon_cmdbuf * _cs,uint32_t ib_size)266 radv_amdgpu_cs_get_new_ib(struct radeon_cmdbuf *_cs, uint32_t ib_size)
267 {
268    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
269    VkResult result;
270 
271    result = radv_amdgpu_cs_bo_create(cs, ib_size);
272    if (result != VK_SUCCESS)
273       return result;
274 
275    cs->ib_mapped = cs->ws->base.buffer_map(cs->ib_buffer);
276    if (!cs->ib_mapped) {
277       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
278       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
279    }
280 
281    cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
282    cs->base.buf = (uint32_t *)cs->ib_mapped;
283    cs->base.cdw = 0;
284    cs->base.reserved_dw = 0;
285    cs->base.max_dw = ib_size / 4 - 4;
286    cs->ib.size = 0;
287    cs->ib.ip_type = cs->hw_ip;
288 
289    if (cs->use_ib)
290       cs->ib_size_ptr = &cs->ib.size;
291 
292    cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
293 
294    return VK_SUCCESS;
295 }
296 
297 static unsigned
radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys * ws,enum amd_ip_type ip_type)298 radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
299 {
300    const uint32_t ib_alignment = ws->info.ip[ip_type].ib_alignment;
301    assert(util_is_power_of_two_nonzero(ib_alignment));
302    return align(20 * 1024 * 4, ib_alignment);
303 }
304 
305 static struct radeon_cmdbuf *
radv_amdgpu_cs_create(struct radeon_winsys * ws,enum amd_ip_type ip_type,bool is_secondary)306 radv_amdgpu_cs_create(struct radeon_winsys *ws, enum amd_ip_type ip_type, bool is_secondary)
307 {
308    struct radv_amdgpu_cs *cs;
309    uint32_t ib_size = radv_amdgpu_cs_get_initial_size(radv_amdgpu_winsys(ws), ip_type);
310 
311    cs = calloc(1, sizeof(struct radv_amdgpu_cs));
312    if (!cs)
313       return NULL;
314 
315    cs->is_secondary = is_secondary;
316    cs->ws = radv_amdgpu_winsys(ws);
317    radv_amdgpu_init_cs(cs, ip_type);
318 
319    cs->use_ib = ring_can_use_ib_bos(cs->ws, ip_type);
320 
321    VkResult result = radv_amdgpu_cs_get_new_ib(&cs->base, ib_size);
322    if (result != VK_SUCCESS) {
323       free(cs);
324       return NULL;
325    }
326 
327    return &cs->base;
328 }
329 
330 static uint32_t
get_nop_packet(struct radv_amdgpu_cs * cs)331 get_nop_packet(struct radv_amdgpu_cs *cs)
332 {
333    switch (cs->hw_ip) {
334    case AMDGPU_HW_IP_GFX:
335    case AMDGPU_HW_IP_COMPUTE:
336       return cs->ws->info.gfx_ib_pad_with_type2 ? PKT2_NOP_PAD : PKT3_NOP_PAD;
337    case AMDGPU_HW_IP_DMA:
338       return cs->ws->info.gfx_level == GFX6 ? 0xF0000000 : SDMA_NOP_PAD;
339    case AMDGPU_HW_IP_UVD:
340    case AMDGPU_HW_IP_UVD_ENC:
341       return PKT2_NOP_PAD;
342    case AMDGPU_HW_IP_VCN_DEC:
343       return 0x81FF;
344    case AMDGPU_HW_IP_VCN_ENC:
345       return 0; /* NOPs are illegal in encode, so don't pad */
346    default:
347       unreachable("Unknown IP type");
348    }
349 }
350 
351 static void
radv_amdgpu_cs_add_ib_buffer(struct radv_amdgpu_cs * cs,struct radeon_winsys_bo * bo,uint32_t offset,uint32_t cdw,bool is_external)352 radv_amdgpu_cs_add_ib_buffer(struct radv_amdgpu_cs *cs, struct radeon_winsys_bo *bo, uint32_t offset, uint32_t cdw,
353                              bool is_external)
354 {
355    if (cs->num_ib_buffers == cs->max_num_ib_buffers) {
356       unsigned max_num_ib_buffers = MAX2(1, cs->max_num_ib_buffers * 2);
357       struct radv_amdgpu_ib *ib_buffers = realloc(cs->ib_buffers, max_num_ib_buffers * sizeof(*ib_buffers));
358       if (!ib_buffers) {
359          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
360          return;
361       }
362       cs->max_num_ib_buffers = max_num_ib_buffers;
363       cs->ib_buffers = ib_buffers;
364    }
365 
366    cs->ib_buffers[cs->num_ib_buffers].bo = bo;
367    cs->ib_buffers[cs->num_ib_buffers].offset = offset;
368    cs->ib_buffers[cs->num_ib_buffers].is_external = is_external;
369    cs->ib_buffers[cs->num_ib_buffers++].cdw = cdw;
370 }
371 
372 static void
radv_amdgpu_restore_last_ib(struct radv_amdgpu_cs * cs)373 radv_amdgpu_restore_last_ib(struct radv_amdgpu_cs *cs)
374 {
375    struct radv_amdgpu_ib *ib = &cs->ib_buffers[--cs->num_ib_buffers];
376    assert(!ib->is_external);
377    cs->ib_buffer = ib->bo;
378 }
379 
380 static void
radv_amdgpu_cs_grow(struct radeon_cmdbuf * _cs,size_t min_size)381 radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
382 {
383    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
384 
385    if (cs->status != VK_SUCCESS) {
386       cs->base.cdw = 0;
387       return;
388    }
389 
390    const uint32_t ib_alignment = cs->ws->info.ip[cs->hw_ip].ib_alignment;
391 
392    cs->ws->base.cs_finalize(_cs);
393 
394    uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2);
395 
396    /* max that fits in the chain size field. */
397    ib_size = align(MIN2(ib_size, 0xfffff), ib_alignment);
398 
399    VkResult result = radv_amdgpu_cs_bo_create(cs, ib_size);
400 
401    if (result != VK_SUCCESS) {
402       cs->base.cdw = 0;
403       cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
404       radv_amdgpu_restore_last_ib(cs);
405    }
406 
407    cs->ib_mapped = cs->ws->base.buffer_map(cs->ib_buffer);
408    if (!cs->ib_mapped) {
409       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
410       cs->base.cdw = 0;
411 
412       /* VK_ERROR_MEMORY_MAP_FAILED is not valid for vkEndCommandBuffer. */
413       cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
414       radv_amdgpu_restore_last_ib(cs);
415    }
416 
417    cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
418 
419    if (cs->use_ib) {
420       cs->base.buf[cs->base.cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
421       cs->base.buf[cs->base.cdw - 3] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
422       cs->base.buf[cs->base.cdw - 2] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va >> 32;
423       cs->base.buf[cs->base.cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1);
424 
425       cs->ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
426    }
427 
428    cs->base.buf = (uint32_t *)cs->ib_mapped;
429    cs->base.cdw = 0;
430    cs->base.reserved_dw = 0;
431    cs->base.max_dw = ib_size / 4 - 4;
432 }
433 
434 static VkResult
radv_amdgpu_cs_finalize(struct radeon_cmdbuf * _cs)435 radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
436 {
437    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
438    enum amd_ip_type ip_type = cs->hw_ip;
439 
440    assert(cs->base.cdw <= cs->base.reserved_dw);
441 
442    uint32_t ib_pad_dw_mask = MAX2(3, cs->ws->info.ip[ip_type].ib_pad_dw_mask);
443    uint32_t nop_packet = get_nop_packet(cs);
444 
445    if (cs->use_ib) {
446       /* Ensure that with the 4 dword reservation we subtract from max_dw we always
447        * have 4 nops at the end for chaining.
448        */
449       while (!cs->base.cdw || (cs->base.cdw & ib_pad_dw_mask) != ib_pad_dw_mask - 3)
450          radeon_emit_unchecked(&cs->base, nop_packet);
451 
452       radeon_emit_unchecked(&cs->base, nop_packet);
453       radeon_emit_unchecked(&cs->base, nop_packet);
454       radeon_emit_unchecked(&cs->base, nop_packet);
455       radeon_emit_unchecked(&cs->base, nop_packet);
456 
457       *cs->ib_size_ptr |= cs->base.cdw;
458    } else {
459       /* Pad the CS with NOP packets. */
460       bool pad = true;
461 
462       /* Don't pad on VCN encode/unified as no NOPs */
463       if (ip_type == AMDGPU_HW_IP_VCN_ENC)
464          pad = false;
465 
466       /* Don't add padding to 0 length UVD due to kernel */
467       if (ip_type == AMDGPU_HW_IP_UVD && cs->base.cdw == 0)
468          pad = false;
469 
470       if (pad) {
471          while (!cs->base.cdw || (cs->base.cdw & ib_pad_dw_mask))
472             radeon_emit_unchecked(&cs->base, nop_packet);
473       }
474    }
475 
476    /* Append the current (last) IB to the array of IB buffers. */
477    radv_amdgpu_cs_add_ib_buffer(cs, cs->ib_buffer, 0, cs->use_ib ? G_3F2_IB_SIZE(*cs->ib_size_ptr) : cs->base.cdw,
478                                 false);
479 
480    /* Prevent freeing this BO twice. */
481    cs->ib_buffer = NULL;
482 
483    cs->chained_to = NULL;
484 
485    assert(cs->base.cdw <= cs->base.max_dw + 4);
486 
487    return cs->status;
488 }
489 
490 static void
radv_amdgpu_cs_reset(struct radeon_cmdbuf * _cs)491 radv_amdgpu_cs_reset(struct radeon_cmdbuf *_cs)
492 {
493    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
494    cs->base.cdw = 0;
495    cs->base.reserved_dw = 0;
496    cs->status = VK_SUCCESS;
497 
498    for (unsigned i = 0; i < cs->num_buffers; ++i) {
499       unsigned hash = cs->handles[i].bo_handle & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
500       cs->buffer_hash_table[hash] = -1;
501    }
502 
503    for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
504       unsigned hash = ((uintptr_t)cs->virtual_buffers[i] >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
505       cs->virtual_buffer_hash_table[hash] = -1;
506    }
507 
508    cs->num_buffers = 0;
509    cs->num_virtual_buffers = 0;
510 
511    /* When the CS is finalized and IBs are not allowed, use last IB. */
512    assert(cs->ib_buffer || cs->num_ib_buffers);
513    if (!cs->ib_buffer)
514       radv_amdgpu_restore_last_ib(cs);
515 
516    cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
517 
518    for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
519       if (cs->ib_buffers[i].is_external)
520          continue;
521 
522       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffers[i].bo);
523    }
524 
525    cs->num_ib_buffers = 0;
526    cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
527 
528    cs->ib.size = 0;
529 
530    if (cs->use_ib)
531       cs->ib_size_ptr = &cs->ib.size;
532 }
533 
534 static void
radv_amdgpu_cs_unchain(struct radeon_cmdbuf * cs)535 radv_amdgpu_cs_unchain(struct radeon_cmdbuf *cs)
536 {
537    struct radv_amdgpu_cs *acs = radv_amdgpu_cs(cs);
538 
539    if (!acs->chained_to)
540       return;
541 
542    assert(cs->cdw <= cs->max_dw + 4);
543 
544    acs->chained_to = NULL;
545    cs->buf[cs->cdw - 4] = PKT3_NOP_PAD;
546    cs->buf[cs->cdw - 3] = PKT3_NOP_PAD;
547    cs->buf[cs->cdw - 2] = PKT3_NOP_PAD;
548    cs->buf[cs->cdw - 1] = PKT3_NOP_PAD;
549 }
550 
551 static bool
radv_amdgpu_cs_chain(struct radeon_cmdbuf * cs,struct radeon_cmdbuf * next_cs,bool pre_ena)552 radv_amdgpu_cs_chain(struct radeon_cmdbuf *cs, struct radeon_cmdbuf *next_cs, bool pre_ena)
553 {
554    /* Chains together two CS (command stream) objects by editing
555     * the end of the first CS to add a command that jumps to the
556     * second CS.
557     *
558     * After this, it is enough to submit the first CS to the GPU
559     * and not necessary to submit the second CS because it is already
560     * executed by the first.
561     */
562 
563    struct radv_amdgpu_cs *acs = radv_amdgpu_cs(cs);
564    struct radv_amdgpu_cs *next_acs = radv_amdgpu_cs(next_cs);
565 
566    /* Only some HW IP types have packets that we can use for chaining. */
567    if (!acs->use_ib)
568       return false;
569 
570    assert(cs->cdw <= cs->max_dw + 4);
571 
572    acs->chained_to = next_acs;
573 
574    cs->buf[cs->cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
575    cs->buf[cs->cdw - 3] = next_acs->ib.ib_mc_address;
576    cs->buf[cs->cdw - 2] = next_acs->ib.ib_mc_address >> 32;
577    cs->buf[cs->cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(pre_ena) | next_acs->ib.size;
578 
579    return true;
580 }
581 
582 static int
radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs * cs,uint32_t bo)583 radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs *cs, uint32_t bo)
584 {
585    unsigned hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
586    int index = cs->buffer_hash_table[hash];
587 
588    if (index == -1)
589       return -1;
590 
591    if (cs->handles[index].bo_handle == bo)
592       return index;
593 
594    for (unsigned i = 0; i < cs->num_buffers; ++i) {
595       if (cs->handles[i].bo_handle == bo) {
596          cs->buffer_hash_table[hash] = i;
597          return i;
598       }
599    }
600 
601    return -1;
602 }
603 
604 static void
radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs * cs,uint32_t bo,uint8_t priority)605 radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs, uint32_t bo, uint8_t priority)
606 {
607    unsigned hash;
608    int index = radv_amdgpu_cs_find_buffer(cs, bo);
609 
610    if (index != -1)
611       return;
612 
613    if (cs->num_buffers == cs->max_num_buffers) {
614       unsigned new_count = MAX2(1, cs->max_num_buffers * 2);
615       struct drm_amdgpu_bo_list_entry *new_entries =
616          realloc(cs->handles, new_count * sizeof(struct drm_amdgpu_bo_list_entry));
617       if (new_entries) {
618          cs->max_num_buffers = new_count;
619          cs->handles = new_entries;
620       } else {
621          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
622          return;
623       }
624    }
625 
626    cs->handles[cs->num_buffers].bo_handle = bo;
627    cs->handles[cs->num_buffers].bo_priority = priority;
628 
629    hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
630    cs->buffer_hash_table[hash] = cs->num_buffers;
631 
632    ++cs->num_buffers;
633 }
634 
635 static void
radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo)636 radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo)
637 {
638    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
639    unsigned hash = ((uintptr_t)bo >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
640 
641    if (!cs->virtual_buffer_hash_table) {
642       int *virtual_buffer_hash_table = malloc(VIRTUAL_BUFFER_HASH_TABLE_SIZE * sizeof(int));
643       if (!virtual_buffer_hash_table) {
644          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
645          return;
646       }
647       cs->virtual_buffer_hash_table = virtual_buffer_hash_table;
648 
649       for (int i = 0; i < VIRTUAL_BUFFER_HASH_TABLE_SIZE; ++i)
650          cs->virtual_buffer_hash_table[i] = -1;
651    }
652 
653    if (cs->virtual_buffer_hash_table[hash] >= 0) {
654       int idx = cs->virtual_buffer_hash_table[hash];
655       if (cs->virtual_buffers[idx] == bo) {
656          return;
657       }
658       for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
659          if (cs->virtual_buffers[i] == bo) {
660             cs->virtual_buffer_hash_table[hash] = i;
661             return;
662          }
663       }
664    }
665 
666    if (cs->max_num_virtual_buffers <= cs->num_virtual_buffers) {
667       unsigned max_num_virtual_buffers = MAX2(2, cs->max_num_virtual_buffers * 2);
668       struct radeon_winsys_bo **virtual_buffers =
669          realloc(cs->virtual_buffers, sizeof(struct radeon_winsys_bo *) * max_num_virtual_buffers);
670       if (!virtual_buffers) {
671          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
672          return;
673       }
674       cs->max_num_virtual_buffers = max_num_virtual_buffers;
675       cs->virtual_buffers = virtual_buffers;
676    }
677 
678    cs->virtual_buffers[cs->num_virtual_buffers] = bo;
679 
680    cs->virtual_buffer_hash_table[hash] = cs->num_virtual_buffers;
681    ++cs->num_virtual_buffers;
682 }
683 
684 static void
radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * _bo)685 radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *_bo)
686 {
687    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
688    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
689 
690    if (cs->status != VK_SUCCESS)
691       return;
692 
693    if (bo->is_virtual) {
694       radv_amdgpu_cs_add_virtual_buffer(_cs, _bo);
695       return;
696    }
697 
698    radv_amdgpu_cs_add_buffer_internal(cs, bo->bo_handle, bo->priority);
699 }
700 
701 static void
radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf * _parent,struct radeon_cmdbuf * _child,bool allow_ib2)702 radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cmdbuf *_child, bool allow_ib2)
703 {
704    struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
705    struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
706    struct radv_amdgpu_winsys *ws = parent->ws;
707    const bool use_ib2 = parent->use_ib && allow_ib2 && parent->hw_ip == AMD_IP_GFX;
708 
709    if (parent->status != VK_SUCCESS || child->status != VK_SUCCESS)
710       return;
711 
712    for (unsigned i = 0; i < child->num_buffers; ++i) {
713       radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i].bo_handle, child->handles[i].bo_priority);
714    }
715 
716    for (unsigned i = 0; i < child->num_virtual_buffers; ++i) {
717       radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i]);
718    }
719 
720    if (use_ib2) {
721       if (parent->base.cdw + 4 > parent->base.max_dw)
722          radv_amdgpu_cs_grow(&parent->base, 4);
723 
724       parent->base.reserved_dw = MAX2(parent->base.reserved_dw, parent->base.cdw + 4);
725 
726       /* Not setting the CHAIN bit will launch an IB2. */
727       radeon_emit(&parent->base, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
728       radeon_emit(&parent->base, child->ib.ib_mc_address);
729       radeon_emit(&parent->base, child->ib.ib_mc_address >> 32);
730       radeon_emit(&parent->base, child->ib.size);
731    } else {
732       assert(parent->use_ib == child->use_ib);
733 
734       /* Grow the current CS and copy the contents of the secondary CS. */
735       for (unsigned i = 0; i < child->num_ib_buffers; i++) {
736          struct radv_amdgpu_ib *ib = &child->ib_buffers[i];
737          uint32_t cdw = ib->cdw;
738          uint8_t *mapped;
739 
740          /* Do not copy the original chain link for IBs. */
741          if (child->use_ib)
742             cdw -= 4;
743 
744          assert(!ib->is_external);
745 
746          if (parent->base.cdw + cdw > parent->base.max_dw)
747             radv_amdgpu_cs_grow(&parent->base, cdw);
748 
749          parent->base.reserved_dw = MAX2(parent->base.reserved_dw, parent->base.cdw + cdw);
750 
751          mapped = ws->base.buffer_map(ib->bo);
752          if (!mapped) {
753             parent->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
754             return;
755          }
756 
757          memcpy(parent->base.buf + parent->base.cdw, mapped, 4 * cdw);
758          parent->base.cdw += cdw;
759       }
760    }
761 }
762 
763 static void
radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo,const uint64_t offset,const uint32_t cdw,const bool predicate)764 radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo, const uint64_t offset,
765                           const uint32_t cdw, const bool predicate)
766 {
767    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
768    const uint64_t va = bo->va + offset;
769 
770    if (cs->status != VK_SUCCESS)
771       return;
772 
773    if (cs->hw_ip == AMD_IP_GFX && cs->use_ib) {
774       radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER, 2, predicate));
775       radeon_emit(&cs->base, va);
776       radeon_emit(&cs->base, va >> 32);
777       radeon_emit(&cs->base, cdw);
778    } else {
779       const uint32_t ib_size = radv_amdgpu_cs_get_initial_size(cs->ws, cs->hw_ip);
780       VkResult result;
781 
782       /* Finalize the current CS without chaining to execute the external IB. */
783       radv_amdgpu_cs_finalize(_cs);
784 
785       radv_amdgpu_cs_add_ib_buffer(cs, bo, offset, cdw, true);
786 
787       /* Start a new CS which isn't chained to any previous CS. */
788       result = radv_amdgpu_cs_get_new_ib(_cs, ib_size);
789       if (result != VK_SUCCESS) {
790          cs->base.cdw = 0;
791          cs->status = result;
792       }
793    }
794 }
795 
796 static unsigned
radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs * start_cs)797 radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs *start_cs)
798 {
799    unsigned num_bo = 0;
800 
801    for (struct radv_amdgpu_cs *cs = start_cs; cs; cs = cs->chained_to) {
802       num_bo += cs->num_buffers;
803       for (unsigned j = 0; j < cs->num_virtual_buffers; ++j)
804          num_bo += radv_amdgpu_winsys_bo(cs->virtual_buffers[j])->bo_count;
805    }
806 
807    return num_bo;
808 }
809 
810 static unsigned
radv_amdgpu_count_cs_array_bo(struct radeon_cmdbuf ** cs_array,unsigned num_cs)811 radv_amdgpu_count_cs_array_bo(struct radeon_cmdbuf **cs_array, unsigned num_cs)
812 {
813    unsigned num_bo = 0;
814 
815    for (unsigned i = 0; i < num_cs; ++i) {
816       num_bo += radv_amdgpu_count_cs_bo(radv_amdgpu_cs(cs_array[i]));
817    }
818 
819    return num_bo;
820 }
821 
822 static unsigned
radv_amdgpu_add_cs_to_bo_list(struct radv_amdgpu_cs * cs,struct drm_amdgpu_bo_list_entry * handles,unsigned num_handles)823 radv_amdgpu_add_cs_to_bo_list(struct radv_amdgpu_cs *cs, struct drm_amdgpu_bo_list_entry *handles, unsigned num_handles)
824 {
825    if (!cs->num_buffers)
826       return num_handles;
827 
828    if (num_handles == 0 && !cs->num_virtual_buffers) {
829       memcpy(handles, cs->handles, cs->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
830       return cs->num_buffers;
831    }
832 
833    int unique_bo_so_far = num_handles;
834    for (unsigned j = 0; j < cs->num_buffers; ++j) {
835       bool found = false;
836       for (unsigned k = 0; k < unique_bo_so_far; ++k) {
837          if (handles[k].bo_handle == cs->handles[j].bo_handle) {
838             found = true;
839             break;
840          }
841       }
842       if (!found) {
843          handles[num_handles] = cs->handles[j];
844          ++num_handles;
845       }
846    }
847    for (unsigned j = 0; j < cs->num_virtual_buffers; ++j) {
848       struct radv_amdgpu_winsys_bo *virtual_bo = radv_amdgpu_winsys_bo(cs->virtual_buffers[j]);
849       u_rwlock_rdlock(&virtual_bo->lock);
850       for (unsigned k = 0; k < virtual_bo->bo_count; ++k) {
851          struct radv_amdgpu_winsys_bo *bo = virtual_bo->bos[k];
852          bool found = false;
853          for (unsigned m = 0; m < num_handles; ++m) {
854             if (handles[m].bo_handle == bo->bo_handle) {
855                found = true;
856                break;
857             }
858          }
859          if (!found) {
860             handles[num_handles].bo_handle = bo->bo_handle;
861             handles[num_handles].bo_priority = bo->priority;
862             ++num_handles;
863          }
864       }
865       u_rwlock_rdunlock(&virtual_bo->lock);
866    }
867 
868    return num_handles;
869 }
870 
871 static unsigned
radv_amdgpu_add_cs_array_to_bo_list(struct radeon_cmdbuf ** cs_array,unsigned num_cs,struct drm_amdgpu_bo_list_entry * handles,unsigned num_handles)872 radv_amdgpu_add_cs_array_to_bo_list(struct radeon_cmdbuf **cs_array, unsigned num_cs,
873                                     struct drm_amdgpu_bo_list_entry *handles, unsigned num_handles)
874 {
875    for (unsigned i = 0; i < num_cs; ++i) {
876       for (struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]); cs; cs = cs->chained_to) {
877          num_handles = radv_amdgpu_add_cs_to_bo_list(cs, handles, num_handles);
878       }
879    }
880 
881    return num_handles;
882 }
883 
884 static unsigned
radv_amdgpu_copy_global_bo_list(struct radv_amdgpu_winsys * ws,struct drm_amdgpu_bo_list_entry * handles)885 radv_amdgpu_copy_global_bo_list(struct radv_amdgpu_winsys *ws, struct drm_amdgpu_bo_list_entry *handles)
886 {
887    for (uint32_t i = 0; i < ws->global_bo_list.count; i++) {
888       handles[i].bo_handle = ws->global_bo_list.bos[i]->bo_handle;
889       handles[i].bo_priority = ws->global_bo_list.bos[i]->priority;
890    }
891 
892    return ws->global_bo_list.count;
893 }
894 
895 static VkResult
radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys * ws,struct radeon_cmdbuf ** cs_array,unsigned count,struct radeon_cmdbuf ** initial_preamble_array,unsigned num_initial_preambles,struct radeon_cmdbuf ** continue_preamble_array,unsigned num_continue_preambles,struct radeon_cmdbuf ** postamble_array,unsigned num_postambles,unsigned * rnum_handles,struct drm_amdgpu_bo_list_entry ** rhandles)896 radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys *ws, struct radeon_cmdbuf **cs_array, unsigned count,
897                         struct radeon_cmdbuf **initial_preamble_array, unsigned num_initial_preambles,
898                         struct radeon_cmdbuf **continue_preamble_array, unsigned num_continue_preambles,
899                         struct radeon_cmdbuf **postamble_array, unsigned num_postambles, unsigned *rnum_handles,
900                         struct drm_amdgpu_bo_list_entry **rhandles)
901 {
902    struct drm_amdgpu_bo_list_entry *handles = NULL;
903    unsigned num_handles = 0;
904 
905    if (ws->debug_all_bos) {
906       handles = malloc(sizeof(handles[0]) * ws->global_bo_list.count);
907       if (!handles)
908          return VK_ERROR_OUT_OF_HOST_MEMORY;
909 
910       num_handles = radv_amdgpu_copy_global_bo_list(ws, handles);
911    } else if (count == 1 && !num_initial_preambles && !num_continue_preambles && !num_postambles &&
912               !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers && !radv_amdgpu_cs(cs_array[0])->chained_to &&
913               !ws->global_bo_list.count) {
914       struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)cs_array[0];
915       if (cs->num_buffers == 0)
916          return VK_SUCCESS;
917 
918       handles = malloc(sizeof(handles[0]) * cs->num_buffers);
919       if (!handles)
920          return VK_ERROR_OUT_OF_HOST_MEMORY;
921 
922       memcpy(handles, cs->handles, sizeof(handles[0]) * cs->num_buffers);
923       num_handles = cs->num_buffers;
924    } else {
925       unsigned total_buffer_count = ws->global_bo_list.count;
926       total_buffer_count += radv_amdgpu_count_cs_array_bo(cs_array, count);
927       total_buffer_count += radv_amdgpu_count_cs_array_bo(initial_preamble_array, num_initial_preambles);
928       total_buffer_count += radv_amdgpu_count_cs_array_bo(continue_preamble_array, num_continue_preambles);
929       total_buffer_count += radv_amdgpu_count_cs_array_bo(postamble_array, num_postambles);
930 
931       if (total_buffer_count == 0)
932          return VK_SUCCESS;
933 
934       handles = malloc(sizeof(handles[0]) * total_buffer_count);
935       if (!handles)
936          return VK_ERROR_OUT_OF_HOST_MEMORY;
937 
938       num_handles = radv_amdgpu_copy_global_bo_list(ws, handles);
939       num_handles = radv_amdgpu_add_cs_array_to_bo_list(cs_array, count, handles, num_handles);
940       num_handles =
941          radv_amdgpu_add_cs_array_to_bo_list(initial_preamble_array, num_initial_preambles, handles, num_handles);
942       num_handles =
943          radv_amdgpu_add_cs_array_to_bo_list(continue_preamble_array, num_continue_preambles, handles, num_handles);
944       num_handles = radv_amdgpu_add_cs_array_to_bo_list(postamble_array, num_postambles, handles, num_handles);
945    }
946 
947    *rhandles = handles;
948    *rnum_handles = num_handles;
949 
950    return VK_SUCCESS;
951 }
952 
953 static void
radv_assign_last_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request)954 radv_assign_last_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request)
955 {
956    radv_amdgpu_request_to_fence(ctx, &ctx->last_submission[request->ip_type][request->ring], request);
957 }
958 
959 static bool
radv_amdgpu_cs_has_external_ib(const struct radv_amdgpu_cs * cs)960 radv_amdgpu_cs_has_external_ib(const struct radv_amdgpu_cs *cs)
961 {
962    for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
963       if (cs->ib_buffers[i].is_external)
964          return true;
965    }
966 
967    return false;
968 }
969 
970 static unsigned
radv_amdgpu_get_num_ibs_per_cs(const struct radv_amdgpu_cs * cs)971 radv_amdgpu_get_num_ibs_per_cs(const struct radv_amdgpu_cs *cs)
972 {
973    unsigned num_ibs = 0;
974 
975    if (cs->use_ib) {
976       unsigned num_external_ibs = 0;
977 
978       for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
979          if (cs->ib_buffers[i].is_external)
980             num_external_ibs++;
981       }
982 
983       num_ibs = num_external_ibs * 2 + 1;
984    } else {
985       num_ibs = cs->num_ib_buffers;
986    }
987 
988    return num_ibs;
989 }
990 
991 static unsigned
radv_amdgpu_count_ibs(struct radeon_cmdbuf ** cs_array,unsigned cs_count,unsigned initial_preamble_count,unsigned continue_preamble_count,unsigned postamble_count)992 radv_amdgpu_count_ibs(struct radeon_cmdbuf **cs_array, unsigned cs_count, unsigned initial_preamble_count,
993                       unsigned continue_preamble_count, unsigned postamble_count)
994 {
995    unsigned num_ibs = 0;
996 
997    for (unsigned i = 0; i < cs_count; i++) {
998       struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
999 
1000       num_ibs += radv_amdgpu_get_num_ibs_per_cs(cs);
1001    }
1002 
1003    return MAX2(initial_preamble_count, continue_preamble_count) + num_ibs + postamble_count;
1004 }
1005 
1006 static VkResult
radv_amdgpu_winsys_cs_submit_internal(struct radv_amdgpu_ctx * ctx,int queue_idx,struct radv_winsys_sem_info * sem_info,struct radeon_cmdbuf ** cs_array,unsigned cs_count,struct radeon_cmdbuf ** initial_preamble_cs,unsigned initial_preamble_count,struct radeon_cmdbuf ** continue_preamble_cs,unsigned continue_preamble_count,struct radeon_cmdbuf ** postamble_cs,unsigned postamble_count,bool uses_shadow_regs)1007 radv_amdgpu_winsys_cs_submit_internal(struct radv_amdgpu_ctx *ctx, int queue_idx, struct radv_winsys_sem_info *sem_info,
1008                                       struct radeon_cmdbuf **cs_array, unsigned cs_count,
1009                                       struct radeon_cmdbuf **initial_preamble_cs, unsigned initial_preamble_count,
1010                                       struct radeon_cmdbuf **continue_preamble_cs, unsigned continue_preamble_count,
1011                                       struct radeon_cmdbuf **postamble_cs, unsigned postamble_count,
1012                                       bool uses_shadow_regs)
1013 {
1014    VkResult result;
1015 
1016    /* Last CS is "the gang leader", its IP type determines which fence to signal. */
1017    struct radv_amdgpu_cs *last_cs = radv_amdgpu_cs(cs_array[cs_count - 1]);
1018    struct radv_amdgpu_winsys *ws = last_cs->ws;
1019 
1020    const unsigned num_ibs =
1021       radv_amdgpu_count_ibs(cs_array, cs_count, initial_preamble_count, continue_preamble_count, postamble_count);
1022    const unsigned ib_array_size = MIN2(RADV_MAX_IBS_PER_SUBMIT, num_ibs);
1023 
1024    STACK_ARRAY(struct radv_amdgpu_cs_ib_info, ibs, ib_array_size);
1025 
1026    struct drm_amdgpu_bo_list_entry *handles = NULL;
1027    unsigned num_handles = 0;
1028 
1029    u_rwlock_rdlock(&ws->global_bo_list.lock);
1030 
1031    result = radv_amdgpu_get_bo_list(ws, &cs_array[0], cs_count, initial_preamble_cs, initial_preamble_count,
1032                                     continue_preamble_cs, continue_preamble_count, postamble_cs, postamble_count,
1033                                     &num_handles, &handles);
1034    if (result != VK_SUCCESS)
1035       goto fail;
1036 
1037    /* Configure the CS request. */
1038    const uint32_t *max_ib_per_ip = ws->info.max_submitted_ibs;
1039    struct radv_amdgpu_cs_request request = {
1040       .ip_type = last_cs->hw_ip,
1041       .ip_instance = 0,
1042       .ring = queue_idx,
1043       .handles = handles,
1044       .num_handles = num_handles,
1045       .ibs = ibs,
1046       .number_of_ibs = 0, /* set below */
1047    };
1048 
1049    for (unsigned cs_idx = 0, cs_ib_idx = 0; cs_idx < cs_count;) {
1050       struct radeon_cmdbuf **preambles = cs_idx ? continue_preamble_cs : initial_preamble_cs;
1051       const unsigned preamble_count = cs_idx ? continue_preamble_count : initial_preamble_count;
1052       const unsigned ib_per_submit = RADV_MAX_IBS_PER_SUBMIT - preamble_count - postamble_count;
1053       unsigned num_submitted_ibs = 0;
1054       unsigned ibs_per_ip[AMD_NUM_IP_TYPES] = {0};
1055 
1056       /* Copy preambles to the submission. */
1057       for (unsigned i = 0; i < preamble_count; ++i) {
1058          /* Assume that the full preamble fits into 1 IB. */
1059          struct radv_amdgpu_cs *cs = radv_amdgpu_cs(preambles[i]);
1060          struct radv_amdgpu_cs_ib_info ib;
1061 
1062          assert(cs->num_ib_buffers == 1);
1063          ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1064 
1065          ibs[num_submitted_ibs++] = ib;
1066          ibs_per_ip[cs->hw_ip]++;
1067       }
1068 
1069       for (unsigned i = 0; i < ib_per_submit && cs_idx < cs_count; ++i) {
1070          struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[cs_idx]);
1071          struct radv_amdgpu_cs_ib_info ib;
1072 
1073          if (cs_ib_idx == 0) {
1074             /* Make sure the whole CS fits into the same submission. */
1075             unsigned cs_num_ib = radv_amdgpu_get_num_ibs_per_cs(cs);
1076             if (i + cs_num_ib > ib_per_submit || ibs_per_ip[cs->hw_ip] + cs_num_ib > max_ib_per_ip[cs->hw_ip])
1077                break;
1078 
1079             if (cs->hw_ip != request.ip_type) {
1080                /* Found a "follower" CS in a gang submission.
1081                 * Make sure to submit this together with its "leader", the next CS.
1082                 * We rely on the caller to order each "follower" before its "leader."
1083                 */
1084                assert(cs_idx != cs_count - 1);
1085                struct radv_amdgpu_cs *next_cs = radv_amdgpu_cs(cs_array[cs_idx + 1]);
1086                assert(next_cs->hw_ip == request.ip_type);
1087                unsigned next_cs_num_ib = radv_amdgpu_get_num_ibs_per_cs(next_cs);
1088                if (i + cs_num_ib + next_cs_num_ib > ib_per_submit ||
1089                    ibs_per_ip[next_cs->hw_ip] + next_cs_num_ib > max_ib_per_ip[next_cs->hw_ip])
1090                   break;
1091             }
1092          }
1093 
1094          /* When IBs are used, we only need to submit the main IB of this CS, because everything
1095           * else is chained to the first IB. Except when the CS has external IBs because they need
1096           * to be submitted separately. Otherwise we must submit all IBs in the ib_buffers array.
1097           */
1098          if (cs->use_ib) {
1099             if (radv_amdgpu_cs_has_external_ib(cs)) {
1100                const unsigned cur_ib_idx = cs_ib_idx;
1101 
1102                ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[cs_ib_idx++]);
1103 
1104                /* Loop until the next external IB is found. */
1105                while (!cs->ib_buffers[cur_ib_idx].is_external && !cs->ib_buffers[cs_ib_idx].is_external &&
1106                       cs_ib_idx < cs->num_ib_buffers) {
1107                   cs_ib_idx++;
1108                }
1109 
1110                if (cs_ib_idx == cs->num_ib_buffers) {
1111                   cs_idx++;
1112                   cs_ib_idx = 0;
1113                }
1114             } else {
1115                ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1116                cs_idx++;
1117             }
1118          } else {
1119             assert(cs_ib_idx < cs->num_ib_buffers);
1120             ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[cs_ib_idx++]);
1121 
1122             if (cs_ib_idx == cs->num_ib_buffers) {
1123                cs_idx++;
1124                cs_ib_idx = 0;
1125             }
1126          }
1127 
1128          if (uses_shadow_regs && ib.ip_type == AMDGPU_HW_IP_GFX)
1129             ib.flags |= AMDGPU_IB_FLAG_PREEMPT;
1130 
1131          assert(num_submitted_ibs < ib_array_size);
1132          ibs[num_submitted_ibs++] = ib;
1133          ibs_per_ip[cs->hw_ip]++;
1134       }
1135 
1136       assert(num_submitted_ibs > preamble_count);
1137 
1138       /* Copy postambles to the submission. */
1139       for (unsigned i = 0; i < postamble_count; ++i) {
1140          /* Assume that the full postamble fits into 1 IB. */
1141          struct radv_amdgpu_cs *cs = radv_amdgpu_cs(postamble_cs[i]);
1142          struct radv_amdgpu_cs_ib_info ib;
1143 
1144          assert(cs->num_ib_buffers == 1);
1145          ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1146 
1147          ibs[num_submitted_ibs++] = ib;
1148          ibs_per_ip[cs->hw_ip]++;
1149       }
1150 
1151       /* Submit the CS. */
1152       request.number_of_ibs = num_submitted_ibs;
1153       result = radv_amdgpu_cs_submit(ctx, &request, sem_info);
1154       if (result != VK_SUCCESS)
1155          goto fail;
1156    }
1157 
1158    free(request.handles);
1159 
1160    if (result != VK_SUCCESS)
1161       goto fail;
1162 
1163    radv_assign_last_submit(ctx, &request);
1164 
1165 fail:
1166    u_rwlock_rdunlock(&ws->global_bo_list.lock);
1167    STACK_ARRAY_FINISH(ibs);
1168    return result;
1169 }
1170 
1171 static VkResult
radv_amdgpu_cs_submit_zero(struct radv_amdgpu_ctx * ctx,enum amd_ip_type ip_type,int queue_idx,struct radv_winsys_sem_info * sem_info)1172 radv_amdgpu_cs_submit_zero(struct radv_amdgpu_ctx *ctx, enum amd_ip_type ip_type, int queue_idx,
1173                            struct radv_winsys_sem_info *sem_info)
1174 {
1175    unsigned hw_ip = ip_type;
1176    unsigned queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, hw_ip, queue_idx);
1177    int ret;
1178 
1179    if (!queue_syncobj)
1180       return VK_ERROR_OUT_OF_HOST_MEMORY;
1181 
1182    if (sem_info->wait.syncobj_count || sem_info->wait.timeline_syncobj_count) {
1183       int fd;
1184       ret = amdgpu_cs_syncobj_export_sync_file(ctx->ws->dev, queue_syncobj, &fd);
1185       if (ret < 0)
1186          return VK_ERROR_DEVICE_LOST;
1187 
1188       for (unsigned i = 0; i < sem_info->wait.syncobj_count; ++i) {
1189          int fd2;
1190          ret = amdgpu_cs_syncobj_export_sync_file(ctx->ws->dev, sem_info->wait.syncobj[i], &fd2);
1191          if (ret < 0) {
1192             close(fd);
1193             return VK_ERROR_DEVICE_LOST;
1194          }
1195 
1196          sync_accumulate("radv", &fd, fd2);
1197          close(fd2);
1198       }
1199       for (unsigned i = 0; i < sem_info->wait.timeline_syncobj_count; ++i) {
1200          int fd2;
1201          ret = amdgpu_cs_syncobj_export_sync_file2(
1202             ctx->ws->dev, sem_info->wait.syncobj[i + sem_info->wait.syncobj_count], sem_info->wait.points[i], 0, &fd2);
1203          if (ret < 0) {
1204             /* This works around a kernel bug where the fence isn't copied if it is already
1205              * signalled. Since it is already signalled it is totally fine to not wait on it.
1206              *
1207              * kernel patch: https://patchwork.freedesktop.org/patch/465583/ */
1208             uint64_t point;
1209             ret = amdgpu_cs_syncobj_query2(ctx->ws->dev, &sem_info->wait.syncobj[i + sem_info->wait.syncobj_count],
1210                                            &point, 1, 0);
1211             if (!ret && point >= sem_info->wait.points[i])
1212                continue;
1213 
1214             close(fd);
1215             return VK_ERROR_DEVICE_LOST;
1216          }
1217 
1218          sync_accumulate("radv", &fd, fd2);
1219          close(fd2);
1220       }
1221       ret = amdgpu_cs_syncobj_import_sync_file(ctx->ws->dev, queue_syncobj, fd);
1222       close(fd);
1223       if (ret < 0)
1224          return VK_ERROR_DEVICE_LOST;
1225 
1226       ctx->queue_syncobj_wait[hw_ip][queue_idx] = true;
1227    }
1228 
1229    for (unsigned i = 0; i < sem_info->signal.syncobj_count; ++i) {
1230       uint32_t dst_handle = sem_info->signal.syncobj[i];
1231       uint32_t src_handle = queue_syncobj;
1232 
1233       if (ctx->ws->info.has_timeline_syncobj) {
1234          ret = amdgpu_cs_syncobj_transfer(ctx->ws->dev, dst_handle, 0, src_handle, 0, 0);
1235          if (ret < 0)
1236             return VK_ERROR_DEVICE_LOST;
1237       } else {
1238          int fd;
1239          ret = amdgpu_cs_syncobj_export_sync_file(ctx->ws->dev, src_handle, &fd);
1240          if (ret < 0)
1241             return VK_ERROR_DEVICE_LOST;
1242 
1243          ret = amdgpu_cs_syncobj_import_sync_file(ctx->ws->dev, dst_handle, fd);
1244          close(fd);
1245          if (ret < 0)
1246             return VK_ERROR_DEVICE_LOST;
1247       }
1248    }
1249    for (unsigned i = 0; i < sem_info->signal.timeline_syncobj_count; ++i) {
1250       ret = amdgpu_cs_syncobj_transfer(ctx->ws->dev, sem_info->signal.syncobj[i + sem_info->signal.syncobj_count],
1251                                        sem_info->signal.points[i], queue_syncobj, 0, 0);
1252       if (ret < 0)
1253          return VK_ERROR_DEVICE_LOST;
1254    }
1255    return VK_SUCCESS;
1256 }
1257 
1258 static VkResult
radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx * _ctx,const struct radv_winsys_submit_info * submit,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t signal_count,const struct vk_sync_signal * signals)1259 radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx, const struct radv_winsys_submit_info *submit,
1260                              uint32_t wait_count, const struct vk_sync_wait *waits, uint32_t signal_count,
1261                              const struct vk_sync_signal *signals)
1262 {
1263    struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
1264    struct radv_amdgpu_winsys *ws = ctx->ws;
1265    VkResult result;
1266    unsigned wait_idx = 0, signal_idx = 0;
1267 
1268    STACK_ARRAY(uint64_t, wait_points, wait_count);
1269    STACK_ARRAY(uint32_t, wait_syncobj, wait_count);
1270    STACK_ARRAY(uint64_t, signal_points, signal_count);
1271    STACK_ARRAY(uint32_t, signal_syncobj, signal_count);
1272 
1273    if (!wait_points || !wait_syncobj || !signal_points || !signal_syncobj) {
1274       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1275       goto out;
1276    }
1277 
1278    for (uint32_t i = 0; i < wait_count; ++i) {
1279       if (waits[i].sync->type == &vk_sync_dummy_type)
1280          continue;
1281 
1282       assert(waits[i].sync->type == &ws->syncobj_sync_type);
1283       wait_syncobj[wait_idx] = ((struct vk_drm_syncobj *)waits[i].sync)->syncobj;
1284       wait_points[wait_idx] = waits[i].wait_value;
1285       ++wait_idx;
1286    }
1287 
1288    for (uint32_t i = 0; i < signal_count; ++i) {
1289       if (signals[i].sync->type == &vk_sync_dummy_type)
1290          continue;
1291 
1292       assert(signals[i].sync->type == &ws->syncobj_sync_type);
1293       signal_syncobj[signal_idx] = ((struct vk_drm_syncobj *)signals[i].sync)->syncobj;
1294       signal_points[signal_idx] = signals[i].signal_value;
1295       ++signal_idx;
1296    }
1297 
1298    assert(signal_idx <= signal_count);
1299    assert(wait_idx <= wait_count);
1300 
1301    const uint32_t wait_timeline_syncobj_count =
1302       (ws->syncobj_sync_type.features & VK_SYNC_FEATURE_TIMELINE) ? wait_idx : 0;
1303    const uint32_t signal_timeline_syncobj_count =
1304       (ws->syncobj_sync_type.features & VK_SYNC_FEATURE_TIMELINE) ? signal_idx : 0;
1305 
1306    struct radv_winsys_sem_info sem_info = {
1307       .wait =
1308          {
1309             .points = wait_points,
1310             .syncobj = wait_syncobj,
1311             .timeline_syncobj_count = wait_timeline_syncobj_count,
1312             .syncobj_count = wait_idx - wait_timeline_syncobj_count,
1313          },
1314       .signal =
1315          {
1316             .points = signal_points,
1317             .syncobj = signal_syncobj,
1318             .timeline_syncobj_count = signal_timeline_syncobj_count,
1319             .syncobj_count = signal_idx - signal_timeline_syncobj_count,
1320          },
1321       .cs_emit_wait = true,
1322       .cs_emit_signal = true,
1323    };
1324 
1325    if (!submit->cs_count) {
1326       result = radv_amdgpu_cs_submit_zero(ctx, submit->ip_type, submit->queue_index, &sem_info);
1327    } else {
1328       result = radv_amdgpu_winsys_cs_submit_internal(
1329          ctx, submit->queue_index, &sem_info, submit->cs_array, submit->cs_count, submit->initial_preamble_cs,
1330          submit->initial_preamble_count, submit->continue_preamble_cs, submit->continue_preamble_count,
1331          submit->postamble_cs, submit->postamble_count, submit->uses_shadow_regs);
1332    }
1333 
1334 out:
1335    STACK_ARRAY_FINISH(wait_points);
1336    STACK_ARRAY_FINISH(wait_syncobj);
1337    STACK_ARRAY_FINISH(signal_points);
1338    STACK_ARRAY_FINISH(signal_syncobj);
1339    return result;
1340 }
1341 
1342 static void
radv_amdgpu_winsys_get_cpu_addr(void * _cs,uint64_t addr,struct ac_addr_info * info)1343 radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr, struct ac_addr_info *info)
1344 {
1345    struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1346    void *ret = NULL;
1347 
1348    memset(info, 0, sizeof(struct ac_addr_info));
1349 
1350    if (cs->ws->debug_log_bos) {
1351       u_rwlock_rdlock(&cs->ws->log_bo_list_lock);
1352       list_for_each_entry_rev (struct radv_amdgpu_winsys_bo_log, bo_log, &cs->ws->log_bo_list, list) {
1353          if (addr >= bo_log->va && addr - bo_log->va < bo_log->size) {
1354             info->use_after_free = bo_log->destroyed;
1355             break;
1356          }
1357       }
1358       u_rwlock_rdunlock(&cs->ws->log_bo_list_lock);
1359    }
1360 
1361    if (info->use_after_free)
1362       return;
1363 
1364    info->valid = !cs->ws->debug_all_bos;
1365 
1366    for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
1367       struct radv_amdgpu_ib *ib = &cs->ib_buffers[i];
1368       struct radv_amdgpu_winsys_bo *bo = (struct radv_amdgpu_winsys_bo *)ib->bo;
1369 
1370       if (addr >= bo->base.va && addr - bo->base.va < bo->size) {
1371          if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0) {
1372             info->cpu_addr = (char *)ret + (addr - bo->base.va);
1373             info->valid = true;
1374             return;
1375          }
1376       }
1377    }
1378    u_rwlock_rdlock(&cs->ws->global_bo_list.lock);
1379    for (uint32_t i = 0; i < cs->ws->global_bo_list.count; i++) {
1380       struct radv_amdgpu_winsys_bo *bo = cs->ws->global_bo_list.bos[i];
1381       if (addr >= bo->base.va && addr - bo->base.va < bo->size) {
1382          if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0) {
1383             u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1384             info->valid = true;
1385             info->cpu_addr = (char *)ret + (addr - bo->base.va);
1386             return;
1387          }
1388       }
1389    }
1390    u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1391 
1392    return;
1393 }
1394 
1395 static void
radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf * _cs,FILE * file,const int * trace_ids,int trace_id_count,enum radv_cs_dump_type type)1396 radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf *_cs, FILE *file, const int *trace_ids, int trace_id_count,
1397                            enum radv_cs_dump_type type)
1398 {
1399    struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1400    struct radv_amdgpu_winsys *ws = cs->ws;
1401 
1402    if (cs->use_ib) {
1403       struct radv_amdgpu_cs_ib_info ib_info = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1404 
1405       struct ac_addr_info addr_info;
1406       radv_amdgpu_winsys_get_cpu_addr(cs, ib_info.ib_mc_address, &addr_info);
1407       assert(addr_info.cpu_addr);
1408 
1409       if (type == RADV_CS_DUMP_TYPE_IBS) {
1410          ac_parse_ib(file, addr_info.cpu_addr, cs->ib_buffers[0].cdw, trace_ids, trace_id_count, "main IB",
1411                      ws->info.gfx_level, ws->info.family, cs->hw_ip, radv_amdgpu_winsys_get_cpu_addr, cs);
1412       } else {
1413          uint32_t *ib_dw = addr_info.cpu_addr;
1414          ac_gather_context_rolls(file, &ib_dw, &cs->ib_buffers[0].cdw, 1, &ws->info);
1415       }
1416    } else {
1417       uint32_t **ibs = type == RADV_CS_DUMP_TYPE_CTX_ROLLS ? malloc(cs->num_ib_buffers * sizeof(uint32_t *)) : NULL;
1418       uint32_t *ib_dw_sizes =
1419          type == RADV_CS_DUMP_TYPE_CTX_ROLLS ? malloc(cs->num_ib_buffers * sizeof(uint32_t)) : NULL;
1420 
1421       for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
1422          struct radv_amdgpu_ib *ib = &cs->ib_buffers[i];
1423          char name[64];
1424          void *mapped;
1425 
1426          mapped = ws->base.buffer_map(ib->bo);
1427          if (!mapped)
1428             continue;
1429 
1430          if (cs->num_ib_buffers > 1) {
1431             snprintf(name, sizeof(name), "main IB (chunk %d)", i);
1432          } else {
1433             snprintf(name, sizeof(name), "main IB");
1434          }
1435 
1436          if (type == RADV_CS_DUMP_TYPE_IBS) {
1437             ac_parse_ib(file, mapped, ib->cdw, trace_ids, trace_id_count, name, ws->info.gfx_level, ws->info.family,
1438                         cs->hw_ip, NULL, NULL);
1439          } else {
1440             ibs[i] = mapped;
1441             ib_dw_sizes[i] = ib->cdw;
1442          }
1443       }
1444 
1445       if (type == RADV_CS_DUMP_TYPE_CTX_ROLLS) {
1446          ac_gather_context_rolls(file, ibs, ib_dw_sizes, cs->num_ib_buffers, &ws->info);
1447 
1448          free(ibs);
1449          free(ib_dw_sizes);
1450       }
1451    }
1452 }
1453 
1454 static uint32_t
radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)1455 radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)
1456 {
1457    switch (radv_priority) {
1458    case RADEON_CTX_PRIORITY_REALTIME:
1459       return AMDGPU_CTX_PRIORITY_VERY_HIGH;
1460    case RADEON_CTX_PRIORITY_HIGH:
1461       return AMDGPU_CTX_PRIORITY_HIGH;
1462    case RADEON_CTX_PRIORITY_MEDIUM:
1463       return AMDGPU_CTX_PRIORITY_NORMAL;
1464    case RADEON_CTX_PRIORITY_LOW:
1465       return AMDGPU_CTX_PRIORITY_LOW;
1466    default:
1467       unreachable("Invalid context priority");
1468    }
1469 }
1470 
1471 static VkResult
radv_amdgpu_ctx_create(struct radeon_winsys * _ws,enum radeon_ctx_priority priority,struct radeon_winsys_ctx ** rctx)1472 radv_amdgpu_ctx_create(struct radeon_winsys *_ws, enum radeon_ctx_priority priority, struct radeon_winsys_ctx **rctx)
1473 {
1474    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1475    struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx);
1476    uint32_t amdgpu_priority = radv_to_amdgpu_priority(priority);
1477    VkResult result;
1478    int r;
1479 
1480    if (!ctx)
1481       return VK_ERROR_OUT_OF_HOST_MEMORY;
1482 
1483    r = amdgpu_cs_ctx_create2(ws->dev, amdgpu_priority, &ctx->ctx);
1484    if (r && r == -EACCES) {
1485       result = VK_ERROR_NOT_PERMITTED_KHR;
1486       goto fail_create;
1487    } else if (r) {
1488       fprintf(stderr, "radv/amdgpu: radv_amdgpu_cs_ctx_create2 failed. (%i)\n", r);
1489       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1490       goto fail_create;
1491    }
1492    ctx->ws = ws;
1493 
1494    assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * 4 * sizeof(uint64_t) <= 4096);
1495    result = ws->base.buffer_create(&ws->base, 4096, 8, RADEON_DOMAIN_GTT,
1496                                    RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_CS, 0,
1497                                    &ctx->fence_bo);
1498    if (result != VK_SUCCESS) {
1499       goto fail_alloc;
1500    }
1501 
1502    *rctx = (struct radeon_winsys_ctx *)ctx;
1503    return VK_SUCCESS;
1504 
1505 fail_alloc:
1506    amdgpu_cs_ctx_free(ctx->ctx);
1507 fail_create:
1508    FREE(ctx);
1509    return result;
1510 }
1511 
1512 static void
radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)1513 radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
1514 {
1515    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1516 
1517    for (unsigned ip = 0; ip <= AMDGPU_HW_IP_NUM; ++ip) {
1518       for (unsigned ring = 0; ring < MAX_RINGS_PER_TYPE; ++ring) {
1519          if (ctx->queue_syncobj[ip][ring])
1520             amdgpu_cs_destroy_syncobj(ctx->ws->dev, ctx->queue_syncobj[ip][ring]);
1521       }
1522    }
1523 
1524    ctx->ws->base.buffer_destroy(&ctx->ws->base, ctx->fence_bo);
1525    amdgpu_cs_ctx_free(ctx->ctx);
1526    FREE(ctx);
1527 }
1528 
1529 static uint32_t
radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx * ctx,unsigned ip,unsigned ring)1530 radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring)
1531 {
1532    uint32_t *syncobj = &ctx->queue_syncobj[ip][ring];
1533    if (!*syncobj) {
1534       amdgpu_cs_create_syncobj2(ctx->ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED, syncobj);
1535    }
1536    return *syncobj;
1537 }
1538 
1539 static bool
radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,int ring_index)1540 radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx, enum amd_ip_type ip_type, int ring_index)
1541 {
1542    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1543 
1544    if (ctx->last_submission[ip_type][ring_index].fence.fence) {
1545       uint32_t expired;
1546       int ret =
1547          amdgpu_cs_query_fence_status(&ctx->last_submission[ip_type][ring_index].fence, 1000000000ull, 0, &expired);
1548 
1549       if (ret || !expired)
1550          return false;
1551    }
1552 
1553    return true;
1554 }
1555 
1556 static uint32_t
radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate)1557 radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate)
1558 {
1559    switch (radv_pstate) {
1560    case RADEON_CTX_PSTATE_NONE:
1561       return AMDGPU_CTX_STABLE_PSTATE_NONE;
1562    case RADEON_CTX_PSTATE_STANDARD:
1563       return AMDGPU_CTX_STABLE_PSTATE_STANDARD;
1564    case RADEON_CTX_PSTATE_MIN_SCLK:
1565       return AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK;
1566    case RADEON_CTX_PSTATE_MIN_MCLK:
1567       return AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK;
1568    case RADEON_CTX_PSTATE_PEAK:
1569       return AMDGPU_CTX_STABLE_PSTATE_PEAK;
1570    default:
1571       unreachable("Invalid pstate");
1572    }
1573 }
1574 
1575 static int
radv_amdgpu_ctx_set_pstate(struct radeon_winsys_ctx * rwctx,enum radeon_ctx_pstate pstate)1576 radv_amdgpu_ctx_set_pstate(struct radeon_winsys_ctx *rwctx, enum radeon_ctx_pstate pstate)
1577 {
1578    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1579    uint32_t new_pstate = radv_to_amdgpu_pstate(pstate);
1580    uint32_t current_pstate = 0;
1581    int r;
1582 
1583    r = amdgpu_cs_ctx_stable_pstate(ctx->ctx, AMDGPU_CTX_OP_GET_STABLE_PSTATE, 0, &current_pstate);
1584    if (r) {
1585       fprintf(stderr, "radv/amdgpu: failed to get current pstate\n");
1586       return r;
1587    }
1588 
1589    /* Do not try to set a new pstate when the current one is already what we want. Otherwise, the
1590     * kernel might return -EBUSY if we have multiple AMDGPU contexts in flight.
1591     */
1592    if (current_pstate == new_pstate)
1593       return 0;
1594 
1595    r = amdgpu_cs_ctx_stable_pstate(ctx->ctx, AMDGPU_CTX_OP_SET_STABLE_PSTATE, new_pstate, NULL);
1596    if (r) {
1597       fprintf(stderr, "radv/amdgpu: failed to set new pstate\n");
1598       return r;
1599    }
1600 
1601    return 0;
1602 }
1603 
1604 static void *
radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts * counts,uint32_t queue_syncobj,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1605 radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts, uint32_t queue_syncobj,
1606                                    struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1607 {
1608    unsigned count = counts->syncobj_count + (queue_syncobj ? 1 : 0);
1609    struct drm_amdgpu_cs_chunk_sem *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * count);
1610    if (!syncobj)
1611       return NULL;
1612 
1613    for (unsigned i = 0; i < counts->syncobj_count; i++) {
1614       struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
1615       sem->handle = counts->syncobj[i];
1616    }
1617 
1618    if (queue_syncobj)
1619       syncobj[counts->syncobj_count].handle = queue_syncobj;
1620 
1621    chunk->chunk_id = chunk_id;
1622    chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_sem) / 4 * count;
1623    chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1624    return syncobj;
1625 }
1626 
1627 static void *
radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts * counts,uint32_t queue_syncobj,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1628 radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts *counts, uint32_t queue_syncobj,
1629                                             struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1630 {
1631    uint32_t count = counts->syncobj_count + counts->timeline_syncobj_count + (queue_syncobj ? 1 : 0);
1632    struct drm_amdgpu_cs_chunk_syncobj *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_syncobj) * count);
1633    if (!syncobj)
1634       return NULL;
1635 
1636    for (unsigned i = 0; i < counts->syncobj_count; i++) {
1637       struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i];
1638       sem->handle = counts->syncobj[i];
1639       sem->flags = 0;
1640       sem->point = 0;
1641    }
1642 
1643    for (unsigned i = 0; i < counts->timeline_syncobj_count; i++) {
1644       struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i + counts->syncobj_count];
1645       sem->handle = counts->syncobj[i + counts->syncobj_count];
1646       sem->flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
1647       sem->point = counts->points[i];
1648    }
1649 
1650    if (queue_syncobj) {
1651       syncobj[count - 1].handle = queue_syncobj;
1652       syncobj[count - 1].flags = 0;
1653       syncobj[count - 1].point = 0;
1654    }
1655 
1656    chunk->chunk_id = chunk_id;
1657    chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_syncobj) / 4 * count;
1658    chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1659    return syncobj;
1660 }
1661 
1662 static bool
radv_amdgpu_cs_has_user_fence(struct radv_amdgpu_cs_request * request)1663 radv_amdgpu_cs_has_user_fence(struct radv_amdgpu_cs_request *request)
1664 {
1665    return request->ip_type != AMDGPU_HW_IP_UVD && request->ip_type != AMDGPU_HW_IP_VCE &&
1666           request->ip_type != AMDGPU_HW_IP_UVD_ENC && request->ip_type != AMDGPU_HW_IP_VCN_DEC &&
1667           request->ip_type != AMDGPU_HW_IP_VCN_ENC && request->ip_type != AMDGPU_HW_IP_VCN_JPEG;
1668 }
1669 
1670 static VkResult
radv_amdgpu_cs_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request,struct radv_winsys_sem_info * sem_info)1671 radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
1672                       struct radv_winsys_sem_info *sem_info)
1673 {
1674    int r;
1675    int num_chunks;
1676    int size;
1677    struct drm_amdgpu_cs_chunk *chunks;
1678    struct drm_amdgpu_cs_chunk_data *chunk_data;
1679    struct drm_amdgpu_bo_list_in bo_list_in;
1680    void *wait_syncobj = NULL, *signal_syncobj = NULL;
1681    int i;
1682    VkResult result = VK_SUCCESS;
1683    bool has_user_fence = radv_amdgpu_cs_has_user_fence(request);
1684    uint32_t queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, request->ip_type, request->ring);
1685    bool *queue_syncobj_wait = &ctx->queue_syncobj_wait[request->ip_type][request->ring];
1686 
1687    if (!queue_syncobj)
1688       return VK_ERROR_OUT_OF_HOST_MEMORY;
1689 
1690    size = request->number_of_ibs + 1 + (has_user_fence ? 1 : 0) + 1 /* bo list */ + 3;
1691 
1692    chunks = malloc(sizeof(chunks[0]) * size);
1693    if (!chunks)
1694       return VK_ERROR_OUT_OF_HOST_MEMORY;
1695 
1696    size = request->number_of_ibs + (has_user_fence ? 1 : 0);
1697 
1698    chunk_data = malloc(sizeof(chunk_data[0]) * size);
1699    if (!chunk_data) {
1700       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1701       goto error_out;
1702    }
1703 
1704    num_chunks = request->number_of_ibs;
1705    for (i = 0; i < request->number_of_ibs; i++) {
1706       struct radv_amdgpu_cs_ib_info *ib;
1707       chunks[i].chunk_id = AMDGPU_CHUNK_ID_IB;
1708       chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1709       chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1710 
1711       ib = &request->ibs[i];
1712       assert(ib->ib_mc_address && ib->ib_mc_address % ctx->ws->info.ip[ib->ip_type].ib_alignment == 0);
1713       assert(ib->size);
1714 
1715       chunk_data[i].ib_data._pad = 0;
1716       chunk_data[i].ib_data.va_start = ib->ib_mc_address;
1717       chunk_data[i].ib_data.ib_bytes = ib->size * 4;
1718       chunk_data[i].ib_data.ip_type = ib->ip_type;
1719       chunk_data[i].ib_data.ip_instance = request->ip_instance;
1720       chunk_data[i].ib_data.ring = request->ring;
1721       chunk_data[i].ib_data.flags = ib->flags;
1722    }
1723 
1724    assert(chunk_data[request->number_of_ibs - 1].ib_data.ip_type == request->ip_type);
1725 
1726    if (has_user_fence) {
1727       i = num_chunks++;
1728       chunks[i].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1729       chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1730       chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1731 
1732       struct amdgpu_cs_fence_info fence_info;
1733       fence_info.handle = radv_amdgpu_winsys_bo(ctx->fence_bo)->bo;
1734       /* Need to reserve 4 QWORD for user fence:
1735        *   QWORD[0]: completed fence
1736        *   QWORD[1]: preempted fence
1737        *   QWORD[2]: reset fence
1738        *   QWORD[3]: preempted then reset
1739        */
1740       fence_info.offset = (request->ip_type * MAX_RINGS_PER_TYPE + request->ring) * 4;
1741       amdgpu_cs_chunk_fence_info_to_data(&fence_info, &chunk_data[i]);
1742    }
1743 
1744    if (sem_info->cs_emit_wait &&
1745        (sem_info->wait.timeline_syncobj_count || sem_info->wait.syncobj_count || *queue_syncobj_wait)) {
1746 
1747       if (ctx->ws->info.has_timeline_syncobj) {
1748          wait_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(&sem_info->wait, queue_syncobj, &chunks[num_chunks],
1749                                                                     AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT);
1750       } else {
1751          wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait, queue_syncobj, &chunks[num_chunks],
1752                                                            AMDGPU_CHUNK_ID_SYNCOBJ_IN);
1753       }
1754       if (!wait_syncobj) {
1755          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1756          goto error_out;
1757       }
1758       num_chunks++;
1759 
1760       sem_info->cs_emit_wait = false;
1761       *queue_syncobj_wait = false;
1762    }
1763 
1764    if (sem_info->cs_emit_signal) {
1765       if (ctx->ws->info.has_timeline_syncobj) {
1766          signal_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(
1767             &sem_info->signal, queue_syncobj, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL);
1768       } else {
1769          signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal, queue_syncobj, &chunks[num_chunks],
1770                                                              AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
1771       }
1772       if (!signal_syncobj) {
1773          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1774          goto error_out;
1775       }
1776       num_chunks++;
1777    }
1778 
1779    bo_list_in.operation = ~0;
1780    bo_list_in.list_handle = ~0;
1781    bo_list_in.bo_number = request->num_handles;
1782    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1783    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)request->handles;
1784 
1785    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1786    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1787    chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1788    num_chunks++;
1789 
1790    /* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites quite
1791     * often, but it eventually succeeds after enough attempts. This happens frequently with dEQP
1792     * using NGG streamout.
1793     */
1794    uint64_t abs_timeout_ns = os_time_get_absolute_timeout(1000000000ull); /* 1s */
1795 
1796    r = 0;
1797    do {
1798       /* Wait 1 ms and try again. */
1799       if (r == -ENOMEM)
1800          os_time_sleep(1000);
1801 
1802       r = amdgpu_cs_submit_raw2(ctx->ws->dev, ctx->ctx, 0, num_chunks, chunks, &request->seq_no);
1803    } while (r == -ENOMEM && os_time_get_nano() < abs_timeout_ns);
1804 
1805    if (r) {
1806       if (r == -ENOMEM) {
1807          fprintf(stderr, "radv/amdgpu: Not enough memory for command submission.\n");
1808          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1809       } else if (r == -ECANCELED) {
1810          fprintf(stderr,
1811                  "radv/amdgpu: The CS has been cancelled because the context is lost. This context is innocent.\n");
1812          result = VK_ERROR_DEVICE_LOST;
1813       } else if (r == -ENODATA) {
1814          fprintf(stderr, "radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty "
1815                          "of a soft recovery.\n");
1816          result = VK_ERROR_DEVICE_LOST;
1817       } else if (r == -ETIME) {
1818          fprintf(stderr, "radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty "
1819                          "of a hard recovery.\n");
1820          result = VK_ERROR_DEVICE_LOST;
1821       } else {
1822          fprintf(stderr,
1823                  "radv/amdgpu: The CS has been rejected, "
1824                  "see dmesg for more information (%i).\n",
1825                  r);
1826          result = VK_ERROR_UNKNOWN;
1827       }
1828    }
1829 
1830 error_out:
1831    free(chunks);
1832    free(chunk_data);
1833    free(wait_syncobj);
1834    free(signal_syncobj);
1835    return result;
1836 }
1837 
1838 void
radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys * ws)1839 radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
1840 {
1841    ws->base.ctx_create = radv_amdgpu_ctx_create;
1842    ws->base.ctx_destroy = radv_amdgpu_ctx_destroy;
1843    ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle;
1844    ws->base.ctx_set_pstate = radv_amdgpu_ctx_set_pstate;
1845    ws->base.cs_domain = radv_amdgpu_cs_domain;
1846    ws->base.cs_create = radv_amdgpu_cs_create;
1847    ws->base.cs_destroy = radv_amdgpu_cs_destroy;
1848    ws->base.cs_grow = radv_amdgpu_cs_grow;
1849    ws->base.cs_finalize = radv_amdgpu_cs_finalize;
1850    ws->base.cs_reset = radv_amdgpu_cs_reset;
1851    ws->base.cs_chain = radv_amdgpu_cs_chain;
1852    ws->base.cs_unchain = radv_amdgpu_cs_unchain;
1853    ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer;
1854    ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary;
1855    ws->base.cs_execute_ib = radv_amdgpu_cs_execute_ib;
1856    ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
1857    ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
1858 }
1859