• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4  * Copyright © 2015 Advanced Micro Devices, Inc.
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include "amdgpu_cs.h"
10 #include "util/detect_os.h"
11 #include "amdgpu_winsys.h"
12 #include "util/os_time.h"
13 #include <inttypes.h>
14 #include <stdio.h>
15 
16 #include "amd/common/sid.h"
17 
18 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
19  * codes in the kernel).
20  */
21 #if DETECT_OS_OPENBSD
22 #define ENODATA ENOTSUP
23 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
24 #define ENODATA ECONNREFUSED
25 #endif
26 
27 /* FENCES */
28 
amdgpu_fence_destroy(struct amdgpu_fence * fence)29 void amdgpu_fence_destroy(struct amdgpu_fence *fence)
30 {
31    ac_drm_cs_destroy_syncobj(fence->aws->fd, fence->syncobj);
32 
33    if (fence->ctx)
34       amdgpu_ctx_reference(&fence->ctx, NULL);
35 
36    util_queue_fence_destroy(&fence->submitted);
37    FREE(fence);
38 }
39 
40 static struct pipe_fence_handle *
amdgpu_fence_create(struct amdgpu_cs * cs)41 amdgpu_fence_create(struct amdgpu_cs *cs)
42 {
43    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
44    struct amdgpu_ctx *ctx = cs->ctx;
45 
46    fence->reference.count = 1;
47    fence->aws = ctx->aws;
48    amdgpu_ctx_reference(&fence->ctx, ctx);
49    fence->ctx = ctx;
50    fence->ip_type = cs->ip_type;
51    if (ac_drm_cs_create_syncobj2(ctx->aws->fd, 0, &fence->syncobj)) {
52       free(fence);
53       return NULL;
54    }
55 
56    util_queue_fence_init(&fence->submitted);
57    util_queue_fence_reset(&fence->submitted);
58    fence->queue_index = cs->queue_index;
59    return (struct pipe_fence_handle *)fence;
60 }
61 
62 static struct pipe_fence_handle *
amdgpu_fence_import_syncobj(struct radeon_winsys * rws,int fd)63 amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
64 {
65    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
66    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
67    int r;
68 
69    if (!fence)
70       return NULL;
71 
72    pipe_reference_init(&fence->reference, 1);
73    fence->aws = aws;
74    fence->ip_type = 0xffffffff;
75 
76    r = ac_drm_cs_import_syncobj(aws->fd, fd, &fence->syncobj);
77    if (r) {
78       FREE(fence);
79       return NULL;
80    }
81 
82    util_queue_fence_init(&fence->submitted);
83    fence->imported = true;
84 
85    return (struct pipe_fence_handle*)fence;
86 }
87 
88 static struct pipe_fence_handle *
amdgpu_fence_import_sync_file(struct radeon_winsys * rws,int fd)89 amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
90 {
91    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
92    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
93 
94    if (!fence)
95       return NULL;
96 
97    pipe_reference_init(&fence->reference, 1);
98    fence->aws = aws;
99    /* fence->ctx == NULL means that the fence is syncobj-based. */
100 
101    /* Convert sync_file into syncobj. */
102    int r = ac_drm_cs_create_syncobj(aws->fd, &fence->syncobj);
103    if (r) {
104       FREE(fence);
105       return NULL;
106    }
107 
108    r = ac_drm_cs_syncobj_import_sync_file(aws->fd, fence->syncobj, fd);
109    if (r) {
110       ac_drm_cs_destroy_syncobj(aws->fd, fence->syncobj);
111       FREE(fence);
112       return NULL;
113    }
114 
115    util_queue_fence_init(&fence->submitted);
116    fence->imported = true;
117 
118    return (struct pipe_fence_handle*)fence;
119 }
120 
amdgpu_fence_export_sync_file(struct radeon_winsys * rws,struct pipe_fence_handle * pfence)121 static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
122                                          struct pipe_fence_handle *pfence)
123 {
124    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
125    struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
126    int fd, r;
127 
128    util_queue_fence_wait(&fence->submitted);
129 
130    /* Convert syncobj into sync_file. */
131    r = ac_drm_cs_syncobj_export_sync_file(aws->fd, fence->syncobj, &fd);
132    return r ? -1 : fd;
133 }
134 
amdgpu_export_signalled_sync_file(struct radeon_winsys * rws)135 static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
136 {
137    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
138    uint32_t syncobj;
139    int fd = -1;
140 
141    int r = ac_drm_cs_create_syncobj2(aws->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
142                                      &syncobj);
143    if (r) {
144       return -1;
145    }
146 
147    r = ac_drm_cs_syncobj_export_sync_file(aws->fd, syncobj, &fd);
148    if (r) {
149       fd = -1;
150    }
151 
152    ac_drm_cs_destroy_syncobj(aws->fd, syncobj);
153    return fd;
154 }
155 
amdgpu_fence_submitted(struct pipe_fence_handle * fence,uint64_t seq_no,uint64_t * user_fence_cpu_address)156 static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
157                                    uint64_t seq_no,
158                                    uint64_t *user_fence_cpu_address)
159 {
160    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
161 
162    afence->seq_no = seq_no;
163    afence->user_fence_cpu_address = user_fence_cpu_address;
164    util_queue_fence_signal(&afence->submitted);
165 }
166 
amdgpu_fence_signalled(struct pipe_fence_handle * fence)167 static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
168 {
169    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
170 
171    afence->signalled = true;
172    util_queue_fence_signal(&afence->submitted);
173 }
174 
amdgpu_fence_wait(struct pipe_fence_handle * fence,uint64_t timeout,bool absolute)175 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
176                        bool absolute)
177 {
178    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
179    int64_t abs_timeout;
180    uint64_t *user_fence_cpu;
181 
182    if (afence->signalled)
183       return true;
184 
185    if (absolute)
186       abs_timeout = timeout;
187    else
188       abs_timeout = os_time_get_absolute_timeout(timeout);
189 
190    /* The fence might not have a number assigned if its IB is being
191     * submitted in the other thread right now. Wait until the submission
192     * is done. */
193    if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
194       return false;
195 
196    user_fence_cpu = afence->user_fence_cpu_address;
197    if (user_fence_cpu) {
198       if (*user_fence_cpu >= afence->seq_no) {
199          afence->signalled = true;
200          return true;
201       }
202 
203       /* No timeout, just query: no need for the ioctl. */
204       if (!absolute && !timeout)
205          return false;
206    }
207 
208    if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE)
209       abs_timeout = INT64_MAX;
210 
211    if (ac_drm_cs_syncobj_wait(afence->aws->fd, &afence->syncobj, 1,
212                               abs_timeout, 0, NULL))
213       return false;
214 
215    /* Check that guest-side syncobj agrees with the user fence. */
216    if (user_fence_cpu && afence->aws->info.is_virtio)
217       assert(afence->seq_no <= *user_fence_cpu);
218 
219    afence->signalled = true;
220    return true;
221 }
222 
amdgpu_fence_wait_rel_timeout(struct radeon_winsys * rws,struct pipe_fence_handle * fence,uint64_t timeout)223 static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
224                                           struct pipe_fence_handle *fence,
225                                           uint64_t timeout)
226 {
227    return amdgpu_fence_wait(fence, timeout, false);
228 }
229 
230 static struct pipe_fence_handle *
amdgpu_cs_get_next_fence(struct radeon_cmdbuf * rcs)231 amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
232 {
233    struct amdgpu_cs *cs = amdgpu_cs(rcs);
234    struct pipe_fence_handle *fence = NULL;
235 
236    if (cs->noop)
237       return NULL;
238 
239    if (cs->next_fence) {
240       amdgpu_fence_reference(&fence, cs->next_fence);
241       return fence;
242    }
243 
244    fence = amdgpu_fence_create(cs);
245    if (!fence)
246       return NULL;
247 
248    amdgpu_fence_reference(&cs->next_fence, fence);
249    return fence;
250 }
251 
252 /* CONTEXTS */
253 
254 static uint32_t
radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)255 radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)
256 {
257    switch (radeon_priority) {
258    case RADEON_CTX_PRIORITY_REALTIME:
259       return AMDGPU_CTX_PRIORITY_VERY_HIGH;
260    case RADEON_CTX_PRIORITY_HIGH:
261       return AMDGPU_CTX_PRIORITY_HIGH;
262    case RADEON_CTX_PRIORITY_MEDIUM:
263       return AMDGPU_CTX_PRIORITY_NORMAL;
264    case RADEON_CTX_PRIORITY_LOW:
265       return AMDGPU_CTX_PRIORITY_LOW;
266    default:
267       unreachable("Invalid context priority");
268    }
269 }
270 
amdgpu_ctx_create(struct radeon_winsys * rws,enum radeon_ctx_priority priority,bool allow_context_lost)271 static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *rws,
272                                                    enum radeon_ctx_priority priority,
273                                                    bool allow_context_lost)
274 {
275    struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
276    int r;
277    struct amdgpu_bo_alloc_request alloc_buffer = {};
278    uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority);
279    ac_drm_device *dev;
280    ac_drm_bo buf_handle;
281 
282    if (!ctx)
283       return NULL;
284 
285    ctx->aws = amdgpu_winsys(rws);
286    ctx->reference.count = 1;
287    ctx->allow_context_lost = allow_context_lost;
288 
289    dev = ctx->aws->dev;
290 
291    r = ac_drm_cs_ctx_create2(dev, amdgpu_priority, &ctx->ctx_handle);
292    if (r) {
293       fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r);
294       goto error_create;
295    }
296 
297    alloc_buffer.alloc_size = ctx->aws->info.gart_page_size;
298    alloc_buffer.phys_alignment = ctx->aws->info.gart_page_size;
299    alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
300 
301    r = ac_drm_bo_alloc(dev, &alloc_buffer, &buf_handle);
302    if (r) {
303       fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
304       goto error_user_fence_alloc;
305    }
306 
307    ctx->user_fence_cpu_address_base = NULL;
308    r = ac_drm_bo_cpu_map(dev, buf_handle, (void**)&ctx->user_fence_cpu_address_base);
309    if (r) {
310       fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
311       goto error_user_fence_map;
312    }
313 
314    memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
315    ctx->user_fence_bo = buf_handle;
316    ac_drm_bo_export(dev, buf_handle, amdgpu_bo_handle_type_kms, &ctx->user_fence_bo_kms_handle);
317 
318    return (struct radeon_winsys_ctx*)ctx;
319 
320 error_user_fence_map:
321    ac_drm_bo_free(dev, buf_handle);
322 
323 error_user_fence_alloc:
324    ac_drm_cs_ctx_free(dev, ctx->ctx_handle);
325 error_create:
326    FREE(ctx);
327    return NULL;
328 }
329 
amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)330 static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
331 {
332    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
333 
334    amdgpu_ctx_reference(&ctx, NULL);
335 }
336 
amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys * aws,enum amd_ip_type ip_type,uint32_t * ib,uint32_t * num_dw,unsigned leave_dw_space)337 static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *aws, enum amd_ip_type ip_type,
338                                       uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space)
339 {
340    unsigned pad_dw_mask = aws->info.ip[ip_type].ib_pad_dw_mask;
341    unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask;
342 
343    if (unaligned_dw) {
344       int remaining = pad_dw_mask + 1 - unaligned_dw;
345 
346       /* Only pad by 1 dword with the type-2 NOP if necessary. */
347       if (remaining == 1 && aws->info.gfx_ib_pad_with_type2) {
348          ib[(*num_dw)++] = PKT2_NOP_PAD;
349       } else {
350          /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
351           * packet. The size of the packet body after the header is always count + 1.
352           * If count == -1, there is no packet body. NOP is the only packet that can have
353           * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
354           */
355          ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0);
356          *num_dw += remaining - 1;
357       }
358    }
359    assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0);
360 }
361 
amdgpu_submit_gfx_nop(struct amdgpu_ctx * ctx)362 static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx)
363 {
364    struct amdgpu_bo_alloc_request request = {0};
365    struct drm_amdgpu_bo_list_in bo_list_in;
366    struct drm_amdgpu_cs_chunk_ib ib_in = {0};
367    ac_drm_bo bo;
368    amdgpu_va_handle va_handle = NULL;
369    struct drm_amdgpu_cs_chunk chunks[2];
370    struct drm_amdgpu_bo_list_entry list;
371    unsigned noop_dw_size;
372    void *cpu = NULL;
373    uint64_t seq_no;
374    uint64_t va;
375    int r;
376 
377    /* Older amdgpu doesn't report if the reset is complete or not. Detect
378     * it by submitting a no-op job. If it reports an error, then assume
379     * that the reset is not complete.
380     */
381    uint32_t temp_ctx_handle;
382    r = ac_drm_cs_ctx_create2(ctx->aws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx_handle);
383    if (r)
384       return r;
385 
386    request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
387    request.alloc_size = 4096;
388    request.phys_alignment = 4096;
389    r = ac_drm_bo_alloc(ctx->aws->dev, &request, &bo);
390    if (r)
391       goto destroy_ctx;
392 
393    r = ac_drm_va_range_alloc(ctx->aws->dev, amdgpu_gpu_va_range_general,
394                              request.alloc_size, request.phys_alignment,
395                              0, &va, &va_handle,
396                              AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH);
397    if (r)
398       goto destroy_bo;
399 
400    uint32_t kms_handle;
401    ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &kms_handle);
402 
403    r = ac_drm_bo_va_op_raw(ctx->aws->dev, kms_handle, 0, request.alloc_size, va,
404                            AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
405                            AMDGPU_VA_OP_MAP);
406    if (r)
407       goto destroy_bo;
408 
409    r = ac_drm_bo_cpu_map(ctx->aws->dev, bo, &cpu);
410    if (r)
411       goto destroy_bo;
412 
413    noop_dw_size = ctx->aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
414    ((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
415 
416    ac_drm_bo_cpu_unmap(ctx->aws->dev, bo);
417 
418    list.bo_handle = kms_handle;
419    ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &list.bo_handle);
420    list.bo_priority = 0;
421 
422    bo_list_in.list_handle = ~0;
423    bo_list_in.bo_number = 1;
424    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
425    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list;
426 
427    ib_in.ip_type = AMD_IP_GFX;
428    ib_in.ib_bytes = noop_dw_size * 4;
429    ib_in.va_start = va;
430 
431    chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
432    chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
433    chunks[0].chunk_data = (uintptr_t)&bo_list_in;
434 
435    chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB;
436    chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
437    chunks[1].chunk_data = (uintptr_t)&ib_in;
438 
439    r = ac_drm_cs_submit_raw2(ctx->aws->dev, temp_ctx_handle, 0, 2, chunks, &seq_no);
440 
441 destroy_bo:
442    if (va_handle)
443       ac_drm_va_range_free(va_handle);
444    ac_drm_bo_free(ctx->aws->dev, bo);
445 destroy_ctx:
446    ac_drm_cs_ctx_free(ctx->aws->dev, temp_ctx_handle);
447 
448    return r;
449 }
450 
451 static void
amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)452 amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
453                                const char *format, ...)
454 {
455    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
456 
457    /* Don't overwrite the last reset status. */
458    if (ctx->sw_status != PIPE_NO_RESET)
459       return;
460 
461    ctx->sw_status = status;
462 
463    if (!ctx->allow_context_lost) {
464       va_list args;
465 
466       va_start(args, format);
467       vfprintf(stderr, format, args);
468       va_end(args);
469 
470       /* Non-robust contexts are allowed to terminate the process. The only alternative is
471        * to skip command submission, which would look like a freeze because nothing is drawn,
472        * which looks like a hang without any reset.
473        */
474       abort();
475    }
476 }
477 
478 static enum pipe_reset_status
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx * rwctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)479 amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only,
480                               bool *needs_reset, bool *reset_completed)
481 {
482    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
483 
484    if (needs_reset)
485       *needs_reset = false;
486    if (reset_completed)
487       *reset_completed = false;
488 
489    /* Return a failure due to a GPU hang. */
490    uint64_t flags;
491 
492    if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
493       /* If the caller is only interested in full reset (= wants to ignore soft
494        * recoveries), we can use the rejected cs count as a quick first check.
495        */
496       return PIPE_NO_RESET;
497    }
498 
499    /*
500     * ctx->sw_status is updated on alloc/ioctl failures.
501     *
502     * We only rely on amdgpu_cs_query_reset_state2 to tell us
503     * that the context reset is complete.
504     */
505    if (ctx->sw_status != PIPE_NO_RESET) {
506       int r = ac_drm_cs_query_reset_state2(ctx->aws->dev, ctx->ctx_handle, &flags);
507       if (!r) {
508          if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
509             if (reset_completed) {
510                /* The ARB_robustness spec says:
511                *
512                *    If a reset status other than NO_ERROR is returned and subsequent
513                *    calls return NO_ERROR, the context reset was encountered and
514                *    completed. If a reset status is repeatedly returned, the context may
515                *    be in the process of resetting.
516                *
517                * Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
518                * so don't do anything special. On older kernels, submit a no-op cs. If it
519                * succeeds then assume the reset is complete.
520                */
521                if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
522                   *reset_completed = true;
523 
524                if (ctx->aws->info.drm_minor < 54 && ctx->aws->info.has_graphics)
525                   *reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
526             }
527          }
528       } else {
529          fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
530       }
531 
532       /* Return a failure due to SW issues. */
533       if (needs_reset)
534          *needs_reset = true;
535       return ctx->sw_status;
536    }
537 
538    if (needs_reset)
539       *needs_reset = false;
540    return PIPE_NO_RESET;
541 }
542 
543 /* COMMAND SUBMISSION */
544 
amdgpu_cs_has_user_fence(struct amdgpu_cs * acs)545 static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs)
546 {
547    return acs->ip_type == AMD_IP_GFX ||
548           acs->ip_type == AMD_IP_COMPUTE ||
549           acs->ip_type == AMD_IP_SDMA;
550 }
551 
amdgpu_cs_epilog_dws(struct amdgpu_cs * cs)552 static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs)
553 {
554    if (cs->has_chaining)
555       return 4; /* for chaining */
556 
557    return 0;
558 }
559 
560 static struct amdgpu_cs_buffer *
amdgpu_lookup_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list)561 amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
562                      struct amdgpu_buffer_list *list)
563 {
564    int num_buffers = list->num_buffers;
565    struct amdgpu_cs_buffer *buffers = list->buffers;
566    unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
567    int i = cs->buffer_indices_hashlist[hash];
568 
569    /* not found or found */
570    if (i < 0)
571       return NULL;
572 
573    if (i < num_buffers && buffers[i].bo == bo)
574       return &buffers[i];
575 
576    /* Hash collision, look for the BO in the list of buffers linearly. */
577    for (int i = num_buffers - 1; i >= 0; i--) {
578       if (buffers[i].bo == bo) {
579          /* Put this buffer in the hash list.
580           * This will prevent additional hash collisions if there are
581           * several consecutive lookup_buffer calls for the same buffer.
582           *
583           * Example: Assuming buffers A,B,C collide in the hash list,
584           * the following sequence of buffers:
585           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
586           * will collide here: ^ and here:   ^,
587           * meaning that we should get very few collisions in the end. */
588          cs->buffer_indices_hashlist[hash] = i & 0x7fff;
589          return &buffers[i];
590       }
591    }
592    return NULL;
593 }
594 
595 struct amdgpu_cs_buffer *
amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo)596 amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
597 {
598    return amdgpu_lookup_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)]);
599 }
600 
601 static struct amdgpu_cs_buffer *
amdgpu_do_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)602 amdgpu_do_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
603                      struct amdgpu_buffer_list *list, bool add_ref)
604 {
605    /* New buffer, check if the backing array is large enough. */
606    if (unlikely(list->num_buffers >= list->max_buffers)) {
607       unsigned new_max =
608          MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3));
609       struct amdgpu_cs_buffer *new_buffers;
610 
611       new_buffers = (struct amdgpu_cs_buffer *)
612                     REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers),
613                             new_max * sizeof(*new_buffers));
614       if (!new_buffers) {
615          fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
616          return NULL;
617       }
618 
619       list->max_buffers = new_max;
620       list->buffers = new_buffers;
621    }
622 
623    unsigned idx = list->num_buffers++;
624    struct amdgpu_cs_buffer *buffer = &list->buffers[idx];
625    if (add_ref)
626       p_atomic_inc(&bo->base.reference.count);
627    buffer->bo = bo;
628    buffer->usage = 0;
629 
630    unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
631    cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
632    return buffer;
633 }
634 
635 static struct amdgpu_cs_buffer *
amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)636 amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
637                             struct amdgpu_buffer_list *list, bool add_ref)
638 {
639    struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(cs, bo, list);
640 
641    return buffer ? buffer : amdgpu_do_add_buffer(cs, bo, list, add_ref);
642 }
643 
amdgpu_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)644 static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
645                                     struct pb_buffer_lean *buf,
646                                     unsigned usage,
647                                     enum radeon_bo_domain domains)
648 {
649    /* Don't use the "domains" parameter. Amdgpu doesn't support changing
650     * the buffer placement during command submission.
651     */
652    struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc;
653    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
654    struct amdgpu_cs_buffer *buffer;
655 
656    /* Fast exit for no-op calls.
657     * This is very effective with suballocators and linear uploaders that
658     * are outside of the winsys.
659     */
660    if (bo == cs->last_added_bo &&
661        (usage & cs->last_added_bo_usage) == usage)
662       return 0;
663 
664    buffer = amdgpu_lookup_or_add_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)], true);
665    if (!buffer)
666       return 0;
667 
668    buffer->usage |= usage;
669 
670    cs->last_added_bo_usage = buffer->usage;
671    cs->last_added_bo = bo;
672    return 0;
673 }
674 
amdgpu_ib_new_buffer(struct amdgpu_winsys * aws,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)675 static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *aws,
676                                  struct amdgpu_ib *main_ib,
677                                  struct amdgpu_cs *cs)
678 {
679    struct pb_buffer_lean *pb;
680    uint8_t *mapped;
681    unsigned buffer_size;
682 
683    /* Always create a buffer that is at least as large as the maximum seen IB size,
684     * aligned to a power of two.
685     */
686    buffer_size = util_next_power_of_two(main_ib->max_ib_bytes);
687 
688    /* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/
689    if (!cs->has_chaining)
690       buffer_size *= 4;
691 
692    const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024);
693    /* This is the maximum size that fits into the INDIRECT_BUFFER packet. */
694    const unsigned max_size = 2 * 1024 * 1024;
695 
696    buffer_size = MIN2(buffer_size, max_size);
697    buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */
698 
699    /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU.
700     * The speed of writing to GTT WC is somewhere between no difference and very slow, while
701     * VRAM being very slow a lot more often.
702     *
703     * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency
704     * and doesn't have to wait for cached GL2 requests to be processed.
705     */
706    enum radeon_bo_domain domain = RADEON_DOMAIN_GTT;
707    unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING |
708                     RADEON_FLAG_GL2_BYPASS;
709 
710    if (cs->ip_type == AMD_IP_GFX ||
711        cs->ip_type == AMD_IP_COMPUTE ||
712        cs->ip_type == AMD_IP_SDMA) {
713       /* Avoids hangs with "rendercheck -t cacomposite -f a8r8g8b8" via glamor
714        * on Navi 14
715        */
716       flags |= RADEON_FLAG_32BIT;
717    }
718 
719    pb = amdgpu_bo_create(aws, buffer_size,
720                          aws->info.gart_page_size,
721                          domain, (radeon_bo_flag)flags);
722    if (!pb)
723       return false;
724 
725    mapped = (uint8_t*)amdgpu_bo_map(&aws->dummy_sws.base, pb, NULL, PIPE_MAP_WRITE);
726    if (!mapped) {
727       radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
728       return false;
729    }
730 
731    radeon_bo_reference(&aws->dummy_sws.base, &main_ib->big_buffer, pb);
732    radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
733 
734    main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer);
735    main_ib->big_buffer_cpu_ptr = mapped;
736    main_ib->used_ib_space = 0;
737 
738    return true;
739 }
740 
amdgpu_get_new_ib(struct amdgpu_winsys * aws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)741 static bool amdgpu_get_new_ib(struct amdgpu_winsys *aws,
742                               struct radeon_cmdbuf *rcs,
743                               struct amdgpu_ib *main_ib,
744                               struct amdgpu_cs *cs)
745 {
746    struct drm_amdgpu_cs_chunk_ib *chunk_ib = &cs->csc->chunk_ib[IB_MAIN];
747    /* This is the minimum size of a contiguous IB. */
748    unsigned ib_size = 16 * 1024;
749 
750    /* Always allocate at least the size of the biggest cs_check_space call,
751     * because precisely the last call might have requested this size.
752     */
753    ib_size = MAX2(ib_size, main_ib->max_check_space_size);
754 
755    if (!cs->has_chaining) {
756       ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes),
757                                    IB_MAX_SUBMIT_BYTES));
758    }
759 
760    /* Decay the IB buffer size over time, so that memory usage decreases after
761     * a temporary peak.
762     */
763    main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32;
764 
765    rcs->prev_dw = 0;
766    rcs->num_prev = 0;
767    rcs->current.cdw = 0;
768    rcs->current.buf = NULL;
769 
770    /* Allocate a new buffer for IBs if the current buffer is all used. */
771    if (!main_ib->big_buffer ||
772        main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) {
773       if (!amdgpu_ib_new_buffer(aws, main_ib, cs))
774          return false;
775    }
776 
777    chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space;
778    chunk_ib->ib_bytes = 0;
779    /* ib_bytes is in dwords and the conversion to bytes will be done before
780     * the CS ioctl. */
781    main_ib->ptr_ib_size = &chunk_ib->ib_bytes;
782    main_ib->is_chained_ib = false;
783 
784    amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
785                         (radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB),
786                         (radeon_bo_domain)0);
787 
788    rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
789 
790    cs->csc->ib_main_addr = rcs->current.buf;
791 
792    ib_size = main_ib->big_buffer->size - main_ib->used_ib_space;
793    rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs);
794    return true;
795 }
796 
amdgpu_set_ib_size(struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib)797 static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib)
798 {
799    if (ib->is_chained_ib) {
800       *ib->ptr_ib_size = rcs->current.cdw |
801                          S_3F2_CHAIN(1) | S_3F2_VALID(1) |
802                          S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL);
803    } else {
804       *ib->ptr_ib_size = rcs->current.cdw;
805    }
806 }
807 
amdgpu_ib_finalize(struct amdgpu_winsys * aws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib,enum amd_ip_type ip_type)808 static void amdgpu_ib_finalize(struct amdgpu_winsys *aws, struct radeon_cmdbuf *rcs,
809                                struct amdgpu_ib *ib, enum amd_ip_type ip_type)
810 {
811    amdgpu_set_ib_size(rcs, ib);
812    ib->used_ib_space += rcs->current.cdw * 4;
813    ib->used_ib_space = align(ib->used_ib_space, aws->info.ip[ip_type].ib_alignment);
814    ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4);
815 }
816 
amdgpu_init_cs_context(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs,enum amd_ip_type ip_type)817 static bool amdgpu_init_cs_context(struct amdgpu_winsys *aws,
818                                    struct amdgpu_cs_context *cs,
819                                    enum amd_ip_type ip_type)
820 {
821    for (unsigned i = 0; i < ARRAY_SIZE(cs->chunk_ib); i++) {
822       cs->chunk_ib[i].ip_type = ip_type;
823       cs->chunk_ib[i].flags = 0;
824 
825       if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
826          /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation
827           * is the beginning of IBs because completion of an IB doesn't care about the state of
828           * GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be
829           * executed in parallel, so draw calls from the current IB can finish after the next IB
830           * starts drawing, and so the cache flush at the end of IBs is usually late and thus
831           * useless.
832           */
833          cs->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
834       }
835    }
836 
837    cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE;
838    cs->last_added_bo = NULL;
839    return true;
840 }
841 
cleanup_fence_list(struct amdgpu_fence_list * fences)842 static void cleanup_fence_list(struct amdgpu_fence_list *fences)
843 {
844    for (unsigned i = 0; i < fences->num; i++)
845       amdgpu_fence_drop_reference(fences->list[i]);
846    fences->num = 0;
847 }
848 
amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)849 static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
850 {
851    for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) {
852       struct amdgpu_cs_buffer *buffers = cs->buffer_lists[i].buffers;
853       unsigned num_buffers = cs->buffer_lists[i].num_buffers;
854 
855       for (unsigned j = 0; j < num_buffers; j++)
856          amdgpu_winsys_bo_drop_reference(aws, buffers[j].bo);
857 
858       cs->buffer_lists[i].num_buffers = 0;
859    }
860 }
861 
amdgpu_cs_context_cleanup(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)862 static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
863 {
864    cs->seq_no_dependencies.valid_fence_mask = 0;
865    cleanup_fence_list(&cs->syncobj_dependencies);
866    cleanup_fence_list(&cs->syncobj_to_signal);
867    amdgpu_fence_reference(&cs->fence, NULL);
868    cs->last_added_bo = NULL;
869 }
870 
amdgpu_destroy_cs_context(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)871 static void amdgpu_destroy_cs_context(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
872 {
873    amdgpu_cs_context_cleanup_buffers(aws, cs);
874    amdgpu_cs_context_cleanup(aws, cs);
875    for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++)
876       FREE(cs->buffer_lists[i].buffers);
877    FREE(cs->syncobj_dependencies.list);
878    FREE(cs->syncobj_to_signal.list);
879 }
880 
881 
amdgpu_cs_get_ip_type(struct radeon_cmdbuf * rcs)882 static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs)
883 {
884    struct amdgpu_cs *cs = amdgpu_cs(rcs);
885    return cs->ip_type;
886 }
887 
ip_uses_alt_fence(enum amd_ip_type ip_type)888 static bool ip_uses_alt_fence(enum amd_ip_type ip_type)
889 {
890    /* The alt_fence path can be tested thoroughly by enabling it for GFX here. */
891    return ip_type == AMD_IP_VCN_DEC ||
892           ip_type == AMD_IP_VCN_ENC ||
893           ip_type == AMD_IP_VCN_JPEG;
894 }
895 
amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)896 static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
897 {
898    struct amdgpu_cs *cs = amdgpu_cs(rcs);
899 
900    if (!cs)
901       return;
902 
903    amdgpu_cs_sync_flush(rcs);
904    util_queue_fence_destroy(&cs->flush_completed);
905    p_atomic_dec(&cs->aws->num_cs);
906    radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->preamble_ib_bo, NULL);
907    radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->main_ib.big_buffer, NULL);
908    FREE(rcs->prev);
909    amdgpu_destroy_cs_context(cs->aws, &cs->csc1);
910    amdgpu_destroy_cs_context(cs->aws, &cs->csc2);
911    amdgpu_fence_reference(&cs->next_fence, NULL);
912    FREE(cs);
913 }
914 
915 static bool
amdgpu_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)916 amdgpu_cs_create(struct radeon_cmdbuf *rcs,
917                  struct radeon_winsys_ctx *rwctx,
918                  enum amd_ip_type ip_type,
919                  void (*flush)(void *ctx, unsigned flags,
920                                struct pipe_fence_handle **fence),
921                  void *flush_ctx)
922 {
923    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
924    struct amdgpu_cs *cs;
925 
926    cs = CALLOC_STRUCT(amdgpu_cs);
927    if (!cs) {
928       return false;
929    }
930 
931    util_queue_fence_init(&cs->flush_completed);
932 
933    cs->aws = ctx->aws;
934    cs->ctx = ctx;
935    cs->flush_cs = flush;
936    cs->flush_data = flush_ctx;
937    cs->ip_type = ip_type;
938    cs->noop = ctx->aws->noop_cs;
939    cs->has_chaining = ctx->aws->info.gfx_level >= GFX7 &&
940                       (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
941 
942    /* Compute the queue index by counting the IPs that have queues. */
943    assert(ip_type < ARRAY_SIZE(ctx->aws->info.ip));
944    assert(ctx->aws->info.ip[ip_type].num_queues);
945 
946    if (ip_uses_alt_fence(ip_type)) {
947       cs->queue_index = INT_MAX;
948       cs->uses_alt_fence = true;
949    } else {
950       cs->queue_index = 0;
951 
952       for (unsigned i = 0; i < ARRAY_SIZE(ctx->aws->info.ip); i++) {
953          if (!ctx->aws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i))
954             continue;
955 
956          if (i == ip_type)
957             break;
958 
959          cs->queue_index++;
960       }
961       assert(cs->queue_index < AMDGPU_MAX_QUEUES);
962    }
963 
964    ac_drm_cs_chunk_fence_info_to_data(cs->ctx->user_fence_bo_kms_handle, cs->ip_type * 4,
965                                       (struct drm_amdgpu_cs_chunk_data*)&cs->fence_chunk);
966 
967    if (!amdgpu_init_cs_context(ctx->aws, &cs->csc1, ip_type)) {
968       FREE(cs);
969       return false;
970    }
971 
972    if (!amdgpu_init_cs_context(ctx->aws, &cs->csc2, ip_type)) {
973       amdgpu_destroy_cs_context(ctx->aws, &cs->csc1);
974       FREE(cs);
975       return false;
976    }
977 
978    memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
979 
980    /* Set the first submission context as current. */
981    rcs->csc = cs->csc = &cs->csc1;
982    cs->cst = &cs->csc2;
983 
984    /* Assign to both amdgpu_cs_context; only csc will use it. */
985    cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist;
986    cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist;
987 
988    cs->csc1.aws = ctx->aws;
989    cs->csc2.aws = ctx->aws;
990 
991    p_atomic_inc(&ctx->aws->num_cs);
992 
993    if (!amdgpu_get_new_ib(ctx->aws, rcs, &cs->main_ib, cs))
994       goto fail;
995 
996    /* Currently only gfx, compute and sdma queues supports user queue. */
997    if (cs->aws->info.use_userq && ip_type <= AMD_IP_SDMA) {
998       if (!amdgpu_userq_init(cs->aws, &cs->aws->queues[cs->queue_index].userq, ip_type))
999          goto fail;
1000    }
1001 
1002    rcs->priv = cs;
1003    return true;
1004 fail:
1005    amdgpu_cs_destroy(rcs);
1006    return false;
1007 }
1008 
1009 static bool
amdgpu_cs_setup_preemption(struct radeon_cmdbuf * rcs,const uint32_t * preamble_ib,unsigned preamble_num_dw)1010 amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
1011                            unsigned preamble_num_dw)
1012 {
1013    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1014    struct amdgpu_winsys *aws = cs->aws;
1015    struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2};
1016    unsigned size = align(preamble_num_dw * 4, aws->info.ip[AMD_IP_GFX].ib_alignment);
1017    struct pb_buffer_lean *preamble_bo;
1018    uint32_t *map;
1019 
1020    /* Create the preamble IB buffer. */
1021    preamble_bo = amdgpu_bo_create(aws, size, aws->info.ip[AMD_IP_GFX].ib_alignment,
1022                                   RADEON_DOMAIN_VRAM,
1023                                   (radeon_bo_flag)
1024                                   (RADEON_FLAG_NO_INTERPROCESS_SHARING |
1025                                    RADEON_FLAG_GTT_WC));
1026    if (!preamble_bo)
1027       return false;
1028 
1029    map = (uint32_t*)amdgpu_bo_map(&aws->dummy_sws.base, preamble_bo, NULL,
1030                                   (pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY));
1031    if (!map) {
1032       radeon_bo_reference(&aws->dummy_sws.base, &preamble_bo, NULL);
1033       return false;
1034    }
1035 
1036    /* Upload the preamble IB. */
1037    memcpy(map, preamble_ib, preamble_num_dw * 4);
1038 
1039    /* Pad the IB. */
1040    amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, map, &preamble_num_dw, 0);
1041    amdgpu_bo_unmap(&aws->dummy_sws.base, preamble_bo);
1042 
1043    for (unsigned i = 0; i < 2; i++) {
1044       csc[i]->chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo);
1045       csc[i]->chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4;
1046 
1047       csc[i]->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT;
1048    }
1049 
1050    assert(!cs->preamble_ib_bo);
1051    cs->preamble_ib_bo = preamble_bo;
1052 
1053    amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1054                         RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1055    return true;
1056 }
1057 
amdgpu_cs_validate(struct radeon_cmdbuf * rcs)1058 static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
1059 {
1060    return true;
1061 }
1062 
amdgpu_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)1063 static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
1064 {
1065    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1066    struct amdgpu_ib *main_ib = &cs->main_ib;
1067 
1068    assert(rcs->current.cdw <= rcs->current.max_dw);
1069 
1070    unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw;
1071 
1072    if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES)
1073       return false;
1074 
1075    if (rcs->current.max_dw - rcs->current.cdw >= dw)
1076       return true;
1077 
1078    unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
1079    unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
1080    /* 125% of the size for IB epilog. */
1081    unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
1082    main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size);
1083    main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4);
1084 
1085    if (!cs->has_chaining)
1086       return false;
1087 
1088    /* Allocate a new chunk */
1089    if (rcs->num_prev >= rcs->max_prev) {
1090       unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
1091       struct radeon_cmdbuf_chunk *new_prev;
1092 
1093       new_prev = (struct radeon_cmdbuf_chunk*)
1094                  REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev,
1095                          sizeof(*new_prev) * new_max_prev);
1096       if (!new_prev)
1097          return false;
1098 
1099       rcs->prev = new_prev;
1100       rcs->max_prev = new_max_prev;
1101    }
1102 
1103    if (!amdgpu_ib_new_buffer(cs->aws, main_ib, cs))
1104       return false;
1105 
1106    assert(main_ib->used_ib_space == 0);
1107    uint64_t va = main_ib->gpu_address;
1108 
1109    /* This space was originally reserved. */
1110    rcs->current.max_dw += cs_epilog_dw;
1111 
1112    /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
1113    amdgpu_pad_gfx_compute_ib(cs->aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4);
1114 
1115    radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
1116    radeon_emit(rcs, va);
1117    radeon_emit(rcs, va >> 32);
1118    uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
1119 
1120    assert((rcs->current.cdw & cs->aws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0);
1121    assert(rcs->current.cdw <= rcs->current.max_dw);
1122 
1123    amdgpu_set_ib_size(rcs, main_ib);
1124    main_ib->ptr_ib_size = new_ptr_ib_size;
1125    main_ib->is_chained_ib = true;
1126 
1127    /* Hook up the new chunk */
1128    rcs->prev[rcs->num_prev].buf = rcs->current.buf;
1129    rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
1130    rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
1131    rcs->num_prev++;
1132 
1133    rcs->prev_dw += rcs->current.cdw;
1134    rcs->current.cdw = 0;
1135 
1136    rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
1137    rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw;
1138 
1139    amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
1140                         RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1141 
1142    return true;
1143 }
1144 
amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context * cs)1145 static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *cs)
1146 {
1147    unsigned num_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1148    struct amdgpu_cs_buffer *buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1149 
1150    for (unsigned i = 0; i < num_buffers; i++) {
1151       struct amdgpu_cs_buffer *slab_buffer = &buffers[i];
1152       struct amdgpu_cs_buffer *real_buffer =
1153          amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(slab_buffer->bo)->b,
1154                                      &cs->buffer_lists[AMDGPU_BO_REAL], true);
1155 
1156       /* We need to set the usage because it determines the BO priority.
1157        *
1158        * Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its
1159        * BO fences to fence dependencies. Only the slab entries should do that.
1160        */
1161       real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
1162    }
1163 }
1164 
amdgpu_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)1165 static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
1166                                           struct radeon_bo_list_item *list)
1167 {
1168     struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
1169 
1170     /* We do this in the CS thread, but since we need to return the final usage of all buffers
1171      * here, do it here too. There is no harm in doing it again in the CS thread.
1172      */
1173     amdgpu_add_slab_backing_buffers(cs);
1174 
1175     struct amdgpu_buffer_list *real_buffers = &cs->buffer_lists[AMDGPU_BO_REAL];
1176     unsigned num_real_buffers = real_buffers->num_buffers;
1177 
1178 #if HAVE_AMDGPU_VIRTIO
1179     assert(!cs->ws->info.is_virtio);
1180 #endif
1181 
1182     if (list) {
1183         for (unsigned i = 0; i < num_real_buffers; i++) {
1184             list[i].bo_size = real_buffers->buffers[i].bo->base.size;
1185             list[i].vm_address =
1186                amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle);
1187             list[i].priority_usage = real_buffers->buffers[i].usage;
1188         }
1189     }
1190     return num_real_buffers;
1191 }
1192 
add_fence_to_list(struct amdgpu_fence_list * fences,struct amdgpu_fence * fence)1193 static void add_fence_to_list(struct amdgpu_fence_list *fences,
1194                               struct amdgpu_fence *fence)
1195 {
1196    unsigned idx = fences->num++;
1197 
1198    if (idx >= fences->max) {
1199       unsigned size;
1200       const unsigned increment = 8;
1201 
1202       fences->max = idx + increment;
1203       size = fences->max * sizeof(fences->list[0]);
1204       fences->list = (struct pipe_fence_handle**)realloc(fences->list, size);
1205    }
1206    amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
1207 }
1208 
amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf * rcs,struct pipe_fence_handle * pfence)1209 static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs,
1210                                            struct pipe_fence_handle *pfence)
1211 {
1212    struct amdgpu_cs *acs = amdgpu_cs(rcs);
1213    struct amdgpu_cs_context *cs = acs->csc;
1214    struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
1215 
1216    util_queue_fence_wait(&fence->submitted);
1217 
1218    if (!fence->imported) {
1219       /* Ignore idle fences. This will only check the user fence in memory. */
1220       if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) {
1221          add_seq_no_to_list(acs->aws, &cs->seq_no_dependencies, fence->queue_index,
1222                             fence->queue_seq_no);
1223       }
1224    }
1225    else
1226       add_fence_to_list(&cs->syncobj_dependencies, fence);
1227 }
1228 
amdgpu_add_fences_to_dependencies(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs,unsigned queue_index_bit,struct amdgpu_seq_no_fences * dependencies,struct amdgpu_winsys_bo * bo,unsigned usage)1229 static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws,
1230                                               struct amdgpu_cs_context *cs,
1231                                               unsigned queue_index_bit,
1232                                               struct amdgpu_seq_no_fences *dependencies,
1233                                               struct amdgpu_winsys_bo *bo, unsigned usage)
1234 {
1235    if (usage & RADEON_USAGE_SYNCHRONIZED) {
1236       /* Add BO fences from queues other than 'queue_index' to dependencies. */
1237       u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) {
1238          add_seq_no_to_list(ws, dependencies, other_queue_idx,
1239                             bo->fences.seq_no[other_queue_idx]);
1240       }
1241 
1242       if (bo->alt_fence)
1243          add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence);
1244    }
1245 }
1246 
amdgpu_set_bo_seq_no(unsigned queue_index,struct amdgpu_winsys_bo * bo,uint_seq_no new_queue_seq_no)1247 static void amdgpu_set_bo_seq_no(unsigned queue_index, struct amdgpu_winsys_bo *bo,
1248                                  uint_seq_no new_queue_seq_no)
1249 {
1250    bo->fences.seq_no[queue_index] = new_queue_seq_no;
1251    bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
1252 }
1253 
amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry * bo_entry,struct amdgpu_winsys_bo * bo,unsigned usage)1254 static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry,
1255                                          struct amdgpu_winsys_bo *bo, unsigned usage)
1256 {
1257    bo_entry->bo_handle = get_real_bo(bo)->kms_handle;
1258    bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2;
1259 }
1260 
amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf * rws,struct pipe_fence_handle * fence)1261 static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
1262                                          struct pipe_fence_handle *fence)
1263 {
1264    struct amdgpu_cs *acs = amdgpu_cs(rws);
1265    struct amdgpu_cs_context *cs = acs->csc;
1266 
1267    add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence);
1268 }
1269 
amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs * acs,unsigned num_real_buffers,struct drm_amdgpu_bo_list_entry * bo_list_real,uint64_t * seq_no)1270 static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs,
1271                                        unsigned num_real_buffers,
1272                                        struct drm_amdgpu_bo_list_entry *bo_list_real,
1273                                        uint64_t *seq_no)
1274 {
1275    struct amdgpu_winsys *aws = acs->aws;
1276    struct amdgpu_cs_context *cs = acs->cst;
1277    struct drm_amdgpu_bo_list_in bo_list_in;
1278    struct drm_amdgpu_cs_chunk chunks[8];
1279    unsigned num_chunks = 0;
1280 
1281    /* BO list */
1282    bo_list_in.operation = ~0;
1283    bo_list_in.list_handle = ~0;
1284    bo_list_in.bo_number = num_real_buffers;
1285    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1286    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list_real;
1287 
1288    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1289    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1290    chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1291    num_chunks++;
1292 
1293    /* Syncobj dependencies. */
1294    unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1295    if (num_syncobj_dependencies) {
1296       struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1297          (struct drm_amdgpu_cs_chunk_sem *)
1298          alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
1299 
1300       for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1301          struct amdgpu_fence *fence =
1302             (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1303 
1304          assert(util_queue_fence_is_signalled(&fence->submitted));
1305          sem_chunk[i].handle = fence->syncobj;
1306       }
1307 
1308       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
1309       chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
1310       chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1311       num_chunks++;
1312    }
1313 
1314    /* Syncobj signals. */
1315    unsigned num_syncobj_to_signal = 1 + cs->syncobj_to_signal.num;
1316    struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1317       (struct drm_amdgpu_cs_chunk_sem *)
1318       alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
1319 
1320    for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) {
1321       struct amdgpu_fence *fence =
1322          (struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1323 
1324       sem_chunk[i].handle = fence->syncobj;
1325    }
1326    sem_chunk[cs->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)cs->fence)->syncobj;
1327 
1328    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
1329    chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal;
1330    chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1331    num_chunks++;
1332 
1333    if (aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) {
1334       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW;
1335       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4;
1336       chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk;
1337       num_chunks++;
1338    }
1339 
1340    /* Fence */
1341    if (amdgpu_cs_has_user_fence(acs)) {
1342       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1343       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1344       chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
1345       num_chunks++;
1346    }
1347 
1348    /* IB */
1349    if (cs->chunk_ib[IB_PREAMBLE].ib_bytes) {
1350       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1351       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1352       chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_PREAMBLE];
1353       num_chunks++;
1354    }
1355 
1356    /* IB */
1357    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1358    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1359    chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_MAIN];
1360    num_chunks++;
1361 
1362    if (cs->secure) {
1363       cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
1364       cs->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
1365    } else {
1366       cs->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1367       cs->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1368    }
1369 
1370    assert(num_chunks <= 8);
1371 
1372    /* Submit the command buffer.
1373     *
1374     * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
1375     * quite often, but it eventually succeeds after enough attempts. This happens frequently
1376     * with dEQP using NGG streamout.
1377     */
1378    int r = 0;
1379 
1380    do {
1381       /* Wait 1 ms and try again. */
1382       if (r == -ENOMEM)
1383          os_time_sleep(1000);
1384 
1385       r = ac_drm_cs_submit_raw2(aws->dev, acs->ctx->ctx_handle, 0, num_chunks, chunks, seq_no);
1386    } while (r == -ENOMEM);
1387 
1388    return r;
1389 }
1390 
amdgpu_cs_add_userq_packets(struct amdgpu_userq * userq,struct amdgpu_cs_context * cs,uint64_t num_fences,struct drm_amdgpu_userq_fence_info * fence_info)1391 static void amdgpu_cs_add_userq_packets(struct amdgpu_userq *userq,
1392                                         struct amdgpu_cs_context *cs,
1393                                         uint64_t num_fences,
1394                                         struct drm_amdgpu_userq_fence_info *fence_info)
1395 {
1396    amdgpu_pkt_begin();
1397 
1398    if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) {
1399       if (num_fences) {
1400          unsigned num_fences_in_iter;
1401          /* FENCE_WAIT_MULTI packet supports max 32 fenes */
1402          for (unsigned i = 0; i < num_fences; i = i + 32) {
1403             num_fences_in_iter = (i + 32 > num_fences) ? num_fences - i : 32;
1404             amdgpu_pkt_add_dw(PKT3(PKT3_FENCE_WAIT_MULTI, num_fences_in_iter * 4, 0));
1405             amdgpu_pkt_add_dw(S_D10_ENGINE_SEL(1) | S_D10_POLL_INTERVAL(4) | S_D10_PREEMPTABLE(1));
1406             for (unsigned j = 0; j < num_fences_in_iter; j++) {
1407                amdgpu_pkt_add_dw(fence_info[i + j].va);
1408                amdgpu_pkt_add_dw(fence_info[i + j].va >> 32);
1409                amdgpu_pkt_add_dw(fence_info[i + j].value);
1410                amdgpu_pkt_add_dw(fence_info[i + j].value >> 32);
1411             }
1412          }
1413       }
1414 
1415       amdgpu_pkt_add_dw(PKT3(PKT3_HDP_FLUSH, 0, 0));
1416       amdgpu_pkt_add_dw(0x0);
1417 
1418       amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
1419       amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start);
1420       amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start >> 32);
1421       if (userq->ip_type == AMD_IP_GFX)
1422          amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_INHERIT_VMID_MQD_GFX(1));
1423       else
1424          amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_VALID_COMPUTE(1) |
1425                               S_3F3_INHERIT_VMID_MQD_COMPUTE(1));
1426 
1427       /* Add 8 for release mem packet and 2 for protected fence signal packet.
1428        * Calculcating userq_fence_seq_num this way to match with kernel fence that is
1429        * returned in userq_wait iotl.
1430        */
1431       userq->user_fence_seq_num = __next_wptr + 8 + 2;
1432 
1433       /* add release mem for user fence */
1434       amdgpu_pkt_add_dw(PKT3(PKT3_RELEASE_MEM, 6, 0));
1435       amdgpu_pkt_add_dw(S_490_EVENT_TYPE(V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT) |
1436                            S_490_EVENT_INDEX(5) | S_490_GLM_WB(1) | S_490_GLM_INV(1) |
1437                            S_490_GL2_WB(1) | S_490_SEQ(1) | S_490_CACHE_POLICY(3));
1438       amdgpu_pkt_add_dw(S_030358_DATA_SEL(2));
1439       amdgpu_pkt_add_dw(userq->user_fence_va);
1440       amdgpu_pkt_add_dw(userq->user_fence_va >> 32);
1441       amdgpu_pkt_add_dw(userq->user_fence_seq_num);
1442       amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32);
1443       amdgpu_pkt_add_dw(0);
1444 
1445       /* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer
1446        * is only accessible from kernel through VMID 0.
1447        */
1448       amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0));
1449       amdgpu_pkt_add_dw(0);
1450    } else {
1451       fprintf(stderr, "amdgpu: unsupported userq ip submission = %d\n", userq->ip_type);
1452    }
1453 
1454    amdgpu_pkt_end();
1455 }
1456 
amdgpu_cs_submit_ib_userq(struct amdgpu_userq * userq,struct amdgpu_cs * acs,uint32_t * shared_buf_kms_handles_write,unsigned num_shared_buf_write,uint32_t * shared_buf_kms_handles_read,unsigned num_shared_buf_read,uint64_t * seq_no,uint64_t vm_timeline_point)1457 static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq,
1458                                      struct amdgpu_cs *acs,
1459                                      uint32_t *shared_buf_kms_handles_write,
1460                                      unsigned num_shared_buf_write,
1461                                      uint32_t *shared_buf_kms_handles_read,
1462                                      unsigned num_shared_buf_read,
1463                                      uint64_t *seq_no,
1464                                      uint64_t vm_timeline_point)
1465 {
1466    int r = 0;
1467    struct amdgpu_winsys *aws = acs->aws;
1468    struct amdgpu_cs_context *cs = acs->cst;
1469 
1470    /* Syncobj dependencies. */
1471    unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1472    uint32_t *syncobj_dependencies_list =
1473       (uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t));
1474 
1475    /* Currently only 1 vm timeline syncobj can be a dependency. */
1476    uint16_t num_syncobj_timeline_dependencies = 1;
1477    uint32_t syncobj_timeline_dependency;
1478    uint64_t syncobj_timeline_dependency_point;
1479 
1480    if (num_syncobj_dependencies) {
1481       for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1482          struct amdgpu_fence *fence =
1483             (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1484 
1485          assert(util_queue_fence_is_signalled(&fence->submitted));
1486          syncobj_dependencies_list[i] = fence->syncobj;
1487       }
1488    }
1489    syncobj_timeline_dependency = aws->vm_timeline_syncobj;
1490    syncobj_timeline_dependency_point = vm_timeline_point;
1491 
1492    /* Syncobj signals. Adding 1 for cs submission fence. */
1493    unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num + 1;
1494    uint32_t *syncobj_signal_list =
1495       (uint32_t*)alloca(num_syncobj_to_signal * sizeof(uint32_t));
1496 
1497    for (unsigned i = 0; i < cs->syncobj_to_signal.num; i++) {
1498       struct amdgpu_fence *fence =
1499          (struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1500 
1501       syncobj_signal_list[i] = fence->syncobj;
1502    }
1503    syncobj_signal_list[num_syncobj_to_signal - 1] = ((struct amdgpu_fence*)cs->fence)->syncobj;
1504 
1505    struct drm_amdgpu_userq_fence_info *fence_info;
1506    struct drm_amdgpu_userq_wait userq_wait_data = {
1507       .syncobj_handles = (uintptr_t)syncobj_dependencies_list,
1508       .syncobj_timeline_handles = (uintptr_t)&syncobj_timeline_dependency,
1509       .syncobj_timeline_points = (uintptr_t)&syncobj_timeline_dependency_point,
1510       .bo_read_handles = (uintptr_t)shared_buf_kms_handles_read,
1511       .bo_write_handles = (uintptr_t)shared_buf_kms_handles_write,
1512       .num_syncobj_timeline_handles = num_syncobj_timeline_dependencies,
1513       .num_fences = 0,
1514       .num_syncobj_handles = num_syncobj_dependencies,
1515       .num_bo_read_handles = num_shared_buf_read,
1516       .num_bo_write_handles = num_shared_buf_write,
1517       .out_fences = (uintptr_t)NULL,
1518    };
1519 
1520    /*
1521     * Buffers sharing synchronization follow these rules:
1522     *   - read-only buffers wait for all previous writes to complete
1523     *   - write-only(also read-write) buffers wait for all previous reads to complete
1524     * To implement this strategy, we use amdgpu_userq_wait() before submitting
1525     * a job, and amdgpu_userq_signal() after to indicate completion.
1526     */
1527    r = ac_drm_userq_wait(aws->dev, &userq_wait_data);
1528    if (r)
1529       fprintf(stderr, "amdgpu: getting wait num_fences failed\n");
1530 
1531    fence_info = (struct drm_amdgpu_userq_fence_info*)
1532       alloca(userq_wait_data.num_fences * sizeof(struct drm_amdgpu_userq_fence_info));
1533    userq_wait_data.out_fences = (uintptr_t)fence_info;
1534 
1535    r = ac_drm_userq_wait(aws->dev, &userq_wait_data);
1536    if (r)
1537       fprintf(stderr, "amdgpu: getting wait fences failed\n");
1538 
1539    simple_mtx_lock(&userq->lock);
1540    amdgpu_cs_add_userq_packets(userq, cs, userq_wait_data.num_fences, fence_info);
1541    struct drm_amdgpu_userq_signal userq_signal_data = {
1542       .queue_id = userq->userq_handle,
1543       .syncobj_handles = (uintptr_t)syncobj_signal_list,
1544       .num_syncobj_handles = num_syncobj_to_signal,
1545       .bo_read_handles = (uintptr_t)shared_buf_kms_handles_read,
1546       .bo_write_handles = (uintptr_t)shared_buf_kms_handles_write,
1547       .num_bo_read_handles = num_shared_buf_read,
1548       .num_bo_write_handles = num_shared_buf_write,
1549    };
1550 
1551 #if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
1552    asm volatile ("mfence" : : : "memory");
1553 #endif
1554    /* Writing to *userq->wptr_bo_map is writing into mqd data. Before writing wptr into mqd
1555     * data, need to ensure that new packets added to user queue ring buffer are updated to.
1556     * memory. To ensure memory is updated, mfence is used.
1557     */
1558    *userq->wptr_bo_map = userq->next_wptr;
1559    /* Ringing the doorbell will have gpu execute new packets that were added in user queue
1560     * ring buffer. Before ringing the doorbell needed to ensure that mqd data is updated to
1561     * memory. To ensure memory is updated, mfence is used.
1562     */
1563 #if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
1564    asm volatile ("mfence" : : : "memory");
1565 #endif
1566    userq->doorbell_bo_map[AMDGPU_USERQ_DOORBELL_INDEX] = userq->next_wptr;
1567    r = ac_drm_userq_signal(aws->dev, &userq_signal_data);
1568 
1569    *seq_no = userq->user_fence_seq_num;
1570    simple_mtx_unlock(&userq->lock);
1571 
1572    return r;
1573 }
1574 
1575 enum queue_type {
1576    KERNELQ,
1577    KERNELQ_ALT_FENCE,
1578    USERQ,
1579 };
1580 
1581 /* The template parameter determines whether the queue should skip code used by the default queue
1582  * system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence
1583  * for all BOs.
1584  */
1585 template<enum queue_type queue_type>
amdgpu_cs_submit_ib(void * job,void * gdata,int thread_index)1586 static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
1587 {
1588    struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
1589    struct amdgpu_winsys *aws = acs->aws;
1590    struct amdgpu_cs_context *cs = acs->cst;
1591    int r;
1592    uint64_t seq_no = 0;
1593    bool has_user_fence = amdgpu_cs_has_user_fence(acs);
1594    /* The maximum timeline point of VM updates for all BOs used in this submit. */
1595    uint64_t vm_timeline_point = 0;
1596 
1597    simple_mtx_lock(&aws->bo_fence_lock);
1598    unsigned queue_index;
1599    struct amdgpu_queue *queue;
1600    uint_seq_no prev_seq_no, next_seq_no;
1601 
1602    if (queue_type != KERNELQ_ALT_FENCE) {
1603       queue_index = acs->queue_index;
1604       queue = &aws->queues[queue_index];
1605       prev_seq_no = queue->latest_seq_no;
1606 
1607       /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
1608        * but the values aren't related.
1609        */
1610       next_seq_no = prev_seq_no + 1;
1611 
1612       /* Wait for the oldest fence to signal. This should always check the user fence, then wait
1613        * via the ioctl. We have to do this because we are going to release the oldest fence and
1614        * replace it with the latest fence in the ring.
1615        */
1616       struct pipe_fence_handle **oldest_fence =
1617          &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
1618 
1619       if (*oldest_fence) {
1620          if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
1621             /* Take the reference because the fence can be released by other threads after we
1622              * unlock the mutex.
1623              */
1624             struct pipe_fence_handle *tmp_fence = NULL;
1625             amdgpu_fence_reference(&tmp_fence, *oldest_fence);
1626 
1627             /* Unlock the mutex before waiting. */
1628             simple_mtx_unlock(&aws->bo_fence_lock);
1629             amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
1630             amdgpu_fence_reference(&tmp_fence, NULL);
1631             simple_mtx_lock(&aws->bo_fence_lock);
1632          }
1633 
1634          /* Remove the idle fence from the ring. */
1635          amdgpu_fence_reference(oldest_fence, NULL);
1636       }
1637    }
1638 
1639    /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
1640     * sequence number per queue and removes all older ones.
1641     */
1642    struct amdgpu_seq_no_fences seq_no_dependencies;
1643    memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies));
1644 
1645    if (queue_type != KERNELQ_ALT_FENCE) {
1646       /* Add a fence dependency on the previous IB if the IP has multiple physical queues to
1647        * make it appear as if it had only 1 queue, or if the previous IB comes from a different
1648        * context. The reasons are:
1649        * - Our BO fence tracking only supports 1 queue per IP.
1650        * - IBs from different contexts must wait for each other and can't execute in a random order.
1651        */
1652       struct amdgpu_fence *prev_fence =
1653          (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
1654 
1655       /* Add a dependency on a previous fence, unless we can determine that
1656        * it's useless because the execution order is guaranteed.
1657        */
1658       if (prev_fence) {
1659          bool same_ctx = queue->last_ctx == acs->ctx;
1660          /* userqueue submission mode uses a single queue per process. */
1661          bool same_queue = queue_type == USERQ || aws->info.ip[acs->ip_type].num_queues == 1;
1662 
1663          if (!same_ctx || !same_queue)
1664             add_seq_no_to_list(aws, &seq_no_dependencies, queue_index, prev_seq_no);
1665       }
1666    }
1667 
1668    /* Since the kernel driver doesn't synchronize execution between different
1669     * rings automatically, we have to add fence dependencies manually. This gathers sequence
1670     * numbers from BOs and sets the next sequence number in the BOs.
1671     */
1672 
1673    /* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1674    struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1675    unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1676    unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1677    unsigned queue_index_bit = (queue_type == KERNELQ_ALT_FENCE) ?
1678       0 : BITFIELD_BIT(queue_index);
1679 
1680    for (unsigned i = 0; i < num_slab_entry_buffers; i++) {
1681       struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i];
1682       struct amdgpu_winsys_bo *bo = buffer->bo;
1683 
1684       amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1685                                         buffer->usage);
1686       if (queue_type == KERNELQ_ALT_FENCE)
1687          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1688       else
1689          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1690 
1691       /* We didn't add any slab entries into the real buffer list that will be submitted
1692        * to the kernel. Do it now.
1693        */
1694       struct amdgpu_cs_buffer *real_buffer =
1695          amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(buffer->bo)->b,
1696                                      &cs->buffer_lists[AMDGPU_BO_REAL], false);
1697 
1698       /* We need to set the usage because it determines the BO priority. */
1699       real_buffer->usage |= buffer->usage;
1700    }
1701 
1702    /* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1703    unsigned num_real_buffers_except_sparse = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1704    struct amdgpu_cs_buffer *sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].buffers;
1705    unsigned num_sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].num_buffers;
1706    bool out_of_memory = false;
1707 
1708    for (unsigned i = 0; i < num_sparse_buffers; i++) {
1709       struct amdgpu_cs_buffer *buffer = &sparse_buffers[i];
1710       struct amdgpu_winsys_bo *bo = buffer->bo;
1711 
1712       amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1713                                         buffer->usage);
1714       if (queue_type == KERNELQ_ALT_FENCE)
1715          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1716       else
1717          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1718 
1719       /* Add backing buffers of sparse buffers to the buffer list.
1720        *
1721        * This is done late, during submission, to keep the buffer list short before
1722        * submit, and to avoid managing fences for the backing buffers.
1723        */
1724       struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo);
1725 
1726       if (queue_type == USERQ) {
1727          uint64_t bo_vm_point = p_atomic_read(&sparse_bo->vm_timeline_point);
1728          vm_timeline_point = MAX2(vm_timeline_point, bo_vm_point);
1729       }
1730 
1731       simple_mtx_lock(&sparse_bo->commit_lock);
1732       list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) {
1733          /* We can directly add the buffer here, because we know that each
1734           * backing buffer occurs only once.
1735           */
1736          struct amdgpu_cs_buffer *real_buffer =
1737             amdgpu_do_add_buffer(cs, &backing->bo->b, &cs->buffer_lists[AMDGPU_BO_REAL], true);
1738          if (!real_buffer) {
1739             fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__);
1740             simple_mtx_unlock(&sparse_bo->commit_lock);
1741             r = -ENOMEM;
1742             out_of_memory = true;
1743          }
1744 
1745          real_buffer->usage = buffer->usage;
1746       }
1747       simple_mtx_unlock(&sparse_bo->commit_lock);
1748    }
1749 
1750    /* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */
1751    unsigned num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1752    struct amdgpu_cs_buffer *real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].buffers;
1753    struct drm_amdgpu_bo_list_entry *bo_list;
1754    /* BO dependency management depends on the queue mode:
1755     * - kernel queue: BO used by the submit are passed to the kernel in a
1756     *   drm_amdgpu_bo_list_entry list. The inter-process synchronization is handled
1757     *   automatically by the kernel; intra-process sync is handled by Mesa.
1758     * - user queue: intra-process sync is similar. Inter-process sync is handled
1759     *   using timeline points, amdgpu_userq_wait (before a submit) and
1760     *   amdgpu_userq_signal (after a submit).
1761     */
1762    unsigned num_shared_buf_write;
1763    unsigned num_shared_buf_read;
1764    /* Store write handles in the begining and read handles at the end in shared_buf_kms_handles.
1765     * If usage is read and write then store the handle in write list.
1766     */
1767    uint32_t *shared_buf_kms_handles;
1768    if (queue_type != USERQ) {
1769       bo_list = (struct drm_amdgpu_bo_list_entry *)
1770          alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1771    } else {
1772       num_shared_buf_write = 0;
1773       num_shared_buf_read = 0;
1774       shared_buf_kms_handles = (uint32_t*)alloca(num_real_buffers * sizeof(uint32_t));
1775    }
1776    unsigned i;
1777 
1778    for (i = 0; i < initial_num_real_buffers; i++) {
1779       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1780       struct amdgpu_winsys_bo *bo = buffer->bo;
1781 
1782       amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1783                                         buffer->usage);
1784       if (queue_type == KERNELQ_ALT_FENCE)
1785          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1786       else
1787          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1788 
1789       if (queue_type != USERQ) {
1790          amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1791       } else {
1792          vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point);
1793 
1794          if (!get_real_bo(bo)->is_shared)
1795             continue;
1796 
1797          if (buffer->usage & RADEON_USAGE_WRITE) {
1798             shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle;
1799             num_shared_buf_write++;
1800          } else {
1801             num_shared_buf_read++;
1802             shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
1803                get_real_bo(bo)->kms_handle;
1804          }
1805       }
1806    }
1807 
1808    /* These are backing buffers of slab entries. Don't add their fence dependencies. */
1809    for (; i < num_real_buffers_except_sparse; i++) {
1810       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1811       struct amdgpu_winsys_bo *bo = buffer->bo;
1812 
1813       if (queue_type == KERNELQ_ALT_FENCE)
1814          get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true;
1815       else
1816          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1817 
1818       if (queue_type != USERQ) {
1819          amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1820       } else {
1821          vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point);
1822 
1823          if (!get_real_bo(bo)->is_shared)
1824             continue;
1825 
1826          if (buffer->usage & RADEON_USAGE_WRITE) {
1827             shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle;
1828             num_shared_buf_write++;
1829          } else {
1830             num_shared_buf_read++;
1831             shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
1832                get_real_bo(bo)->kms_handle;
1833          }
1834       }
1835    }
1836 
1837    /* Sparse backing BOs are last. Don't update their fences because we don't use them. */
1838    for (; i < num_real_buffers; ++i) {
1839       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1840 
1841       if (queue_type != USERQ) {
1842          amdgpu_add_to_kernel_bo_list(&bo_list[i], buffer->bo, buffer->usage);
1843       } else {
1844          if (!get_real_bo(buffer->bo)->is_shared)
1845             continue;
1846          if (buffer->usage & RADEON_USAGE_WRITE) {
1847             shared_buf_kms_handles[num_shared_buf_write] =
1848                get_real_bo(buffer->bo)->kms_handle;
1849             num_shared_buf_write++;
1850          } else {
1851             num_shared_buf_read++;
1852             shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
1853                get_real_bo(buffer->bo)->kms_handle;
1854          }
1855       }
1856    }
1857 
1858 #if 0 /* Debug code. */
1859    printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
1860 
1861    /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
1862    for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
1863       if (i == acs->queue_index)
1864          continue;
1865 
1866       struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
1867       if (!fence) {
1868          if (i <= 1)
1869             printf("      queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
1870          continue;
1871       }
1872 
1873       bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
1874       uint_seq_no old = seq_no_dependencies.seq_no[i];
1875       add_seq_no_to_list(aws, &seq_no_dependencies, i, aws->queues[i].latest_seq_no);
1876       uint_seq_no new = seq_no_dependencies.seq_no[i];
1877 
1878       if (!valid)
1879          printf("   missing dependency on queue=%u, seq_no=%u\n", i, new);
1880       else if (old != new)
1881          printf("   too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
1882       else
1883          printf("   has dependency on queue=%u, seq_no=%u\n", i, old);
1884    }
1885 #endif
1886 
1887    /* Convert the sequence numbers we gathered to fence dependencies. */
1888    u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
1889       struct pipe_fence_handle **fence = get_fence_from_ring(aws, &seq_no_dependencies, i);
1890 
1891       if (fence) {
1892          /* If it's idle, don't add it to the list of dependencies. */
1893          if (amdgpu_fence_wait(*fence, 0, false))
1894             amdgpu_fence_reference(fence, NULL);
1895          else
1896             add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)*fence);
1897       }
1898    }
1899 
1900    if (queue_type != KERNELQ_ALT_FENCE) {
1901       /* Finally, add the IB fence into the fence ring of the queue. */
1902       amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
1903       queue->latest_seq_no = next_seq_no;
1904       ((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no;
1905 
1906       /* Update the last used context in the queue. */
1907       amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
1908    }
1909    simple_mtx_unlock(&aws->bo_fence_lock);
1910 
1911 #if MESA_DEBUG
1912    /* Prepare the buffer list. */
1913    if (aws->debug_all_bos) {
1914       /* The buffer list contains all buffers. This is a slow path that
1915        * ensures that no buffer is missing in the BO list.
1916        */
1917       simple_mtx_lock(&aws->global_bo_list_lock);
1918       if (queue_type != USERQ) {
1919          bo_list = (struct drm_amdgpu_bo_list_entry *)
1920                    alloca(aws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1921          num_real_buffers = 0;
1922          list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
1923             bo_list[num_real_buffers].bo_handle = bo->kms_handle;
1924             bo_list[num_real_buffers].bo_priority = 0;
1925             ++num_real_buffers;
1926          }
1927       } else {
1928          shared_buf_kms_handles = (uint32_t*)alloca(aws->num_buffers * sizeof(uint32_t));
1929          num_shared_buf_write = 0;
1930          num_shared_buf_read = 0;
1931          list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
1932             shared_buf_kms_handles[num_shared_buf_write] = bo->kms_handle;
1933             num_shared_buf_write++;
1934          }
1935       }
1936       simple_mtx_unlock(&aws->global_bo_list_lock);
1937    }
1938 #endif
1939 
1940    if (acs->ip_type == AMD_IP_GFX)
1941       aws->gfx_bo_list_counter += num_real_buffers;
1942 
1943    if (out_of_memory) {
1944       r = -ENOMEM;
1945    } else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) {
1946       r = -ECANCELED;
1947    } else if (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX) {
1948       r = 0;
1949    } else {
1950       if (queue_type != USERQ) {
1951          /* Submit the command buffer.
1952           *
1953           * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
1954           * quite often, but it eventually succeeds after enough attempts. This happens frequently
1955           * with dEQP using NGG streamout.
1956           */
1957          r = 0;
1958 
1959          do {
1960             /* Wait 1 ms and try again. */
1961             if (r == -ENOMEM)
1962                os_time_sleep(1000);
1963 
1964             r = amdgpu_cs_submit_ib_kernelq(acs, num_real_buffers, bo_list, &seq_no);
1965          } while (r == -ENOMEM);
1966 
1967          if (!r) {
1968             /* Success. */
1969             uint64_t *user_fence = NULL;
1970 
1971             /* Need to reserve 4 QWORD for user fence:
1972              *   QWORD[0]: completed fence
1973              *   QWORD[1]: preempted fence
1974              *   QWORD[2]: reset fence
1975              *   QWORD[3]: preempted then reset
1976              */
1977             if (has_user_fence)
1978                user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
1979             amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
1980          }
1981       } else {
1982          struct amdgpu_userq *userq = &queue->userq;
1983          r = amdgpu_cs_submit_ib_userq(userq, acs, shared_buf_kms_handles, num_shared_buf_write,
1984                                        &shared_buf_kms_handles[num_real_buffers - num_shared_buf_read],
1985                                        num_shared_buf_read, &seq_no, vm_timeline_point);
1986          if (!r) {
1987             /* Success. */
1988             amdgpu_fence_submitted(cs->fence, seq_no, userq->user_fence_ptr);
1989          }
1990       }
1991    }
1992 
1993    if (unlikely(r)) {
1994       if (r == -ECANCELED) {
1995          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET,
1996                                         "amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
1997       } else if (r == -ENODATA) {
1998          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1999                                         "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
2000       } else if (r == -ETIME) {
2001          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
2002                                         "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
2003       } else {
2004          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx,
2005                                         PIPE_UNKNOWN_CONTEXT_RESET,
2006                                         "amdgpu: The CS has been rejected, "
2007                                         "see dmesg for more information (%i).\n",
2008                                         r);
2009       }
2010    }
2011 
2012    /* If there was an error, signal the fence, because it won't be signalled
2013     * by the hardware. */
2014    if (r || (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX))
2015       amdgpu_fence_signalled(cs->fence);
2016 
2017    if (unlikely(aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0))
2018       acs->mcbp_fw_shadow_chunk.flags = 0;
2019 
2020    cs->error_code = r;
2021 
2022    /* Clear the buffer lists. */
2023    for (unsigned list = 0; list < ARRAY_SIZE(cs->buffer_lists); list++) {
2024       struct amdgpu_cs_buffer *buffers = cs->buffer_lists[list].buffers;
2025       unsigned num_buffers = cs->buffer_lists[list].num_buffers;
2026 
2027       if (list == AMDGPU_BO_REAL) {
2028          /* Only decrement num_active_ioctls and unref where we incremented them.
2029           * We did both for regular real BOs. We only incremented the refcount for sparse
2030           * backing BOs.
2031           */
2032          /* Regular real BOs. */
2033          for (unsigned i = 0; i < initial_num_real_buffers; i++) {
2034             p_atomic_dec(&buffers[i].bo->num_active_ioctls);
2035             amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
2036          }
2037 
2038          /* Do nothing for slab BOs. */
2039 
2040          /* Sparse backing BOs. */
2041          for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++)
2042             amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
2043       } else {
2044          for (unsigned i = 0; i < num_buffers; i++) {
2045             p_atomic_dec(&buffers[i].bo->num_active_ioctls);
2046             amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
2047          }
2048       }
2049 
2050       cs->buffer_lists[list].num_buffers = 0;
2051    }
2052 
2053    amdgpu_cs_context_cleanup(aws, cs);
2054 }
2055 
2056 /* Make sure the previous submission is completed. */
amdgpu_cs_sync_flush(struct radeon_cmdbuf * rcs)2057 void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
2058 {
2059    struct amdgpu_cs *cs = amdgpu_cs(rcs);
2060 
2061    /* Wait for any pending ioctl of this CS to complete. */
2062    util_queue_fence_wait(&cs->flush_completed);
2063 }
2064 
amdgpu_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** fence)2065 static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
2066                            unsigned flags,
2067                            struct pipe_fence_handle **fence)
2068 {
2069    struct amdgpu_cs *cs = amdgpu_cs(rcs);
2070    struct amdgpu_winsys *aws = cs->aws;
2071    int error_code = 0;
2072    uint32_t ib_pad_dw_mask = aws->info.ip[cs->ip_type].ib_pad_dw_mask;
2073 
2074    rcs->current.max_dw += amdgpu_cs_epilog_dws(cs);
2075 
2076    /* Pad the IB according to the mask. */
2077    switch (cs->ip_type) {
2078    case AMD_IP_SDMA:
2079       if (aws->info.gfx_level <= GFX6) {
2080          while (rcs->current.cdw & ib_pad_dw_mask)
2081             radeon_emit(rcs, 0xf0000000); /* NOP packet */
2082       } else {
2083          while (rcs->current.cdw & ib_pad_dw_mask)
2084             radeon_emit(rcs, SDMA_NOP_PAD);
2085       }
2086       break;
2087    case AMD_IP_GFX:
2088    case AMD_IP_COMPUTE:
2089       amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0);
2090       if (cs->ip_type == AMD_IP_GFX)
2091          aws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
2092       break;
2093    case AMD_IP_UVD:
2094    case AMD_IP_UVD_ENC:
2095       while (rcs->current.cdw & ib_pad_dw_mask)
2096          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
2097       break;
2098    case AMD_IP_VCN_JPEG:
2099       if (rcs->current.cdw % 2)
2100          assert(0);
2101       while (rcs->current.cdw & ib_pad_dw_mask) {
2102          radeon_emit(rcs, 0x60000000); /* nop packet */
2103          radeon_emit(rcs, 0x00000000);
2104       }
2105       break;
2106    case AMD_IP_VCN_DEC:
2107       while (rcs->current.cdw & ib_pad_dw_mask)
2108          radeon_emit(rcs, 0x81ff); /* nop packet */
2109       break;
2110    default:
2111       break;
2112    }
2113 
2114    if (rcs->current.cdw > rcs->current.max_dw) {
2115       fprintf(stderr, "amdgpu: command stream overflowed\n");
2116    }
2117 
2118    /* If the CS is not empty or overflowed.... */
2119    if (likely(radeon_emitted(rcs, 0) &&
2120        rcs->current.cdw <= rcs->current.max_dw &&
2121        !(flags & RADEON_FLUSH_NOOP))) {
2122       struct amdgpu_cs_context *cur = cs->csc;
2123 
2124       /* Set IB sizes. */
2125       amdgpu_ib_finalize(aws, rcs, &cs->main_ib, cs->ip_type);
2126 
2127       /* Create a fence. */
2128       amdgpu_fence_reference(&cur->fence, NULL);
2129       if (cs->next_fence) {
2130          /* just move the reference */
2131          cur->fence = cs->next_fence;
2132          cs->next_fence = NULL;
2133       } else {
2134          cur->fence = amdgpu_fence_create(cs);
2135       }
2136       if (fence)
2137          amdgpu_fence_reference(fence, cur->fence);
2138 
2139       for (unsigned i = 0; i < ARRAY_SIZE(cur->buffer_lists); i++) {
2140          unsigned num_buffers = cur->buffer_lists[i].num_buffers;
2141          struct amdgpu_cs_buffer *buffers = cur->buffer_lists[i].buffers;
2142 
2143          for (unsigned j = 0; j < num_buffers; j++)
2144             p_atomic_inc(&buffers[j].bo->num_active_ioctls);
2145       }
2146 
2147       amdgpu_cs_sync_flush(rcs);
2148 
2149       cur->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
2150       if (cs->noop && cs->ip_type == AMD_IP_GFX) {
2151          /* Reduce the IB size and fill it with NOP to make it like an empty IB. */
2152          unsigned noop_dw_size = aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
2153          assert(cur->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size);
2154 
2155          cur->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
2156          cur->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4;
2157       }
2158 
2159       /* Swap command streams. "cst" is going to be submitted. */
2160       rcs->csc = cs->csc = cs->cst;
2161       cs->cst = cur;
2162 
2163       /* only gfx, compute and sdma queues are supported in userqueues. */
2164       if (aws->info.use_userq && cs->ip_type <= AMD_IP_SDMA) {
2165          util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed,
2166                             amdgpu_cs_submit_ib<USERQ>, NULL, 0);
2167       } else {
2168          util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed,
2169                             cs->uses_alt_fence ?
2170                                amdgpu_cs_submit_ib<KERNELQ_ALT_FENCE>
2171                                : amdgpu_cs_submit_ib<KERNELQ>,
2172                             NULL, 0);
2173       }
2174 
2175       if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
2176          cs->csc->secure = !cs->cst->secure;
2177       else
2178          cs->csc->secure = cs->cst->secure;
2179 
2180       if (!(flags & PIPE_FLUSH_ASYNC)) {
2181          amdgpu_cs_sync_flush(rcs);
2182          error_code = cur->error_code;
2183       }
2184    } else {
2185       if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
2186          cs->csc->secure = !cs->csc->secure;
2187 
2188       amdgpu_cs_context_cleanup_buffers(aws, cs->csc);
2189       amdgpu_cs_context_cleanup(aws, cs->csc);
2190    }
2191 
2192    memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
2193 
2194    amdgpu_get_new_ib(aws, rcs, &cs->main_ib, cs);
2195 
2196    if (cs->preamble_ib_bo) {
2197       amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
2198                            RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
2199    }
2200 
2201    if (cs->ip_type == AMD_IP_GFX)
2202       aws->num_gfx_IBs++;
2203    else if (cs->ip_type == AMD_IP_SDMA)
2204       aws->num_sdma_IBs++;
2205 
2206    return error_code;
2207 }
2208 
amdgpu_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)2209 static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
2210                                     struct pb_buffer_lean *_buf,
2211                                     unsigned usage)
2212 {
2213    struct amdgpu_cs *cs = amdgpu_cs(rcs);
2214    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
2215 
2216    return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
2217 }
2218 
amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf * rcs,uint64_t regs_va,uint64_t csa_va)2219 static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va,
2220                                                                    uint64_t csa_va)
2221 {
2222    struct amdgpu_cs *cs = amdgpu_cs(rcs);
2223    cs->mcbp_fw_shadow_chunk.shadow_va = regs_va;
2224    cs->mcbp_fw_shadow_chunk.csa_va = csa_va;
2225    cs->mcbp_fw_shadow_chunk.gds_va = 0;
2226    cs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
2227 }
2228 
amdgpu_winsys_fence_reference(struct radeon_winsys * rws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)2229 static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws,
2230                                           struct pipe_fence_handle **dst,
2231                                           struct pipe_fence_handle *src)
2232 {
2233    amdgpu_fence_reference(dst, src);
2234 }
2235 
amdgpu_cs_init_functions(struct amdgpu_screen_winsys * sws)2236 void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *sws)
2237 {
2238    sws->base.ctx_create = amdgpu_ctx_create;
2239    sws->base.ctx_destroy = amdgpu_ctx_destroy;
2240    sws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status;
2241    sws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
2242    sws->base.cs_create = amdgpu_cs_create;
2243    sws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
2244    sws->base.cs_destroy = amdgpu_cs_destroy;
2245    sws->base.cs_add_buffer = amdgpu_cs_add_buffer;
2246    sws->base.cs_validate = amdgpu_cs_validate;
2247    sws->base.cs_check_space = amdgpu_cs_check_space;
2248    sws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
2249    sws->base.cs_flush = amdgpu_cs_flush;
2250    sws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
2251    sws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
2252    sws->base.cs_sync_flush = amdgpu_cs_sync_flush;
2253    sws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
2254    sws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal;
2255    sws->base.cs_get_ip_type = amdgpu_cs_get_ip_type;
2256    sws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
2257    sws->base.fence_reference = amdgpu_winsys_fence_reference;
2258    sws->base.fence_import_syncobj = amdgpu_fence_import_syncobj;
2259    sws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
2260    sws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
2261    sws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
2262 
2263    if (sws->aws->info.has_fw_based_shadowing)
2264       sws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va;
2265 }
2266