• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4  * Copyright © 2015 Advanced Micro Devices, Inc.
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include "amdgpu_cs.h"
10 #include "util/detect_os.h"
11 #include "amdgpu_winsys.h"
12 #include "util/os_time.h"
13 #include <inttypes.h>
14 #include <stdio.h>
15 
16 #include "amd/common/sid.h"
17 
18 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
19  * codes in the kernel).
20  */
21 #if DETECT_OS_OPENBSD
22 #define ENODATA ENOTSUP
23 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
24 #define ENODATA ECONNREFUSED
25 #endif
26 
27 /* FENCES */
28 
amdgpu_fence_destroy(struct amdgpu_fence * fence)29 void amdgpu_fence_destroy(struct amdgpu_fence *fence)
30 {
31    ac_drm_cs_destroy_syncobj(fence->aws->fd, fence->syncobj);
32 
33    if (fence->ctx)
34       amdgpu_ctx_reference(&fence->ctx, NULL);
35 
36    util_queue_fence_destroy(&fence->submitted);
37    FREE(fence);
38 }
39 
40 static struct pipe_fence_handle *
amdgpu_fence_create(struct amdgpu_cs * cs)41 amdgpu_fence_create(struct amdgpu_cs *cs)
42 {
43    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
44    struct amdgpu_ctx *ctx = cs->ctx;
45 
46    fence->reference.count = 1;
47    fence->aws = ctx->aws;
48    amdgpu_ctx_reference(&fence->ctx, ctx);
49    fence->ctx = ctx;
50    fence->ip_type = cs->ip_type;
51    if (ac_drm_cs_create_syncobj2(ctx->aws->fd, 0, &fence->syncobj)) {
52       free(fence);
53       return NULL;
54    }
55 
56    util_queue_fence_init(&fence->submitted);
57    util_queue_fence_reset(&fence->submitted);
58    fence->queue_index = cs->queue_index;
59    return (struct pipe_fence_handle *)fence;
60 }
61 
62 static struct pipe_fence_handle *
amdgpu_fence_import_syncobj(struct radeon_winsys * rws,int fd)63 amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
64 {
65    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
66    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
67    int r;
68 
69    if (!fence)
70       return NULL;
71 
72    pipe_reference_init(&fence->reference, 1);
73    fence->aws = aws;
74    fence->ip_type = 0xffffffff;
75 
76    r = ac_drm_cs_import_syncobj(aws->fd, fd, &fence->syncobj);
77    if (r) {
78       FREE(fence);
79       return NULL;
80    }
81 
82    util_queue_fence_init(&fence->submitted);
83    fence->imported = true;
84 
85    return (struct pipe_fence_handle*)fence;
86 }
87 
88 static struct pipe_fence_handle *
amdgpu_fence_import_sync_file(struct radeon_winsys * rws,int fd)89 amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
90 {
91    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
92    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
93 
94    if (!fence)
95       return NULL;
96 
97    pipe_reference_init(&fence->reference, 1);
98    fence->aws = aws;
99    /* fence->ctx == NULL means that the fence is syncobj-based. */
100 
101    /* Convert sync_file into syncobj. */
102    int r = ac_drm_cs_create_syncobj(aws->fd, &fence->syncobj);
103    if (r) {
104       FREE(fence);
105       return NULL;
106    }
107 
108    r = ac_drm_cs_syncobj_import_sync_file(aws->fd, fence->syncobj, fd);
109    if (r) {
110       ac_drm_cs_destroy_syncobj(aws->fd, fence->syncobj);
111       FREE(fence);
112       return NULL;
113    }
114 
115    util_queue_fence_init(&fence->submitted);
116    fence->imported = true;
117 
118    return (struct pipe_fence_handle*)fence;
119 }
120 
amdgpu_fence_export_sync_file(struct radeon_winsys * rws,struct pipe_fence_handle * pfence)121 static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
122                                          struct pipe_fence_handle *pfence)
123 {
124    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
125    struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
126    int fd, r;
127 
128    util_queue_fence_wait(&fence->submitted);
129 
130    /* Convert syncobj into sync_file. */
131    r = ac_drm_cs_syncobj_export_sync_file(aws->fd, fence->syncobj, &fd);
132    return r ? -1 : fd;
133 }
134 
amdgpu_export_signalled_sync_file(struct radeon_winsys * rws)135 static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
136 {
137    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
138    uint32_t syncobj;
139    int fd = -1;
140 
141    int r = ac_drm_cs_create_syncobj2(aws->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
142                                      &syncobj);
143    if (r) {
144       return -1;
145    }
146 
147    r = ac_drm_cs_syncobj_export_sync_file(aws->fd, syncobj, &fd);
148    if (r) {
149       fd = -1;
150    }
151 
152    ac_drm_cs_destroy_syncobj(aws->fd, syncobj);
153    return fd;
154 }
155 
amdgpu_fence_submitted(struct pipe_fence_handle * fence,uint64_t seq_no,uint64_t * user_fence_cpu_address)156 static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
157                                    uint64_t seq_no,
158                                    uint64_t *user_fence_cpu_address)
159 {
160    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
161 
162    afence->seq_no = seq_no;
163    afence->user_fence_cpu_address = user_fence_cpu_address;
164    util_queue_fence_signal(&afence->submitted);
165 }
166 
amdgpu_fence_signalled(struct pipe_fence_handle * fence)167 static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
168 {
169    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
170 
171    afence->signalled = true;
172    util_queue_fence_signal(&afence->submitted);
173 }
174 
amdgpu_fence_wait(struct pipe_fence_handle * fence,uint64_t timeout,bool absolute)175 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
176                        bool absolute)
177 {
178    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
179    int64_t abs_timeout;
180    uint64_t *user_fence_cpu;
181 
182    if (afence->signalled)
183       return true;
184 
185    if (absolute)
186       abs_timeout = timeout;
187    else
188       abs_timeout = os_time_get_absolute_timeout(timeout);
189 
190    /* The fence might not have a number assigned if its IB is being
191     * submitted in the other thread right now. Wait until the submission
192     * is done. */
193    if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
194       return false;
195 
196    user_fence_cpu = afence->user_fence_cpu_address;
197    if (user_fence_cpu) {
198       if (*user_fence_cpu >= afence->seq_no) {
199          afence->signalled = true;
200          return true;
201       }
202 
203       /* No timeout, just query: no need for the ioctl. */
204       if (!absolute && !timeout)
205          return false;
206    }
207 
208    if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE)
209       abs_timeout = INT64_MAX;
210 
211    if (ac_drm_cs_syncobj_wait(afence->aws->fd, &afence->syncobj, 1,
212                               abs_timeout, 0, NULL))
213       return false;
214 
215    /* Check that guest-side syncobj agrees with the user fence. */
216    if (user_fence_cpu && afence->aws->info.is_virtio)
217       assert(afence->seq_no <= *user_fence_cpu);
218 
219    afence->signalled = true;
220    return true;
221 }
222 
amdgpu_fence_wait_rel_timeout(struct radeon_winsys * rws,struct pipe_fence_handle * fence,uint64_t timeout)223 static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
224                                           struct pipe_fence_handle *fence,
225                                           uint64_t timeout)
226 {
227    return amdgpu_fence_wait(fence, timeout, false);
228 }
229 
230 static struct pipe_fence_handle *
amdgpu_cs_get_next_fence(struct radeon_cmdbuf * rcs)231 amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
232 {
233    struct amdgpu_cs *cs = amdgpu_cs(rcs);
234    struct pipe_fence_handle *fence = NULL;
235 
236    if (cs->noop)
237       return NULL;
238 
239    if (cs->next_fence) {
240       amdgpu_fence_reference(&fence, cs->next_fence);
241       return fence;
242    }
243 
244    fence = amdgpu_fence_create(cs);
245    if (!fence)
246       return NULL;
247 
248    amdgpu_fence_reference(&cs->next_fence, fence);
249    return fence;
250 }
251 
252 /* CONTEXTS */
253 
254 static uint32_t
radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)255 radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)
256 {
257    switch (radeon_priority) {
258    case RADEON_CTX_PRIORITY_REALTIME:
259       return AMDGPU_CTX_PRIORITY_VERY_HIGH;
260    case RADEON_CTX_PRIORITY_HIGH:
261       return AMDGPU_CTX_PRIORITY_HIGH;
262    case RADEON_CTX_PRIORITY_MEDIUM:
263       return AMDGPU_CTX_PRIORITY_NORMAL;
264    case RADEON_CTX_PRIORITY_LOW:
265       return AMDGPU_CTX_PRIORITY_LOW;
266    default:
267       unreachable("Invalid context priority");
268    }
269 }
270 
amdgpu_ctx_create(struct radeon_winsys * rws,enum radeon_ctx_priority priority,bool allow_context_lost)271 static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *rws,
272                                                    enum radeon_ctx_priority priority,
273                                                    bool allow_context_lost)
274 {
275    struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
276    int r;
277    struct amdgpu_bo_alloc_request alloc_buffer = {};
278    uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority);
279    ac_drm_device *dev;
280    ac_drm_bo buf_handle;
281 
282    if (!ctx)
283       return NULL;
284 
285    ctx->aws = amdgpu_winsys(rws);
286    ctx->reference.count = 1;
287    ctx->allow_context_lost = allow_context_lost;
288 
289    dev = ctx->aws->dev;
290 
291    r = ac_drm_cs_ctx_create2(dev, amdgpu_priority, &ctx->ctx_handle);
292    if (r) {
293       fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r);
294       goto error_create;
295    }
296 
297    alloc_buffer.alloc_size = ctx->aws->info.gart_page_size;
298    alloc_buffer.phys_alignment = ctx->aws->info.gart_page_size;
299    alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
300 
301    r = ac_drm_bo_alloc(dev, &alloc_buffer, &buf_handle);
302    if (r) {
303       fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
304       goto error_user_fence_alloc;
305    }
306 
307    ctx->user_fence_cpu_address_base = NULL;
308    r = ac_drm_bo_cpu_map(dev, buf_handle, (void**)&ctx->user_fence_cpu_address_base);
309    if (r) {
310       fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
311       goto error_user_fence_map;
312    }
313 
314    memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
315    ctx->user_fence_bo = buf_handle;
316    ac_drm_bo_export(dev, buf_handle, amdgpu_bo_handle_type_kms, &ctx->user_fence_bo_kms_handle);
317 
318    return (struct radeon_winsys_ctx*)ctx;
319 
320 error_user_fence_map:
321    ac_drm_bo_free(dev, buf_handle);
322 
323 error_user_fence_alloc:
324    ac_drm_cs_ctx_free(dev, ctx->ctx_handle);
325 error_create:
326    FREE(ctx);
327    return NULL;
328 }
329 
amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)330 static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
331 {
332    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
333 
334    amdgpu_ctx_reference(&ctx, NULL);
335 }
336 
amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys * aws,enum amd_ip_type ip_type,uint32_t * ib,uint32_t * num_dw,unsigned leave_dw_space)337 static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *aws, enum amd_ip_type ip_type,
338                                       uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space)
339 {
340    unsigned pad_dw_mask = aws->info.ip[ip_type].ib_pad_dw_mask;
341    unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask;
342 
343    if (unaligned_dw) {
344       int remaining = pad_dw_mask + 1 - unaligned_dw;
345 
346       /* Only pad by 1 dword with the type-2 NOP if necessary. */
347       if (remaining == 1 && aws->info.gfx_ib_pad_with_type2) {
348          ib[(*num_dw)++] = PKT2_NOP_PAD;
349       } else {
350          /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
351           * packet. The size of the packet body after the header is always count + 1.
352           * If count == -1, there is no packet body. NOP is the only packet that can have
353           * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
354           */
355          ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0);
356          *num_dw += remaining - 1;
357       }
358    }
359    assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0);
360 }
361 
amdgpu_submit_gfx_nop(struct amdgpu_ctx * ctx)362 static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx)
363 {
364    struct amdgpu_bo_alloc_request request = {0};
365    struct drm_amdgpu_bo_list_in bo_list_in;
366    struct drm_amdgpu_cs_chunk_ib ib_in = {0};
367    ac_drm_bo bo;
368    amdgpu_va_handle va_handle = NULL;
369    struct drm_amdgpu_cs_chunk chunks[2];
370    struct drm_amdgpu_bo_list_entry list;
371    unsigned noop_dw_size;
372    void *cpu = NULL;
373    uint64_t seq_no;
374    uint64_t va;
375    int r;
376 
377    /* Older amdgpu doesn't report if the reset is complete or not. Detect
378     * it by submitting a no-op job. If it reports an error, then assume
379     * that the reset is not complete.
380     */
381    uint32_t temp_ctx_handle;
382    r = ac_drm_cs_ctx_create2(ctx->aws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx_handle);
383    if (r)
384       return r;
385 
386    request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
387    request.alloc_size = 4096;
388    request.phys_alignment = 4096;
389    r = ac_drm_bo_alloc(ctx->aws->dev, &request, &bo);
390    if (r)
391       goto destroy_ctx;
392 
393    r = ac_drm_va_range_alloc(ctx->aws->dev, amdgpu_gpu_va_range_general,
394                              request.alloc_size, request.phys_alignment,
395                              0, &va, &va_handle,
396                              AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH);
397    if (r)
398       goto destroy_bo;
399 
400    uint32_t kms_handle;
401    ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &kms_handle);
402 
403    r = ac_drm_bo_va_op_raw(ctx->aws->dev, kms_handle, 0, request.alloc_size, va,
404                            AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
405                            AMDGPU_VA_OP_MAP);
406    if (r)
407       goto destroy_bo;
408 
409    r = ac_drm_bo_cpu_map(ctx->aws->dev, bo, &cpu);
410    if (r)
411       goto destroy_bo;
412 
413    noop_dw_size = ctx->aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
414    ((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
415 
416    ac_drm_bo_cpu_unmap(ctx->aws->dev, bo);
417 
418    list.bo_handle = kms_handle;
419    ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &list.bo_handle);
420    list.bo_priority = 0;
421 
422    bo_list_in.list_handle = ~0;
423    bo_list_in.bo_number = 1;
424    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
425    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list;
426 
427    ib_in.ip_type = AMD_IP_GFX;
428    ib_in.ib_bytes = noop_dw_size * 4;
429    ib_in.va_start = va;
430 
431    chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
432    chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
433    chunks[0].chunk_data = (uintptr_t)&bo_list_in;
434 
435    chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB;
436    chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
437    chunks[1].chunk_data = (uintptr_t)&ib_in;
438 
439    r = ac_drm_cs_submit_raw2(ctx->aws->dev, temp_ctx_handle, 0, 2, chunks, &seq_no);
440 
441 destroy_bo:
442    if (va_handle)
443       ac_drm_va_range_free(va_handle);
444    ac_drm_bo_free(ctx->aws->dev, bo);
445 destroy_ctx:
446    ac_drm_cs_ctx_free(ctx->aws->dev, temp_ctx_handle);
447 
448    return r;
449 }
450 
451 static void
amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)452 amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
453                                const char *format, ...)
454 {
455    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
456 
457    /* Don't overwrite the last reset status. */
458    if (ctx->sw_status != PIPE_NO_RESET)
459       return;
460 
461    ctx->sw_status = status;
462 
463    if (!ctx->allow_context_lost) {
464       va_list args;
465 
466       va_start(args, format);
467       vfprintf(stderr, format, args);
468       va_end(args);
469 
470       /* Non-robust contexts are allowed to terminate the process. The only alternative is
471        * to skip command submission, which would look like a freeze because nothing is drawn,
472        * which looks like a hang without any reset.
473        */
474       abort();
475    }
476 }
477 
478 static enum pipe_reset_status
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx * rwctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)479 amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only,
480                               bool *needs_reset, bool *reset_completed)
481 {
482    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
483 
484    if (needs_reset)
485       *needs_reset = false;
486    if (reset_completed)
487       *reset_completed = false;
488 
489    /* Return a failure due to a GPU hang. */
490    uint64_t flags;
491 
492    if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
493       /* If the caller is only interested in full reset (= wants to ignore soft
494        * recoveries), we can use the rejected cs count as a quick first check.
495        */
496       return PIPE_NO_RESET;
497    }
498 
499    /*
500     * ctx->sw_status is updated on alloc/ioctl failures.
501     *
502     * We only rely on amdgpu_cs_query_reset_state2 to tell us
503     * that the context reset is complete.
504     */
505    if (ctx->sw_status != PIPE_NO_RESET) {
506       int r = ac_drm_cs_query_reset_state2(ctx->aws->dev, ctx->ctx_handle, &flags);
507       if (!r) {
508          if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
509             if (reset_completed) {
510                /* The ARB_robustness spec says:
511                *
512                *    If a reset status other than NO_ERROR is returned and subsequent
513                *    calls return NO_ERROR, the context reset was encountered and
514                *    completed. If a reset status is repeatedly returned, the context may
515                *    be in the process of resetting.
516                *
517                * Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
518                * so don't do anything special. On older kernels, submit a no-op cs. If it
519                * succeeds then assume the reset is complete.
520                */
521                if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
522                   *reset_completed = true;
523 
524                if (ctx->aws->info.drm_minor < 54 && ctx->aws->info.has_graphics)
525                   *reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
526             }
527          }
528       } else {
529          fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
530       }
531 
532       /* Return a failure due to SW issues. */
533       if (needs_reset)
534          *needs_reset = true;
535       return ctx->sw_status;
536    }
537 
538    if (needs_reset)
539       *needs_reset = false;
540    return PIPE_NO_RESET;
541 }
542 
543 /* COMMAND SUBMISSION */
544 
amdgpu_cs_has_user_fence(struct amdgpu_cs * acs)545 static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs)
546 {
547    return acs->ip_type == AMD_IP_GFX ||
548           acs->ip_type == AMD_IP_COMPUTE ||
549           acs->ip_type == AMD_IP_SDMA;
550 }
551 
amdgpu_cs_epilog_dws(struct amdgpu_cs * cs)552 static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs)
553 {
554    if (cs->has_chaining)
555       return 4; /* for chaining */
556 
557    return 0;
558 }
559 
560 static struct amdgpu_cs_buffer *
amdgpu_lookup_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list)561 amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
562                      struct amdgpu_buffer_list *list)
563 {
564    int num_buffers = list->num_buffers;
565    struct amdgpu_cs_buffer *buffers = list->buffers;
566    unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
567    int i = cs->buffer_indices_hashlist[hash];
568 
569    /* not found or found */
570    if (i < 0)
571       return NULL;
572 
573    if (i < num_buffers && buffers[i].bo == bo)
574       return &buffers[i];
575 
576    /* Hash collision, look for the BO in the list of buffers linearly. */
577    for (int i = num_buffers - 1; i >= 0; i--) {
578       if (buffers[i].bo == bo) {
579          /* Put this buffer in the hash list.
580           * This will prevent additional hash collisions if there are
581           * several consecutive lookup_buffer calls for the same buffer.
582           *
583           * Example: Assuming buffers A,B,C collide in the hash list,
584           * the following sequence of buffers:
585           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
586           * will collide here: ^ and here:   ^,
587           * meaning that we should get very few collisions in the end. */
588          cs->buffer_indices_hashlist[hash] = i & 0x7fff;
589          return &buffers[i];
590       }
591    }
592    return NULL;
593 }
594 
595 struct amdgpu_cs_buffer *
amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo)596 amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
597 {
598    return amdgpu_lookup_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)]);
599 }
600 
601 static struct amdgpu_cs_buffer *
amdgpu_do_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)602 amdgpu_do_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
603                      struct amdgpu_buffer_list *list, bool add_ref)
604 {
605    /* New buffer, check if the backing array is large enough. */
606    if (unlikely(list->num_buffers >= list->max_buffers)) {
607       unsigned new_max =
608          MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3));
609       struct amdgpu_cs_buffer *new_buffers;
610 
611       new_buffers = (struct amdgpu_cs_buffer *)
612                     REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers),
613                             new_max * sizeof(*new_buffers));
614       if (!new_buffers) {
615          fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
616          return NULL;
617       }
618 
619       list->max_buffers = new_max;
620       list->buffers = new_buffers;
621    }
622 
623    unsigned idx = list->num_buffers++;
624    struct amdgpu_cs_buffer *buffer = &list->buffers[idx];
625    if (add_ref)
626       p_atomic_inc(&bo->base.reference.count);
627    buffer->bo = bo;
628    buffer->usage = 0;
629 
630    unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
631    cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
632    return buffer;
633 }
634 
635 static struct amdgpu_cs_buffer *
amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)636 amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
637                             struct amdgpu_buffer_list *list, bool add_ref)
638 {
639    struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(cs, bo, list);
640 
641    return buffer ? buffer : amdgpu_do_add_buffer(cs, bo, list, add_ref);
642 }
643 
amdgpu_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)644 static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
645                                     struct pb_buffer_lean *buf,
646                                     unsigned usage,
647                                     enum radeon_bo_domain domains)
648 {
649    /* Don't use the "domains" parameter. Amdgpu doesn't support changing
650     * the buffer placement during command submission.
651     */
652    struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc;
653    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
654    struct amdgpu_cs_buffer *buffer;
655 
656    /* Fast exit for no-op calls.
657     * This is very effective with suballocators and linear uploaders that
658     * are outside of the winsys.
659     */
660    if (bo == cs->last_added_bo &&
661        (usage & cs->last_added_bo_usage) == usage)
662       return 0;
663 
664    buffer = amdgpu_lookup_or_add_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)], true);
665    if (!buffer)
666       return 0;
667 
668    buffer->usage |= usage;
669 
670    cs->last_added_bo_usage = buffer->usage;
671    cs->last_added_bo = bo;
672    return 0;
673 }
674 
amdgpu_ib_new_buffer(struct amdgpu_winsys * aws,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)675 static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *aws,
676                                  struct amdgpu_ib *main_ib,
677                                  struct amdgpu_cs *cs)
678 {
679    struct pb_buffer_lean *pb;
680    uint8_t *mapped;
681    unsigned buffer_size;
682 
683    /* Always create a buffer that is at least as large as the maximum seen IB size,
684     * aligned to a power of two.
685     */
686    buffer_size = util_next_power_of_two(main_ib->max_ib_bytes);
687 
688    /* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/
689    if (!cs->has_chaining)
690       buffer_size *= 4;
691 
692    const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024);
693    /* This is the maximum size that fits into the INDIRECT_BUFFER packet. */
694    const unsigned max_size = 2 * 1024 * 1024;
695 
696    buffer_size = MIN2(buffer_size, max_size);
697    buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */
698 
699    /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU.
700     * The speed of writing to GTT WC is somewhere between no difference and very slow, while
701     * VRAM being very slow a lot more often.
702     *
703     * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency
704     * and doesn't have to wait for cached GL2 requests to be processed.
705     */
706    enum radeon_bo_domain domain = RADEON_DOMAIN_GTT;
707    unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING |
708                     RADEON_FLAG_GL2_BYPASS;
709 
710    if (cs->ip_type == AMD_IP_GFX ||
711        cs->ip_type == AMD_IP_COMPUTE ||
712        cs->ip_type == AMD_IP_SDMA) {
713       /* Avoids hangs with "rendercheck -t cacomposite -f a8r8g8b8" via glamor
714        * on Navi 14
715        */
716       flags |= RADEON_FLAG_32BIT;
717    }
718 
719    pb = amdgpu_bo_create(aws, buffer_size,
720                          aws->info.gart_page_size,
721                          domain, (radeon_bo_flag)flags);
722    if (!pb)
723       return false;
724 
725    mapped = (uint8_t*)amdgpu_bo_map(&aws->dummy_sws.base, pb, NULL, PIPE_MAP_WRITE);
726    if (!mapped) {
727       radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
728       return false;
729    }
730 
731    radeon_bo_reference(&aws->dummy_sws.base, &main_ib->big_buffer, pb);
732    radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
733 
734    main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer);
735    main_ib->big_buffer_cpu_ptr = mapped;
736    main_ib->used_ib_space = 0;
737 
738    return true;
739 }
740 
amdgpu_get_new_ib(struct amdgpu_winsys * aws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)741 static bool amdgpu_get_new_ib(struct amdgpu_winsys *aws,
742                               struct radeon_cmdbuf *rcs,
743                               struct amdgpu_ib *main_ib,
744                               struct amdgpu_cs *cs)
745 {
746    struct drm_amdgpu_cs_chunk_ib *chunk_ib = &cs->csc->chunk_ib[IB_MAIN];
747    /* This is the minimum size of a contiguous IB. */
748    unsigned ib_size = 16 * 1024;
749 
750    /* Always allocate at least the size of the biggest cs_check_space call,
751     * because precisely the last call might have requested this size.
752     */
753    ib_size = MAX2(ib_size, main_ib->max_check_space_size);
754 
755    if (!cs->has_chaining) {
756       ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes),
757                                    IB_MAX_SUBMIT_BYTES));
758    }
759 
760    /* Decay the IB buffer size over time, so that memory usage decreases after
761     * a temporary peak.
762     */
763    main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32;
764 
765    rcs->prev_dw = 0;
766    rcs->num_prev = 0;
767    rcs->current.cdw = 0;
768    rcs->current.buf = NULL;
769 
770    /* Allocate a new buffer for IBs if the current buffer is all used. */
771    if (!main_ib->big_buffer ||
772        main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) {
773       if (!amdgpu_ib_new_buffer(aws, main_ib, cs))
774          return false;
775    }
776 
777    chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space;
778    chunk_ib->ib_bytes = 0;
779    /* ib_bytes is in dwords and the conversion to bytes will be done before
780     * the CS ioctl. */
781    main_ib->ptr_ib_size = &chunk_ib->ib_bytes;
782    main_ib->is_chained_ib = false;
783 
784    amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
785                         (radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB),
786                         (radeon_bo_domain)0);
787 
788    rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
789 
790    cs->csc->ib_main_addr = rcs->current.buf;
791 
792    ib_size = main_ib->big_buffer->size - main_ib->used_ib_space;
793    rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs);
794    return true;
795 }
796 
amdgpu_set_ib_size(struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib)797 static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib)
798 {
799    if (ib->is_chained_ib) {
800       *ib->ptr_ib_size = rcs->current.cdw |
801                          S_3F2_CHAIN(1) | S_3F2_VALID(1) |
802                          S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL);
803    } else {
804       *ib->ptr_ib_size = rcs->current.cdw;
805    }
806 }
807 
amdgpu_ib_finalize(struct amdgpu_winsys * aws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib,enum amd_ip_type ip_type)808 static void amdgpu_ib_finalize(struct amdgpu_winsys *aws, struct radeon_cmdbuf *rcs,
809                                struct amdgpu_ib *ib, enum amd_ip_type ip_type)
810 {
811    amdgpu_set_ib_size(rcs, ib);
812    ib->used_ib_space += rcs->current.cdw * 4;
813    ib->used_ib_space = align(ib->used_ib_space, aws->info.ip[ip_type].ib_alignment);
814    ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4);
815 }
816 
amdgpu_init_cs_context(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs,enum amd_ip_type ip_type)817 static bool amdgpu_init_cs_context(struct amdgpu_winsys *aws,
818                                    struct amdgpu_cs_context *cs,
819                                    enum amd_ip_type ip_type)
820 {
821    for (unsigned i = 0; i < ARRAY_SIZE(cs->chunk_ib); i++) {
822       cs->chunk_ib[i].ip_type = ip_type;
823       cs->chunk_ib[i].flags = 0;
824 
825       if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
826          /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation
827           * is the beginning of IBs because completion of an IB doesn't care about the state of
828           * GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be
829           * executed in parallel, so draw calls from the current IB can finish after the next IB
830           * starts drawing, and so the cache flush at the end of IBs is usually late and thus
831           * useless.
832           */
833          cs->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
834       }
835    }
836 
837    cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE;
838    cs->last_added_bo = NULL;
839    return true;
840 }
841 
cleanup_fence_list(struct amdgpu_fence_list * fences)842 static void cleanup_fence_list(struct amdgpu_fence_list *fences)
843 {
844    for (unsigned i = 0; i < fences->num; i++)
845       amdgpu_fence_drop_reference(fences->list[i]);
846    fences->num = 0;
847 }
848 
amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)849 static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
850 {
851    for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) {
852       struct amdgpu_cs_buffer *buffers = cs->buffer_lists[i].buffers;
853       unsigned num_buffers = cs->buffer_lists[i].num_buffers;
854 
855       for (unsigned j = 0; j < num_buffers; j++)
856          amdgpu_winsys_bo_drop_reference(aws, buffers[j].bo);
857 
858       cs->buffer_lists[i].num_buffers = 0;
859    }
860 }
861 
amdgpu_cs_context_cleanup(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)862 static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
863 {
864    cs->seq_no_dependencies.valid_fence_mask = 0;
865    cleanup_fence_list(&cs->syncobj_dependencies);
866    cleanup_fence_list(&cs->syncobj_to_signal);
867    amdgpu_fence_reference(&cs->fence, NULL);
868    cs->last_added_bo = NULL;
869 }
870 
amdgpu_destroy_cs_context(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)871 static void amdgpu_destroy_cs_context(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
872 {
873    amdgpu_cs_context_cleanup_buffers(aws, cs);
874    amdgpu_cs_context_cleanup(aws, cs);
875    for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++)
876       FREE(cs->buffer_lists[i].buffers);
877    FREE(cs->syncobj_dependencies.list);
878    FREE(cs->syncobj_to_signal.list);
879 }
880 
881 
amdgpu_cs_get_ip_type(struct radeon_cmdbuf * rcs)882 static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs)
883 {
884    struct amdgpu_cs *cs = amdgpu_cs(rcs);
885    return cs->ip_type;
886 }
887 
ip_uses_alt_fence(enum amd_ip_type ip_type)888 static bool ip_uses_alt_fence(enum amd_ip_type ip_type)
889 {
890    /* The alt_fence path can be tested thoroughly by enabling it for GFX here. */
891    return ip_type == AMD_IP_VCN_DEC ||
892           ip_type == AMD_IP_VCN_ENC ||
893           ip_type == AMD_IP_VCN_JPEG;
894 }
895 
amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)896 static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
897 {
898    struct amdgpu_cs *cs = amdgpu_cs(rcs);
899 
900    if (!cs)
901       return;
902 
903    amdgpu_cs_sync_flush(rcs);
904    util_queue_fence_destroy(&cs->flush_completed);
905    p_atomic_dec(&cs->aws->num_cs);
906    radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->preamble_ib_bo, NULL);
907    radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->main_ib.big_buffer, NULL);
908    FREE(rcs->prev);
909    amdgpu_destroy_cs_context(cs->aws, &cs->csc1);
910    amdgpu_destroy_cs_context(cs->aws, &cs->csc2);
911    amdgpu_fence_reference(&cs->next_fence, NULL);
912    FREE(cs);
913 }
914 
915 static bool
amdgpu_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)916 amdgpu_cs_create(struct radeon_cmdbuf *rcs,
917                  struct radeon_winsys_ctx *rwctx,
918                  enum amd_ip_type ip_type,
919                  void (*flush)(void *ctx, unsigned flags,
920                                struct pipe_fence_handle **fence),
921                  void *flush_ctx)
922 {
923    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
924    struct amdgpu_cs *cs;
925 
926    cs = CALLOC_STRUCT(amdgpu_cs);
927    if (!cs) {
928       return false;
929    }
930 
931    util_queue_fence_init(&cs->flush_completed);
932 
933    cs->aws = ctx->aws;
934    cs->ctx = ctx;
935    cs->flush_cs = flush;
936    cs->flush_data = flush_ctx;
937    cs->ip_type = ip_type;
938    cs->noop = ctx->aws->noop_cs;
939    cs->has_chaining = ctx->aws->info.gfx_level >= GFX7 &&
940                       (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
941 
942    /* Compute the queue index by counting the IPs that have queues. */
943    assert(ip_type < ARRAY_SIZE(ctx->aws->info.ip));
944    assert(ctx->aws->info.ip[ip_type].num_queues);
945 
946    if (ip_uses_alt_fence(ip_type)) {
947       cs->queue_index = INT_MAX;
948       cs->uses_alt_fence = true;
949    } else {
950       cs->queue_index = 0;
951 
952       for (unsigned i = 0; i < ARRAY_SIZE(ctx->aws->info.ip); i++) {
953          if (!ctx->aws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i))
954             continue;
955 
956          if (i == ip_type)
957             break;
958 
959          cs->queue_index++;
960       }
961       assert(cs->queue_index < AMDGPU_MAX_QUEUES);
962    }
963 
964    ac_drm_cs_chunk_fence_info_to_data(cs->ctx->user_fence_bo_kms_handle, cs->ip_type * 4,
965                                       (struct drm_amdgpu_cs_chunk_data*)&cs->fence_chunk);
966 
967    if (!amdgpu_init_cs_context(ctx->aws, &cs->csc1, ip_type)) {
968       FREE(cs);
969       return false;
970    }
971 
972    if (!amdgpu_init_cs_context(ctx->aws, &cs->csc2, ip_type)) {
973       amdgpu_destroy_cs_context(ctx->aws, &cs->csc1);
974       FREE(cs);
975       return false;
976    }
977 
978    memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
979 
980    /* Set the first submission context as current. */
981    rcs->csc = cs->csc = &cs->csc1;
982    cs->cst = &cs->csc2;
983 
984    /* Assign to both amdgpu_cs_context; only csc will use it. */
985    cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist;
986    cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist;
987 
988    cs->csc1.aws = ctx->aws;
989    cs->csc2.aws = ctx->aws;
990 
991    p_atomic_inc(&ctx->aws->num_cs);
992 
993    if (!amdgpu_get_new_ib(ctx->aws, rcs, &cs->main_ib, cs))
994       goto fail;
995 
996    /* Currently only gfx, compute and sdma queues supports user queue. */
997    if (cs->aws->info.use_userq && ip_type <= AMD_IP_SDMA) {
998       if (!amdgpu_userq_init(cs->aws, &cs->aws->queues[cs->queue_index].userq, ip_type))
999          goto fail;
1000    }
1001 
1002    rcs->priv = cs;
1003    return true;
1004 fail:
1005    amdgpu_cs_destroy(rcs);
1006    return false;
1007 }
1008 
1009 static bool
amdgpu_cs_setup_preemption(struct radeon_cmdbuf * rcs,const uint32_t * preamble_ib,unsigned preamble_num_dw)1010 amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
1011                            unsigned preamble_num_dw)
1012 {
1013    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1014    struct amdgpu_winsys *aws = cs->aws;
1015    struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2};
1016    unsigned size = align(preamble_num_dw * 4, aws->info.ip[AMD_IP_GFX].ib_alignment);
1017    struct pb_buffer_lean *preamble_bo;
1018    uint32_t *map;
1019 
1020    /* Create the preamble IB buffer. */
1021    preamble_bo = amdgpu_bo_create(aws, size, aws->info.ip[AMD_IP_GFX].ib_alignment,
1022                                   RADEON_DOMAIN_VRAM,
1023                                   (radeon_bo_flag)
1024                                   (RADEON_FLAG_NO_INTERPROCESS_SHARING |
1025                                    RADEON_FLAG_GTT_WC));
1026    if (!preamble_bo)
1027       return false;
1028 
1029    map = (uint32_t*)amdgpu_bo_map(&aws->dummy_sws.base, preamble_bo, NULL,
1030                                   (pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY));
1031    if (!map) {
1032       radeon_bo_reference(&aws->dummy_sws.base, &preamble_bo, NULL);
1033       return false;
1034    }
1035 
1036    /* Upload the preamble IB. */
1037    memcpy(map, preamble_ib, preamble_num_dw * 4);
1038 
1039    /* Pad the IB. */
1040    amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, map, &preamble_num_dw, 0);
1041    amdgpu_bo_unmap(&aws->dummy_sws.base, preamble_bo);
1042 
1043    for (unsigned i = 0; i < 2; i++) {
1044       csc[i]->chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo);
1045       csc[i]->chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4;
1046 
1047       csc[i]->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT;
1048    }
1049 
1050    assert(!cs->preamble_ib_bo);
1051    cs->preamble_ib_bo = preamble_bo;
1052 
1053    amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1054                         RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1055    return true;
1056 }
1057 
amdgpu_cs_validate(struct radeon_cmdbuf * rcs)1058 static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
1059 {
1060    return true;
1061 }
1062 
amdgpu_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)1063 static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
1064 {
1065    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1066    struct amdgpu_ib *main_ib = &cs->main_ib;
1067 
1068    assert(rcs->current.cdw <= rcs->current.max_dw);
1069 
1070    unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw;
1071 
1072    if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES)
1073       return false;
1074 
1075    if (rcs->current.max_dw - rcs->current.cdw >= dw)
1076       return true;
1077 
1078    unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
1079    unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
1080    /* 125% of the size for IB epilog. */
1081    unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
1082    main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size);
1083    main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4);
1084 
1085    if (!cs->has_chaining)
1086       return false;
1087 
1088    /* Allocate a new chunk */
1089    if (rcs->num_prev >= rcs->max_prev) {
1090       unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
1091       struct radeon_cmdbuf_chunk *new_prev;
1092 
1093       new_prev = (struct radeon_cmdbuf_chunk*)
1094                  REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev,
1095                          sizeof(*new_prev) * new_max_prev);
1096       if (!new_prev)
1097          return false;
1098 
1099       rcs->prev = new_prev;
1100       rcs->max_prev = new_max_prev;
1101    }
1102 
1103    if (!amdgpu_ib_new_buffer(cs->aws, main_ib, cs))
1104       return false;
1105 
1106    assert(main_ib->used_ib_space == 0);
1107    uint64_t va = main_ib->gpu_address;
1108 
1109    /* This space was originally reserved. */
1110    rcs->current.max_dw += cs_epilog_dw;
1111 
1112    /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
1113    amdgpu_pad_gfx_compute_ib(cs->aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4);
1114 
1115    radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
1116    radeon_emit(rcs, va);
1117    radeon_emit(rcs, va >> 32);
1118    uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
1119 
1120    assert((rcs->current.cdw & cs->aws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0);
1121    assert(rcs->current.cdw <= rcs->current.max_dw);
1122 
1123    amdgpu_set_ib_size(rcs, main_ib);
1124    main_ib->ptr_ib_size = new_ptr_ib_size;
1125    main_ib->is_chained_ib = true;
1126 
1127    /* Hook up the new chunk */
1128    rcs->prev[rcs->num_prev].buf = rcs->current.buf;
1129    rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
1130    rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
1131    rcs->num_prev++;
1132 
1133    rcs->prev_dw += rcs->current.cdw;
1134    rcs->current.cdw = 0;
1135 
1136    rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
1137    rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw;
1138 
1139    amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
1140                         RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1141 
1142    return true;
1143 }
1144 
amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context * cs)1145 static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *cs)
1146 {
1147    unsigned num_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1148    struct amdgpu_cs_buffer *buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1149 
1150    for (unsigned i = 0; i < num_buffers; i++) {
1151       struct amdgpu_cs_buffer *slab_buffer = &buffers[i];
1152       struct amdgpu_cs_buffer *real_buffer =
1153          amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(slab_buffer->bo)->b,
1154                                      &cs->buffer_lists[AMDGPU_BO_REAL], true);
1155 
1156       /* We need to set the usage because it determines the BO priority.
1157        *
1158        * Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its
1159        * BO fences to fence dependencies. Only the slab entries should do that.
1160        */
1161       real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
1162    }
1163 }
1164 
amdgpu_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)1165 static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
1166                                           struct radeon_bo_list_item *list)
1167 {
1168     struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
1169 
1170     /* We do this in the CS thread, but since we need to return the final usage of all buffers
1171      * here, do it here too. There is no harm in doing it again in the CS thread.
1172      */
1173     amdgpu_add_slab_backing_buffers(cs);
1174 
1175     struct amdgpu_buffer_list *real_buffers = &cs->buffer_lists[AMDGPU_BO_REAL];
1176     unsigned num_real_buffers = real_buffers->num_buffers;
1177 
1178 #if HAVE_AMDGPU_VIRTIO
1179     assert(!cs->ws->info.is_virtio);
1180 #endif
1181 
1182     if (list) {
1183         for (unsigned i = 0; i < num_real_buffers; i++) {
1184             list[i].bo_size = real_buffers->buffers[i].bo->base.size;
1185             list[i].vm_address =
1186                amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle);
1187             list[i].priority_usage = real_buffers->buffers[i].usage;
1188         }
1189     }
1190     return num_real_buffers;
1191 }
1192 
add_fence_to_list(struct amdgpu_fence_list * fences,struct amdgpu_fence * fence)1193 static void add_fence_to_list(struct amdgpu_fence_list *fences,
1194                               struct amdgpu_fence *fence)
1195 {
1196    unsigned idx = fences->num++;
1197 
1198    if (idx >= fences->max) {
1199       unsigned size;
1200       const unsigned increment = 8;
1201 
1202       fences->max = idx + increment;
1203       size = fences->max * sizeof(fences->list[0]);
1204       fences->list = (struct pipe_fence_handle**)realloc(fences->list, size);
1205    }
1206    amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
1207 }
1208 
amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf * rcs,struct pipe_fence_handle * pfence)1209 static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs,
1210                                            struct pipe_fence_handle *pfence)
1211 {
1212    struct amdgpu_cs *acs = amdgpu_cs(rcs);
1213    struct amdgpu_cs_context *cs = acs->csc;
1214    struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
1215 
1216    util_queue_fence_wait(&fence->submitted);
1217 
1218    if (!fence->imported) {
1219       /* Ignore idle fences. This will only check the user fence in memory. */
1220       if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) {
1221          add_seq_no_to_list(acs->aws, &cs->seq_no_dependencies, fence->queue_index,
1222                             fence->queue_seq_no);
1223       }
1224    }
1225    else
1226       add_fence_to_list(&cs->syncobj_dependencies, fence);
1227 }
1228 
amdgpu_add_fences_to_dependencies(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs,unsigned queue_index_bit,struct amdgpu_seq_no_fences * dependencies,struct amdgpu_winsys_bo * bo,unsigned usage)1229 static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws,
1230                                               struct amdgpu_cs_context *cs,
1231                                               unsigned queue_index_bit,
1232                                               struct amdgpu_seq_no_fences *dependencies,
1233                                               struct amdgpu_winsys_bo *bo, unsigned usage)
1234 {
1235    if (usage & RADEON_USAGE_SYNCHRONIZED) {
1236       /* Add BO fences from queues other than 'queue_index' to dependencies. */
1237       u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) {
1238          add_seq_no_to_list(ws, dependencies, other_queue_idx,
1239                             bo->fences.seq_no[other_queue_idx]);
1240       }
1241 
1242       if (bo->alt_fence)
1243          add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence);
1244    }
1245 }
1246 
amdgpu_set_bo_seq_no(unsigned queue_index,struct amdgpu_winsys_bo * bo,uint_seq_no new_queue_seq_no)1247 static void amdgpu_set_bo_seq_no(unsigned queue_index, struct amdgpu_winsys_bo *bo,
1248                                  uint_seq_no new_queue_seq_no)
1249 {
1250    bo->fences.seq_no[queue_index] = new_queue_seq_no;
1251    bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
1252 }
1253 
amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry * bo_entry,struct amdgpu_winsys_bo * bo,unsigned usage)1254 static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry,
1255                                          struct amdgpu_winsys_bo *bo, unsigned usage)
1256 {
1257    bo_entry->bo_handle = get_real_bo(bo)->kms_handle;
1258    bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2;
1259 }
1260 
amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf * rws,struct pipe_fence_handle * fence)1261 static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
1262                                          struct pipe_fence_handle *fence)
1263 {
1264    struct amdgpu_cs *acs = amdgpu_cs(rws);
1265    struct amdgpu_cs_context *cs = acs->csc;
1266 
1267    add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence);
1268 }
1269 
amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs * acs,unsigned num_real_buffers,struct drm_amdgpu_bo_list_entry * bo_list_real,uint64_t * seq_no)1270 static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs,
1271                                        unsigned num_real_buffers,
1272                                        struct drm_amdgpu_bo_list_entry *bo_list_real,
1273                                        uint64_t *seq_no)
1274 {
1275    struct amdgpu_winsys *aws = acs->aws;
1276    struct amdgpu_cs_context *cs = acs->cst;
1277    struct drm_amdgpu_bo_list_in bo_list_in;
1278    struct drm_amdgpu_cs_chunk chunks[8];
1279    unsigned num_chunks = 0;
1280 
1281    /* BO list */
1282    bo_list_in.operation = ~0;
1283    bo_list_in.list_handle = ~0;
1284    bo_list_in.bo_number = num_real_buffers;
1285    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1286    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list_real;
1287 
1288    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1289    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1290    chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1291    num_chunks++;
1292 
1293    /* Syncobj dependencies. */
1294    unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1295    if (num_syncobj_dependencies) {
1296       struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1297          (struct drm_amdgpu_cs_chunk_sem *)
1298          alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
1299 
1300       for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1301          struct amdgpu_fence *fence =
1302             (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1303 
1304          assert(util_queue_fence_is_signalled(&fence->submitted));
1305          sem_chunk[i].handle = fence->syncobj;
1306       }
1307 
1308       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
1309       chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
1310       chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1311       num_chunks++;
1312    }
1313 
1314    /* Syncobj signals. */
1315    unsigned num_syncobj_to_signal = 1 + cs->syncobj_to_signal.num;
1316    struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1317       (struct drm_amdgpu_cs_chunk_sem *)
1318       alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
1319 
1320    for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) {
1321       struct amdgpu_fence *fence =
1322          (struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1323 
1324       sem_chunk[i].handle = fence->syncobj;
1325    }
1326    sem_chunk[cs->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)cs->fence)->syncobj;
1327 
1328    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
1329    chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal;
1330    chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1331    num_chunks++;
1332 
1333    if (aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) {
1334       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW;
1335       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4;
1336       chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk;
1337       num_chunks++;
1338    }
1339 
1340    /* Fence */
1341    if (amdgpu_cs_has_user_fence(acs)) {
1342       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1343       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1344       chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
1345       num_chunks++;
1346    }
1347 
1348    /* IB */
1349    if (cs->chunk_ib[IB_PREAMBLE].ib_bytes) {
1350       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1351       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1352       chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_PREAMBLE];
1353       num_chunks++;
1354    }
1355 
1356    /* IB */
1357    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1358    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1359    chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_MAIN];
1360    num_chunks++;
1361 
1362    if (cs->secure) {
1363       cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
1364       cs->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
1365    } else {
1366       cs->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1367       cs->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1368    }
1369 
1370    assert(num_chunks <= 8);
1371 
1372    /* Submit the command buffer.
1373     *
1374     * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
1375     * quite often, but it eventually succeeds after enough attempts. This happens frequently
1376     * with dEQP using NGG streamout.
1377     */
1378    int r = 0;
1379 
1380    do {
1381       /* Wait 1 ms and try again. */
1382       if (r == -ENOMEM)
1383          os_time_sleep(1000);
1384 
1385       r = ac_drm_cs_submit_raw2(aws->dev, acs->ctx->ctx_handle, 0, num_chunks, chunks, seq_no);
1386    } while (r == -ENOMEM);
1387 
1388    return r;
1389 }
1390 
amdgpu_cs_add_userq_packets(struct amdgpu_userq * userq,struct amdgpu_cs_context * cs,uint64_t num_fences,struct drm_amdgpu_userq_fence_info * fence_info)1391 static void amdgpu_cs_add_userq_packets(struct amdgpu_userq *userq,
1392                                         struct amdgpu_cs_context *cs,
1393                                         uint64_t num_fences,
1394                                         struct drm_amdgpu_userq_fence_info *fence_info)
1395 {
1396    amdgpu_pkt_begin();
1397 
1398    if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) {
1399       if (num_fences) {
1400          unsigned num_fences_in_iter;
1401          /* FENCE_WAIT_MULTI packet supports max 32 fenes */
1402          for (unsigned i = 0; i < num_fences; i = i + 32) {
1403             num_fences_in_iter = (i + 32 > num_fences) ? num_fences - i : 32;
1404             amdgpu_pkt_add_dw(PKT3(PKT3_FENCE_WAIT_MULTI, num_fences_in_iter * 4, 0));
1405             amdgpu_pkt_add_dw(S_D10_ENGINE_SEL(1) | S_D10_POLL_INTERVAL(4) | S_D10_PREEMPTABLE(1));
1406             for (unsigned j = 0; j < num_fences_in_iter; j++) {
1407                amdgpu_pkt_add_dw(fence_info[i + j].va);
1408                amdgpu_pkt_add_dw(fence_info[i + j].va >> 32);
1409                amdgpu_pkt_add_dw(fence_info[i + j].value);
1410                amdgpu_pkt_add_dw(fence_info[i + j].value >> 32);
1411             }
1412          }
1413       }
1414 
1415       amdgpu_pkt_add_dw(PKT3(PKT3_HDP_FLUSH, 0, 0));
1416       amdgpu_pkt_add_dw(0x0);
1417 
1418       amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
1419       amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start);
1420       amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start >> 32);
1421       if (userq->ip_type == AMD_IP_GFX)
1422          amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_INHERIT_VMID_MQD_GFX(1));
1423       else
1424          amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_VALID_COMPUTE(1) |
1425                               S_3F3_INHERIT_VMID_MQD_COMPUTE(1));
1426 
1427       /* Add 8 for release mem packet and 2 for protected fence signal packet.
1428        * Calculcating userq_fence_seq_num this way to match with kernel fence that is
1429        * returned in userq_wait iotl.
1430        */
1431       userq->user_fence_seq_num = *userq->wptr_bo_map + __num_dw_written + 8 + 2;
1432 
1433       /* add release mem for user fence */
1434       amdgpu_pkt_add_dw(PKT3(PKT3_RELEASE_MEM, 6, 0));
1435       amdgpu_pkt_add_dw(S_490_EVENT_TYPE(V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT) |
1436                            S_490_EVENT_INDEX(5) | S_490_GLM_WB(1) | S_490_GLM_INV(1) |
1437                            S_490_GL2_WB(1) | S_490_SEQ(1) | S_490_CACHE_POLICY(3));
1438       amdgpu_pkt_add_dw(S_030358_DATA_SEL(2));
1439       amdgpu_pkt_add_dw(userq->user_fence_va);
1440       amdgpu_pkt_add_dw(userq->user_fence_va >> 32);
1441       amdgpu_pkt_add_dw(userq->user_fence_seq_num);
1442       amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32);
1443       amdgpu_pkt_add_dw(0);
1444 
1445       /* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer
1446        * is only accessible from kernel through VMID 0.
1447        */
1448       amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0));
1449       amdgpu_pkt_add_dw(0);
1450    } else {
1451       fprintf(stderr, "amdgpu: unsupported userq ip submission = %d\n", userq->ip_type);
1452    }
1453 
1454    amdgpu_pkt_end();
1455 }
1456 
amdgpu_cs_submit_ib_userq(struct amdgpu_userq * userq,struct amdgpu_cs * acs,uint32_t * shared_buf_kms_handles_write,unsigned num_shared_buf_write,uint32_t * shared_buf_kms_handles_read,unsigned num_shared_buf_read,uint64_t * seq_no,uint64_t vm_timeline_point)1457 static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq,
1458                                      struct amdgpu_cs *acs,
1459                                      uint32_t *shared_buf_kms_handles_write,
1460                                      unsigned num_shared_buf_write,
1461                                      uint32_t *shared_buf_kms_handles_read,
1462                                      unsigned num_shared_buf_read,
1463                                      uint64_t *seq_no,
1464                                      uint64_t vm_timeline_point)
1465 {
1466    int r = 0;
1467    struct amdgpu_winsys *aws = acs->aws;
1468    struct amdgpu_cs_context *cs = acs->cst;
1469 
1470    /* Syncobj dependencies. */
1471    unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1472    uint32_t *syncobj_dependencies_list =
1473       (uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t));
1474 
1475    /* Currently only 1 vm timeline syncobj can be a dependency. */
1476    uint16_t num_syncobj_timeline_dependencies = 1;
1477    uint32_t syncobj_timeline_dependency;
1478    uint64_t syncobj_timeline_dependency_point;
1479 
1480    if (num_syncobj_dependencies) {
1481       for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1482          struct amdgpu_fence *fence =
1483             (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1484 
1485          assert(util_queue_fence_is_signalled(&fence->submitted));
1486          syncobj_dependencies_list[i] = fence->syncobj;
1487       }
1488    }
1489    syncobj_timeline_dependency = aws->vm_timeline_syncobj;
1490    syncobj_timeline_dependency_point = vm_timeline_point;
1491 
1492    /* Syncobj signals. Adding 1 for cs submission fence. */
1493    unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num + 1;
1494    uint32_t *syncobj_signal_list =
1495       (uint32_t*)alloca(num_syncobj_to_signal * sizeof(uint32_t));
1496 
1497    for (unsigned i = 0; i < cs->syncobj_to_signal.num; i++) {
1498       struct amdgpu_fence *fence =
1499          (struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1500 
1501       syncobj_signal_list[i] = fence->syncobj;
1502    }
1503    syncobj_signal_list[num_syncobj_to_signal - 1] = ((struct amdgpu_fence*)cs->fence)->syncobj;
1504 
1505    struct drm_amdgpu_userq_fence_info *fence_info;
1506    struct drm_amdgpu_userq_wait userq_wait_data = {
1507       .syncobj_handles = (uintptr_t)syncobj_dependencies_list,
1508       .syncobj_timeline_handles = (uintptr_t)&syncobj_timeline_dependency,
1509       .syncobj_timeline_points = (uintptr_t)&syncobj_timeline_dependency_point,
1510       .bo_read_handles = (uintptr_t)shared_buf_kms_handles_read,
1511       .bo_write_handles = (uintptr_t)shared_buf_kms_handles_write,
1512       .num_syncobj_timeline_handles = num_syncobj_timeline_dependencies,
1513       .num_fences = 0,
1514       .num_syncobj_handles = num_syncobj_dependencies,
1515       .num_bo_read_handles = num_shared_buf_read,
1516       .num_bo_write_handles = num_shared_buf_write,
1517       .out_fences = (uintptr_t)NULL,
1518    };
1519 
1520    /*
1521     * Buffers sharing synchronization follow these rules:
1522     *   - read-only buffers wait for all previous writes to complete
1523     *   - write-only(also read-write) buffers wait for all previous reads to complete
1524     * To implement this strategy, we use amdgpu_userq_wait() before submitting
1525     * a job, and amdgpu_userq_signal() after to indicate completion.
1526     */
1527    r = ac_drm_userq_wait(aws->dev, &userq_wait_data);
1528    if (r)
1529       fprintf(stderr, "amdgpu: getting wait num_fences failed\n");
1530 
1531    fence_info = (struct drm_amdgpu_userq_fence_info*)
1532       alloca(userq_wait_data.num_fences * sizeof(struct drm_amdgpu_userq_fence_info));
1533    userq_wait_data.out_fences = (uintptr_t)fence_info;
1534 
1535    r = ac_drm_userq_wait(aws->dev, &userq_wait_data);
1536    if (r)
1537       fprintf(stderr, "amdgpu: getting wait fences failed\n");
1538 
1539    simple_mtx_lock(&userq->lock);
1540    amdgpu_cs_add_userq_packets(userq, cs, userq_wait_data.num_fences, fence_info);
1541    struct drm_amdgpu_userq_signal userq_signal_data = {
1542       .queue_id = userq->userq_handle,
1543       .syncobj_handles = (uintptr_t)syncobj_signal_list,
1544       .num_syncobj_handles = num_syncobj_to_signal,
1545       .bo_read_handles = (uintptr_t)shared_buf_kms_handles_read,
1546       .bo_write_handles = (uintptr_t)shared_buf_kms_handles_write,
1547       .num_bo_read_handles = num_shared_buf_read,
1548       .num_bo_write_handles = num_shared_buf_write,
1549    };
1550 
1551    r = ac_drm_userq_signal(aws->dev, &userq_signal_data);
1552    if (!r)
1553       userq->doorbell_bo_map[AMDGPU_USERQ_DOORBELL_INDEX] = *userq->wptr_bo_map;
1554 
1555    *seq_no = userq->user_fence_seq_num;
1556    simple_mtx_unlock(&userq->lock);
1557 
1558    return r;
1559 }
1560 
1561 enum queue_type {
1562    KERNELQ,
1563    KERNELQ_ALT_FENCE,
1564    USERQ,
1565 };
1566 
1567 /* The template parameter determines whether the queue should skip code used by the default queue
1568  * system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence
1569  * for all BOs.
1570  */
1571 template<enum queue_type queue_type>
amdgpu_cs_submit_ib(void * job,void * gdata,int thread_index)1572 static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
1573 {
1574    struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
1575    struct amdgpu_winsys *aws = acs->aws;
1576    struct amdgpu_cs_context *cs = acs->cst;
1577    int r;
1578    uint64_t seq_no = 0;
1579    bool has_user_fence = amdgpu_cs_has_user_fence(acs);
1580    /* The maximum timeline point of VM updates for all BOs used in this submit. */
1581    uint64_t vm_timeline_point = 0;
1582 
1583    simple_mtx_lock(&aws->bo_fence_lock);
1584    unsigned queue_index;
1585    struct amdgpu_queue *queue;
1586    uint_seq_no prev_seq_no, next_seq_no;
1587 
1588    if (queue_type != KERNELQ_ALT_FENCE) {
1589       queue_index = acs->queue_index;
1590       queue = &aws->queues[queue_index];
1591       prev_seq_no = queue->latest_seq_no;
1592 
1593       /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
1594        * but the values aren't related.
1595        */
1596       next_seq_no = prev_seq_no + 1;
1597 
1598       /* Wait for the oldest fence to signal. This should always check the user fence, then wait
1599        * via the ioctl. We have to do this because we are going to release the oldest fence and
1600        * replace it with the latest fence in the ring.
1601        */
1602       struct pipe_fence_handle **oldest_fence =
1603          &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
1604 
1605       if (*oldest_fence) {
1606          if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
1607             /* Take the reference because the fence can be released by other threads after we
1608              * unlock the mutex.
1609              */
1610             struct pipe_fence_handle *tmp_fence = NULL;
1611             amdgpu_fence_reference(&tmp_fence, *oldest_fence);
1612 
1613             /* Unlock the mutex before waiting. */
1614             simple_mtx_unlock(&aws->bo_fence_lock);
1615             amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
1616             amdgpu_fence_reference(&tmp_fence, NULL);
1617             simple_mtx_lock(&aws->bo_fence_lock);
1618          }
1619 
1620          /* Remove the idle fence from the ring. */
1621          amdgpu_fence_reference(oldest_fence, NULL);
1622       }
1623    }
1624 
1625    /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
1626     * sequence number per queue and removes all older ones.
1627     */
1628    struct amdgpu_seq_no_fences seq_no_dependencies;
1629    memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies));
1630 
1631    if (queue_type != KERNELQ_ALT_FENCE) {
1632       /* Add a fence dependency on the previous IB if the IP has multiple physical queues to
1633        * make it appear as if it had only 1 queue, or if the previous IB comes from a different
1634        * context. The reasons are:
1635        * - Our BO fence tracking only supports 1 queue per IP.
1636        * - IBs from different contexts must wait for each other and can't execute in a random order.
1637        */
1638       struct amdgpu_fence *prev_fence =
1639          (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
1640 
1641       /* Add a dependency on a previous fence, unless we can determine that
1642        * it's useless because the execution order is guaranteed.
1643        */
1644       if (prev_fence) {
1645          bool same_ctx = queue->last_ctx == acs->ctx;
1646          /* userqueue submission mode uses a single queue per process. */
1647          bool same_queue = aws->info.ip[acs->ip_type].num_queues > 1 &&
1648                            queue_type != USERQ;
1649          if (!same_ctx || !same_queue)
1650             add_seq_no_to_list(aws, &seq_no_dependencies, queue_index, prev_seq_no);
1651       }
1652    }
1653 
1654    /* Since the kernel driver doesn't synchronize execution between different
1655     * rings automatically, we have to add fence dependencies manually. This gathers sequence
1656     * numbers from BOs and sets the next sequence number in the BOs.
1657     */
1658 
1659    /* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1660    struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1661    unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1662    unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1663    unsigned queue_index_bit = (queue_type == KERNELQ_ALT_FENCE) ?
1664       0 : BITFIELD_BIT(queue_index);
1665 
1666    for (unsigned i = 0; i < num_slab_entry_buffers; i++) {
1667       struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i];
1668       struct amdgpu_winsys_bo *bo = buffer->bo;
1669 
1670       amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1671                                         buffer->usage);
1672       if (queue_type == KERNELQ_ALT_FENCE)
1673          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1674       else
1675          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1676 
1677       /* We didn't add any slab entries into the real buffer list that will be submitted
1678        * to the kernel. Do it now.
1679        */
1680       struct amdgpu_cs_buffer *real_buffer =
1681          amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(buffer->bo)->b,
1682                                      &cs->buffer_lists[AMDGPU_BO_REAL], false);
1683 
1684       /* We need to set the usage because it determines the BO priority. */
1685       real_buffer->usage |= buffer->usage;
1686    }
1687 
1688    /* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1689    unsigned num_real_buffers_except_sparse = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1690    struct amdgpu_cs_buffer *sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].buffers;
1691    unsigned num_sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].num_buffers;
1692    bool out_of_memory = false;
1693 
1694    for (unsigned i = 0; i < num_sparse_buffers; i++) {
1695       struct amdgpu_cs_buffer *buffer = &sparse_buffers[i];
1696       struct amdgpu_winsys_bo *bo = buffer->bo;
1697 
1698       amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1699                                         buffer->usage);
1700       if (queue_type == KERNELQ_ALT_FENCE)
1701          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1702       else
1703          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1704 
1705       /* Add backing buffers of sparse buffers to the buffer list.
1706        *
1707        * This is done late, during submission, to keep the buffer list short before
1708        * submit, and to avoid managing fences for the backing buffers.
1709        */
1710       struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo);
1711 
1712       if (queue_type == USERQ) {
1713          uint64_t bo_vm_point = p_atomic_read(&sparse_bo->vm_timeline_point);
1714          vm_timeline_point = MAX2(vm_timeline_point, bo_vm_point);
1715       }
1716 
1717       simple_mtx_lock(&sparse_bo->commit_lock);
1718       list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) {
1719          /* We can directly add the buffer here, because we know that each
1720           * backing buffer occurs only once.
1721           */
1722          struct amdgpu_cs_buffer *real_buffer =
1723             amdgpu_do_add_buffer(cs, &backing->bo->b, &cs->buffer_lists[AMDGPU_BO_REAL], true);
1724          if (!real_buffer) {
1725             fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__);
1726             simple_mtx_unlock(&sparse_bo->commit_lock);
1727             r = -ENOMEM;
1728             out_of_memory = true;
1729          }
1730 
1731          real_buffer->usage = buffer->usage;
1732       }
1733       simple_mtx_unlock(&sparse_bo->commit_lock);
1734    }
1735 
1736    /* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */
1737    unsigned num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1738    struct amdgpu_cs_buffer *real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].buffers;
1739    struct drm_amdgpu_bo_list_entry *bo_list;
1740    /* BO dependency management depends on the queue mode:
1741     * - kernel queue: BO used by the submit are passed to the kernel in a
1742     *   drm_amdgpu_bo_list_entry list. The inter-process synchronization is handled
1743     *   automatically by the kernel; intra-process sync is handled by Mesa.
1744     * - user queue: intra-process sync is similar. Inter-process sync is handled
1745     *   using timeline points, amdgpu_userq_wait (before a submit) and
1746     *   amdgpu_userq_signal (after a submit).
1747     */
1748    unsigned num_shared_buf_write;
1749    unsigned num_shared_buf_read;
1750    /* Store write handles in the begining and read handles at the end in shared_buf_kms_handles.
1751     * If usage is read and write then store the handle in write list.
1752     */
1753    uint32_t *shared_buf_kms_handles;
1754    if (queue_type != USERQ) {
1755       bo_list = (struct drm_amdgpu_bo_list_entry *)
1756          alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1757    } else {
1758       num_shared_buf_write = 0;
1759       num_shared_buf_read = 0;
1760       shared_buf_kms_handles = (uint32_t*)alloca(num_real_buffers * sizeof(uint32_t));
1761    }
1762    unsigned i;
1763 
1764    for (i = 0; i < initial_num_real_buffers; i++) {
1765       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1766       struct amdgpu_winsys_bo *bo = buffer->bo;
1767 
1768       amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1769                                         buffer->usage);
1770       if (queue_type == KERNELQ_ALT_FENCE)
1771          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1772       else
1773          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1774 
1775       if (queue_type != USERQ) {
1776          amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1777       } else {
1778          vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point);
1779 
1780          if (!get_real_bo(bo)->is_shared)
1781             continue;
1782 
1783          if (buffer->usage & RADEON_USAGE_WRITE) {
1784             shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle;
1785             num_shared_buf_write++;
1786          } else {
1787             num_shared_buf_read++;
1788             shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
1789                get_real_bo(bo)->kms_handle;
1790          }
1791       }
1792    }
1793 
1794    /* These are backing buffers of slab entries. Don't add their fence dependencies. */
1795    for (; i < num_real_buffers_except_sparse; i++) {
1796       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1797       struct amdgpu_winsys_bo *bo = buffer->bo;
1798 
1799       if (queue_type == KERNELQ_ALT_FENCE)
1800          get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true;
1801       else
1802          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1803 
1804       if (queue_type != USERQ) {
1805          amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1806       } else {
1807          vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point);
1808 
1809          if (!get_real_bo(bo)->is_shared)
1810             continue;
1811 
1812          if (buffer->usage & RADEON_USAGE_WRITE) {
1813             shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle;
1814             num_shared_buf_write++;
1815          } else {
1816             num_shared_buf_read++;
1817             shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
1818                get_real_bo(bo)->kms_handle;
1819          }
1820       }
1821    }
1822 
1823    /* Sparse backing BOs are last. Don't update their fences because we don't use them. */
1824    for (; i < num_real_buffers; ++i) {
1825       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1826 
1827       if (queue_type != USERQ) {
1828          amdgpu_add_to_kernel_bo_list(&bo_list[i], buffer->bo, buffer->usage);
1829       } else {
1830          if (!get_real_bo(buffer->bo)->is_shared)
1831             continue;
1832          if (buffer->usage & RADEON_USAGE_WRITE) {
1833             shared_buf_kms_handles[num_shared_buf_write] =
1834                get_real_bo(buffer->bo)->kms_handle;
1835             num_shared_buf_write++;
1836          } else {
1837             num_shared_buf_read++;
1838             shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
1839                get_real_bo(buffer->bo)->kms_handle;
1840          }
1841       }
1842    }
1843 
1844 #if 0 /* Debug code. */
1845    printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
1846 
1847    /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
1848    for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
1849       if (i == acs->queue_index)
1850          continue;
1851 
1852       struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
1853       if (!fence) {
1854          if (i <= 1)
1855             printf("      queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
1856          continue;
1857       }
1858 
1859       bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
1860       uint_seq_no old = seq_no_dependencies.seq_no[i];
1861       add_seq_no_to_list(aws, &seq_no_dependencies, i, aws->queues[i].latest_seq_no);
1862       uint_seq_no new = seq_no_dependencies.seq_no[i];
1863 
1864       if (!valid)
1865          printf("   missing dependency on queue=%u, seq_no=%u\n", i, new);
1866       else if (old != new)
1867          printf("   too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
1868       else
1869          printf("   has dependency on queue=%u, seq_no=%u\n", i, old);
1870    }
1871 #endif
1872 
1873    /* Convert the sequence numbers we gathered to fence dependencies. */
1874    u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
1875       struct pipe_fence_handle **fence = get_fence_from_ring(aws, &seq_no_dependencies, i);
1876 
1877       if (fence) {
1878          /* If it's idle, don't add it to the list of dependencies. */
1879          if (amdgpu_fence_wait(*fence, 0, false))
1880             amdgpu_fence_reference(fence, NULL);
1881          else
1882             add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)*fence);
1883       }
1884    }
1885 
1886    if (queue_type != KERNELQ_ALT_FENCE) {
1887       /* Finally, add the IB fence into the fence ring of the queue. */
1888       amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
1889       queue->latest_seq_no = next_seq_no;
1890       ((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no;
1891 
1892       /* Update the last used context in the queue. */
1893       amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
1894    }
1895    simple_mtx_unlock(&aws->bo_fence_lock);
1896 
1897 #if MESA_DEBUG
1898    /* Prepare the buffer list. */
1899    if (aws->debug_all_bos) {
1900       /* The buffer list contains all buffers. This is a slow path that
1901        * ensures that no buffer is missing in the BO list.
1902        */
1903       simple_mtx_lock(&aws->global_bo_list_lock);
1904       if (queue_type != USERQ) {
1905          bo_list = (struct drm_amdgpu_bo_list_entry *)
1906                    alloca(aws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1907          num_real_buffers = 0;
1908          list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
1909             bo_list[num_real_buffers].bo_handle = bo->kms_handle;
1910             bo_list[num_real_buffers].bo_priority = 0;
1911             ++num_real_buffers;
1912          }
1913       } else {
1914          shared_buf_kms_handles = (uint32_t*)alloca(aws->num_buffers * sizeof(uint32_t));
1915          num_shared_buf_write = 0;
1916          num_shared_buf_read = 0;
1917          list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
1918             shared_buf_kms_handles[num_shared_buf_write] = bo->kms_handle;
1919             num_shared_buf_write++;
1920          }
1921       }
1922       simple_mtx_unlock(&aws->global_bo_list_lock);
1923    }
1924 #endif
1925 
1926    if (acs->ip_type == AMD_IP_GFX)
1927       aws->gfx_bo_list_counter += num_real_buffers;
1928 
1929    if (out_of_memory) {
1930       r = -ENOMEM;
1931    } else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) {
1932       r = -ECANCELED;
1933    } else if (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX) {
1934       r = 0;
1935    } else {
1936       if (queue_type != USERQ) {
1937          /* Submit the command buffer.
1938           *
1939           * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
1940           * quite often, but it eventually succeeds after enough attempts. This happens frequently
1941           * with dEQP using NGG streamout.
1942           */
1943          r = 0;
1944 
1945          do {
1946             /* Wait 1 ms and try again. */
1947             if (r == -ENOMEM)
1948                os_time_sleep(1000);
1949 
1950             r = amdgpu_cs_submit_ib_kernelq(acs, num_real_buffers, bo_list, &seq_no);
1951          } while (r == -ENOMEM);
1952 
1953          if (!r) {
1954             /* Success. */
1955             uint64_t *user_fence = NULL;
1956 
1957             /* Need to reserve 4 QWORD for user fence:
1958              *   QWORD[0]: completed fence
1959              *   QWORD[1]: preempted fence
1960              *   QWORD[2]: reset fence
1961              *   QWORD[3]: preempted then reset
1962              */
1963             if (has_user_fence)
1964                user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
1965             amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
1966          }
1967       } else {
1968          struct amdgpu_userq *userq = &queue->userq;
1969          r = amdgpu_cs_submit_ib_userq(userq, acs, shared_buf_kms_handles, num_shared_buf_write,
1970                                        &shared_buf_kms_handles[num_real_buffers - num_shared_buf_read],
1971                                        num_shared_buf_read, &seq_no, vm_timeline_point);
1972          if (!r) {
1973             /* Success. */
1974             amdgpu_fence_submitted(cs->fence, seq_no, userq->user_fence_ptr);
1975          }
1976       }
1977    }
1978 
1979    if (unlikely(r)) {
1980       if (r == -ECANCELED) {
1981          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET,
1982                                         "amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
1983       } else if (r == -ENODATA) {
1984          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1985                                         "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
1986       } else if (r == -ETIME) {
1987          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1988                                         "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
1989       } else {
1990          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx,
1991                                         PIPE_UNKNOWN_CONTEXT_RESET,
1992                                         "amdgpu: The CS has been rejected, "
1993                                         "see dmesg for more information (%i).\n",
1994                                         r);
1995       }
1996    }
1997 
1998    /* If there was an error, signal the fence, because it won't be signalled
1999     * by the hardware. */
2000    if (r || (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX))
2001       amdgpu_fence_signalled(cs->fence);
2002 
2003    if (unlikely(aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0))
2004       acs->mcbp_fw_shadow_chunk.flags = 0;
2005 
2006    cs->error_code = r;
2007 
2008    /* Clear the buffer lists. */
2009    for (unsigned list = 0; list < ARRAY_SIZE(cs->buffer_lists); list++) {
2010       struct amdgpu_cs_buffer *buffers = cs->buffer_lists[list].buffers;
2011       unsigned num_buffers = cs->buffer_lists[list].num_buffers;
2012 
2013       if (list == AMDGPU_BO_REAL) {
2014          /* Only decrement num_active_ioctls and unref where we incremented them.
2015           * We did both for regular real BOs. We only incremented the refcount for sparse
2016           * backing BOs.
2017           */
2018          /* Regular real BOs. */
2019          for (unsigned i = 0; i < initial_num_real_buffers; i++) {
2020             p_atomic_dec(&buffers[i].bo->num_active_ioctls);
2021             amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
2022          }
2023 
2024          /* Do nothing for slab BOs. */
2025 
2026          /* Sparse backing BOs. */
2027          for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++)
2028             amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
2029       } else {
2030          for (unsigned i = 0; i < num_buffers; i++) {
2031             p_atomic_dec(&buffers[i].bo->num_active_ioctls);
2032             amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
2033          }
2034       }
2035 
2036       cs->buffer_lists[list].num_buffers = 0;
2037    }
2038 
2039    amdgpu_cs_context_cleanup(aws, cs);
2040 }
2041 
2042 /* Make sure the previous submission is completed. */
amdgpu_cs_sync_flush(struct radeon_cmdbuf * rcs)2043 void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
2044 {
2045    struct amdgpu_cs *cs = amdgpu_cs(rcs);
2046 
2047    /* Wait for any pending ioctl of this CS to complete. */
2048    util_queue_fence_wait(&cs->flush_completed);
2049 }
2050 
amdgpu_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** fence)2051 static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
2052                            unsigned flags,
2053                            struct pipe_fence_handle **fence)
2054 {
2055    struct amdgpu_cs *cs = amdgpu_cs(rcs);
2056    struct amdgpu_winsys *aws = cs->aws;
2057    int error_code = 0;
2058    uint32_t ib_pad_dw_mask = aws->info.ip[cs->ip_type].ib_pad_dw_mask;
2059 
2060    rcs->current.max_dw += amdgpu_cs_epilog_dws(cs);
2061 
2062    /* Pad the IB according to the mask. */
2063    switch (cs->ip_type) {
2064    case AMD_IP_SDMA:
2065       if (aws->info.gfx_level <= GFX6) {
2066          while (rcs->current.cdw & ib_pad_dw_mask)
2067             radeon_emit(rcs, 0xf0000000); /* NOP packet */
2068       } else {
2069          while (rcs->current.cdw & ib_pad_dw_mask)
2070             radeon_emit(rcs, SDMA_NOP_PAD);
2071       }
2072       break;
2073    case AMD_IP_GFX:
2074    case AMD_IP_COMPUTE:
2075       amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0);
2076       if (cs->ip_type == AMD_IP_GFX)
2077          aws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
2078       break;
2079    case AMD_IP_UVD:
2080    case AMD_IP_UVD_ENC:
2081       while (rcs->current.cdw & ib_pad_dw_mask)
2082          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
2083       break;
2084    case AMD_IP_VCN_JPEG:
2085       if (rcs->current.cdw % 2)
2086          assert(0);
2087       while (rcs->current.cdw & ib_pad_dw_mask) {
2088          radeon_emit(rcs, 0x60000000); /* nop packet */
2089          radeon_emit(rcs, 0x00000000);
2090       }
2091       break;
2092    case AMD_IP_VCN_DEC:
2093       while (rcs->current.cdw & ib_pad_dw_mask)
2094          radeon_emit(rcs, 0x81ff); /* nop packet */
2095       break;
2096    default:
2097       break;
2098    }
2099 
2100    if (rcs->current.cdw > rcs->current.max_dw) {
2101       fprintf(stderr, "amdgpu: command stream overflowed\n");
2102    }
2103 
2104    /* If the CS is not empty or overflowed.... */
2105    if (likely(radeon_emitted(rcs, 0) &&
2106        rcs->current.cdw <= rcs->current.max_dw &&
2107        !(flags & RADEON_FLUSH_NOOP))) {
2108       struct amdgpu_cs_context *cur = cs->csc;
2109 
2110       /* Set IB sizes. */
2111       amdgpu_ib_finalize(aws, rcs, &cs->main_ib, cs->ip_type);
2112 
2113       /* Create a fence. */
2114       amdgpu_fence_reference(&cur->fence, NULL);
2115       if (cs->next_fence) {
2116          /* just move the reference */
2117          cur->fence = cs->next_fence;
2118          cs->next_fence = NULL;
2119       } else {
2120          cur->fence = amdgpu_fence_create(cs);
2121       }
2122       if (fence)
2123          amdgpu_fence_reference(fence, cur->fence);
2124 
2125       for (unsigned i = 0; i < ARRAY_SIZE(cur->buffer_lists); i++) {
2126          unsigned num_buffers = cur->buffer_lists[i].num_buffers;
2127          struct amdgpu_cs_buffer *buffers = cur->buffer_lists[i].buffers;
2128 
2129          for (unsigned j = 0; j < num_buffers; j++)
2130             p_atomic_inc(&buffers[j].bo->num_active_ioctls);
2131       }
2132 
2133       amdgpu_cs_sync_flush(rcs);
2134 
2135       cur->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
2136       if (cs->noop && cs->ip_type == AMD_IP_GFX) {
2137          /* Reduce the IB size and fill it with NOP to make it like an empty IB. */
2138          unsigned noop_dw_size = aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
2139          assert(cur->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size);
2140 
2141          cur->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
2142          cur->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4;
2143       }
2144 
2145       /* Swap command streams. "cst" is going to be submitted. */
2146       rcs->csc = cs->csc = cs->cst;
2147       cs->cst = cur;
2148 
2149       /* only gfx, compute and sdma queues are supported in userqueues. */
2150       if (aws->info.use_userq && cs->ip_type <= AMD_IP_SDMA) {
2151          util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed,
2152                             amdgpu_cs_submit_ib<USERQ>, NULL, 0);
2153       } else {
2154          util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed,
2155                             cs->uses_alt_fence ?
2156                                amdgpu_cs_submit_ib<KERNELQ_ALT_FENCE>
2157                                : amdgpu_cs_submit_ib<KERNELQ>,
2158                             NULL, 0);
2159       }
2160 
2161       if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
2162          cs->csc->secure = !cs->cst->secure;
2163       else
2164          cs->csc->secure = cs->cst->secure;
2165 
2166       if (!(flags & PIPE_FLUSH_ASYNC)) {
2167          amdgpu_cs_sync_flush(rcs);
2168          error_code = cur->error_code;
2169       }
2170    } else {
2171       if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
2172          cs->csc->secure = !cs->csc->secure;
2173 
2174       amdgpu_cs_context_cleanup_buffers(aws, cs->csc);
2175       amdgpu_cs_context_cleanup(aws, cs->csc);
2176    }
2177 
2178    memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
2179 
2180    amdgpu_get_new_ib(aws, rcs, &cs->main_ib, cs);
2181 
2182    if (cs->preamble_ib_bo) {
2183       amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
2184                            RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
2185    }
2186 
2187    if (cs->ip_type == AMD_IP_GFX)
2188       aws->num_gfx_IBs++;
2189    else if (cs->ip_type == AMD_IP_SDMA)
2190       aws->num_sdma_IBs++;
2191 
2192    return error_code;
2193 }
2194 
amdgpu_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)2195 static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
2196                                     struct pb_buffer_lean *_buf,
2197                                     unsigned usage)
2198 {
2199    struct amdgpu_cs *cs = amdgpu_cs(rcs);
2200    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
2201 
2202    return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
2203 }
2204 
amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf * rcs,uint64_t regs_va,uint64_t csa_va)2205 static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va,
2206                                                                    uint64_t csa_va)
2207 {
2208    struct amdgpu_cs *cs = amdgpu_cs(rcs);
2209    cs->mcbp_fw_shadow_chunk.shadow_va = regs_va;
2210    cs->mcbp_fw_shadow_chunk.csa_va = csa_va;
2211    cs->mcbp_fw_shadow_chunk.gds_va = 0;
2212    cs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
2213 }
2214 
amdgpu_winsys_fence_reference(struct radeon_winsys * rws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)2215 static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws,
2216                                           struct pipe_fence_handle **dst,
2217                                           struct pipe_fence_handle *src)
2218 {
2219    amdgpu_fence_reference(dst, src);
2220 }
2221 
amdgpu_cs_init_functions(struct amdgpu_screen_winsys * sws)2222 void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *sws)
2223 {
2224    sws->base.ctx_create = amdgpu_ctx_create;
2225    sws->base.ctx_destroy = amdgpu_ctx_destroy;
2226    sws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status;
2227    sws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
2228    sws->base.cs_create = amdgpu_cs_create;
2229    sws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
2230    sws->base.cs_destroy = amdgpu_cs_destroy;
2231    sws->base.cs_add_buffer = amdgpu_cs_add_buffer;
2232    sws->base.cs_validate = amdgpu_cs_validate;
2233    sws->base.cs_check_space = amdgpu_cs_check_space;
2234    sws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
2235    sws->base.cs_flush = amdgpu_cs_flush;
2236    sws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
2237    sws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
2238    sws->base.cs_sync_flush = amdgpu_cs_sync_flush;
2239    sws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
2240    sws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal;
2241    sws->base.cs_get_ip_type = amdgpu_cs_get_ip_type;
2242    sws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
2243    sws->base.fence_reference = amdgpu_winsys_fence_reference;
2244    sws->base.fence_import_syncobj = amdgpu_fence_import_syncobj;
2245    sws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
2246    sws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
2247    sws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
2248 
2249    if (sws->aws->info.has_fw_based_shadowing)
2250       sws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va;
2251 }
2252