• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4  * Copyright © 2015 Advanced Micro Devices, Inc.
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include "amdgpu_cs.h"
10 #include "util/detect_os.h"
11 #include "util/os_time.h"
12 #include <inttypes.h>
13 #include <stdio.h>
14 
15 #include "amd/common/sid.h"
16 
17 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
18  * codes in the kernel).
19  */
20 #if DETECT_OS_OPENBSD
21 #define ENODATA ENOTSUP
22 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
23 #define ENODATA ECONNREFUSED
24 #endif
25 
26 /* FENCES */
27 
amdgpu_fence_destroy(struct amdgpu_fence * fence)28 void amdgpu_fence_destroy(struct amdgpu_fence *fence)
29 {
30    amdgpu_cs_destroy_syncobj(fence->ws->dev, fence->syncobj);
31 
32    if (fence->ctx)
33       amdgpu_ctx_reference(&fence->ctx, NULL);
34 
35    util_queue_fence_destroy(&fence->submitted);
36    FREE(fence);
37 }
38 
39 static struct pipe_fence_handle *
amdgpu_fence_create(struct amdgpu_cs * cs)40 amdgpu_fence_create(struct amdgpu_cs *cs)
41 {
42    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
43    struct amdgpu_ctx *ctx = cs->ctx;
44 
45    fence->reference.count = 1;
46    fence->ws = ctx->ws;
47    amdgpu_ctx_reference(&fence->ctx, ctx);
48    fence->ctx = ctx;
49    fence->ip_type = cs->ip_type;
50    if (amdgpu_cs_create_syncobj2(ctx->ws->dev, 0, &fence->syncobj)) {
51       free(fence);
52       return NULL;
53    }
54 
55    util_queue_fence_init(&fence->submitted);
56    util_queue_fence_reset(&fence->submitted);
57    fence->queue_index = cs->queue_index;
58    return (struct pipe_fence_handle *)fence;
59 }
60 
61 static struct pipe_fence_handle *
amdgpu_fence_import_syncobj(struct radeon_winsys * rws,int fd)62 amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
63 {
64    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
65    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
66    int r;
67 
68    if (!fence)
69       return NULL;
70 
71    pipe_reference_init(&fence->reference, 1);
72    fence->ws = ws;
73    fence->ip_type = 0xffffffff;
74 
75    r = amdgpu_cs_import_syncobj(ws->dev, fd, &fence->syncobj);
76    if (r) {
77       FREE(fence);
78       return NULL;
79    }
80 
81    util_queue_fence_init(&fence->submitted);
82    fence->imported = true;
83 
84    return (struct pipe_fence_handle*)fence;
85 }
86 
87 static struct pipe_fence_handle *
amdgpu_fence_import_sync_file(struct radeon_winsys * rws,int fd)88 amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
89 {
90    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
91    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
92 
93    if (!fence)
94       return NULL;
95 
96    pipe_reference_init(&fence->reference, 1);
97    fence->ws = ws;
98    /* fence->ctx == NULL means that the fence is syncobj-based. */
99 
100    /* Convert sync_file into syncobj. */
101    int r = amdgpu_cs_create_syncobj(ws->dev, &fence->syncobj);
102    if (r) {
103       FREE(fence);
104       return NULL;
105    }
106 
107    r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd);
108    if (r) {
109       amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj);
110       FREE(fence);
111       return NULL;
112    }
113 
114    util_queue_fence_init(&fence->submitted);
115    fence->imported = true;
116 
117    return (struct pipe_fence_handle*)fence;
118 }
119 
amdgpu_fence_export_sync_file(struct radeon_winsys * rws,struct pipe_fence_handle * pfence)120 static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
121 					 struct pipe_fence_handle *pfence)
122 {
123    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
124    struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
125    int fd, r;
126 
127    util_queue_fence_wait(&fence->submitted);
128 
129    /* Convert syncobj into sync_file. */
130    r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd);
131    return r ? -1 : fd;
132 }
133 
amdgpu_export_signalled_sync_file(struct radeon_winsys * rws)134 static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
135 {
136    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
137    uint32_t syncobj;
138    int fd = -1;
139 
140    int r = amdgpu_cs_create_syncobj2(ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED,
141                                      &syncobj);
142    if (r) {
143       return -1;
144    }
145 
146    r = amdgpu_cs_syncobj_export_sync_file(ws->dev, syncobj, &fd);
147    if (r) {
148       fd = -1;
149    }
150 
151    amdgpu_cs_destroy_syncobj(ws->dev, syncobj);
152    return fd;
153 }
154 
amdgpu_fence_submitted(struct pipe_fence_handle * fence,uint64_t seq_no,uint64_t * user_fence_cpu_address)155 static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
156                                    uint64_t seq_no,
157                                    uint64_t *user_fence_cpu_address)
158 {
159    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
160 
161    afence->seq_no = seq_no;
162    afence->user_fence_cpu_address = user_fence_cpu_address;
163    util_queue_fence_signal(&afence->submitted);
164 }
165 
amdgpu_fence_signalled(struct pipe_fence_handle * fence)166 static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
167 {
168    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
169 
170    afence->signalled = true;
171    util_queue_fence_signal(&afence->submitted);
172 }
173 
amdgpu_fence_wait(struct pipe_fence_handle * fence,uint64_t timeout,bool absolute)174 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
175                        bool absolute)
176 {
177    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
178    int64_t abs_timeout;
179    uint64_t *user_fence_cpu;
180 
181    if (afence->signalled)
182       return true;
183 
184    if (absolute)
185       abs_timeout = timeout;
186    else
187       abs_timeout = os_time_get_absolute_timeout(timeout);
188 
189    /* The fence might not have a number assigned if its IB is being
190     * submitted in the other thread right now. Wait until the submission
191     * is done. */
192    if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
193       return false;
194 
195    user_fence_cpu = afence->user_fence_cpu_address;
196    if (user_fence_cpu) {
197       if (*user_fence_cpu >= afence->seq_no) {
198          afence->signalled = true;
199          return true;
200       }
201 
202       /* No timeout, just query: no need for the ioctl. */
203       if (!absolute && !timeout)
204          return false;
205    }
206 
207    if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE)
208       abs_timeout = INT64_MAX;
209 
210    if (amdgpu_cs_syncobj_wait(afence->ws->dev, &afence->syncobj, 1,
211                               abs_timeout, 0, NULL))
212 
213       return false;
214 
215    afence->signalled = true;
216    return true;
217 }
218 
amdgpu_fence_wait_rel_timeout(struct radeon_winsys * rws,struct pipe_fence_handle * fence,uint64_t timeout)219 static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
220                                           struct pipe_fence_handle *fence,
221                                           uint64_t timeout)
222 {
223    return amdgpu_fence_wait(fence, timeout, false);
224 }
225 
226 static struct pipe_fence_handle *
amdgpu_cs_get_next_fence(struct radeon_cmdbuf * rcs)227 amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
228 {
229    struct amdgpu_cs *cs = amdgpu_cs(rcs);
230    struct pipe_fence_handle *fence = NULL;
231 
232    if (cs->noop)
233       return NULL;
234 
235    if (cs->next_fence) {
236       amdgpu_fence_reference(&fence, cs->next_fence);
237       return fence;
238    }
239 
240    fence = amdgpu_fence_create(cs);
241    if (!fence)
242       return NULL;
243 
244    amdgpu_fence_reference(&cs->next_fence, fence);
245    return fence;
246 }
247 
248 /* CONTEXTS */
249 
250 static uint32_t
radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)251 radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)
252 {
253    switch (radeon_priority) {
254    case RADEON_CTX_PRIORITY_REALTIME:
255       return AMDGPU_CTX_PRIORITY_VERY_HIGH;
256    case RADEON_CTX_PRIORITY_HIGH:
257       return AMDGPU_CTX_PRIORITY_HIGH;
258    case RADEON_CTX_PRIORITY_MEDIUM:
259       return AMDGPU_CTX_PRIORITY_NORMAL;
260    case RADEON_CTX_PRIORITY_LOW:
261       return AMDGPU_CTX_PRIORITY_LOW;
262    default:
263       unreachable("Invalid context priority");
264    }
265 }
266 
amdgpu_ctx_create(struct radeon_winsys * ws,enum radeon_ctx_priority priority,bool allow_context_lost)267 static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws,
268                                                    enum radeon_ctx_priority priority,
269                                                    bool allow_context_lost)
270 {
271    struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
272    int r;
273    struct amdgpu_bo_alloc_request alloc_buffer = {};
274    uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority);
275    amdgpu_bo_handle buf_handle;
276 
277    if (!ctx)
278       return NULL;
279 
280    ctx->ws = amdgpu_winsys(ws);
281    ctx->reference.count = 1;
282    ctx->allow_context_lost = allow_context_lost;
283 
284    r = amdgpu_cs_ctx_create2(ctx->ws->dev, amdgpu_priority, &ctx->ctx);
285    if (r) {
286       fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r);
287       goto error_create;
288    }
289 
290    alloc_buffer.alloc_size = ctx->ws->info.gart_page_size;
291    alloc_buffer.phys_alignment = ctx->ws->info.gart_page_size;
292    alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
293 
294    r = amdgpu_bo_alloc(ctx->ws->dev, &alloc_buffer, &buf_handle);
295    if (r) {
296       fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
297       goto error_user_fence_alloc;
298    }
299 
300    r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base);
301    if (r) {
302       fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
303       goto error_user_fence_map;
304    }
305 
306    memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
307    ctx->user_fence_bo = buf_handle;
308 
309    return (struct radeon_winsys_ctx*)ctx;
310 
311 error_user_fence_map:
312    amdgpu_bo_free(buf_handle);
313 error_user_fence_alloc:
314    amdgpu_cs_ctx_free(ctx->ctx);
315 error_create:
316    FREE(ctx);
317    return NULL;
318 }
319 
amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)320 static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
321 {
322    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
323 
324    amdgpu_ctx_reference(&ctx, NULL);
325 }
326 
amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys * ws,enum amd_ip_type ip_type,uint32_t * ib,uint32_t * num_dw,unsigned leave_dw_space)327 static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *ws, enum amd_ip_type ip_type,
328                                       uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space)
329 {
330    unsigned pad_dw_mask = ws->info.ip[ip_type].ib_pad_dw_mask;
331    unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask;
332 
333    if (unaligned_dw) {
334       int remaining = pad_dw_mask + 1 - unaligned_dw;
335 
336       /* Only pad by 1 dword with the type-2 NOP if necessary. */
337       if (remaining == 1 && ws->info.gfx_ib_pad_with_type2) {
338          ib[(*num_dw)++] = PKT2_NOP_PAD;
339       } else {
340          /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
341           * packet. The size of the packet body after the header is always count + 1.
342           * If count == -1, there is no packet body. NOP is the only packet that can have
343           * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
344           */
345          ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0);
346          *num_dw += remaining - 1;
347       }
348    }
349    assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0);
350 }
351 
amdgpu_submit_gfx_nop(struct amdgpu_ctx * ctx)352 static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx)
353 {
354    struct amdgpu_bo_alloc_request request = {0};
355    struct drm_amdgpu_bo_list_in bo_list_in;
356    struct drm_amdgpu_cs_chunk_ib ib_in = {0};
357    amdgpu_bo_handle buf_handle;
358    amdgpu_va_handle va_handle = NULL;
359    struct drm_amdgpu_cs_chunk chunks[2];
360    struct drm_amdgpu_bo_list_entry list;
361    unsigned noop_dw_size;
362    void *cpu = NULL;
363    uint64_t seq_no;
364    uint64_t va;
365    int r;
366 
367    /* Older amdgpu doesn't report if the reset is complete or not. Detect
368     * it by submitting a no-op job. If it reports an error, then assume
369     * that the reset is not complete.
370     */
371    amdgpu_context_handle temp_ctx;
372    r = amdgpu_cs_ctx_create2(ctx->ws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx);
373    if (r)
374       return r;
375 
376    request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
377    request.alloc_size = 4096;
378    request.phys_alignment = 4096;
379    r = amdgpu_bo_alloc(ctx->ws->dev, &request, &buf_handle);
380    if (r)
381       goto destroy_ctx;
382 
383    r = amdgpu_va_range_alloc(ctx->ws->dev, amdgpu_gpu_va_range_general,
384                  request.alloc_size, request.phys_alignment,
385                  0, &va, &va_handle,
386                  AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH);
387    if (r)
388       goto destroy_bo;
389    r = amdgpu_bo_va_op_raw(ctx->ws->dev, buf_handle, 0, request.alloc_size, va,
390                            AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
391                            AMDGPU_VA_OP_MAP);
392    if (r)
393       goto destroy_bo;
394 
395    r = amdgpu_bo_cpu_map(buf_handle, &cpu);
396    if (r)
397       goto destroy_bo;
398 
399    noop_dw_size = ctx->ws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
400    ((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
401 
402    amdgpu_bo_cpu_unmap(buf_handle);
403 
404    amdgpu_bo_export(buf_handle, amdgpu_bo_handle_type_kms, &list.bo_handle);
405    list.bo_priority = 0;
406 
407    bo_list_in.list_handle = ~0;
408    bo_list_in.bo_number = 1;
409    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
410    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list;
411 
412    ib_in.ip_type = AMD_IP_GFX;
413    ib_in.ib_bytes = noop_dw_size * 4;
414    ib_in.va_start = va;
415 
416    chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
417    chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
418    chunks[0].chunk_data = (uintptr_t)&bo_list_in;
419 
420    chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB;
421    chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
422    chunks[1].chunk_data = (uintptr_t)&ib_in;
423 
424    r = amdgpu_cs_submit_raw2(ctx->ws->dev, temp_ctx, 0, 2, chunks, &seq_no);
425 
426 destroy_bo:
427    if (va_handle)
428       amdgpu_va_range_free(va_handle);
429    amdgpu_bo_free(buf_handle);
430 destroy_ctx:
431    amdgpu_cs_ctx_free(temp_ctx);
432 
433    return r;
434 }
435 
436 static void
amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)437 amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
438                                const char *format, ...)
439 {
440    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
441 
442    /* Don't overwrite the last reset status. */
443    if (ctx->sw_status != PIPE_NO_RESET)
444       return;
445 
446    ctx->sw_status = status;
447 
448    if (!ctx->allow_context_lost) {
449       va_list args;
450 
451       va_start(args, format);
452       vfprintf(stderr, format, args);
453       va_end(args);
454 
455       /* Non-robust contexts are allowed to terminate the process. The only alternative is
456        * to skip command submission, which would look like a freeze because nothing is drawn,
457        * which looks like a hang without any reset.
458        */
459       abort();
460    }
461 }
462 
463 static enum pipe_reset_status
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx * rwctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)464 amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only,
465                               bool *needs_reset, bool *reset_completed)
466 {
467    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
468 
469    if (needs_reset)
470       *needs_reset = false;
471    if (reset_completed)
472       *reset_completed = false;
473 
474    /* Return a failure due to a GPU hang. */
475    uint64_t flags;
476 
477    if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
478       /* If the caller is only interested in full reset (= wants to ignore soft
479        * recoveries), we can use the rejected cs count as a quick first check.
480        */
481       return PIPE_NO_RESET;
482    }
483 
484    /*
485     * ctx->sw_status is updated on alloc/ioctl failures.
486     *
487     * We only rely on amdgpu_cs_query_reset_state2 to tell us
488     * that the context reset is complete.
489     */
490    if (ctx->sw_status != PIPE_NO_RESET) {
491       int r = amdgpu_cs_query_reset_state2(ctx->ctx, &flags);
492       if (!r) {
493          if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
494             if (reset_completed) {
495                /* The ARB_robustness spec says:
496                *
497                *    If a reset status other than NO_ERROR is returned and subsequent
498                *    calls return NO_ERROR, the context reset was encountered and
499                *    completed. If a reset status is repeatedly returned, the context may
500                *    be in the process of resetting.
501                *
502                * Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
503                * so don't do anything special. On older kernels, submit a no-op cs. If it
504                * succeeds then assume the reset is complete.
505                */
506                if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
507                   *reset_completed = true;
508 
509                if (ctx->ws->info.drm_minor < 54 && ctx->ws->info.has_graphics)
510                   *reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
511             }
512          }
513       } else {
514          fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
515       }
516 
517       /* Return a failure due to SW issues. */
518       if (needs_reset)
519          *needs_reset = true;
520       return ctx->sw_status;
521    }
522 
523    if (needs_reset)
524       *needs_reset = false;
525    return PIPE_NO_RESET;
526 }
527 
528 /* COMMAND SUBMISSION */
529 
amdgpu_cs_has_user_fence(struct amdgpu_cs * acs)530 static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs)
531 {
532    return acs->ip_type == AMD_IP_GFX ||
533           acs->ip_type == AMD_IP_COMPUTE ||
534           acs->ip_type == AMD_IP_SDMA;
535 }
536 
amdgpu_cs_epilog_dws(struct amdgpu_cs * cs)537 static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs)
538 {
539    if (cs->has_chaining)
540       return 4; /* for chaining */
541 
542    return 0;
543 }
544 
545 static struct amdgpu_cs_buffer *
amdgpu_lookup_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list)546 amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
547                      struct amdgpu_buffer_list *list)
548 {
549    int num_buffers = list->num_buffers;
550    struct amdgpu_cs_buffer *buffers = list->buffers;
551    unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
552    int i = cs->buffer_indices_hashlist[hash];
553 
554    /* not found or found */
555    if (i < 0)
556       return NULL;
557 
558    if (i < num_buffers && buffers[i].bo == bo)
559       return &buffers[i];
560 
561    /* Hash collision, look for the BO in the list of buffers linearly. */
562    for (int i = num_buffers - 1; i >= 0; i--) {
563       if (buffers[i].bo == bo) {
564          /* Put this buffer in the hash list.
565           * This will prevent additional hash collisions if there are
566           * several consecutive lookup_buffer calls for the same buffer.
567           *
568           * Example: Assuming buffers A,B,C collide in the hash list,
569           * the following sequence of buffers:
570           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
571           * will collide here: ^ and here:   ^,
572           * meaning that we should get very few collisions in the end. */
573          cs->buffer_indices_hashlist[hash] = i & 0x7fff;
574          return &buffers[i];
575       }
576    }
577    return NULL;
578 }
579 
580 struct amdgpu_cs_buffer *
amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo)581 amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
582 {
583    return amdgpu_lookup_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)]);
584 }
585 
586 static struct amdgpu_cs_buffer *
amdgpu_do_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)587 amdgpu_do_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
588                      struct amdgpu_buffer_list *list, bool add_ref)
589 {
590    /* New buffer, check if the backing array is large enough. */
591    if (unlikely(list->num_buffers >= list->max_buffers)) {
592       unsigned new_max =
593          MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3));
594       struct amdgpu_cs_buffer *new_buffers;
595 
596       new_buffers = (struct amdgpu_cs_buffer *)
597                     REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers),
598                             new_max * sizeof(*new_buffers));
599       if (!new_buffers) {
600          fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
601          return NULL;
602       }
603 
604       list->max_buffers = new_max;
605       list->buffers = new_buffers;
606    }
607 
608    unsigned idx = list->num_buffers++;
609    struct amdgpu_cs_buffer *buffer = &list->buffers[idx];
610    if (add_ref)
611       p_atomic_inc(&bo->base.reference.count);
612    buffer->bo = bo;
613    buffer->usage = 0;
614 
615    unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
616    cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
617    return buffer;
618 }
619 
620 static struct amdgpu_cs_buffer *
amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)621 amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
622                             struct amdgpu_buffer_list *list, bool add_ref)
623 {
624    struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(cs, bo, list);
625 
626    return buffer ? buffer : amdgpu_do_add_buffer(cs, bo, list, add_ref);
627 }
628 
amdgpu_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)629 static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
630                                     struct pb_buffer_lean *buf,
631                                     unsigned usage,
632                                     enum radeon_bo_domain domains)
633 {
634    /* Don't use the "domains" parameter. Amdgpu doesn't support changing
635     * the buffer placement during command submission.
636     */
637    struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc;
638    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
639    struct amdgpu_cs_buffer *buffer;
640 
641    /* Fast exit for no-op calls.
642     * This is very effective with suballocators and linear uploaders that
643     * are outside of the winsys.
644     */
645    if (bo == cs->last_added_bo &&
646        (usage & cs->last_added_bo_usage) == usage)
647       return 0;
648 
649    buffer = amdgpu_lookup_or_add_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)], true);
650    if (!buffer)
651       return 0;
652 
653    buffer->usage |= usage;
654 
655    cs->last_added_bo_usage = buffer->usage;
656    cs->last_added_bo = bo;
657    return 0;
658 }
659 
amdgpu_ib_new_buffer(struct amdgpu_winsys * ws,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)660 static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws,
661                                  struct amdgpu_ib *main_ib,
662                                  struct amdgpu_cs *cs)
663 {
664    struct pb_buffer_lean *pb;
665    uint8_t *mapped;
666    unsigned buffer_size;
667 
668    /* Always create a buffer that is at least as large as the maximum seen IB size,
669     * aligned to a power of two.
670     */
671    buffer_size = util_next_power_of_two(main_ib->max_ib_bytes);
672 
673    /* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/
674    if (!cs->has_chaining)
675       buffer_size *= 4;
676 
677    const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024);
678    /* This is the maximum size that fits into the INDIRECT_BUFFER packet. */
679    const unsigned max_size = 2 * 1024 * 1024;
680 
681    buffer_size = MIN2(buffer_size, max_size);
682    buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */
683 
684    /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU.
685     * The speed of writing to GTT WC is somewhere between no difference and very slow, while
686     * VRAM being very slow a lot more often.
687     *
688     * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency
689     * and doesn't have to wait for cached GL2 requests to be processed.
690     */
691    enum radeon_bo_domain domain = RADEON_DOMAIN_GTT;
692    unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING |
693                     RADEON_FLAG_GL2_BYPASS;
694 
695    if (cs->ip_type == AMD_IP_GFX ||
696        cs->ip_type == AMD_IP_COMPUTE ||
697        cs->ip_type == AMD_IP_SDMA) {
698       /* Avoids hangs with "rendercheck -t cacomposite -f a8r8g8b8" via glamor
699        * on Navi 14
700        */
701       flags |= RADEON_FLAG_32BIT;
702    }
703 
704    pb = amdgpu_bo_create(ws, buffer_size,
705                          ws->info.gart_page_size,
706                          domain, (radeon_bo_flag)flags);
707    if (!pb)
708       return false;
709 
710    mapped = (uint8_t*)amdgpu_bo_map(&ws->dummy_ws.base, pb, NULL, PIPE_MAP_WRITE);
711    if (!mapped) {
712       radeon_bo_reference(&ws->dummy_ws.base, &pb, NULL);
713       return false;
714    }
715 
716    radeon_bo_reference(&ws->dummy_ws.base, &main_ib->big_buffer, pb);
717    radeon_bo_reference(&ws->dummy_ws.base, &pb, NULL);
718 
719    main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer);
720    main_ib->big_buffer_cpu_ptr = mapped;
721    main_ib->used_ib_space = 0;
722 
723    return true;
724 }
725 
amdgpu_get_new_ib(struct amdgpu_winsys * ws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)726 static bool amdgpu_get_new_ib(struct amdgpu_winsys *ws,
727                               struct radeon_cmdbuf *rcs,
728                               struct amdgpu_ib *main_ib,
729                               struct amdgpu_cs *cs)
730 {
731    struct drm_amdgpu_cs_chunk_ib *chunk_ib = &cs->csc->chunk_ib[IB_MAIN];
732    /* This is the minimum size of a contiguous IB. */
733    unsigned ib_size = 16 * 1024;
734 
735    /* Always allocate at least the size of the biggest cs_check_space call,
736     * because precisely the last call might have requested this size.
737     */
738    ib_size = MAX2(ib_size, main_ib->max_check_space_size);
739 
740    if (!cs->has_chaining) {
741       ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes),
742                                    IB_MAX_SUBMIT_BYTES));
743    }
744 
745    /* Decay the IB buffer size over time, so that memory usage decreases after
746     * a temporary peak.
747     */
748    main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32;
749 
750    rcs->prev_dw = 0;
751    rcs->num_prev = 0;
752    rcs->current.cdw = 0;
753    rcs->current.buf = NULL;
754 
755    /* Allocate a new buffer for IBs if the current buffer is all used. */
756    if (!main_ib->big_buffer ||
757        main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) {
758       if (!amdgpu_ib_new_buffer(ws, main_ib, cs))
759          return false;
760    }
761 
762    chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space;
763    chunk_ib->ib_bytes = 0;
764    /* ib_bytes is in dwords and the conversion to bytes will be done before
765     * the CS ioctl. */
766    main_ib->ptr_ib_size = &chunk_ib->ib_bytes;
767    main_ib->is_chained_ib = false;
768 
769    amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
770                         (radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB),
771                         (radeon_bo_domain)0);
772 
773    rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
774 
775    cs->csc->ib_main_addr = rcs->current.buf;
776 
777    ib_size = main_ib->big_buffer->size - main_ib->used_ib_space;
778    rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs);
779    return true;
780 }
781 
amdgpu_set_ib_size(struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib)782 static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib)
783 {
784    if (ib->is_chained_ib) {
785       *ib->ptr_ib_size = rcs->current.cdw |
786                          S_3F2_CHAIN(1) | S_3F2_VALID(1) |
787                          S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL);
788    } else {
789       *ib->ptr_ib_size = rcs->current.cdw;
790    }
791 }
792 
amdgpu_ib_finalize(struct amdgpu_winsys * ws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib,enum amd_ip_type ip_type)793 static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct radeon_cmdbuf *rcs,
794                                struct amdgpu_ib *ib, enum amd_ip_type ip_type)
795 {
796    amdgpu_set_ib_size(rcs, ib);
797    ib->used_ib_space += rcs->current.cdw * 4;
798    ib->used_ib_space = align(ib->used_ib_space, ws->info.ip[ip_type].ib_alignment);
799    ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4);
800 }
801 
amdgpu_init_cs_context(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs,enum amd_ip_type ip_type)802 static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws,
803                                    struct amdgpu_cs_context *cs,
804                                    enum amd_ip_type ip_type)
805 {
806    for (unsigned i = 0; i < ARRAY_SIZE(cs->chunk_ib); i++) {
807       cs->chunk_ib[i].ip_type = ip_type;
808       cs->chunk_ib[i].flags = 0;
809 
810       if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
811          /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation
812           * is the beginning of IBs because completion of an IB doesn't care about the state of
813           * GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be
814           * executed in parallel, so draw calls from the current IB can finish after the next IB
815           * starts drawing, and so the cache flush at the end of IBs is usually late and thus
816           * useless.
817           */
818          cs->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
819       }
820    }
821 
822    cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE;
823    cs->last_added_bo = NULL;
824    return true;
825 }
826 
cleanup_fence_list(struct amdgpu_fence_list * fences)827 static void cleanup_fence_list(struct amdgpu_fence_list *fences)
828 {
829    for (unsigned i = 0; i < fences->num; i++)
830       amdgpu_fence_drop_reference(fences->list[i]);
831    fences->num = 0;
832 }
833 
amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs)834 static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs)
835 {
836    for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) {
837       struct amdgpu_cs_buffer *buffers = cs->buffer_lists[i].buffers;
838       unsigned num_buffers = cs->buffer_lists[i].num_buffers;
839 
840       for (unsigned j = 0; j < num_buffers; j++)
841          amdgpu_winsys_bo_drop_reference(ws, buffers[j].bo);
842 
843       cs->buffer_lists[i].num_buffers = 0;
844    }
845 }
846 
amdgpu_cs_context_cleanup(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs)847 static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs)
848 {
849    cs->seq_no_dependencies.valid_fence_mask = 0;
850    cleanup_fence_list(&cs->syncobj_dependencies);
851    cleanup_fence_list(&cs->syncobj_to_signal);
852    amdgpu_fence_reference(&cs->fence, NULL);
853    cs->last_added_bo = NULL;
854 }
855 
amdgpu_destroy_cs_context(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs)856 static void amdgpu_destroy_cs_context(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs)
857 {
858    amdgpu_cs_context_cleanup_buffers(ws, cs);
859    amdgpu_cs_context_cleanup(ws, cs);
860    for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++)
861       FREE(cs->buffer_lists[i].buffers);
862    FREE(cs->syncobj_dependencies.list);
863    FREE(cs->syncobj_to_signal.list);
864 }
865 
866 
amdgpu_cs_get_ip_type(struct radeon_cmdbuf * rcs)867 static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs)
868 {
869    struct amdgpu_cs *cs = amdgpu_cs(rcs);
870    return cs->ip_type;
871 }
872 
ip_uses_alt_fence(enum amd_ip_type ip_type)873 static bool ip_uses_alt_fence(enum amd_ip_type ip_type)
874 {
875    /* The alt_fence path can be tested thoroughly by enabling it for GFX here. */
876    return ip_type == AMD_IP_VCN_DEC ||
877           ip_type == AMD_IP_VCN_ENC ||
878           ip_type == AMD_IP_VCN_JPEG;
879 }
880 
881 static bool
amdgpu_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)882 amdgpu_cs_create(struct radeon_cmdbuf *rcs,
883                  struct radeon_winsys_ctx *rwctx,
884                  enum amd_ip_type ip_type,
885                  void (*flush)(void *ctx, unsigned flags,
886                                struct pipe_fence_handle **fence),
887                  void *flush_ctx)
888 {
889    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
890    struct amdgpu_cs *cs;
891 
892    cs = CALLOC_STRUCT(amdgpu_cs);
893    if (!cs) {
894       return false;
895    }
896 
897    util_queue_fence_init(&cs->flush_completed);
898 
899    cs->ws = ctx->ws;
900    cs->ctx = ctx;
901    cs->flush_cs = flush;
902    cs->flush_data = flush_ctx;
903    cs->ip_type = ip_type;
904    cs->noop = ctx->ws->noop_cs;
905    cs->has_chaining = ctx->ws->info.gfx_level >= GFX7 &&
906                       (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
907 
908    /* Compute the queue index by counting the IPs that have queues. */
909    assert(ip_type < ARRAY_SIZE(ctx->ws->info.ip));
910    assert(ctx->ws->info.ip[ip_type].num_queues);
911 
912    if (ip_uses_alt_fence(ip_type)) {
913       cs->queue_index = INT_MAX;
914       cs->uses_alt_fence = true;
915    } else {
916       cs->queue_index = 0;
917 
918       for (unsigned i = 0; i < ARRAY_SIZE(ctx->ws->info.ip); i++) {
919          if (!ctx->ws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i))
920             continue;
921 
922          if (i == ip_type)
923             break;
924 
925          cs->queue_index++;
926       }
927       assert(cs->queue_index < AMDGPU_MAX_QUEUES);
928    }
929 
930    struct amdgpu_cs_fence_info fence_info;
931    fence_info.handle = cs->ctx->user_fence_bo;
932    fence_info.offset = cs->ip_type * 4;
933    amdgpu_cs_chunk_fence_info_to_data(&fence_info,
934                                       (struct drm_amdgpu_cs_chunk_data*)&cs->fence_chunk);
935 
936    if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ip_type)) {
937       FREE(cs);
938       return false;
939    }
940 
941    if (!amdgpu_init_cs_context(ctx->ws, &cs->csc2, ip_type)) {
942       amdgpu_destroy_cs_context(ctx->ws, &cs->csc1);
943       FREE(cs);
944       return false;
945    }
946 
947    memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
948 
949    /* Set the first submission context as current. */
950    rcs->csc = cs->csc = &cs->csc1;
951    cs->cst = &cs->csc2;
952 
953    /* Assign to both amdgpu_cs_context; only csc will use it. */
954    cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist;
955    cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist;
956 
957    cs->csc1.ws = ctx->ws;
958    cs->csc2.ws = ctx->ws;
959 
960    rcs->priv = cs;
961 
962    if (!amdgpu_get_new_ib(ctx->ws, rcs, &cs->main_ib, cs)) {
963       amdgpu_destroy_cs_context(ctx->ws, &cs->csc2);
964       amdgpu_destroy_cs_context(ctx->ws, &cs->csc1);
965       FREE(cs);
966       rcs->priv = NULL;
967       return false;
968    }
969 
970    p_atomic_inc(&ctx->ws->num_cs);
971    return true;
972 }
973 
974 static bool
amdgpu_cs_setup_preemption(struct radeon_cmdbuf * rcs,const uint32_t * preamble_ib,unsigned preamble_num_dw)975 amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
976                            unsigned preamble_num_dw)
977 {
978    struct amdgpu_cs *cs = amdgpu_cs(rcs);
979    struct amdgpu_winsys *ws = cs->ws;
980    struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2};
981    unsigned size = align(preamble_num_dw * 4, ws->info.ip[AMD_IP_GFX].ib_alignment);
982    struct pb_buffer_lean *preamble_bo;
983    uint32_t *map;
984 
985    /* Create the preamble IB buffer. */
986    preamble_bo = amdgpu_bo_create(ws, size, ws->info.ip[AMD_IP_GFX].ib_alignment,
987                                   RADEON_DOMAIN_VRAM,
988                                   (radeon_bo_flag)
989                                   (RADEON_FLAG_NO_INTERPROCESS_SHARING |
990                                    RADEON_FLAG_GTT_WC |
991                                    RADEON_FLAG_READ_ONLY));
992    if (!preamble_bo)
993       return false;
994 
995    map = (uint32_t*)amdgpu_bo_map(&ws->dummy_ws.base, preamble_bo, NULL,
996                                   (pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY));
997    if (!map) {
998       radeon_bo_reference(&ws->dummy_ws.base, &preamble_bo, NULL);
999       return false;
1000    }
1001 
1002    /* Upload the preamble IB. */
1003    memcpy(map, preamble_ib, preamble_num_dw * 4);
1004 
1005    /* Pad the IB. */
1006    amdgpu_pad_gfx_compute_ib(ws, cs->ip_type, map, &preamble_num_dw, 0);
1007    amdgpu_bo_unmap(&ws->dummy_ws.base, preamble_bo);
1008 
1009    for (unsigned i = 0; i < 2; i++) {
1010       csc[i]->chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo);
1011       csc[i]->chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4;
1012 
1013       csc[i]->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT;
1014    }
1015 
1016    assert(!cs->preamble_ib_bo);
1017    cs->preamble_ib_bo = preamble_bo;
1018 
1019    amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1020                         RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1021    return true;
1022 }
1023 
amdgpu_cs_validate(struct radeon_cmdbuf * rcs)1024 static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
1025 {
1026    return true;
1027 }
1028 
amdgpu_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)1029 static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
1030 {
1031    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1032    struct amdgpu_ib *main_ib = &cs->main_ib;
1033 
1034    assert(rcs->current.cdw <= rcs->current.max_dw);
1035 
1036    unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw;
1037 
1038    if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES)
1039       return false;
1040 
1041    if (rcs->current.max_dw - rcs->current.cdw >= dw)
1042       return true;
1043 
1044    unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
1045    unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
1046    /* 125% of the size for IB epilog. */
1047    unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
1048    main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size);
1049    main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4);
1050 
1051    if (!cs->has_chaining)
1052       return false;
1053 
1054    /* Allocate a new chunk */
1055    if (rcs->num_prev >= rcs->max_prev) {
1056       unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
1057       struct radeon_cmdbuf_chunk *new_prev;
1058 
1059       new_prev = (struct radeon_cmdbuf_chunk*)
1060                  REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev,
1061                          sizeof(*new_prev) * new_max_prev);
1062       if (!new_prev)
1063          return false;
1064 
1065       rcs->prev = new_prev;
1066       rcs->max_prev = new_max_prev;
1067    }
1068 
1069    if (!amdgpu_ib_new_buffer(cs->ws, main_ib, cs))
1070       return false;
1071 
1072    assert(main_ib->used_ib_space == 0);
1073    uint64_t va = main_ib->gpu_address;
1074 
1075    /* This space was originally reserved. */
1076    rcs->current.max_dw += cs_epilog_dw;
1077 
1078    /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
1079    amdgpu_pad_gfx_compute_ib(cs->ws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4);
1080 
1081    radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
1082    radeon_emit(rcs, va);
1083    radeon_emit(rcs, va >> 32);
1084    uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
1085 
1086    assert((rcs->current.cdw & cs->ws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0);
1087    assert(rcs->current.cdw <= rcs->current.max_dw);
1088 
1089    amdgpu_set_ib_size(rcs, main_ib);
1090    main_ib->ptr_ib_size = new_ptr_ib_size;
1091    main_ib->is_chained_ib = true;
1092 
1093    /* Hook up the new chunk */
1094    rcs->prev[rcs->num_prev].buf = rcs->current.buf;
1095    rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
1096    rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
1097    rcs->num_prev++;
1098 
1099    rcs->prev_dw += rcs->current.cdw;
1100    rcs->current.cdw = 0;
1101 
1102    rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
1103    rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw;
1104 
1105    amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
1106                         RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1107 
1108    return true;
1109 }
1110 
amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context * cs)1111 static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *cs)
1112 {
1113    unsigned num_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1114    struct amdgpu_cs_buffer *buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1115 
1116    for (unsigned i = 0; i < num_buffers; i++) {
1117       struct amdgpu_cs_buffer *slab_buffer = &buffers[i];
1118       struct amdgpu_cs_buffer *real_buffer =
1119          amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(slab_buffer->bo)->b,
1120                                      &cs->buffer_lists[AMDGPU_BO_REAL], true);
1121 
1122       /* We need to set the usage because it determines the BO priority.
1123        *
1124        * Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its
1125        * BO fences to fence dependencies. Only the slab entries should do that.
1126        */
1127       real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
1128    }
1129 }
1130 
amdgpu_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)1131 static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
1132                                           struct radeon_bo_list_item *list)
1133 {
1134     struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
1135 
1136     /* We do this in the CS thread, but since we need to return the final usage of all buffers
1137      * here, do it here too. There is no harm in doing it again in the CS thread.
1138      */
1139     amdgpu_add_slab_backing_buffers(cs);
1140 
1141     struct amdgpu_buffer_list *real_buffers = &cs->buffer_lists[AMDGPU_BO_REAL];
1142     unsigned num_real_buffers = real_buffers->num_buffers;
1143 
1144     if (list) {
1145         for (unsigned i = 0; i < num_real_buffers; i++) {
1146             list[i].bo_size = real_buffers->buffers[i].bo->base.size;
1147             list[i].vm_address =
1148                amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle);
1149             list[i].priority_usage = real_buffers->buffers[i].usage;
1150         }
1151     }
1152     return num_real_buffers;
1153 }
1154 
add_fence_to_list(struct amdgpu_fence_list * fences,struct amdgpu_fence * fence)1155 static void add_fence_to_list(struct amdgpu_fence_list *fences,
1156                               struct amdgpu_fence *fence)
1157 {
1158    unsigned idx = fences->num++;
1159 
1160    if (idx >= fences->max) {
1161       unsigned size;
1162       const unsigned increment = 8;
1163 
1164       fences->max = idx + increment;
1165       size = fences->max * sizeof(fences->list[0]);
1166       fences->list = (struct pipe_fence_handle**)realloc(fences->list, size);
1167    }
1168    amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
1169 }
1170 
amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf * rcs,struct pipe_fence_handle * pfence)1171 static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs,
1172                                            struct pipe_fence_handle *pfence)
1173 {
1174    struct amdgpu_cs *acs = amdgpu_cs(rcs);
1175    struct amdgpu_cs_context *cs = acs->csc;
1176    struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
1177 
1178    util_queue_fence_wait(&fence->submitted);
1179 
1180    if (!fence->imported) {
1181       /* Ignore idle fences. This will only check the user fence in memory. */
1182       if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) {
1183          add_seq_no_to_list(acs->ws, &cs->seq_no_dependencies, fence->queue_index,
1184                             fence->queue_seq_no);
1185       }
1186    }
1187    else
1188       add_fence_to_list(&cs->syncobj_dependencies, fence);
1189 }
1190 
amdgpu_add_fences_to_dependencies(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs,unsigned queue_index_bit,struct amdgpu_seq_no_fences * dependencies,struct amdgpu_winsys_bo * bo,unsigned usage)1191 static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws,
1192                                               struct amdgpu_cs_context *cs,
1193                                               unsigned queue_index_bit,
1194                                               struct amdgpu_seq_no_fences *dependencies,
1195                                               struct amdgpu_winsys_bo *bo, unsigned usage)
1196 {
1197    if (usage & RADEON_USAGE_SYNCHRONIZED) {
1198       /* Add BO fences from queues other than 'queue_index' to dependencies. */
1199       u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) {
1200          add_seq_no_to_list(ws, dependencies, other_queue_idx,
1201                             bo->fences.seq_no[other_queue_idx]);
1202       }
1203 
1204       if (bo->alt_fence)
1205          add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence);
1206    }
1207 }
1208 
amdgpu_set_bo_seq_no(unsigned queue_index,struct amdgpu_winsys_bo * bo,uint_seq_no new_queue_seq_no)1209 static void amdgpu_set_bo_seq_no(unsigned queue_index, struct amdgpu_winsys_bo *bo,
1210                                  uint_seq_no new_queue_seq_no)
1211 {
1212    bo->fences.seq_no[queue_index] = new_queue_seq_no;
1213    bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
1214 }
1215 
amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry * bo_entry,struct amdgpu_winsys_bo * bo,unsigned usage)1216 static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry,
1217                                          struct amdgpu_winsys_bo *bo, unsigned usage)
1218 {
1219    bo_entry->bo_handle = get_real_bo(bo)->kms_handle;
1220    bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2;
1221 }
1222 
amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf * rws,struct pipe_fence_handle * fence)1223 static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
1224                                          struct pipe_fence_handle *fence)
1225 {
1226    struct amdgpu_cs *acs = amdgpu_cs(rws);
1227    struct amdgpu_cs_context *cs = acs->csc;
1228 
1229    add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence);
1230 }
1231 
1232 /* The template parameter determines whether the queue should skip code used by the default queue
1233  * system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence
1234  * for all BOs.
1235  */
1236 template<bool QUEUE_USES_ALT_FENCE>
amdgpu_cs_submit_ib(void * job,void * gdata,int thread_index)1237 static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
1238 {
1239    struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
1240    struct amdgpu_winsys *ws = acs->ws;
1241    struct amdgpu_cs_context *cs = acs->cst;
1242    int r;
1243    uint64_t seq_no = 0;
1244    bool has_user_fence = amdgpu_cs_has_user_fence(acs);
1245 
1246    assert(QUEUE_USES_ALT_FENCE == acs->uses_alt_fence);
1247 
1248    simple_mtx_lock(&ws->bo_fence_lock);
1249    unsigned queue_index;
1250    struct amdgpu_queue *queue;
1251    uint_seq_no prev_seq_no, next_seq_no;
1252 
1253    if (!QUEUE_USES_ALT_FENCE) {
1254       queue_index = acs->queue_index;
1255       queue = &ws->queues[queue_index];
1256       prev_seq_no = queue->latest_seq_no;
1257 
1258       /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
1259        * but the values aren't related.
1260        */
1261       next_seq_no = prev_seq_no + 1;
1262 
1263       /* Wait for the oldest fence to signal. This should always check the user fence, then wait
1264        * via the ioctl. We have to do this because we are going to release the oldest fence and
1265        * replace it with the latest fence in the ring.
1266        */
1267       struct pipe_fence_handle **oldest_fence =
1268          &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
1269 
1270       if (*oldest_fence) {
1271          if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
1272             /* Take the reference because the fence can be released by other threads after we
1273              * unlock the mutex.
1274              */
1275             struct pipe_fence_handle *tmp_fence = NULL;
1276             amdgpu_fence_reference(&tmp_fence, *oldest_fence);
1277 
1278             /* Unlock the mutex before waiting. */
1279             simple_mtx_unlock(&ws->bo_fence_lock);
1280             amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
1281             amdgpu_fence_reference(&tmp_fence, NULL);
1282             simple_mtx_lock(&ws->bo_fence_lock);
1283          }
1284 
1285          /* Remove the idle fence from the ring. */
1286          amdgpu_fence_reference(oldest_fence, NULL);
1287       }
1288    }
1289 
1290    /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
1291     * sequence number per queue and removes all older ones.
1292     */
1293    struct amdgpu_seq_no_fences seq_no_dependencies;
1294    memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies));
1295 
1296    if (!QUEUE_USES_ALT_FENCE) {
1297       /* Add a fence dependency on the previous IB if the IP has multiple physical queues to
1298        * make it appear as if it had only 1 queue, or if the previous IB comes from a different
1299        * context. The reasons are:
1300        * - Our BO fence tracking only supports 1 queue per IP.
1301        * - IBs from different contexts must wait for each other and can't execute in a random order.
1302        */
1303       struct amdgpu_fence *prev_fence =
1304          (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
1305 
1306       if (prev_fence && (ws->info.ip[acs->ip_type].num_queues > 1 || queue->last_ctx != acs->ctx))
1307          add_seq_no_to_list(ws, &seq_no_dependencies, queue_index, prev_seq_no);
1308    }
1309 
1310    /* Since the kernel driver doesn't synchronize execution between different
1311     * rings automatically, we have to add fence dependencies manually. This gathers sequence
1312     * numbers from BOs and sets the next sequence number in the BOs.
1313     */
1314 
1315    /* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1316    struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1317    unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1318    unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1319    unsigned queue_index_bit = QUEUE_USES_ALT_FENCE ? 0 : BITFIELD_BIT(queue_index);
1320 
1321    for (unsigned i = 0; i < num_slab_entry_buffers; i++) {
1322       struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i];
1323       struct amdgpu_winsys_bo *bo = buffer->bo;
1324 
1325       amdgpu_add_fences_to_dependencies(ws, cs, queue_index_bit, &seq_no_dependencies, bo,
1326                                         buffer->usage);
1327       if (QUEUE_USES_ALT_FENCE)
1328          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1329       else
1330          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1331 
1332       /* We didn't add any slab entries into the real buffer list that will be submitted
1333        * to the kernel. Do it now.
1334        */
1335       struct amdgpu_cs_buffer *real_buffer =
1336          amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(buffer->bo)->b,
1337                                      &cs->buffer_lists[AMDGPU_BO_REAL], false);
1338 
1339       /* We need to set the usage because it determines the BO priority. */
1340       real_buffer->usage |= buffer->usage;
1341    }
1342 
1343    /* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1344    unsigned num_real_buffers_except_sparse = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1345    struct amdgpu_cs_buffer *sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].buffers;
1346    unsigned num_sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].num_buffers;
1347    bool out_of_memory = false;
1348 
1349    for (unsigned i = 0; i < num_sparse_buffers; i++) {
1350       struct amdgpu_cs_buffer *buffer = &sparse_buffers[i];
1351       struct amdgpu_winsys_bo *bo = buffer->bo;
1352 
1353       amdgpu_add_fences_to_dependencies(ws, cs, queue_index_bit, &seq_no_dependencies, bo,
1354                                         buffer->usage);
1355       if (QUEUE_USES_ALT_FENCE)
1356          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1357       else
1358          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1359 
1360       /* Add backing buffers of sparse buffers to the buffer list.
1361        *
1362        * This is done late, during submission, to keep the buffer list short before
1363        * submit, and to avoid managing fences for the backing buffers.
1364        */
1365       struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo);
1366 
1367       simple_mtx_lock(&sparse_bo->commit_lock);
1368       list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) {
1369          /* We can directly add the buffer here, because we know that each
1370           * backing buffer occurs only once.
1371           */
1372          struct amdgpu_cs_buffer *real_buffer =
1373             amdgpu_do_add_buffer(cs, &backing->bo->b, &cs->buffer_lists[AMDGPU_BO_REAL], true);
1374          if (!real_buffer) {
1375             fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__);
1376             simple_mtx_unlock(&sparse_bo->commit_lock);
1377             r = -ENOMEM;
1378             out_of_memory = true;
1379          }
1380 
1381          real_buffer->usage = buffer->usage;
1382       }
1383       simple_mtx_unlock(&sparse_bo->commit_lock);
1384    }
1385 
1386    /* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */
1387    unsigned num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1388    struct amdgpu_cs_buffer *real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].buffers;
1389    struct drm_amdgpu_bo_list_entry *bo_list =
1390       (struct drm_amdgpu_bo_list_entry *)
1391       alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1392    unsigned i;
1393 
1394    for (i = 0; i < initial_num_real_buffers; i++) {
1395       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1396       struct amdgpu_winsys_bo *bo = buffer->bo;
1397 
1398       amdgpu_add_fences_to_dependencies(ws, cs, queue_index_bit, &seq_no_dependencies, bo,
1399                                         buffer->usage);
1400       if (QUEUE_USES_ALT_FENCE)
1401          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1402       else
1403          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1404 
1405       amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1406    }
1407 
1408    /* These are backing buffers of slab entries. Don't add their fence dependencies. */
1409    for (; i < num_real_buffers_except_sparse; i++) {
1410       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1411       struct amdgpu_winsys_bo *bo = buffer->bo;
1412 
1413       if (QUEUE_USES_ALT_FENCE)
1414          get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true;
1415       else
1416          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1417 
1418       amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1419    }
1420 
1421    /* Sparse backing BOs are last. Don't update their fences because we don't use them. */
1422    for (; i < num_real_buffers; ++i) {
1423       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1424 
1425       amdgpu_add_to_kernel_bo_list(&bo_list[i], buffer->bo, buffer->usage);
1426    }
1427 
1428 #if 0 /* Debug code. */
1429    printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
1430 
1431    /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
1432    for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
1433       if (i == acs->queue_index)
1434          continue;
1435 
1436       struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
1437       if (!fence) {
1438          if (i <= 1)
1439             printf("      queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
1440          continue;
1441       }
1442 
1443       bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
1444       uint_seq_no old = seq_no_dependencies.seq_no[i];
1445       add_seq_no_to_list(ws, &seq_no_dependencies, i, ws->queues[i].latest_seq_no);
1446       uint_seq_no new = seq_no_dependencies.seq_no[i];
1447 
1448       if (!valid)
1449          printf("   missing dependency on queue=%u, seq_no=%u\n", i, new);
1450       else if (old != new)
1451          printf("   too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
1452       else
1453          printf("   has dependency on queue=%u, seq_no=%u\n", i, old);
1454    }
1455 #endif
1456 
1457    /* Convert the sequence numbers we gathered to fence dependencies. */
1458    u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
1459       struct pipe_fence_handle **fence = get_fence_from_ring(ws, &seq_no_dependencies, i);
1460 
1461       if (fence) {
1462          /* If it's idle, don't add it to the list of dependencies. */
1463          if (amdgpu_fence_wait(*fence, 0, false))
1464             amdgpu_fence_reference(fence, NULL);
1465          else
1466             add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)*fence);
1467       }
1468    }
1469 
1470    if (!QUEUE_USES_ALT_FENCE) {
1471       /* Finally, add the IB fence into the fence ring of the queue. */
1472       amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
1473       queue->latest_seq_no = next_seq_no;
1474       ((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no;
1475 
1476       /* Update the last used context in the queue. */
1477       amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
1478    }
1479    simple_mtx_unlock(&ws->bo_fence_lock);
1480 
1481 #if DEBUG
1482    /* Prepare the buffer list. */
1483    if (ws->debug_all_bos) {
1484       /* The buffer list contains all buffers. This is a slow path that
1485        * ensures that no buffer is missing in the BO list.
1486        */
1487       simple_mtx_lock(&ws->global_bo_list_lock);
1488       bo_list = (struct drm_amdgpu_bo_list_entry *)
1489                 alloca(ws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1490       num_real_buffers = 0;
1491 
1492       list_for_each_entry(struct amdgpu_bo_real, bo, &ws->global_bo_list, global_list_item) {
1493          bo_list[num_real_buffers].bo_handle = bo->kms_handle;
1494          bo_list[num_real_buffers].bo_priority = 0;
1495          ++num_real_buffers;
1496       }
1497       simple_mtx_unlock(&ws->global_bo_list_lock);
1498    }
1499 #endif
1500 
1501    if (acs->ip_type == AMD_IP_GFX)
1502       ws->gfx_bo_list_counter += num_real_buffers;
1503 
1504    struct drm_amdgpu_cs_chunk chunks[8];
1505    unsigned num_chunks = 0;
1506 
1507    /* BO list */
1508    struct drm_amdgpu_bo_list_in bo_list_in;
1509    bo_list_in.operation = ~0;
1510    bo_list_in.list_handle = ~0;
1511    bo_list_in.bo_number = num_real_buffers;
1512    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1513    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list;
1514 
1515    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1516    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1517    chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1518    num_chunks++;
1519 
1520    /* Syncobj dependencies. */
1521    unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1522    if (num_syncobj_dependencies) {
1523       struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1524          (struct drm_amdgpu_cs_chunk_sem *)
1525          alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
1526 
1527       for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1528          struct amdgpu_fence *fence =
1529             (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1530 
1531          assert(util_queue_fence_is_signalled(&fence->submitted));
1532          sem_chunk[i].handle = fence->syncobj;
1533       }
1534 
1535       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
1536       chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
1537       chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1538       num_chunks++;
1539    }
1540 
1541    /* Syncobj signals. */
1542    unsigned num_syncobj_to_signal = 1 + cs->syncobj_to_signal.num;
1543    struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1544       (struct drm_amdgpu_cs_chunk_sem *)
1545       alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
1546 
1547    for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) {
1548       struct amdgpu_fence *fence =
1549          (struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1550 
1551       sem_chunk[i].handle = fence->syncobj;
1552    }
1553    sem_chunk[cs->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)cs->fence)->syncobj;
1554 
1555    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
1556    chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal;
1557    chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1558    num_chunks++;
1559 
1560    if (ws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) {
1561       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW;
1562       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4;
1563       chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk;
1564       num_chunks++;
1565    }
1566 
1567    /* Fence */
1568    if (has_user_fence) {
1569       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1570       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1571       chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
1572       num_chunks++;
1573    }
1574 
1575    /* IB */
1576    if (cs->chunk_ib[IB_PREAMBLE].ib_bytes) {
1577       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1578       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1579       chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_PREAMBLE];
1580       num_chunks++;
1581    }
1582 
1583    /* IB */
1584    cs->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
1585    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1586    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1587    chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_MAIN];
1588    num_chunks++;
1589 
1590    if (cs->secure) {
1591       cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
1592       cs->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
1593    } else {
1594       cs->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1595       cs->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1596    }
1597 
1598    bool noop = acs->noop;
1599 
1600    if (noop && acs->ip_type == AMD_IP_GFX) {
1601       /* Reduce the IB size and fill it with NOP to make it like an empty IB. */
1602       unsigned noop_dw_size = ws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
1603       assert(cs->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size);
1604 
1605       cs->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
1606       cs->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4;
1607       noop = false;
1608    }
1609 
1610    assert(num_chunks <= ARRAY_SIZE(chunks));
1611 
1612    if (out_of_memory) {
1613       r = -ENOMEM;
1614    } else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) {
1615       r = -ECANCELED;
1616    } else if (unlikely(noop)) {
1617       r = 0;
1618    } else {
1619       /* Submit the command buffer.
1620        *
1621        * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
1622        * quite often, but it eventually succeeds after enough attempts. This happens frequently
1623        * with dEQP using NGG streamout.
1624        */
1625       r = 0;
1626 
1627       do {
1628          /* Wait 1 ms and try again. */
1629          if (r == -ENOMEM)
1630             os_time_sleep(1000);
1631 
1632          r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, 0, num_chunks, chunks, &seq_no);
1633       } while (r == -ENOMEM);
1634 
1635       if (!r) {
1636          /* Success. */
1637          uint64_t *user_fence = NULL;
1638 
1639          /* Need to reserve 4 QWORD for user fence:
1640           *   QWORD[0]: completed fence
1641           *   QWORD[1]: preempted fence
1642           *   QWORD[2]: reset fence
1643           *   QWORD[3]: preempted then reset
1644           */
1645          if (has_user_fence)
1646             user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
1647          amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
1648       }
1649    }
1650 
1651    if (unlikely(r)) {
1652       if (r == -ECANCELED) {
1653          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET,
1654                                         "amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
1655       } else if (r == -ENODATA) {
1656          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1657                                         "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
1658       } else if (r == -ETIME) {
1659          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1660                                         "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
1661       } else {
1662          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx,
1663                                         PIPE_UNKNOWN_CONTEXT_RESET,
1664                                         "amdgpu: The CS has been rejected, "
1665                                         "see dmesg for more information (%i).\n",
1666                                         r);
1667       }
1668    }
1669 
1670    /* If there was an error, signal the fence, because it won't be signalled
1671     * by the hardware. */
1672    if (r || noop)
1673       amdgpu_fence_signalled(cs->fence);
1674 
1675    if (unlikely(ws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0))
1676       acs->mcbp_fw_shadow_chunk.flags = 0;
1677 
1678    cs->error_code = r;
1679 
1680    /* Clear the buffer lists. */
1681    for (unsigned list = 0; list < ARRAY_SIZE(cs->buffer_lists); list++) {
1682       struct amdgpu_cs_buffer *buffers = cs->buffer_lists[list].buffers;
1683       unsigned num_buffers = cs->buffer_lists[list].num_buffers;
1684 
1685       if (list == AMDGPU_BO_REAL) {
1686          /* Only decrement num_active_ioctls and unref where we incremented them.
1687           * We did both for regular real BOs. We only incremented the refcount for sparse
1688           * backing BOs.
1689           */
1690          /* Regular real BOs. */
1691          for (unsigned i = 0; i < initial_num_real_buffers; i++) {
1692             p_atomic_dec(&buffers[i].bo->num_active_ioctls);
1693             amdgpu_winsys_bo_drop_reference(ws, buffers[i].bo);
1694          }
1695 
1696          /* Do nothing for slab BOs. */
1697 
1698          /* Sparse backing BOs. */
1699          for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++)
1700             amdgpu_winsys_bo_drop_reference(ws, buffers[i].bo);
1701       } else {
1702          for (unsigned i = 0; i < num_buffers; i++) {
1703             p_atomic_dec(&buffers[i].bo->num_active_ioctls);
1704             amdgpu_winsys_bo_drop_reference(ws, buffers[i].bo);
1705          }
1706       }
1707 
1708       cs->buffer_lists[list].num_buffers = 0;
1709    }
1710 
1711    amdgpu_cs_context_cleanup(ws, cs);
1712 }
1713 
1714 /* Make sure the previous submission is completed. */
amdgpu_cs_sync_flush(struct radeon_cmdbuf * rcs)1715 void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
1716 {
1717    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1718 
1719    /* Wait for any pending ioctl of this CS to complete. */
1720    util_queue_fence_wait(&cs->flush_completed);
1721 }
1722 
amdgpu_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** fence)1723 static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
1724                            unsigned flags,
1725                            struct pipe_fence_handle **fence)
1726 {
1727    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1728    struct amdgpu_winsys *ws = cs->ws;
1729    int error_code = 0;
1730    uint32_t ib_pad_dw_mask = ws->info.ip[cs->ip_type].ib_pad_dw_mask;
1731 
1732    rcs->current.max_dw += amdgpu_cs_epilog_dws(cs);
1733 
1734    /* Pad the IB according to the mask. */
1735    switch (cs->ip_type) {
1736    case AMD_IP_SDMA:
1737       if (ws->info.gfx_level <= GFX6) {
1738          while (rcs->current.cdw & ib_pad_dw_mask)
1739             radeon_emit(rcs, 0xf0000000); /* NOP packet */
1740       } else {
1741          while (rcs->current.cdw & ib_pad_dw_mask)
1742             radeon_emit(rcs, SDMA_NOP_PAD);
1743       }
1744       break;
1745    case AMD_IP_GFX:
1746    case AMD_IP_COMPUTE:
1747       amdgpu_pad_gfx_compute_ib(ws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0);
1748       if (cs->ip_type == AMD_IP_GFX)
1749          ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
1750       break;
1751    case AMD_IP_UVD:
1752    case AMD_IP_UVD_ENC:
1753       while (rcs->current.cdw & ib_pad_dw_mask)
1754          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
1755       break;
1756    case AMD_IP_VCN_JPEG:
1757       if (rcs->current.cdw % 2)
1758          assert(0);
1759       while (rcs->current.cdw & ib_pad_dw_mask) {
1760          radeon_emit(rcs, 0x60000000); /* nop packet */
1761          radeon_emit(rcs, 0x00000000);
1762       }
1763       break;
1764    case AMD_IP_VCN_DEC:
1765       while (rcs->current.cdw & ib_pad_dw_mask)
1766          radeon_emit(rcs, 0x81ff); /* nop packet */
1767       break;
1768    default:
1769       break;
1770    }
1771 
1772    if (rcs->current.cdw > rcs->current.max_dw) {
1773       fprintf(stderr, "amdgpu: command stream overflowed\n");
1774    }
1775 
1776    /* If the CS is not empty or overflowed.... */
1777    if (likely(radeon_emitted(rcs, 0) &&
1778        rcs->current.cdw <= rcs->current.max_dw &&
1779        !(flags & RADEON_FLUSH_NOOP))) {
1780       struct amdgpu_cs_context *cur = cs->csc;
1781 
1782       /* Set IB sizes. */
1783       amdgpu_ib_finalize(ws, rcs, &cs->main_ib, cs->ip_type);
1784 
1785       /* Create a fence. */
1786       amdgpu_fence_reference(&cur->fence, NULL);
1787       if (cs->next_fence) {
1788          /* just move the reference */
1789          cur->fence = cs->next_fence;
1790          cs->next_fence = NULL;
1791       } else {
1792          cur->fence = amdgpu_fence_create(cs);
1793       }
1794       if (fence)
1795          amdgpu_fence_reference(fence, cur->fence);
1796 
1797       for (unsigned i = 0; i < ARRAY_SIZE(cur->buffer_lists); i++) {
1798          unsigned num_buffers = cur->buffer_lists[i].num_buffers;
1799          struct amdgpu_cs_buffer *buffers = cur->buffer_lists[i].buffers;
1800 
1801          for (unsigned j = 0; j < num_buffers; j++)
1802             p_atomic_inc(&buffers[j].bo->num_active_ioctls);
1803       }
1804 
1805       amdgpu_cs_sync_flush(rcs);
1806 
1807       /* Swap command streams. "cst" is going to be submitted. */
1808       rcs->csc = cs->csc = cs->cst;
1809       cs->cst = cur;
1810 
1811       /* Submit. */
1812       util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
1813                          cs->uses_alt_fence ? amdgpu_cs_submit_ib<true>
1814                                             : amdgpu_cs_submit_ib<false>, NULL, 0);
1815 
1816       if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
1817          cs->csc->secure = !cs->cst->secure;
1818       else
1819          cs->csc->secure = cs->cst->secure;
1820 
1821       if (!(flags & PIPE_FLUSH_ASYNC)) {
1822          amdgpu_cs_sync_flush(rcs);
1823          error_code = cur->error_code;
1824       }
1825    } else {
1826       if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
1827          cs->csc->secure = !cs->csc->secure;
1828 
1829       amdgpu_cs_context_cleanup_buffers(ws, cs->csc);
1830       amdgpu_cs_context_cleanup(ws, cs->csc);
1831    }
1832 
1833    memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
1834 
1835    amdgpu_get_new_ib(ws, rcs, &cs->main_ib, cs);
1836 
1837    if (cs->preamble_ib_bo) {
1838       amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1839                            RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1840    }
1841 
1842    if (cs->ip_type == AMD_IP_GFX)
1843       ws->num_gfx_IBs++;
1844    else if (cs->ip_type == AMD_IP_SDMA)
1845       ws->num_sdma_IBs++;
1846 
1847    return error_code;
1848 }
1849 
amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)1850 static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
1851 {
1852    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1853 
1854    if (!cs)
1855       return;
1856 
1857    amdgpu_cs_sync_flush(rcs);
1858    util_queue_fence_destroy(&cs->flush_completed);
1859    p_atomic_dec(&cs->ws->num_cs);
1860    radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->preamble_ib_bo, NULL);
1861    radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->main_ib.big_buffer, NULL);
1862    FREE(rcs->prev);
1863    amdgpu_destroy_cs_context(cs->ws, &cs->csc1);
1864    amdgpu_destroy_cs_context(cs->ws, &cs->csc2);
1865    amdgpu_fence_reference(&cs->next_fence, NULL);
1866    FREE(cs);
1867 }
1868 
amdgpu_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)1869 static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
1870                                     struct pb_buffer_lean *_buf,
1871                                     unsigned usage)
1872 {
1873    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1874    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
1875 
1876    return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
1877 }
1878 
amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf * rcs,uint64_t regs_va,uint64_t csa_va)1879 static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va,
1880                                                                    uint64_t csa_va)
1881 {
1882    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1883    cs->mcbp_fw_shadow_chunk.shadow_va = regs_va;
1884    cs->mcbp_fw_shadow_chunk.csa_va = csa_va;
1885    cs->mcbp_fw_shadow_chunk.gds_va = 0;
1886    cs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
1887 }
1888 
amdgpu_winsys_fence_reference(struct radeon_winsys * rws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)1889 static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws,
1890                                           struct pipe_fence_handle **dst,
1891                                           struct pipe_fence_handle *src)
1892 {
1893    amdgpu_fence_reference(dst, src);
1894 }
1895 
amdgpu_cs_init_functions(struct amdgpu_screen_winsys * ws)1896 void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws)
1897 {
1898    ws->base.ctx_create = amdgpu_ctx_create;
1899    ws->base.ctx_destroy = amdgpu_ctx_destroy;
1900    ws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status;
1901    ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
1902    ws->base.cs_create = amdgpu_cs_create;
1903    ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
1904    ws->base.cs_destroy = amdgpu_cs_destroy;
1905    ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
1906    ws->base.cs_validate = amdgpu_cs_validate;
1907    ws->base.cs_check_space = amdgpu_cs_check_space;
1908    ws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
1909    ws->base.cs_flush = amdgpu_cs_flush;
1910    ws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
1911    ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
1912    ws->base.cs_sync_flush = amdgpu_cs_sync_flush;
1913    ws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
1914    ws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal;
1915    ws->base.cs_get_ip_type = amdgpu_cs_get_ip_type;
1916    ws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
1917    ws->base.fence_reference = amdgpu_winsys_fence_reference;
1918    ws->base.fence_import_syncobj = amdgpu_fence_import_syncobj;
1919    ws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
1920    ws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
1921    ws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
1922 
1923    if (ws->aws->info.has_fw_based_shadowing)
1924       ws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va;
1925 }
1926