1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * Copyright © 2015 Advanced Micro Devices, Inc.
5 *
6 * SPDX-License-Identifier: MIT
7 */
8
9 #include "amdgpu_cs.h"
10 #include "util/detect_os.h"
11 #include "util/os_time.h"
12 #include <inttypes.h>
13 #include <stdio.h>
14
15 #include "amd/common/sid.h"
16
17 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
18 * codes in the kernel).
19 */
20 #if DETECT_OS_OPENBSD
21 #define ENODATA ENOTSUP
22 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
23 #define ENODATA ECONNREFUSED
24 #endif
25
26 /* FENCES */
27
amdgpu_fence_destroy(struct amdgpu_fence * fence)28 void amdgpu_fence_destroy(struct amdgpu_fence *fence)
29 {
30 amdgpu_cs_destroy_syncobj(fence->ws->dev, fence->syncobj);
31
32 if (fence->ctx)
33 amdgpu_ctx_reference(&fence->ctx, NULL);
34
35 util_queue_fence_destroy(&fence->submitted);
36 FREE(fence);
37 }
38
39 static struct pipe_fence_handle *
amdgpu_fence_create(struct amdgpu_cs * cs)40 amdgpu_fence_create(struct amdgpu_cs *cs)
41 {
42 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
43 struct amdgpu_ctx *ctx = cs->ctx;
44
45 fence->reference.count = 1;
46 fence->ws = ctx->ws;
47 amdgpu_ctx_reference(&fence->ctx, ctx);
48 fence->ctx = ctx;
49 fence->ip_type = cs->ip_type;
50 if (amdgpu_cs_create_syncobj2(ctx->ws->dev, 0, &fence->syncobj)) {
51 free(fence);
52 return NULL;
53 }
54
55 util_queue_fence_init(&fence->submitted);
56 util_queue_fence_reset(&fence->submitted);
57 fence->queue_index = cs->queue_index;
58 return (struct pipe_fence_handle *)fence;
59 }
60
61 static struct pipe_fence_handle *
amdgpu_fence_import_syncobj(struct radeon_winsys * rws,int fd)62 amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
63 {
64 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
65 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
66 int r;
67
68 if (!fence)
69 return NULL;
70
71 pipe_reference_init(&fence->reference, 1);
72 fence->ws = ws;
73 fence->ip_type = 0xffffffff;
74
75 r = amdgpu_cs_import_syncobj(ws->dev, fd, &fence->syncobj);
76 if (r) {
77 FREE(fence);
78 return NULL;
79 }
80
81 util_queue_fence_init(&fence->submitted);
82 fence->imported = true;
83
84 return (struct pipe_fence_handle*)fence;
85 }
86
87 static struct pipe_fence_handle *
amdgpu_fence_import_sync_file(struct radeon_winsys * rws,int fd)88 amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
89 {
90 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
91 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
92
93 if (!fence)
94 return NULL;
95
96 pipe_reference_init(&fence->reference, 1);
97 fence->ws = ws;
98 /* fence->ctx == NULL means that the fence is syncobj-based. */
99
100 /* Convert sync_file into syncobj. */
101 int r = amdgpu_cs_create_syncobj(ws->dev, &fence->syncobj);
102 if (r) {
103 FREE(fence);
104 return NULL;
105 }
106
107 r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd);
108 if (r) {
109 amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj);
110 FREE(fence);
111 return NULL;
112 }
113
114 util_queue_fence_init(&fence->submitted);
115 fence->imported = true;
116
117 return (struct pipe_fence_handle*)fence;
118 }
119
amdgpu_fence_export_sync_file(struct radeon_winsys * rws,struct pipe_fence_handle * pfence)120 static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
121 struct pipe_fence_handle *pfence)
122 {
123 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
124 struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
125 int fd, r;
126
127 util_queue_fence_wait(&fence->submitted);
128
129 /* Convert syncobj into sync_file. */
130 r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd);
131 return r ? -1 : fd;
132 }
133
amdgpu_export_signalled_sync_file(struct radeon_winsys * rws)134 static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
135 {
136 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
137 uint32_t syncobj;
138 int fd = -1;
139
140 int r = amdgpu_cs_create_syncobj2(ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED,
141 &syncobj);
142 if (r) {
143 return -1;
144 }
145
146 r = amdgpu_cs_syncobj_export_sync_file(ws->dev, syncobj, &fd);
147 if (r) {
148 fd = -1;
149 }
150
151 amdgpu_cs_destroy_syncobj(ws->dev, syncobj);
152 return fd;
153 }
154
amdgpu_fence_submitted(struct pipe_fence_handle * fence,uint64_t seq_no,uint64_t * user_fence_cpu_address)155 static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
156 uint64_t seq_no,
157 uint64_t *user_fence_cpu_address)
158 {
159 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
160
161 afence->seq_no = seq_no;
162 afence->user_fence_cpu_address = user_fence_cpu_address;
163 util_queue_fence_signal(&afence->submitted);
164 }
165
amdgpu_fence_signalled(struct pipe_fence_handle * fence)166 static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
167 {
168 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
169
170 afence->signalled = true;
171 util_queue_fence_signal(&afence->submitted);
172 }
173
amdgpu_fence_wait(struct pipe_fence_handle * fence,uint64_t timeout,bool absolute)174 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
175 bool absolute)
176 {
177 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
178 int64_t abs_timeout;
179 uint64_t *user_fence_cpu;
180
181 if (afence->signalled)
182 return true;
183
184 if (absolute)
185 abs_timeout = timeout;
186 else
187 abs_timeout = os_time_get_absolute_timeout(timeout);
188
189 /* The fence might not have a number assigned if its IB is being
190 * submitted in the other thread right now. Wait until the submission
191 * is done. */
192 if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
193 return false;
194
195 user_fence_cpu = afence->user_fence_cpu_address;
196 if (user_fence_cpu) {
197 if (*user_fence_cpu >= afence->seq_no) {
198 afence->signalled = true;
199 return true;
200 }
201
202 /* No timeout, just query: no need for the ioctl. */
203 if (!absolute && !timeout)
204 return false;
205 }
206
207 if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE)
208 abs_timeout = INT64_MAX;
209
210 if (amdgpu_cs_syncobj_wait(afence->ws->dev, &afence->syncobj, 1,
211 abs_timeout, 0, NULL))
212
213 return false;
214
215 afence->signalled = true;
216 return true;
217 }
218
amdgpu_fence_wait_rel_timeout(struct radeon_winsys * rws,struct pipe_fence_handle * fence,uint64_t timeout)219 static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
220 struct pipe_fence_handle *fence,
221 uint64_t timeout)
222 {
223 return amdgpu_fence_wait(fence, timeout, false);
224 }
225
226 static struct pipe_fence_handle *
amdgpu_cs_get_next_fence(struct radeon_cmdbuf * rcs)227 amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
228 {
229 struct amdgpu_cs *cs = amdgpu_cs(rcs);
230 struct pipe_fence_handle *fence = NULL;
231
232 if (cs->noop)
233 return NULL;
234
235 if (cs->next_fence) {
236 amdgpu_fence_reference(&fence, cs->next_fence);
237 return fence;
238 }
239
240 fence = amdgpu_fence_create(cs);
241 if (!fence)
242 return NULL;
243
244 amdgpu_fence_reference(&cs->next_fence, fence);
245 return fence;
246 }
247
248 /* CONTEXTS */
249
250 static uint32_t
radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)251 radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)
252 {
253 switch (radeon_priority) {
254 case RADEON_CTX_PRIORITY_REALTIME:
255 return AMDGPU_CTX_PRIORITY_VERY_HIGH;
256 case RADEON_CTX_PRIORITY_HIGH:
257 return AMDGPU_CTX_PRIORITY_HIGH;
258 case RADEON_CTX_PRIORITY_MEDIUM:
259 return AMDGPU_CTX_PRIORITY_NORMAL;
260 case RADEON_CTX_PRIORITY_LOW:
261 return AMDGPU_CTX_PRIORITY_LOW;
262 default:
263 unreachable("Invalid context priority");
264 }
265 }
266
amdgpu_ctx_create(struct radeon_winsys * ws,enum radeon_ctx_priority priority,bool allow_context_lost)267 static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws,
268 enum radeon_ctx_priority priority,
269 bool allow_context_lost)
270 {
271 struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
272 int r;
273 struct amdgpu_bo_alloc_request alloc_buffer = {};
274 uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority);
275 amdgpu_bo_handle buf_handle;
276
277 if (!ctx)
278 return NULL;
279
280 ctx->ws = amdgpu_winsys(ws);
281 ctx->reference.count = 1;
282 ctx->allow_context_lost = allow_context_lost;
283
284 r = amdgpu_cs_ctx_create2(ctx->ws->dev, amdgpu_priority, &ctx->ctx);
285 if (r) {
286 fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r);
287 goto error_create;
288 }
289
290 alloc_buffer.alloc_size = ctx->ws->info.gart_page_size;
291 alloc_buffer.phys_alignment = ctx->ws->info.gart_page_size;
292 alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
293
294 r = amdgpu_bo_alloc(ctx->ws->dev, &alloc_buffer, &buf_handle);
295 if (r) {
296 fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
297 goto error_user_fence_alloc;
298 }
299
300 r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base);
301 if (r) {
302 fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
303 goto error_user_fence_map;
304 }
305
306 memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
307 ctx->user_fence_bo = buf_handle;
308
309 return (struct radeon_winsys_ctx*)ctx;
310
311 error_user_fence_map:
312 amdgpu_bo_free(buf_handle);
313 error_user_fence_alloc:
314 amdgpu_cs_ctx_free(ctx->ctx);
315 error_create:
316 FREE(ctx);
317 return NULL;
318 }
319
amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)320 static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
321 {
322 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
323
324 amdgpu_ctx_reference(&ctx, NULL);
325 }
326
amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys * ws,enum amd_ip_type ip_type,uint32_t * ib,uint32_t * num_dw,unsigned leave_dw_space)327 static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *ws, enum amd_ip_type ip_type,
328 uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space)
329 {
330 unsigned pad_dw_mask = ws->info.ip[ip_type].ib_pad_dw_mask;
331 unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask;
332
333 if (unaligned_dw) {
334 int remaining = pad_dw_mask + 1 - unaligned_dw;
335
336 /* Only pad by 1 dword with the type-2 NOP if necessary. */
337 if (remaining == 1 && ws->info.gfx_ib_pad_with_type2) {
338 ib[(*num_dw)++] = PKT2_NOP_PAD;
339 } else {
340 /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
341 * packet. The size of the packet body after the header is always count + 1.
342 * If count == -1, there is no packet body. NOP is the only packet that can have
343 * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
344 */
345 ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0);
346 *num_dw += remaining - 1;
347 }
348 }
349 assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0);
350 }
351
amdgpu_submit_gfx_nop(struct amdgpu_ctx * ctx)352 static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx)
353 {
354 struct amdgpu_bo_alloc_request request = {0};
355 struct drm_amdgpu_bo_list_in bo_list_in;
356 struct drm_amdgpu_cs_chunk_ib ib_in = {0};
357 amdgpu_bo_handle buf_handle;
358 amdgpu_va_handle va_handle = NULL;
359 struct drm_amdgpu_cs_chunk chunks[2];
360 struct drm_amdgpu_bo_list_entry list;
361 unsigned noop_dw_size;
362 void *cpu = NULL;
363 uint64_t seq_no;
364 uint64_t va;
365 int r;
366
367 /* Older amdgpu doesn't report if the reset is complete or not. Detect
368 * it by submitting a no-op job. If it reports an error, then assume
369 * that the reset is not complete.
370 */
371 amdgpu_context_handle temp_ctx;
372 r = amdgpu_cs_ctx_create2(ctx->ws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx);
373 if (r)
374 return r;
375
376 request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
377 request.alloc_size = 4096;
378 request.phys_alignment = 4096;
379 r = amdgpu_bo_alloc(ctx->ws->dev, &request, &buf_handle);
380 if (r)
381 goto destroy_ctx;
382
383 r = amdgpu_va_range_alloc(ctx->ws->dev, amdgpu_gpu_va_range_general,
384 request.alloc_size, request.phys_alignment,
385 0, &va, &va_handle,
386 AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH);
387 if (r)
388 goto destroy_bo;
389 r = amdgpu_bo_va_op_raw(ctx->ws->dev, buf_handle, 0, request.alloc_size, va,
390 AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
391 AMDGPU_VA_OP_MAP);
392 if (r)
393 goto destroy_bo;
394
395 r = amdgpu_bo_cpu_map(buf_handle, &cpu);
396 if (r)
397 goto destroy_bo;
398
399 noop_dw_size = ctx->ws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
400 ((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
401
402 amdgpu_bo_cpu_unmap(buf_handle);
403
404 amdgpu_bo_export(buf_handle, amdgpu_bo_handle_type_kms, &list.bo_handle);
405 list.bo_priority = 0;
406
407 bo_list_in.list_handle = ~0;
408 bo_list_in.bo_number = 1;
409 bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
410 bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list;
411
412 ib_in.ip_type = AMD_IP_GFX;
413 ib_in.ib_bytes = noop_dw_size * 4;
414 ib_in.va_start = va;
415
416 chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
417 chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
418 chunks[0].chunk_data = (uintptr_t)&bo_list_in;
419
420 chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB;
421 chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
422 chunks[1].chunk_data = (uintptr_t)&ib_in;
423
424 r = amdgpu_cs_submit_raw2(ctx->ws->dev, temp_ctx, 0, 2, chunks, &seq_no);
425
426 destroy_bo:
427 if (va_handle)
428 amdgpu_va_range_free(va_handle);
429 amdgpu_bo_free(buf_handle);
430 destroy_ctx:
431 amdgpu_cs_ctx_free(temp_ctx);
432
433 return r;
434 }
435
436 static void
amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)437 amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
438 const char *format, ...)
439 {
440 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
441
442 /* Don't overwrite the last reset status. */
443 if (ctx->sw_status != PIPE_NO_RESET)
444 return;
445
446 ctx->sw_status = status;
447
448 if (!ctx->allow_context_lost) {
449 va_list args;
450
451 va_start(args, format);
452 vfprintf(stderr, format, args);
453 va_end(args);
454
455 /* Non-robust contexts are allowed to terminate the process. The only alternative is
456 * to skip command submission, which would look like a freeze because nothing is drawn,
457 * which looks like a hang without any reset.
458 */
459 abort();
460 }
461 }
462
463 static enum pipe_reset_status
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx * rwctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)464 amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only,
465 bool *needs_reset, bool *reset_completed)
466 {
467 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
468
469 if (needs_reset)
470 *needs_reset = false;
471 if (reset_completed)
472 *reset_completed = false;
473
474 /* Return a failure due to a GPU hang. */
475 uint64_t flags;
476
477 if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
478 /* If the caller is only interested in full reset (= wants to ignore soft
479 * recoveries), we can use the rejected cs count as a quick first check.
480 */
481 return PIPE_NO_RESET;
482 }
483
484 /*
485 * ctx->sw_status is updated on alloc/ioctl failures.
486 *
487 * We only rely on amdgpu_cs_query_reset_state2 to tell us
488 * that the context reset is complete.
489 */
490 if (ctx->sw_status != PIPE_NO_RESET) {
491 int r = amdgpu_cs_query_reset_state2(ctx->ctx, &flags);
492 if (!r) {
493 if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
494 if (reset_completed) {
495 /* The ARB_robustness spec says:
496 *
497 * If a reset status other than NO_ERROR is returned and subsequent
498 * calls return NO_ERROR, the context reset was encountered and
499 * completed. If a reset status is repeatedly returned, the context may
500 * be in the process of resetting.
501 *
502 * Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
503 * so don't do anything special. On older kernels, submit a no-op cs. If it
504 * succeeds then assume the reset is complete.
505 */
506 if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
507 *reset_completed = true;
508
509 if (ctx->ws->info.drm_minor < 54 && ctx->ws->info.has_graphics)
510 *reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
511 }
512 }
513 } else {
514 fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
515 }
516
517 /* Return a failure due to SW issues. */
518 if (needs_reset)
519 *needs_reset = true;
520 return ctx->sw_status;
521 }
522
523 if (needs_reset)
524 *needs_reset = false;
525 return PIPE_NO_RESET;
526 }
527
528 /* COMMAND SUBMISSION */
529
amdgpu_cs_has_user_fence(struct amdgpu_cs * acs)530 static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs)
531 {
532 return acs->ip_type == AMD_IP_GFX ||
533 acs->ip_type == AMD_IP_COMPUTE ||
534 acs->ip_type == AMD_IP_SDMA;
535 }
536
amdgpu_cs_epilog_dws(struct amdgpu_cs * cs)537 static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs)
538 {
539 if (cs->has_chaining)
540 return 4; /* for chaining */
541
542 return 0;
543 }
544
545 static struct amdgpu_cs_buffer *
amdgpu_lookup_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list)546 amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
547 struct amdgpu_buffer_list *list)
548 {
549 int num_buffers = list->num_buffers;
550 struct amdgpu_cs_buffer *buffers = list->buffers;
551 unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
552 int i = cs->buffer_indices_hashlist[hash];
553
554 /* not found or found */
555 if (i < 0)
556 return NULL;
557
558 if (i < num_buffers && buffers[i].bo == bo)
559 return &buffers[i];
560
561 /* Hash collision, look for the BO in the list of buffers linearly. */
562 for (int i = num_buffers - 1; i >= 0; i--) {
563 if (buffers[i].bo == bo) {
564 /* Put this buffer in the hash list.
565 * This will prevent additional hash collisions if there are
566 * several consecutive lookup_buffer calls for the same buffer.
567 *
568 * Example: Assuming buffers A,B,C collide in the hash list,
569 * the following sequence of buffers:
570 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
571 * will collide here: ^ and here: ^,
572 * meaning that we should get very few collisions in the end. */
573 cs->buffer_indices_hashlist[hash] = i & 0x7fff;
574 return &buffers[i];
575 }
576 }
577 return NULL;
578 }
579
580 struct amdgpu_cs_buffer *
amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo)581 amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
582 {
583 return amdgpu_lookup_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)]);
584 }
585
586 static struct amdgpu_cs_buffer *
amdgpu_do_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)587 amdgpu_do_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
588 struct amdgpu_buffer_list *list, bool add_ref)
589 {
590 /* New buffer, check if the backing array is large enough. */
591 if (unlikely(list->num_buffers >= list->max_buffers)) {
592 unsigned new_max =
593 MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3));
594 struct amdgpu_cs_buffer *new_buffers;
595
596 new_buffers = (struct amdgpu_cs_buffer *)
597 REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers),
598 new_max * sizeof(*new_buffers));
599 if (!new_buffers) {
600 fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
601 return NULL;
602 }
603
604 list->max_buffers = new_max;
605 list->buffers = new_buffers;
606 }
607
608 unsigned idx = list->num_buffers++;
609 struct amdgpu_cs_buffer *buffer = &list->buffers[idx];
610 if (add_ref)
611 p_atomic_inc(&bo->base.reference.count);
612 buffer->bo = bo;
613 buffer->usage = 0;
614
615 unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
616 cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
617 return buffer;
618 }
619
620 static struct amdgpu_cs_buffer *
amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)621 amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
622 struct amdgpu_buffer_list *list, bool add_ref)
623 {
624 struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(cs, bo, list);
625
626 return buffer ? buffer : amdgpu_do_add_buffer(cs, bo, list, add_ref);
627 }
628
amdgpu_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)629 static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
630 struct pb_buffer_lean *buf,
631 unsigned usage,
632 enum radeon_bo_domain domains)
633 {
634 /* Don't use the "domains" parameter. Amdgpu doesn't support changing
635 * the buffer placement during command submission.
636 */
637 struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc;
638 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
639 struct amdgpu_cs_buffer *buffer;
640
641 /* Fast exit for no-op calls.
642 * This is very effective with suballocators and linear uploaders that
643 * are outside of the winsys.
644 */
645 if (bo == cs->last_added_bo &&
646 (usage & cs->last_added_bo_usage) == usage)
647 return 0;
648
649 buffer = amdgpu_lookup_or_add_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)], true);
650 if (!buffer)
651 return 0;
652
653 buffer->usage |= usage;
654
655 cs->last_added_bo_usage = buffer->usage;
656 cs->last_added_bo = bo;
657 return 0;
658 }
659
amdgpu_ib_new_buffer(struct amdgpu_winsys * ws,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)660 static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws,
661 struct amdgpu_ib *main_ib,
662 struct amdgpu_cs *cs)
663 {
664 struct pb_buffer_lean *pb;
665 uint8_t *mapped;
666 unsigned buffer_size;
667
668 /* Always create a buffer that is at least as large as the maximum seen IB size,
669 * aligned to a power of two.
670 */
671 buffer_size = util_next_power_of_two(main_ib->max_ib_bytes);
672
673 /* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/
674 if (!cs->has_chaining)
675 buffer_size *= 4;
676
677 const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024);
678 /* This is the maximum size that fits into the INDIRECT_BUFFER packet. */
679 const unsigned max_size = 2 * 1024 * 1024;
680
681 buffer_size = MIN2(buffer_size, max_size);
682 buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */
683
684 /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU.
685 * The speed of writing to GTT WC is somewhere between no difference and very slow, while
686 * VRAM being very slow a lot more often.
687 *
688 * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency
689 * and doesn't have to wait for cached GL2 requests to be processed.
690 */
691 enum radeon_bo_domain domain = RADEON_DOMAIN_GTT;
692 unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING |
693 RADEON_FLAG_GL2_BYPASS;
694
695 if (cs->ip_type == AMD_IP_GFX ||
696 cs->ip_type == AMD_IP_COMPUTE ||
697 cs->ip_type == AMD_IP_SDMA) {
698 /* Avoids hangs with "rendercheck -t cacomposite -f a8r8g8b8" via glamor
699 * on Navi 14
700 */
701 flags |= RADEON_FLAG_32BIT;
702 }
703
704 pb = amdgpu_bo_create(ws, buffer_size,
705 ws->info.gart_page_size,
706 domain, (radeon_bo_flag)flags);
707 if (!pb)
708 return false;
709
710 mapped = (uint8_t*)amdgpu_bo_map(&ws->dummy_ws.base, pb, NULL, PIPE_MAP_WRITE);
711 if (!mapped) {
712 radeon_bo_reference(&ws->dummy_ws.base, &pb, NULL);
713 return false;
714 }
715
716 radeon_bo_reference(&ws->dummy_ws.base, &main_ib->big_buffer, pb);
717 radeon_bo_reference(&ws->dummy_ws.base, &pb, NULL);
718
719 main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer);
720 main_ib->big_buffer_cpu_ptr = mapped;
721 main_ib->used_ib_space = 0;
722
723 return true;
724 }
725
amdgpu_get_new_ib(struct amdgpu_winsys * ws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)726 static bool amdgpu_get_new_ib(struct amdgpu_winsys *ws,
727 struct radeon_cmdbuf *rcs,
728 struct amdgpu_ib *main_ib,
729 struct amdgpu_cs *cs)
730 {
731 struct drm_amdgpu_cs_chunk_ib *chunk_ib = &cs->csc->chunk_ib[IB_MAIN];
732 /* This is the minimum size of a contiguous IB. */
733 unsigned ib_size = 16 * 1024;
734
735 /* Always allocate at least the size of the biggest cs_check_space call,
736 * because precisely the last call might have requested this size.
737 */
738 ib_size = MAX2(ib_size, main_ib->max_check_space_size);
739
740 if (!cs->has_chaining) {
741 ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes),
742 IB_MAX_SUBMIT_BYTES));
743 }
744
745 /* Decay the IB buffer size over time, so that memory usage decreases after
746 * a temporary peak.
747 */
748 main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32;
749
750 rcs->prev_dw = 0;
751 rcs->num_prev = 0;
752 rcs->current.cdw = 0;
753 rcs->current.buf = NULL;
754
755 /* Allocate a new buffer for IBs if the current buffer is all used. */
756 if (!main_ib->big_buffer ||
757 main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) {
758 if (!amdgpu_ib_new_buffer(ws, main_ib, cs))
759 return false;
760 }
761
762 chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space;
763 chunk_ib->ib_bytes = 0;
764 /* ib_bytes is in dwords and the conversion to bytes will be done before
765 * the CS ioctl. */
766 main_ib->ptr_ib_size = &chunk_ib->ib_bytes;
767 main_ib->is_chained_ib = false;
768
769 amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
770 (radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB),
771 (radeon_bo_domain)0);
772
773 rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
774
775 cs->csc->ib_main_addr = rcs->current.buf;
776
777 ib_size = main_ib->big_buffer->size - main_ib->used_ib_space;
778 rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs);
779 return true;
780 }
781
amdgpu_set_ib_size(struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib)782 static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib)
783 {
784 if (ib->is_chained_ib) {
785 *ib->ptr_ib_size = rcs->current.cdw |
786 S_3F2_CHAIN(1) | S_3F2_VALID(1) |
787 S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL);
788 } else {
789 *ib->ptr_ib_size = rcs->current.cdw;
790 }
791 }
792
amdgpu_ib_finalize(struct amdgpu_winsys * ws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib,enum amd_ip_type ip_type)793 static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct radeon_cmdbuf *rcs,
794 struct amdgpu_ib *ib, enum amd_ip_type ip_type)
795 {
796 amdgpu_set_ib_size(rcs, ib);
797 ib->used_ib_space += rcs->current.cdw * 4;
798 ib->used_ib_space = align(ib->used_ib_space, ws->info.ip[ip_type].ib_alignment);
799 ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4);
800 }
801
amdgpu_init_cs_context(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs,enum amd_ip_type ip_type)802 static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws,
803 struct amdgpu_cs_context *cs,
804 enum amd_ip_type ip_type)
805 {
806 for (unsigned i = 0; i < ARRAY_SIZE(cs->chunk_ib); i++) {
807 cs->chunk_ib[i].ip_type = ip_type;
808 cs->chunk_ib[i].flags = 0;
809
810 if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
811 /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation
812 * is the beginning of IBs because completion of an IB doesn't care about the state of
813 * GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be
814 * executed in parallel, so draw calls from the current IB can finish after the next IB
815 * starts drawing, and so the cache flush at the end of IBs is usually late and thus
816 * useless.
817 */
818 cs->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
819 }
820 }
821
822 cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE;
823 cs->last_added_bo = NULL;
824 return true;
825 }
826
cleanup_fence_list(struct amdgpu_fence_list * fences)827 static void cleanup_fence_list(struct amdgpu_fence_list *fences)
828 {
829 for (unsigned i = 0; i < fences->num; i++)
830 amdgpu_fence_drop_reference(fences->list[i]);
831 fences->num = 0;
832 }
833
amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs)834 static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs)
835 {
836 for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) {
837 struct amdgpu_cs_buffer *buffers = cs->buffer_lists[i].buffers;
838 unsigned num_buffers = cs->buffer_lists[i].num_buffers;
839
840 for (unsigned j = 0; j < num_buffers; j++)
841 amdgpu_winsys_bo_drop_reference(ws, buffers[j].bo);
842
843 cs->buffer_lists[i].num_buffers = 0;
844 }
845 }
846
amdgpu_cs_context_cleanup(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs)847 static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs)
848 {
849 cs->seq_no_dependencies.valid_fence_mask = 0;
850 cleanup_fence_list(&cs->syncobj_dependencies);
851 cleanup_fence_list(&cs->syncobj_to_signal);
852 amdgpu_fence_reference(&cs->fence, NULL);
853 cs->last_added_bo = NULL;
854 }
855
amdgpu_destroy_cs_context(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs)856 static void amdgpu_destroy_cs_context(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs)
857 {
858 amdgpu_cs_context_cleanup_buffers(ws, cs);
859 amdgpu_cs_context_cleanup(ws, cs);
860 for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++)
861 FREE(cs->buffer_lists[i].buffers);
862 FREE(cs->syncobj_dependencies.list);
863 FREE(cs->syncobj_to_signal.list);
864 }
865
866
amdgpu_cs_get_ip_type(struct radeon_cmdbuf * rcs)867 static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs)
868 {
869 struct amdgpu_cs *cs = amdgpu_cs(rcs);
870 return cs->ip_type;
871 }
872
ip_uses_alt_fence(enum amd_ip_type ip_type)873 static bool ip_uses_alt_fence(enum amd_ip_type ip_type)
874 {
875 /* The alt_fence path can be tested thoroughly by enabling it for GFX here. */
876 return ip_type == AMD_IP_VCN_DEC ||
877 ip_type == AMD_IP_VCN_ENC ||
878 ip_type == AMD_IP_VCN_JPEG;
879 }
880
881 static bool
amdgpu_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)882 amdgpu_cs_create(struct radeon_cmdbuf *rcs,
883 struct radeon_winsys_ctx *rwctx,
884 enum amd_ip_type ip_type,
885 void (*flush)(void *ctx, unsigned flags,
886 struct pipe_fence_handle **fence),
887 void *flush_ctx)
888 {
889 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
890 struct amdgpu_cs *cs;
891
892 cs = CALLOC_STRUCT(amdgpu_cs);
893 if (!cs) {
894 return false;
895 }
896
897 util_queue_fence_init(&cs->flush_completed);
898
899 cs->ws = ctx->ws;
900 cs->ctx = ctx;
901 cs->flush_cs = flush;
902 cs->flush_data = flush_ctx;
903 cs->ip_type = ip_type;
904 cs->noop = ctx->ws->noop_cs;
905 cs->has_chaining = ctx->ws->info.gfx_level >= GFX7 &&
906 (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
907
908 /* Compute the queue index by counting the IPs that have queues. */
909 assert(ip_type < ARRAY_SIZE(ctx->ws->info.ip));
910 assert(ctx->ws->info.ip[ip_type].num_queues);
911
912 if (ip_uses_alt_fence(ip_type)) {
913 cs->queue_index = INT_MAX;
914 cs->uses_alt_fence = true;
915 } else {
916 cs->queue_index = 0;
917
918 for (unsigned i = 0; i < ARRAY_SIZE(ctx->ws->info.ip); i++) {
919 if (!ctx->ws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i))
920 continue;
921
922 if (i == ip_type)
923 break;
924
925 cs->queue_index++;
926 }
927 assert(cs->queue_index < AMDGPU_MAX_QUEUES);
928 }
929
930 struct amdgpu_cs_fence_info fence_info;
931 fence_info.handle = cs->ctx->user_fence_bo;
932 fence_info.offset = cs->ip_type * 4;
933 amdgpu_cs_chunk_fence_info_to_data(&fence_info,
934 (struct drm_amdgpu_cs_chunk_data*)&cs->fence_chunk);
935
936 if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ip_type)) {
937 FREE(cs);
938 return false;
939 }
940
941 if (!amdgpu_init_cs_context(ctx->ws, &cs->csc2, ip_type)) {
942 amdgpu_destroy_cs_context(ctx->ws, &cs->csc1);
943 FREE(cs);
944 return false;
945 }
946
947 memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
948
949 /* Set the first submission context as current. */
950 rcs->csc = cs->csc = &cs->csc1;
951 cs->cst = &cs->csc2;
952
953 /* Assign to both amdgpu_cs_context; only csc will use it. */
954 cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist;
955 cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist;
956
957 cs->csc1.ws = ctx->ws;
958 cs->csc2.ws = ctx->ws;
959
960 rcs->priv = cs;
961
962 if (!amdgpu_get_new_ib(ctx->ws, rcs, &cs->main_ib, cs)) {
963 amdgpu_destroy_cs_context(ctx->ws, &cs->csc2);
964 amdgpu_destroy_cs_context(ctx->ws, &cs->csc1);
965 FREE(cs);
966 rcs->priv = NULL;
967 return false;
968 }
969
970 p_atomic_inc(&ctx->ws->num_cs);
971 return true;
972 }
973
974 static bool
amdgpu_cs_setup_preemption(struct radeon_cmdbuf * rcs,const uint32_t * preamble_ib,unsigned preamble_num_dw)975 amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
976 unsigned preamble_num_dw)
977 {
978 struct amdgpu_cs *cs = amdgpu_cs(rcs);
979 struct amdgpu_winsys *ws = cs->ws;
980 struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2};
981 unsigned size = align(preamble_num_dw * 4, ws->info.ip[AMD_IP_GFX].ib_alignment);
982 struct pb_buffer_lean *preamble_bo;
983 uint32_t *map;
984
985 /* Create the preamble IB buffer. */
986 preamble_bo = amdgpu_bo_create(ws, size, ws->info.ip[AMD_IP_GFX].ib_alignment,
987 RADEON_DOMAIN_VRAM,
988 (radeon_bo_flag)
989 (RADEON_FLAG_NO_INTERPROCESS_SHARING |
990 RADEON_FLAG_GTT_WC |
991 RADEON_FLAG_READ_ONLY));
992 if (!preamble_bo)
993 return false;
994
995 map = (uint32_t*)amdgpu_bo_map(&ws->dummy_ws.base, preamble_bo, NULL,
996 (pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY));
997 if (!map) {
998 radeon_bo_reference(&ws->dummy_ws.base, &preamble_bo, NULL);
999 return false;
1000 }
1001
1002 /* Upload the preamble IB. */
1003 memcpy(map, preamble_ib, preamble_num_dw * 4);
1004
1005 /* Pad the IB. */
1006 amdgpu_pad_gfx_compute_ib(ws, cs->ip_type, map, &preamble_num_dw, 0);
1007 amdgpu_bo_unmap(&ws->dummy_ws.base, preamble_bo);
1008
1009 for (unsigned i = 0; i < 2; i++) {
1010 csc[i]->chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo);
1011 csc[i]->chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4;
1012
1013 csc[i]->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT;
1014 }
1015
1016 assert(!cs->preamble_ib_bo);
1017 cs->preamble_ib_bo = preamble_bo;
1018
1019 amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1020 RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1021 return true;
1022 }
1023
amdgpu_cs_validate(struct radeon_cmdbuf * rcs)1024 static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
1025 {
1026 return true;
1027 }
1028
amdgpu_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)1029 static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
1030 {
1031 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1032 struct amdgpu_ib *main_ib = &cs->main_ib;
1033
1034 assert(rcs->current.cdw <= rcs->current.max_dw);
1035
1036 unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw;
1037
1038 if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES)
1039 return false;
1040
1041 if (rcs->current.max_dw - rcs->current.cdw >= dw)
1042 return true;
1043
1044 unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
1045 unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
1046 /* 125% of the size for IB epilog. */
1047 unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
1048 main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size);
1049 main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4);
1050
1051 if (!cs->has_chaining)
1052 return false;
1053
1054 /* Allocate a new chunk */
1055 if (rcs->num_prev >= rcs->max_prev) {
1056 unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
1057 struct radeon_cmdbuf_chunk *new_prev;
1058
1059 new_prev = (struct radeon_cmdbuf_chunk*)
1060 REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev,
1061 sizeof(*new_prev) * new_max_prev);
1062 if (!new_prev)
1063 return false;
1064
1065 rcs->prev = new_prev;
1066 rcs->max_prev = new_max_prev;
1067 }
1068
1069 if (!amdgpu_ib_new_buffer(cs->ws, main_ib, cs))
1070 return false;
1071
1072 assert(main_ib->used_ib_space == 0);
1073 uint64_t va = main_ib->gpu_address;
1074
1075 /* This space was originally reserved. */
1076 rcs->current.max_dw += cs_epilog_dw;
1077
1078 /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
1079 amdgpu_pad_gfx_compute_ib(cs->ws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4);
1080
1081 radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
1082 radeon_emit(rcs, va);
1083 radeon_emit(rcs, va >> 32);
1084 uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
1085
1086 assert((rcs->current.cdw & cs->ws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0);
1087 assert(rcs->current.cdw <= rcs->current.max_dw);
1088
1089 amdgpu_set_ib_size(rcs, main_ib);
1090 main_ib->ptr_ib_size = new_ptr_ib_size;
1091 main_ib->is_chained_ib = true;
1092
1093 /* Hook up the new chunk */
1094 rcs->prev[rcs->num_prev].buf = rcs->current.buf;
1095 rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
1096 rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
1097 rcs->num_prev++;
1098
1099 rcs->prev_dw += rcs->current.cdw;
1100 rcs->current.cdw = 0;
1101
1102 rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
1103 rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw;
1104
1105 amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
1106 RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1107
1108 return true;
1109 }
1110
amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context * cs)1111 static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *cs)
1112 {
1113 unsigned num_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1114 struct amdgpu_cs_buffer *buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1115
1116 for (unsigned i = 0; i < num_buffers; i++) {
1117 struct amdgpu_cs_buffer *slab_buffer = &buffers[i];
1118 struct amdgpu_cs_buffer *real_buffer =
1119 amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(slab_buffer->bo)->b,
1120 &cs->buffer_lists[AMDGPU_BO_REAL], true);
1121
1122 /* We need to set the usage because it determines the BO priority.
1123 *
1124 * Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its
1125 * BO fences to fence dependencies. Only the slab entries should do that.
1126 */
1127 real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
1128 }
1129 }
1130
amdgpu_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)1131 static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
1132 struct radeon_bo_list_item *list)
1133 {
1134 struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
1135
1136 /* We do this in the CS thread, but since we need to return the final usage of all buffers
1137 * here, do it here too. There is no harm in doing it again in the CS thread.
1138 */
1139 amdgpu_add_slab_backing_buffers(cs);
1140
1141 struct amdgpu_buffer_list *real_buffers = &cs->buffer_lists[AMDGPU_BO_REAL];
1142 unsigned num_real_buffers = real_buffers->num_buffers;
1143
1144 if (list) {
1145 for (unsigned i = 0; i < num_real_buffers; i++) {
1146 list[i].bo_size = real_buffers->buffers[i].bo->base.size;
1147 list[i].vm_address =
1148 amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle);
1149 list[i].priority_usage = real_buffers->buffers[i].usage;
1150 }
1151 }
1152 return num_real_buffers;
1153 }
1154
add_fence_to_list(struct amdgpu_fence_list * fences,struct amdgpu_fence * fence)1155 static void add_fence_to_list(struct amdgpu_fence_list *fences,
1156 struct amdgpu_fence *fence)
1157 {
1158 unsigned idx = fences->num++;
1159
1160 if (idx >= fences->max) {
1161 unsigned size;
1162 const unsigned increment = 8;
1163
1164 fences->max = idx + increment;
1165 size = fences->max * sizeof(fences->list[0]);
1166 fences->list = (struct pipe_fence_handle**)realloc(fences->list, size);
1167 }
1168 amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
1169 }
1170
amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf * rcs,struct pipe_fence_handle * pfence)1171 static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs,
1172 struct pipe_fence_handle *pfence)
1173 {
1174 struct amdgpu_cs *acs = amdgpu_cs(rcs);
1175 struct amdgpu_cs_context *cs = acs->csc;
1176 struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
1177
1178 util_queue_fence_wait(&fence->submitted);
1179
1180 if (!fence->imported) {
1181 /* Ignore idle fences. This will only check the user fence in memory. */
1182 if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) {
1183 add_seq_no_to_list(acs->ws, &cs->seq_no_dependencies, fence->queue_index,
1184 fence->queue_seq_no);
1185 }
1186 }
1187 else
1188 add_fence_to_list(&cs->syncobj_dependencies, fence);
1189 }
1190
amdgpu_add_fences_to_dependencies(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs,unsigned queue_index_bit,struct amdgpu_seq_no_fences * dependencies,struct amdgpu_winsys_bo * bo,unsigned usage)1191 static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws,
1192 struct amdgpu_cs_context *cs,
1193 unsigned queue_index_bit,
1194 struct amdgpu_seq_no_fences *dependencies,
1195 struct amdgpu_winsys_bo *bo, unsigned usage)
1196 {
1197 if (usage & RADEON_USAGE_SYNCHRONIZED) {
1198 /* Add BO fences from queues other than 'queue_index' to dependencies. */
1199 u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) {
1200 add_seq_no_to_list(ws, dependencies, other_queue_idx,
1201 bo->fences.seq_no[other_queue_idx]);
1202 }
1203
1204 if (bo->alt_fence)
1205 add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence);
1206 }
1207 }
1208
amdgpu_set_bo_seq_no(unsigned queue_index,struct amdgpu_winsys_bo * bo,uint_seq_no new_queue_seq_no)1209 static void amdgpu_set_bo_seq_no(unsigned queue_index, struct amdgpu_winsys_bo *bo,
1210 uint_seq_no new_queue_seq_no)
1211 {
1212 bo->fences.seq_no[queue_index] = new_queue_seq_no;
1213 bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
1214 }
1215
amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry * bo_entry,struct amdgpu_winsys_bo * bo,unsigned usage)1216 static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry,
1217 struct amdgpu_winsys_bo *bo, unsigned usage)
1218 {
1219 bo_entry->bo_handle = get_real_bo(bo)->kms_handle;
1220 bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2;
1221 }
1222
amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf * rws,struct pipe_fence_handle * fence)1223 static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
1224 struct pipe_fence_handle *fence)
1225 {
1226 struct amdgpu_cs *acs = amdgpu_cs(rws);
1227 struct amdgpu_cs_context *cs = acs->csc;
1228
1229 add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence);
1230 }
1231
1232 /* The template parameter determines whether the queue should skip code used by the default queue
1233 * system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence
1234 * for all BOs.
1235 */
1236 template<bool QUEUE_USES_ALT_FENCE>
amdgpu_cs_submit_ib(void * job,void * gdata,int thread_index)1237 static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
1238 {
1239 struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
1240 struct amdgpu_winsys *ws = acs->ws;
1241 struct amdgpu_cs_context *cs = acs->cst;
1242 int r;
1243 uint64_t seq_no = 0;
1244 bool has_user_fence = amdgpu_cs_has_user_fence(acs);
1245
1246 assert(QUEUE_USES_ALT_FENCE == acs->uses_alt_fence);
1247
1248 simple_mtx_lock(&ws->bo_fence_lock);
1249 unsigned queue_index;
1250 struct amdgpu_queue *queue;
1251 uint_seq_no prev_seq_no, next_seq_no;
1252
1253 if (!QUEUE_USES_ALT_FENCE) {
1254 queue_index = acs->queue_index;
1255 queue = &ws->queues[queue_index];
1256 prev_seq_no = queue->latest_seq_no;
1257
1258 /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
1259 * but the values aren't related.
1260 */
1261 next_seq_no = prev_seq_no + 1;
1262
1263 /* Wait for the oldest fence to signal. This should always check the user fence, then wait
1264 * via the ioctl. We have to do this because we are going to release the oldest fence and
1265 * replace it with the latest fence in the ring.
1266 */
1267 struct pipe_fence_handle **oldest_fence =
1268 &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
1269
1270 if (*oldest_fence) {
1271 if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
1272 /* Take the reference because the fence can be released by other threads after we
1273 * unlock the mutex.
1274 */
1275 struct pipe_fence_handle *tmp_fence = NULL;
1276 amdgpu_fence_reference(&tmp_fence, *oldest_fence);
1277
1278 /* Unlock the mutex before waiting. */
1279 simple_mtx_unlock(&ws->bo_fence_lock);
1280 amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
1281 amdgpu_fence_reference(&tmp_fence, NULL);
1282 simple_mtx_lock(&ws->bo_fence_lock);
1283 }
1284
1285 /* Remove the idle fence from the ring. */
1286 amdgpu_fence_reference(oldest_fence, NULL);
1287 }
1288 }
1289
1290 /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
1291 * sequence number per queue and removes all older ones.
1292 */
1293 struct amdgpu_seq_no_fences seq_no_dependencies;
1294 memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies));
1295
1296 if (!QUEUE_USES_ALT_FENCE) {
1297 /* Add a fence dependency on the previous IB if the IP has multiple physical queues to
1298 * make it appear as if it had only 1 queue, or if the previous IB comes from a different
1299 * context. The reasons are:
1300 * - Our BO fence tracking only supports 1 queue per IP.
1301 * - IBs from different contexts must wait for each other and can't execute in a random order.
1302 */
1303 struct amdgpu_fence *prev_fence =
1304 (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
1305
1306 if (prev_fence && (ws->info.ip[acs->ip_type].num_queues > 1 || queue->last_ctx != acs->ctx))
1307 add_seq_no_to_list(ws, &seq_no_dependencies, queue_index, prev_seq_no);
1308 }
1309
1310 /* Since the kernel driver doesn't synchronize execution between different
1311 * rings automatically, we have to add fence dependencies manually. This gathers sequence
1312 * numbers from BOs and sets the next sequence number in the BOs.
1313 */
1314
1315 /* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1316 struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1317 unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1318 unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1319 unsigned queue_index_bit = QUEUE_USES_ALT_FENCE ? 0 : BITFIELD_BIT(queue_index);
1320
1321 for (unsigned i = 0; i < num_slab_entry_buffers; i++) {
1322 struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i];
1323 struct amdgpu_winsys_bo *bo = buffer->bo;
1324
1325 amdgpu_add_fences_to_dependencies(ws, cs, queue_index_bit, &seq_no_dependencies, bo,
1326 buffer->usage);
1327 if (QUEUE_USES_ALT_FENCE)
1328 amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1329 else
1330 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1331
1332 /* We didn't add any slab entries into the real buffer list that will be submitted
1333 * to the kernel. Do it now.
1334 */
1335 struct amdgpu_cs_buffer *real_buffer =
1336 amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(buffer->bo)->b,
1337 &cs->buffer_lists[AMDGPU_BO_REAL], false);
1338
1339 /* We need to set the usage because it determines the BO priority. */
1340 real_buffer->usage |= buffer->usage;
1341 }
1342
1343 /* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1344 unsigned num_real_buffers_except_sparse = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1345 struct amdgpu_cs_buffer *sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].buffers;
1346 unsigned num_sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].num_buffers;
1347 bool out_of_memory = false;
1348
1349 for (unsigned i = 0; i < num_sparse_buffers; i++) {
1350 struct amdgpu_cs_buffer *buffer = &sparse_buffers[i];
1351 struct amdgpu_winsys_bo *bo = buffer->bo;
1352
1353 amdgpu_add_fences_to_dependencies(ws, cs, queue_index_bit, &seq_no_dependencies, bo,
1354 buffer->usage);
1355 if (QUEUE_USES_ALT_FENCE)
1356 amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1357 else
1358 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1359
1360 /* Add backing buffers of sparse buffers to the buffer list.
1361 *
1362 * This is done late, during submission, to keep the buffer list short before
1363 * submit, and to avoid managing fences for the backing buffers.
1364 */
1365 struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo);
1366
1367 simple_mtx_lock(&sparse_bo->commit_lock);
1368 list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) {
1369 /* We can directly add the buffer here, because we know that each
1370 * backing buffer occurs only once.
1371 */
1372 struct amdgpu_cs_buffer *real_buffer =
1373 amdgpu_do_add_buffer(cs, &backing->bo->b, &cs->buffer_lists[AMDGPU_BO_REAL], true);
1374 if (!real_buffer) {
1375 fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__);
1376 simple_mtx_unlock(&sparse_bo->commit_lock);
1377 r = -ENOMEM;
1378 out_of_memory = true;
1379 }
1380
1381 real_buffer->usage = buffer->usage;
1382 }
1383 simple_mtx_unlock(&sparse_bo->commit_lock);
1384 }
1385
1386 /* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */
1387 unsigned num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1388 struct amdgpu_cs_buffer *real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].buffers;
1389 struct drm_amdgpu_bo_list_entry *bo_list =
1390 (struct drm_amdgpu_bo_list_entry *)
1391 alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1392 unsigned i;
1393
1394 for (i = 0; i < initial_num_real_buffers; i++) {
1395 struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1396 struct amdgpu_winsys_bo *bo = buffer->bo;
1397
1398 amdgpu_add_fences_to_dependencies(ws, cs, queue_index_bit, &seq_no_dependencies, bo,
1399 buffer->usage);
1400 if (QUEUE_USES_ALT_FENCE)
1401 amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1402 else
1403 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1404
1405 amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1406 }
1407
1408 /* These are backing buffers of slab entries. Don't add their fence dependencies. */
1409 for (; i < num_real_buffers_except_sparse; i++) {
1410 struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1411 struct amdgpu_winsys_bo *bo = buffer->bo;
1412
1413 if (QUEUE_USES_ALT_FENCE)
1414 get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true;
1415 else
1416 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1417
1418 amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1419 }
1420
1421 /* Sparse backing BOs are last. Don't update their fences because we don't use them. */
1422 for (; i < num_real_buffers; ++i) {
1423 struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1424
1425 amdgpu_add_to_kernel_bo_list(&bo_list[i], buffer->bo, buffer->usage);
1426 }
1427
1428 #if 0 /* Debug code. */
1429 printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
1430
1431 /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
1432 for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
1433 if (i == acs->queue_index)
1434 continue;
1435
1436 struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
1437 if (!fence) {
1438 if (i <= 1)
1439 printf(" queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
1440 continue;
1441 }
1442
1443 bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
1444 uint_seq_no old = seq_no_dependencies.seq_no[i];
1445 add_seq_no_to_list(ws, &seq_no_dependencies, i, ws->queues[i].latest_seq_no);
1446 uint_seq_no new = seq_no_dependencies.seq_no[i];
1447
1448 if (!valid)
1449 printf(" missing dependency on queue=%u, seq_no=%u\n", i, new);
1450 else if (old != new)
1451 printf(" too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
1452 else
1453 printf(" has dependency on queue=%u, seq_no=%u\n", i, old);
1454 }
1455 #endif
1456
1457 /* Convert the sequence numbers we gathered to fence dependencies. */
1458 u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
1459 struct pipe_fence_handle **fence = get_fence_from_ring(ws, &seq_no_dependencies, i);
1460
1461 if (fence) {
1462 /* If it's idle, don't add it to the list of dependencies. */
1463 if (amdgpu_fence_wait(*fence, 0, false))
1464 amdgpu_fence_reference(fence, NULL);
1465 else
1466 add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)*fence);
1467 }
1468 }
1469
1470 if (!QUEUE_USES_ALT_FENCE) {
1471 /* Finally, add the IB fence into the fence ring of the queue. */
1472 amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
1473 queue->latest_seq_no = next_seq_no;
1474 ((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no;
1475
1476 /* Update the last used context in the queue. */
1477 amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
1478 }
1479 simple_mtx_unlock(&ws->bo_fence_lock);
1480
1481 #if DEBUG
1482 /* Prepare the buffer list. */
1483 if (ws->debug_all_bos) {
1484 /* The buffer list contains all buffers. This is a slow path that
1485 * ensures that no buffer is missing in the BO list.
1486 */
1487 simple_mtx_lock(&ws->global_bo_list_lock);
1488 bo_list = (struct drm_amdgpu_bo_list_entry *)
1489 alloca(ws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1490 num_real_buffers = 0;
1491
1492 list_for_each_entry(struct amdgpu_bo_real, bo, &ws->global_bo_list, global_list_item) {
1493 bo_list[num_real_buffers].bo_handle = bo->kms_handle;
1494 bo_list[num_real_buffers].bo_priority = 0;
1495 ++num_real_buffers;
1496 }
1497 simple_mtx_unlock(&ws->global_bo_list_lock);
1498 }
1499 #endif
1500
1501 if (acs->ip_type == AMD_IP_GFX)
1502 ws->gfx_bo_list_counter += num_real_buffers;
1503
1504 struct drm_amdgpu_cs_chunk chunks[8];
1505 unsigned num_chunks = 0;
1506
1507 /* BO list */
1508 struct drm_amdgpu_bo_list_in bo_list_in;
1509 bo_list_in.operation = ~0;
1510 bo_list_in.list_handle = ~0;
1511 bo_list_in.bo_number = num_real_buffers;
1512 bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1513 bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list;
1514
1515 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1516 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1517 chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1518 num_chunks++;
1519
1520 /* Syncobj dependencies. */
1521 unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1522 if (num_syncobj_dependencies) {
1523 struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1524 (struct drm_amdgpu_cs_chunk_sem *)
1525 alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
1526
1527 for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1528 struct amdgpu_fence *fence =
1529 (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1530
1531 assert(util_queue_fence_is_signalled(&fence->submitted));
1532 sem_chunk[i].handle = fence->syncobj;
1533 }
1534
1535 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
1536 chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
1537 chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1538 num_chunks++;
1539 }
1540
1541 /* Syncobj signals. */
1542 unsigned num_syncobj_to_signal = 1 + cs->syncobj_to_signal.num;
1543 struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1544 (struct drm_amdgpu_cs_chunk_sem *)
1545 alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
1546
1547 for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) {
1548 struct amdgpu_fence *fence =
1549 (struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1550
1551 sem_chunk[i].handle = fence->syncobj;
1552 }
1553 sem_chunk[cs->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)cs->fence)->syncobj;
1554
1555 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
1556 chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal;
1557 chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1558 num_chunks++;
1559
1560 if (ws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) {
1561 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW;
1562 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4;
1563 chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk;
1564 num_chunks++;
1565 }
1566
1567 /* Fence */
1568 if (has_user_fence) {
1569 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1570 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1571 chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
1572 num_chunks++;
1573 }
1574
1575 /* IB */
1576 if (cs->chunk_ib[IB_PREAMBLE].ib_bytes) {
1577 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1578 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1579 chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_PREAMBLE];
1580 num_chunks++;
1581 }
1582
1583 /* IB */
1584 cs->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
1585 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1586 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1587 chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_MAIN];
1588 num_chunks++;
1589
1590 if (cs->secure) {
1591 cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
1592 cs->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
1593 } else {
1594 cs->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1595 cs->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1596 }
1597
1598 bool noop = acs->noop;
1599
1600 if (noop && acs->ip_type == AMD_IP_GFX) {
1601 /* Reduce the IB size and fill it with NOP to make it like an empty IB. */
1602 unsigned noop_dw_size = ws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
1603 assert(cs->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size);
1604
1605 cs->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
1606 cs->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4;
1607 noop = false;
1608 }
1609
1610 assert(num_chunks <= ARRAY_SIZE(chunks));
1611
1612 if (out_of_memory) {
1613 r = -ENOMEM;
1614 } else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) {
1615 r = -ECANCELED;
1616 } else if (unlikely(noop)) {
1617 r = 0;
1618 } else {
1619 /* Submit the command buffer.
1620 *
1621 * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
1622 * quite often, but it eventually succeeds after enough attempts. This happens frequently
1623 * with dEQP using NGG streamout.
1624 */
1625 r = 0;
1626
1627 do {
1628 /* Wait 1 ms and try again. */
1629 if (r == -ENOMEM)
1630 os_time_sleep(1000);
1631
1632 r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, 0, num_chunks, chunks, &seq_no);
1633 } while (r == -ENOMEM);
1634
1635 if (!r) {
1636 /* Success. */
1637 uint64_t *user_fence = NULL;
1638
1639 /* Need to reserve 4 QWORD for user fence:
1640 * QWORD[0]: completed fence
1641 * QWORD[1]: preempted fence
1642 * QWORD[2]: reset fence
1643 * QWORD[3]: preempted then reset
1644 */
1645 if (has_user_fence)
1646 user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
1647 amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
1648 }
1649 }
1650
1651 if (unlikely(r)) {
1652 if (r == -ECANCELED) {
1653 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET,
1654 "amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
1655 } else if (r == -ENODATA) {
1656 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1657 "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
1658 } else if (r == -ETIME) {
1659 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1660 "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
1661 } else {
1662 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx,
1663 PIPE_UNKNOWN_CONTEXT_RESET,
1664 "amdgpu: The CS has been rejected, "
1665 "see dmesg for more information (%i).\n",
1666 r);
1667 }
1668 }
1669
1670 /* If there was an error, signal the fence, because it won't be signalled
1671 * by the hardware. */
1672 if (r || noop)
1673 amdgpu_fence_signalled(cs->fence);
1674
1675 if (unlikely(ws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0))
1676 acs->mcbp_fw_shadow_chunk.flags = 0;
1677
1678 cs->error_code = r;
1679
1680 /* Clear the buffer lists. */
1681 for (unsigned list = 0; list < ARRAY_SIZE(cs->buffer_lists); list++) {
1682 struct amdgpu_cs_buffer *buffers = cs->buffer_lists[list].buffers;
1683 unsigned num_buffers = cs->buffer_lists[list].num_buffers;
1684
1685 if (list == AMDGPU_BO_REAL) {
1686 /* Only decrement num_active_ioctls and unref where we incremented them.
1687 * We did both for regular real BOs. We only incremented the refcount for sparse
1688 * backing BOs.
1689 */
1690 /* Regular real BOs. */
1691 for (unsigned i = 0; i < initial_num_real_buffers; i++) {
1692 p_atomic_dec(&buffers[i].bo->num_active_ioctls);
1693 amdgpu_winsys_bo_drop_reference(ws, buffers[i].bo);
1694 }
1695
1696 /* Do nothing for slab BOs. */
1697
1698 /* Sparse backing BOs. */
1699 for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++)
1700 amdgpu_winsys_bo_drop_reference(ws, buffers[i].bo);
1701 } else {
1702 for (unsigned i = 0; i < num_buffers; i++) {
1703 p_atomic_dec(&buffers[i].bo->num_active_ioctls);
1704 amdgpu_winsys_bo_drop_reference(ws, buffers[i].bo);
1705 }
1706 }
1707
1708 cs->buffer_lists[list].num_buffers = 0;
1709 }
1710
1711 amdgpu_cs_context_cleanup(ws, cs);
1712 }
1713
1714 /* Make sure the previous submission is completed. */
amdgpu_cs_sync_flush(struct radeon_cmdbuf * rcs)1715 void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
1716 {
1717 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1718
1719 /* Wait for any pending ioctl of this CS to complete. */
1720 util_queue_fence_wait(&cs->flush_completed);
1721 }
1722
amdgpu_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** fence)1723 static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
1724 unsigned flags,
1725 struct pipe_fence_handle **fence)
1726 {
1727 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1728 struct amdgpu_winsys *ws = cs->ws;
1729 int error_code = 0;
1730 uint32_t ib_pad_dw_mask = ws->info.ip[cs->ip_type].ib_pad_dw_mask;
1731
1732 rcs->current.max_dw += amdgpu_cs_epilog_dws(cs);
1733
1734 /* Pad the IB according to the mask. */
1735 switch (cs->ip_type) {
1736 case AMD_IP_SDMA:
1737 if (ws->info.gfx_level <= GFX6) {
1738 while (rcs->current.cdw & ib_pad_dw_mask)
1739 radeon_emit(rcs, 0xf0000000); /* NOP packet */
1740 } else {
1741 while (rcs->current.cdw & ib_pad_dw_mask)
1742 radeon_emit(rcs, SDMA_NOP_PAD);
1743 }
1744 break;
1745 case AMD_IP_GFX:
1746 case AMD_IP_COMPUTE:
1747 amdgpu_pad_gfx_compute_ib(ws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0);
1748 if (cs->ip_type == AMD_IP_GFX)
1749 ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
1750 break;
1751 case AMD_IP_UVD:
1752 case AMD_IP_UVD_ENC:
1753 while (rcs->current.cdw & ib_pad_dw_mask)
1754 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
1755 break;
1756 case AMD_IP_VCN_JPEG:
1757 if (rcs->current.cdw % 2)
1758 assert(0);
1759 while (rcs->current.cdw & ib_pad_dw_mask) {
1760 radeon_emit(rcs, 0x60000000); /* nop packet */
1761 radeon_emit(rcs, 0x00000000);
1762 }
1763 break;
1764 case AMD_IP_VCN_DEC:
1765 while (rcs->current.cdw & ib_pad_dw_mask)
1766 radeon_emit(rcs, 0x81ff); /* nop packet */
1767 break;
1768 default:
1769 break;
1770 }
1771
1772 if (rcs->current.cdw > rcs->current.max_dw) {
1773 fprintf(stderr, "amdgpu: command stream overflowed\n");
1774 }
1775
1776 /* If the CS is not empty or overflowed.... */
1777 if (likely(radeon_emitted(rcs, 0) &&
1778 rcs->current.cdw <= rcs->current.max_dw &&
1779 !(flags & RADEON_FLUSH_NOOP))) {
1780 struct amdgpu_cs_context *cur = cs->csc;
1781
1782 /* Set IB sizes. */
1783 amdgpu_ib_finalize(ws, rcs, &cs->main_ib, cs->ip_type);
1784
1785 /* Create a fence. */
1786 amdgpu_fence_reference(&cur->fence, NULL);
1787 if (cs->next_fence) {
1788 /* just move the reference */
1789 cur->fence = cs->next_fence;
1790 cs->next_fence = NULL;
1791 } else {
1792 cur->fence = amdgpu_fence_create(cs);
1793 }
1794 if (fence)
1795 amdgpu_fence_reference(fence, cur->fence);
1796
1797 for (unsigned i = 0; i < ARRAY_SIZE(cur->buffer_lists); i++) {
1798 unsigned num_buffers = cur->buffer_lists[i].num_buffers;
1799 struct amdgpu_cs_buffer *buffers = cur->buffer_lists[i].buffers;
1800
1801 for (unsigned j = 0; j < num_buffers; j++)
1802 p_atomic_inc(&buffers[j].bo->num_active_ioctls);
1803 }
1804
1805 amdgpu_cs_sync_flush(rcs);
1806
1807 /* Swap command streams. "cst" is going to be submitted. */
1808 rcs->csc = cs->csc = cs->cst;
1809 cs->cst = cur;
1810
1811 /* Submit. */
1812 util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
1813 cs->uses_alt_fence ? amdgpu_cs_submit_ib<true>
1814 : amdgpu_cs_submit_ib<false>, NULL, 0);
1815
1816 if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
1817 cs->csc->secure = !cs->cst->secure;
1818 else
1819 cs->csc->secure = cs->cst->secure;
1820
1821 if (!(flags & PIPE_FLUSH_ASYNC)) {
1822 amdgpu_cs_sync_flush(rcs);
1823 error_code = cur->error_code;
1824 }
1825 } else {
1826 if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
1827 cs->csc->secure = !cs->csc->secure;
1828
1829 amdgpu_cs_context_cleanup_buffers(ws, cs->csc);
1830 amdgpu_cs_context_cleanup(ws, cs->csc);
1831 }
1832
1833 memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
1834
1835 amdgpu_get_new_ib(ws, rcs, &cs->main_ib, cs);
1836
1837 if (cs->preamble_ib_bo) {
1838 amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1839 RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1840 }
1841
1842 if (cs->ip_type == AMD_IP_GFX)
1843 ws->num_gfx_IBs++;
1844 else if (cs->ip_type == AMD_IP_SDMA)
1845 ws->num_sdma_IBs++;
1846
1847 return error_code;
1848 }
1849
amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)1850 static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
1851 {
1852 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1853
1854 if (!cs)
1855 return;
1856
1857 amdgpu_cs_sync_flush(rcs);
1858 util_queue_fence_destroy(&cs->flush_completed);
1859 p_atomic_dec(&cs->ws->num_cs);
1860 radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->preamble_ib_bo, NULL);
1861 radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->main_ib.big_buffer, NULL);
1862 FREE(rcs->prev);
1863 amdgpu_destroy_cs_context(cs->ws, &cs->csc1);
1864 amdgpu_destroy_cs_context(cs->ws, &cs->csc2);
1865 amdgpu_fence_reference(&cs->next_fence, NULL);
1866 FREE(cs);
1867 }
1868
amdgpu_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)1869 static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
1870 struct pb_buffer_lean *_buf,
1871 unsigned usage)
1872 {
1873 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1874 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
1875
1876 return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
1877 }
1878
amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf * rcs,uint64_t regs_va,uint64_t csa_va)1879 static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va,
1880 uint64_t csa_va)
1881 {
1882 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1883 cs->mcbp_fw_shadow_chunk.shadow_va = regs_va;
1884 cs->mcbp_fw_shadow_chunk.csa_va = csa_va;
1885 cs->mcbp_fw_shadow_chunk.gds_va = 0;
1886 cs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
1887 }
1888
amdgpu_winsys_fence_reference(struct radeon_winsys * rws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)1889 static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws,
1890 struct pipe_fence_handle **dst,
1891 struct pipe_fence_handle *src)
1892 {
1893 amdgpu_fence_reference(dst, src);
1894 }
1895
amdgpu_cs_init_functions(struct amdgpu_screen_winsys * ws)1896 void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws)
1897 {
1898 ws->base.ctx_create = amdgpu_ctx_create;
1899 ws->base.ctx_destroy = amdgpu_ctx_destroy;
1900 ws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status;
1901 ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
1902 ws->base.cs_create = amdgpu_cs_create;
1903 ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
1904 ws->base.cs_destroy = amdgpu_cs_destroy;
1905 ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
1906 ws->base.cs_validate = amdgpu_cs_validate;
1907 ws->base.cs_check_space = amdgpu_cs_check_space;
1908 ws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
1909 ws->base.cs_flush = amdgpu_cs_flush;
1910 ws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
1911 ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
1912 ws->base.cs_sync_flush = amdgpu_cs_sync_flush;
1913 ws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
1914 ws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal;
1915 ws->base.cs_get_ip_type = amdgpu_cs_get_ip_type;
1916 ws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
1917 ws->base.fence_reference = amdgpu_winsys_fence_reference;
1918 ws->base.fence_import_syncobj = amdgpu_fence_import_syncobj;
1919 ws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
1920 ws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
1921 ws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
1922
1923 if (ws->aws->info.has_fw_based_shadowing)
1924 ws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va;
1925 }
1926