1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * Copyright © 2015 Advanced Micro Devices, Inc.
5 *
6 * SPDX-License-Identifier: MIT
7 */
8
9 #include "amdgpu_cs.h"
10 #include "util/detect_os.h"
11 #include "amdgpu_winsys.h"
12 #include "util/os_time.h"
13 #include <inttypes.h>
14 #include <stdio.h>
15
16 #include "amd/common/sid.h"
17
18 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
19 * codes in the kernel).
20 */
21 #if DETECT_OS_OPENBSD
22 #define ENODATA ENOTSUP
23 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
24 #define ENODATA ECONNREFUSED
25 #endif
26
27 /* FENCES */
28
amdgpu_fence_destroy(struct amdgpu_fence * fence)29 void amdgpu_fence_destroy(struct amdgpu_fence *fence)
30 {
31 ac_drm_cs_destroy_syncobj(fence->aws->fd, fence->syncobj);
32
33 if (fence->ctx)
34 amdgpu_ctx_reference(&fence->ctx, NULL);
35
36 util_queue_fence_destroy(&fence->submitted);
37 FREE(fence);
38 }
39
40 static struct pipe_fence_handle *
amdgpu_fence_create(struct amdgpu_cs * cs)41 amdgpu_fence_create(struct amdgpu_cs *cs)
42 {
43 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
44 struct amdgpu_ctx *ctx = cs->ctx;
45
46 fence->reference.count = 1;
47 fence->aws = ctx->aws;
48 amdgpu_ctx_reference(&fence->ctx, ctx);
49 fence->ctx = ctx;
50 fence->ip_type = cs->ip_type;
51 if (ac_drm_cs_create_syncobj2(ctx->aws->fd, 0, &fence->syncobj)) {
52 free(fence);
53 return NULL;
54 }
55
56 util_queue_fence_init(&fence->submitted);
57 util_queue_fence_reset(&fence->submitted);
58 fence->queue_index = cs->queue_index;
59 return (struct pipe_fence_handle *)fence;
60 }
61
62 static struct pipe_fence_handle *
amdgpu_fence_import_syncobj(struct radeon_winsys * rws,int fd)63 amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
64 {
65 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
66 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
67 int r;
68
69 if (!fence)
70 return NULL;
71
72 pipe_reference_init(&fence->reference, 1);
73 fence->aws = aws;
74 fence->ip_type = 0xffffffff;
75
76 r = ac_drm_cs_import_syncobj(aws->fd, fd, &fence->syncobj);
77 if (r) {
78 FREE(fence);
79 return NULL;
80 }
81
82 util_queue_fence_init(&fence->submitted);
83 fence->imported = true;
84
85 return (struct pipe_fence_handle*)fence;
86 }
87
88 static struct pipe_fence_handle *
amdgpu_fence_import_sync_file(struct radeon_winsys * rws,int fd)89 amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
90 {
91 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
92 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
93
94 if (!fence)
95 return NULL;
96
97 pipe_reference_init(&fence->reference, 1);
98 fence->aws = aws;
99 /* fence->ctx == NULL means that the fence is syncobj-based. */
100
101 /* Convert sync_file into syncobj. */
102 int r = ac_drm_cs_create_syncobj(aws->fd, &fence->syncobj);
103 if (r) {
104 FREE(fence);
105 return NULL;
106 }
107
108 r = ac_drm_cs_syncobj_import_sync_file(aws->fd, fence->syncobj, fd);
109 if (r) {
110 ac_drm_cs_destroy_syncobj(aws->fd, fence->syncobj);
111 FREE(fence);
112 return NULL;
113 }
114
115 util_queue_fence_init(&fence->submitted);
116 fence->imported = true;
117
118 return (struct pipe_fence_handle*)fence;
119 }
120
amdgpu_fence_export_sync_file(struct radeon_winsys * rws,struct pipe_fence_handle * pfence)121 static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
122 struct pipe_fence_handle *pfence)
123 {
124 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
125 struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
126 int fd, r;
127
128 util_queue_fence_wait(&fence->submitted);
129
130 /* Convert syncobj into sync_file. */
131 r = ac_drm_cs_syncobj_export_sync_file(aws->fd, fence->syncobj, &fd);
132 return r ? -1 : fd;
133 }
134
amdgpu_export_signalled_sync_file(struct radeon_winsys * rws)135 static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
136 {
137 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
138 uint32_t syncobj;
139 int fd = -1;
140
141 int r = ac_drm_cs_create_syncobj2(aws->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
142 &syncobj);
143 if (r) {
144 return -1;
145 }
146
147 r = ac_drm_cs_syncobj_export_sync_file(aws->fd, syncobj, &fd);
148 if (r) {
149 fd = -1;
150 }
151
152 ac_drm_cs_destroy_syncobj(aws->fd, syncobj);
153 return fd;
154 }
155
amdgpu_fence_submitted(struct pipe_fence_handle * fence,uint64_t seq_no,uint64_t * user_fence_cpu_address)156 static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
157 uint64_t seq_no,
158 uint64_t *user_fence_cpu_address)
159 {
160 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
161
162 afence->seq_no = seq_no;
163 afence->user_fence_cpu_address = user_fence_cpu_address;
164 util_queue_fence_signal(&afence->submitted);
165 }
166
amdgpu_fence_signalled(struct pipe_fence_handle * fence)167 static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
168 {
169 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
170
171 afence->signalled = true;
172 util_queue_fence_signal(&afence->submitted);
173 }
174
amdgpu_fence_wait(struct pipe_fence_handle * fence,uint64_t timeout,bool absolute)175 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
176 bool absolute)
177 {
178 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
179 int64_t abs_timeout;
180 uint64_t *user_fence_cpu;
181
182 if (afence->signalled)
183 return true;
184
185 if (absolute)
186 abs_timeout = timeout;
187 else
188 abs_timeout = os_time_get_absolute_timeout(timeout);
189
190 /* The fence might not have a number assigned if its IB is being
191 * submitted in the other thread right now. Wait until the submission
192 * is done. */
193 if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
194 return false;
195
196 user_fence_cpu = afence->user_fence_cpu_address;
197 if (user_fence_cpu) {
198 if (*user_fence_cpu >= afence->seq_no) {
199 afence->signalled = true;
200 return true;
201 }
202
203 /* No timeout, just query: no need for the ioctl. */
204 if (!absolute && !timeout)
205 return false;
206 }
207
208 if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE)
209 abs_timeout = INT64_MAX;
210
211 if (ac_drm_cs_syncobj_wait(afence->aws->fd, &afence->syncobj, 1,
212 abs_timeout, 0, NULL))
213 return false;
214
215 /* Check that guest-side syncobj agrees with the user fence. */
216 if (user_fence_cpu && afence->aws->info.is_virtio)
217 assert(afence->seq_no <= *user_fence_cpu);
218
219 afence->signalled = true;
220 return true;
221 }
222
amdgpu_fence_wait_rel_timeout(struct radeon_winsys * rws,struct pipe_fence_handle * fence,uint64_t timeout)223 static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
224 struct pipe_fence_handle *fence,
225 uint64_t timeout)
226 {
227 return amdgpu_fence_wait(fence, timeout, false);
228 }
229
230 static struct pipe_fence_handle *
amdgpu_cs_get_next_fence(struct radeon_cmdbuf * rcs)231 amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
232 {
233 struct amdgpu_cs *cs = amdgpu_cs(rcs);
234 struct pipe_fence_handle *fence = NULL;
235
236 if (cs->noop)
237 return NULL;
238
239 if (cs->next_fence) {
240 amdgpu_fence_reference(&fence, cs->next_fence);
241 return fence;
242 }
243
244 fence = amdgpu_fence_create(cs);
245 if (!fence)
246 return NULL;
247
248 amdgpu_fence_reference(&cs->next_fence, fence);
249 return fence;
250 }
251
252 /* CONTEXTS */
253
254 static uint32_t
radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)255 radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)
256 {
257 switch (radeon_priority) {
258 case RADEON_CTX_PRIORITY_REALTIME:
259 return AMDGPU_CTX_PRIORITY_VERY_HIGH;
260 case RADEON_CTX_PRIORITY_HIGH:
261 return AMDGPU_CTX_PRIORITY_HIGH;
262 case RADEON_CTX_PRIORITY_MEDIUM:
263 return AMDGPU_CTX_PRIORITY_NORMAL;
264 case RADEON_CTX_PRIORITY_LOW:
265 return AMDGPU_CTX_PRIORITY_LOW;
266 default:
267 unreachable("Invalid context priority");
268 }
269 }
270
amdgpu_ctx_create(struct radeon_winsys * rws,enum radeon_ctx_priority priority,bool allow_context_lost)271 static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *rws,
272 enum radeon_ctx_priority priority,
273 bool allow_context_lost)
274 {
275 struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
276 int r;
277 struct amdgpu_bo_alloc_request alloc_buffer = {};
278 uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority);
279 ac_drm_device *dev;
280 ac_drm_bo buf_handle;
281
282 if (!ctx)
283 return NULL;
284
285 ctx->aws = amdgpu_winsys(rws);
286 ctx->reference.count = 1;
287 ctx->allow_context_lost = allow_context_lost;
288
289 dev = ctx->aws->dev;
290
291 r = ac_drm_cs_ctx_create2(dev, amdgpu_priority, &ctx->ctx_handle);
292 if (r) {
293 fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r);
294 goto error_create;
295 }
296
297 alloc_buffer.alloc_size = ctx->aws->info.gart_page_size;
298 alloc_buffer.phys_alignment = ctx->aws->info.gart_page_size;
299 alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
300
301 r = ac_drm_bo_alloc(dev, &alloc_buffer, &buf_handle);
302 if (r) {
303 fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
304 goto error_user_fence_alloc;
305 }
306
307 ctx->user_fence_cpu_address_base = NULL;
308 r = ac_drm_bo_cpu_map(dev, buf_handle, (void**)&ctx->user_fence_cpu_address_base);
309 if (r) {
310 fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
311 goto error_user_fence_map;
312 }
313
314 memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
315 ctx->user_fence_bo = buf_handle;
316 ac_drm_bo_export(dev, buf_handle, amdgpu_bo_handle_type_kms, &ctx->user_fence_bo_kms_handle);
317
318 return (struct radeon_winsys_ctx*)ctx;
319
320 error_user_fence_map:
321 ac_drm_bo_free(dev, buf_handle);
322
323 error_user_fence_alloc:
324 ac_drm_cs_ctx_free(dev, ctx->ctx_handle);
325 error_create:
326 FREE(ctx);
327 return NULL;
328 }
329
amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)330 static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
331 {
332 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
333
334 amdgpu_ctx_reference(&ctx, NULL);
335 }
336
amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys * aws,enum amd_ip_type ip_type,uint32_t * ib,uint32_t * num_dw,unsigned leave_dw_space)337 static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *aws, enum amd_ip_type ip_type,
338 uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space)
339 {
340 unsigned pad_dw_mask = aws->info.ip[ip_type].ib_pad_dw_mask;
341 unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask;
342
343 if (unaligned_dw) {
344 int remaining = pad_dw_mask + 1 - unaligned_dw;
345
346 /* Only pad by 1 dword with the type-2 NOP if necessary. */
347 if (remaining == 1 && aws->info.gfx_ib_pad_with_type2) {
348 ib[(*num_dw)++] = PKT2_NOP_PAD;
349 } else {
350 /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
351 * packet. The size of the packet body after the header is always count + 1.
352 * If count == -1, there is no packet body. NOP is the only packet that can have
353 * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
354 */
355 ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0);
356 *num_dw += remaining - 1;
357 }
358 }
359 assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0);
360 }
361
amdgpu_submit_gfx_nop(struct amdgpu_ctx * ctx)362 static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx)
363 {
364 struct amdgpu_bo_alloc_request request = {0};
365 struct drm_amdgpu_bo_list_in bo_list_in;
366 struct drm_amdgpu_cs_chunk_ib ib_in = {0};
367 ac_drm_bo bo;
368 amdgpu_va_handle va_handle = NULL;
369 struct drm_amdgpu_cs_chunk chunks[2];
370 struct drm_amdgpu_bo_list_entry list;
371 unsigned noop_dw_size;
372 void *cpu = NULL;
373 uint64_t seq_no;
374 uint64_t va;
375 int r;
376
377 /* Older amdgpu doesn't report if the reset is complete or not. Detect
378 * it by submitting a no-op job. If it reports an error, then assume
379 * that the reset is not complete.
380 */
381 uint32_t temp_ctx_handle;
382 r = ac_drm_cs_ctx_create2(ctx->aws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx_handle);
383 if (r)
384 return r;
385
386 request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
387 request.alloc_size = 4096;
388 request.phys_alignment = 4096;
389 r = ac_drm_bo_alloc(ctx->aws->dev, &request, &bo);
390 if (r)
391 goto destroy_ctx;
392
393 r = ac_drm_va_range_alloc(ctx->aws->dev, amdgpu_gpu_va_range_general,
394 request.alloc_size, request.phys_alignment,
395 0, &va, &va_handle,
396 AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH);
397 if (r)
398 goto destroy_bo;
399
400 uint32_t kms_handle;
401 ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &kms_handle);
402
403 r = ac_drm_bo_va_op_raw(ctx->aws->dev, kms_handle, 0, request.alloc_size, va,
404 AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
405 AMDGPU_VA_OP_MAP);
406 if (r)
407 goto destroy_bo;
408
409 r = ac_drm_bo_cpu_map(ctx->aws->dev, bo, &cpu);
410 if (r)
411 goto destroy_bo;
412
413 noop_dw_size = ctx->aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
414 ((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
415
416 ac_drm_bo_cpu_unmap(ctx->aws->dev, bo);
417
418 list.bo_handle = kms_handle;
419 ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &list.bo_handle);
420 list.bo_priority = 0;
421
422 bo_list_in.list_handle = ~0;
423 bo_list_in.bo_number = 1;
424 bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
425 bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list;
426
427 ib_in.ip_type = AMD_IP_GFX;
428 ib_in.ib_bytes = noop_dw_size * 4;
429 ib_in.va_start = va;
430
431 chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
432 chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
433 chunks[0].chunk_data = (uintptr_t)&bo_list_in;
434
435 chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB;
436 chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
437 chunks[1].chunk_data = (uintptr_t)&ib_in;
438
439 r = ac_drm_cs_submit_raw2(ctx->aws->dev, temp_ctx_handle, 0, 2, chunks, &seq_no);
440
441 destroy_bo:
442 if (va_handle)
443 ac_drm_va_range_free(va_handle);
444 ac_drm_bo_free(ctx->aws->dev, bo);
445 destroy_ctx:
446 ac_drm_cs_ctx_free(ctx->aws->dev, temp_ctx_handle);
447
448 return r;
449 }
450
451 static void
amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)452 amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
453 const char *format, ...)
454 {
455 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
456
457 /* Don't overwrite the last reset status. */
458 if (ctx->sw_status != PIPE_NO_RESET)
459 return;
460
461 ctx->sw_status = status;
462
463 if (!ctx->allow_context_lost) {
464 va_list args;
465
466 va_start(args, format);
467 vfprintf(stderr, format, args);
468 va_end(args);
469
470 /* Non-robust contexts are allowed to terminate the process. The only alternative is
471 * to skip command submission, which would look like a freeze because nothing is drawn,
472 * which looks like a hang without any reset.
473 */
474 abort();
475 }
476 }
477
478 static enum pipe_reset_status
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx * rwctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)479 amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only,
480 bool *needs_reset, bool *reset_completed)
481 {
482 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
483
484 if (needs_reset)
485 *needs_reset = false;
486 if (reset_completed)
487 *reset_completed = false;
488
489 /* Return a failure due to a GPU hang. */
490 uint64_t flags;
491
492 if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
493 /* If the caller is only interested in full reset (= wants to ignore soft
494 * recoveries), we can use the rejected cs count as a quick first check.
495 */
496 return PIPE_NO_RESET;
497 }
498
499 /*
500 * ctx->sw_status is updated on alloc/ioctl failures.
501 *
502 * We only rely on amdgpu_cs_query_reset_state2 to tell us
503 * that the context reset is complete.
504 */
505 if (ctx->sw_status != PIPE_NO_RESET) {
506 int r = ac_drm_cs_query_reset_state2(ctx->aws->dev, ctx->ctx_handle, &flags);
507 if (!r) {
508 if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
509 if (reset_completed) {
510 /* The ARB_robustness spec says:
511 *
512 * If a reset status other than NO_ERROR is returned and subsequent
513 * calls return NO_ERROR, the context reset was encountered and
514 * completed. If a reset status is repeatedly returned, the context may
515 * be in the process of resetting.
516 *
517 * Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
518 * so don't do anything special. On older kernels, submit a no-op cs. If it
519 * succeeds then assume the reset is complete.
520 */
521 if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
522 *reset_completed = true;
523
524 if (ctx->aws->info.drm_minor < 54 && ctx->aws->info.has_graphics)
525 *reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
526 }
527 }
528 } else {
529 fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
530 }
531
532 /* Return a failure due to SW issues. */
533 if (needs_reset)
534 *needs_reset = true;
535 return ctx->sw_status;
536 }
537
538 if (needs_reset)
539 *needs_reset = false;
540 return PIPE_NO_RESET;
541 }
542
543 /* COMMAND SUBMISSION */
544
amdgpu_cs_has_user_fence(struct amdgpu_cs * acs)545 static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs)
546 {
547 return acs->ip_type == AMD_IP_GFX ||
548 acs->ip_type == AMD_IP_COMPUTE ||
549 acs->ip_type == AMD_IP_SDMA;
550 }
551
amdgpu_cs_epilog_dws(struct amdgpu_cs * cs)552 static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs)
553 {
554 if (cs->has_chaining)
555 return 4; /* for chaining */
556
557 return 0;
558 }
559
560 static struct amdgpu_cs_buffer *
amdgpu_lookup_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list)561 amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
562 struct amdgpu_buffer_list *list)
563 {
564 int num_buffers = list->num_buffers;
565 struct amdgpu_cs_buffer *buffers = list->buffers;
566 unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
567 int i = cs->buffer_indices_hashlist[hash];
568
569 /* not found or found */
570 if (i < 0)
571 return NULL;
572
573 if (i < num_buffers && buffers[i].bo == bo)
574 return &buffers[i];
575
576 /* Hash collision, look for the BO in the list of buffers linearly. */
577 for (int i = num_buffers - 1; i >= 0; i--) {
578 if (buffers[i].bo == bo) {
579 /* Put this buffer in the hash list.
580 * This will prevent additional hash collisions if there are
581 * several consecutive lookup_buffer calls for the same buffer.
582 *
583 * Example: Assuming buffers A,B,C collide in the hash list,
584 * the following sequence of buffers:
585 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
586 * will collide here: ^ and here: ^,
587 * meaning that we should get very few collisions in the end. */
588 cs->buffer_indices_hashlist[hash] = i & 0x7fff;
589 return &buffers[i];
590 }
591 }
592 return NULL;
593 }
594
595 struct amdgpu_cs_buffer *
amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo)596 amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
597 {
598 return amdgpu_lookup_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)]);
599 }
600
601 static struct amdgpu_cs_buffer *
amdgpu_do_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)602 amdgpu_do_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
603 struct amdgpu_buffer_list *list, bool add_ref)
604 {
605 /* New buffer, check if the backing array is large enough. */
606 if (unlikely(list->num_buffers >= list->max_buffers)) {
607 unsigned new_max =
608 MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3));
609 struct amdgpu_cs_buffer *new_buffers;
610
611 new_buffers = (struct amdgpu_cs_buffer *)
612 REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers),
613 new_max * sizeof(*new_buffers));
614 if (!new_buffers) {
615 fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
616 return NULL;
617 }
618
619 list->max_buffers = new_max;
620 list->buffers = new_buffers;
621 }
622
623 unsigned idx = list->num_buffers++;
624 struct amdgpu_cs_buffer *buffer = &list->buffers[idx];
625 if (add_ref)
626 p_atomic_inc(&bo->base.reference.count);
627 buffer->bo = bo;
628 buffer->usage = 0;
629
630 unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
631 cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
632 return buffer;
633 }
634
635 static struct amdgpu_cs_buffer *
amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)636 amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
637 struct amdgpu_buffer_list *list, bool add_ref)
638 {
639 struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(cs, bo, list);
640
641 return buffer ? buffer : amdgpu_do_add_buffer(cs, bo, list, add_ref);
642 }
643
amdgpu_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)644 static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
645 struct pb_buffer_lean *buf,
646 unsigned usage,
647 enum radeon_bo_domain domains)
648 {
649 /* Don't use the "domains" parameter. Amdgpu doesn't support changing
650 * the buffer placement during command submission.
651 */
652 struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc;
653 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
654 struct amdgpu_cs_buffer *buffer;
655
656 /* Fast exit for no-op calls.
657 * This is very effective with suballocators and linear uploaders that
658 * are outside of the winsys.
659 */
660 if (bo == cs->last_added_bo &&
661 (usage & cs->last_added_bo_usage) == usage)
662 return 0;
663
664 buffer = amdgpu_lookup_or_add_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)], true);
665 if (!buffer)
666 return 0;
667
668 buffer->usage |= usage;
669
670 cs->last_added_bo_usage = buffer->usage;
671 cs->last_added_bo = bo;
672 return 0;
673 }
674
amdgpu_ib_new_buffer(struct amdgpu_winsys * aws,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)675 static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *aws,
676 struct amdgpu_ib *main_ib,
677 struct amdgpu_cs *cs)
678 {
679 struct pb_buffer_lean *pb;
680 uint8_t *mapped;
681 unsigned buffer_size;
682
683 /* Always create a buffer that is at least as large as the maximum seen IB size,
684 * aligned to a power of two.
685 */
686 buffer_size = util_next_power_of_two(main_ib->max_ib_bytes);
687
688 /* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/
689 if (!cs->has_chaining)
690 buffer_size *= 4;
691
692 const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024);
693 /* This is the maximum size that fits into the INDIRECT_BUFFER packet. */
694 const unsigned max_size = 2 * 1024 * 1024;
695
696 buffer_size = MIN2(buffer_size, max_size);
697 buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */
698
699 /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU.
700 * The speed of writing to GTT WC is somewhere between no difference and very slow, while
701 * VRAM being very slow a lot more often.
702 *
703 * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency
704 * and doesn't have to wait for cached GL2 requests to be processed.
705 */
706 enum radeon_bo_domain domain = RADEON_DOMAIN_GTT;
707 unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING |
708 RADEON_FLAG_GL2_BYPASS;
709
710 if (cs->ip_type == AMD_IP_GFX ||
711 cs->ip_type == AMD_IP_COMPUTE ||
712 cs->ip_type == AMD_IP_SDMA) {
713 /* Avoids hangs with "rendercheck -t cacomposite -f a8r8g8b8" via glamor
714 * on Navi 14
715 */
716 flags |= RADEON_FLAG_32BIT;
717 }
718
719 pb = amdgpu_bo_create(aws, buffer_size,
720 aws->info.gart_page_size,
721 domain, (radeon_bo_flag)flags);
722 if (!pb)
723 return false;
724
725 mapped = (uint8_t*)amdgpu_bo_map(&aws->dummy_sws.base, pb, NULL, PIPE_MAP_WRITE);
726 if (!mapped) {
727 radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
728 return false;
729 }
730
731 radeon_bo_reference(&aws->dummy_sws.base, &main_ib->big_buffer, pb);
732 radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
733
734 main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer);
735 main_ib->big_buffer_cpu_ptr = mapped;
736 main_ib->used_ib_space = 0;
737
738 return true;
739 }
740
amdgpu_get_new_ib(struct amdgpu_winsys * aws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)741 static bool amdgpu_get_new_ib(struct amdgpu_winsys *aws,
742 struct radeon_cmdbuf *rcs,
743 struct amdgpu_ib *main_ib,
744 struct amdgpu_cs *cs)
745 {
746 struct drm_amdgpu_cs_chunk_ib *chunk_ib = &cs->csc->chunk_ib[IB_MAIN];
747 /* This is the minimum size of a contiguous IB. */
748 unsigned ib_size = 16 * 1024;
749
750 /* Always allocate at least the size of the biggest cs_check_space call,
751 * because precisely the last call might have requested this size.
752 */
753 ib_size = MAX2(ib_size, main_ib->max_check_space_size);
754
755 if (!cs->has_chaining) {
756 ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes),
757 IB_MAX_SUBMIT_BYTES));
758 }
759
760 /* Decay the IB buffer size over time, so that memory usage decreases after
761 * a temporary peak.
762 */
763 main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32;
764
765 rcs->prev_dw = 0;
766 rcs->num_prev = 0;
767 rcs->current.cdw = 0;
768 rcs->current.buf = NULL;
769
770 /* Allocate a new buffer for IBs if the current buffer is all used. */
771 if (!main_ib->big_buffer ||
772 main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) {
773 if (!amdgpu_ib_new_buffer(aws, main_ib, cs))
774 return false;
775 }
776
777 chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space;
778 chunk_ib->ib_bytes = 0;
779 /* ib_bytes is in dwords and the conversion to bytes will be done before
780 * the CS ioctl. */
781 main_ib->ptr_ib_size = &chunk_ib->ib_bytes;
782 main_ib->is_chained_ib = false;
783
784 amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
785 (radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB),
786 (radeon_bo_domain)0);
787
788 rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
789
790 cs->csc->ib_main_addr = rcs->current.buf;
791
792 ib_size = main_ib->big_buffer->size - main_ib->used_ib_space;
793 rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs);
794 return true;
795 }
796
amdgpu_set_ib_size(struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib)797 static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib)
798 {
799 if (ib->is_chained_ib) {
800 *ib->ptr_ib_size = rcs->current.cdw |
801 S_3F2_CHAIN(1) | S_3F2_VALID(1) |
802 S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL);
803 } else {
804 *ib->ptr_ib_size = rcs->current.cdw;
805 }
806 }
807
amdgpu_ib_finalize(struct amdgpu_winsys * aws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib,enum amd_ip_type ip_type)808 static void amdgpu_ib_finalize(struct amdgpu_winsys *aws, struct radeon_cmdbuf *rcs,
809 struct amdgpu_ib *ib, enum amd_ip_type ip_type)
810 {
811 amdgpu_set_ib_size(rcs, ib);
812 ib->used_ib_space += rcs->current.cdw * 4;
813 ib->used_ib_space = align(ib->used_ib_space, aws->info.ip[ip_type].ib_alignment);
814 ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4);
815 }
816
amdgpu_init_cs_context(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs,enum amd_ip_type ip_type)817 static bool amdgpu_init_cs_context(struct amdgpu_winsys *aws,
818 struct amdgpu_cs_context *cs,
819 enum amd_ip_type ip_type)
820 {
821 for (unsigned i = 0; i < ARRAY_SIZE(cs->chunk_ib); i++) {
822 cs->chunk_ib[i].ip_type = ip_type;
823 cs->chunk_ib[i].flags = 0;
824
825 if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
826 /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation
827 * is the beginning of IBs because completion of an IB doesn't care about the state of
828 * GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be
829 * executed in parallel, so draw calls from the current IB can finish after the next IB
830 * starts drawing, and so the cache flush at the end of IBs is usually late and thus
831 * useless.
832 */
833 cs->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
834 }
835 }
836
837 cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE;
838 cs->last_added_bo = NULL;
839 return true;
840 }
841
cleanup_fence_list(struct amdgpu_fence_list * fences)842 static void cleanup_fence_list(struct amdgpu_fence_list *fences)
843 {
844 for (unsigned i = 0; i < fences->num; i++)
845 amdgpu_fence_drop_reference(fences->list[i]);
846 fences->num = 0;
847 }
848
amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)849 static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
850 {
851 for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) {
852 struct amdgpu_cs_buffer *buffers = cs->buffer_lists[i].buffers;
853 unsigned num_buffers = cs->buffer_lists[i].num_buffers;
854
855 for (unsigned j = 0; j < num_buffers; j++)
856 amdgpu_winsys_bo_drop_reference(aws, buffers[j].bo);
857
858 cs->buffer_lists[i].num_buffers = 0;
859 }
860 }
861
amdgpu_cs_context_cleanup(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)862 static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
863 {
864 cs->seq_no_dependencies.valid_fence_mask = 0;
865 cleanup_fence_list(&cs->syncobj_dependencies);
866 cleanup_fence_list(&cs->syncobj_to_signal);
867 amdgpu_fence_reference(&cs->fence, NULL);
868 cs->last_added_bo = NULL;
869 }
870
amdgpu_destroy_cs_context(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)871 static void amdgpu_destroy_cs_context(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
872 {
873 amdgpu_cs_context_cleanup_buffers(aws, cs);
874 amdgpu_cs_context_cleanup(aws, cs);
875 for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++)
876 FREE(cs->buffer_lists[i].buffers);
877 FREE(cs->syncobj_dependencies.list);
878 FREE(cs->syncobj_to_signal.list);
879 }
880
881
amdgpu_cs_get_ip_type(struct radeon_cmdbuf * rcs)882 static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs)
883 {
884 struct amdgpu_cs *cs = amdgpu_cs(rcs);
885 return cs->ip_type;
886 }
887
ip_uses_alt_fence(enum amd_ip_type ip_type)888 static bool ip_uses_alt_fence(enum amd_ip_type ip_type)
889 {
890 /* The alt_fence path can be tested thoroughly by enabling it for GFX here. */
891 return ip_type == AMD_IP_VCN_DEC ||
892 ip_type == AMD_IP_VCN_ENC ||
893 ip_type == AMD_IP_VCN_JPEG;
894 }
895
amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)896 static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
897 {
898 struct amdgpu_cs *cs = amdgpu_cs(rcs);
899
900 if (!cs)
901 return;
902
903 amdgpu_cs_sync_flush(rcs);
904 util_queue_fence_destroy(&cs->flush_completed);
905 p_atomic_dec(&cs->aws->num_cs);
906 radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->preamble_ib_bo, NULL);
907 radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->main_ib.big_buffer, NULL);
908 FREE(rcs->prev);
909 amdgpu_destroy_cs_context(cs->aws, &cs->csc1);
910 amdgpu_destroy_cs_context(cs->aws, &cs->csc2);
911 amdgpu_fence_reference(&cs->next_fence, NULL);
912 FREE(cs);
913 }
914
915 static bool
amdgpu_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)916 amdgpu_cs_create(struct radeon_cmdbuf *rcs,
917 struct radeon_winsys_ctx *rwctx,
918 enum amd_ip_type ip_type,
919 void (*flush)(void *ctx, unsigned flags,
920 struct pipe_fence_handle **fence),
921 void *flush_ctx)
922 {
923 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
924 struct amdgpu_cs *cs;
925
926 cs = CALLOC_STRUCT(amdgpu_cs);
927 if (!cs) {
928 return false;
929 }
930
931 util_queue_fence_init(&cs->flush_completed);
932
933 cs->aws = ctx->aws;
934 cs->ctx = ctx;
935 cs->flush_cs = flush;
936 cs->flush_data = flush_ctx;
937 cs->ip_type = ip_type;
938 cs->noop = ctx->aws->noop_cs;
939 cs->has_chaining = ctx->aws->info.gfx_level >= GFX7 &&
940 (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
941
942 /* Compute the queue index by counting the IPs that have queues. */
943 assert(ip_type < ARRAY_SIZE(ctx->aws->info.ip));
944 assert(ctx->aws->info.ip[ip_type].num_queues);
945
946 if (ip_uses_alt_fence(ip_type)) {
947 cs->queue_index = INT_MAX;
948 cs->uses_alt_fence = true;
949 } else {
950 cs->queue_index = 0;
951
952 for (unsigned i = 0; i < ARRAY_SIZE(ctx->aws->info.ip); i++) {
953 if (!ctx->aws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i))
954 continue;
955
956 if (i == ip_type)
957 break;
958
959 cs->queue_index++;
960 }
961 assert(cs->queue_index < AMDGPU_MAX_QUEUES);
962 }
963
964 ac_drm_cs_chunk_fence_info_to_data(cs->ctx->user_fence_bo_kms_handle, cs->ip_type * 4,
965 (struct drm_amdgpu_cs_chunk_data*)&cs->fence_chunk);
966
967 if (!amdgpu_init_cs_context(ctx->aws, &cs->csc1, ip_type)) {
968 FREE(cs);
969 return false;
970 }
971
972 if (!amdgpu_init_cs_context(ctx->aws, &cs->csc2, ip_type)) {
973 amdgpu_destroy_cs_context(ctx->aws, &cs->csc1);
974 FREE(cs);
975 return false;
976 }
977
978 memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
979
980 /* Set the first submission context as current. */
981 rcs->csc = cs->csc = &cs->csc1;
982 cs->cst = &cs->csc2;
983
984 /* Assign to both amdgpu_cs_context; only csc will use it. */
985 cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist;
986 cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist;
987
988 cs->csc1.aws = ctx->aws;
989 cs->csc2.aws = ctx->aws;
990
991 p_atomic_inc(&ctx->aws->num_cs);
992
993 if (!amdgpu_get_new_ib(ctx->aws, rcs, &cs->main_ib, cs))
994 goto fail;
995
996 /* Currently only gfx, compute and sdma queues supports user queue. */
997 if (cs->aws->info.use_userq && ip_type <= AMD_IP_SDMA) {
998 if (!amdgpu_userq_init(cs->aws, &cs->aws->queues[cs->queue_index].userq, ip_type))
999 goto fail;
1000 }
1001
1002 rcs->priv = cs;
1003 return true;
1004 fail:
1005 amdgpu_cs_destroy(rcs);
1006 return false;
1007 }
1008
1009 static bool
amdgpu_cs_setup_preemption(struct radeon_cmdbuf * rcs,const uint32_t * preamble_ib,unsigned preamble_num_dw)1010 amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
1011 unsigned preamble_num_dw)
1012 {
1013 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1014 struct amdgpu_winsys *aws = cs->aws;
1015 struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2};
1016 unsigned size = align(preamble_num_dw * 4, aws->info.ip[AMD_IP_GFX].ib_alignment);
1017 struct pb_buffer_lean *preamble_bo;
1018 uint32_t *map;
1019
1020 /* Create the preamble IB buffer. */
1021 preamble_bo = amdgpu_bo_create(aws, size, aws->info.ip[AMD_IP_GFX].ib_alignment,
1022 RADEON_DOMAIN_VRAM,
1023 (radeon_bo_flag)
1024 (RADEON_FLAG_NO_INTERPROCESS_SHARING |
1025 RADEON_FLAG_GTT_WC));
1026 if (!preamble_bo)
1027 return false;
1028
1029 map = (uint32_t*)amdgpu_bo_map(&aws->dummy_sws.base, preamble_bo, NULL,
1030 (pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY));
1031 if (!map) {
1032 radeon_bo_reference(&aws->dummy_sws.base, &preamble_bo, NULL);
1033 return false;
1034 }
1035
1036 /* Upload the preamble IB. */
1037 memcpy(map, preamble_ib, preamble_num_dw * 4);
1038
1039 /* Pad the IB. */
1040 amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, map, &preamble_num_dw, 0);
1041 amdgpu_bo_unmap(&aws->dummy_sws.base, preamble_bo);
1042
1043 for (unsigned i = 0; i < 2; i++) {
1044 csc[i]->chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo);
1045 csc[i]->chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4;
1046
1047 csc[i]->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT;
1048 }
1049
1050 assert(!cs->preamble_ib_bo);
1051 cs->preamble_ib_bo = preamble_bo;
1052
1053 amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1054 RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1055 return true;
1056 }
1057
amdgpu_cs_validate(struct radeon_cmdbuf * rcs)1058 static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
1059 {
1060 return true;
1061 }
1062
amdgpu_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)1063 static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
1064 {
1065 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1066 struct amdgpu_ib *main_ib = &cs->main_ib;
1067
1068 assert(rcs->current.cdw <= rcs->current.max_dw);
1069
1070 unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw;
1071
1072 if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES)
1073 return false;
1074
1075 if (rcs->current.max_dw - rcs->current.cdw >= dw)
1076 return true;
1077
1078 unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
1079 unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
1080 /* 125% of the size for IB epilog. */
1081 unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
1082 main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size);
1083 main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4);
1084
1085 if (!cs->has_chaining)
1086 return false;
1087
1088 /* Allocate a new chunk */
1089 if (rcs->num_prev >= rcs->max_prev) {
1090 unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
1091 struct radeon_cmdbuf_chunk *new_prev;
1092
1093 new_prev = (struct radeon_cmdbuf_chunk*)
1094 REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev,
1095 sizeof(*new_prev) * new_max_prev);
1096 if (!new_prev)
1097 return false;
1098
1099 rcs->prev = new_prev;
1100 rcs->max_prev = new_max_prev;
1101 }
1102
1103 if (!amdgpu_ib_new_buffer(cs->aws, main_ib, cs))
1104 return false;
1105
1106 assert(main_ib->used_ib_space == 0);
1107 uint64_t va = main_ib->gpu_address;
1108
1109 /* This space was originally reserved. */
1110 rcs->current.max_dw += cs_epilog_dw;
1111
1112 /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
1113 amdgpu_pad_gfx_compute_ib(cs->aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4);
1114
1115 radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
1116 radeon_emit(rcs, va);
1117 radeon_emit(rcs, va >> 32);
1118 uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
1119
1120 assert((rcs->current.cdw & cs->aws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0);
1121 assert(rcs->current.cdw <= rcs->current.max_dw);
1122
1123 amdgpu_set_ib_size(rcs, main_ib);
1124 main_ib->ptr_ib_size = new_ptr_ib_size;
1125 main_ib->is_chained_ib = true;
1126
1127 /* Hook up the new chunk */
1128 rcs->prev[rcs->num_prev].buf = rcs->current.buf;
1129 rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
1130 rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
1131 rcs->num_prev++;
1132
1133 rcs->prev_dw += rcs->current.cdw;
1134 rcs->current.cdw = 0;
1135
1136 rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
1137 rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw;
1138
1139 amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
1140 RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1141
1142 return true;
1143 }
1144
amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context * cs)1145 static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *cs)
1146 {
1147 unsigned num_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1148 struct amdgpu_cs_buffer *buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1149
1150 for (unsigned i = 0; i < num_buffers; i++) {
1151 struct amdgpu_cs_buffer *slab_buffer = &buffers[i];
1152 struct amdgpu_cs_buffer *real_buffer =
1153 amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(slab_buffer->bo)->b,
1154 &cs->buffer_lists[AMDGPU_BO_REAL], true);
1155
1156 /* We need to set the usage because it determines the BO priority.
1157 *
1158 * Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its
1159 * BO fences to fence dependencies. Only the slab entries should do that.
1160 */
1161 real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
1162 }
1163 }
1164
amdgpu_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)1165 static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
1166 struct radeon_bo_list_item *list)
1167 {
1168 struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
1169
1170 /* We do this in the CS thread, but since we need to return the final usage of all buffers
1171 * here, do it here too. There is no harm in doing it again in the CS thread.
1172 */
1173 amdgpu_add_slab_backing_buffers(cs);
1174
1175 struct amdgpu_buffer_list *real_buffers = &cs->buffer_lists[AMDGPU_BO_REAL];
1176 unsigned num_real_buffers = real_buffers->num_buffers;
1177
1178 #if HAVE_AMDGPU_VIRTIO
1179 assert(!cs->ws->info.is_virtio);
1180 #endif
1181
1182 if (list) {
1183 for (unsigned i = 0; i < num_real_buffers; i++) {
1184 list[i].bo_size = real_buffers->buffers[i].bo->base.size;
1185 list[i].vm_address =
1186 amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle);
1187 list[i].priority_usage = real_buffers->buffers[i].usage;
1188 }
1189 }
1190 return num_real_buffers;
1191 }
1192
add_fence_to_list(struct amdgpu_fence_list * fences,struct amdgpu_fence * fence)1193 static void add_fence_to_list(struct amdgpu_fence_list *fences,
1194 struct amdgpu_fence *fence)
1195 {
1196 unsigned idx = fences->num++;
1197
1198 if (idx >= fences->max) {
1199 unsigned size;
1200 const unsigned increment = 8;
1201
1202 fences->max = idx + increment;
1203 size = fences->max * sizeof(fences->list[0]);
1204 fences->list = (struct pipe_fence_handle**)realloc(fences->list, size);
1205 }
1206 amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
1207 }
1208
amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf * rcs,struct pipe_fence_handle * pfence)1209 static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs,
1210 struct pipe_fence_handle *pfence)
1211 {
1212 struct amdgpu_cs *acs = amdgpu_cs(rcs);
1213 struct amdgpu_cs_context *cs = acs->csc;
1214 struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
1215
1216 util_queue_fence_wait(&fence->submitted);
1217
1218 if (!fence->imported) {
1219 /* Ignore idle fences. This will only check the user fence in memory. */
1220 if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) {
1221 add_seq_no_to_list(acs->aws, &cs->seq_no_dependencies, fence->queue_index,
1222 fence->queue_seq_no);
1223 }
1224 }
1225 else
1226 add_fence_to_list(&cs->syncobj_dependencies, fence);
1227 }
1228
amdgpu_add_fences_to_dependencies(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs,unsigned queue_index_bit,struct amdgpu_seq_no_fences * dependencies,struct amdgpu_winsys_bo * bo,unsigned usage)1229 static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws,
1230 struct amdgpu_cs_context *cs,
1231 unsigned queue_index_bit,
1232 struct amdgpu_seq_no_fences *dependencies,
1233 struct amdgpu_winsys_bo *bo, unsigned usage)
1234 {
1235 if (usage & RADEON_USAGE_SYNCHRONIZED) {
1236 /* Add BO fences from queues other than 'queue_index' to dependencies. */
1237 u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) {
1238 add_seq_no_to_list(ws, dependencies, other_queue_idx,
1239 bo->fences.seq_no[other_queue_idx]);
1240 }
1241
1242 if (bo->alt_fence)
1243 add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence);
1244 }
1245 }
1246
amdgpu_set_bo_seq_no(unsigned queue_index,struct amdgpu_winsys_bo * bo,uint_seq_no new_queue_seq_no)1247 static void amdgpu_set_bo_seq_no(unsigned queue_index, struct amdgpu_winsys_bo *bo,
1248 uint_seq_no new_queue_seq_no)
1249 {
1250 bo->fences.seq_no[queue_index] = new_queue_seq_no;
1251 bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
1252 }
1253
amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry * bo_entry,struct amdgpu_winsys_bo * bo,unsigned usage)1254 static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry,
1255 struct amdgpu_winsys_bo *bo, unsigned usage)
1256 {
1257 bo_entry->bo_handle = get_real_bo(bo)->kms_handle;
1258 bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2;
1259 }
1260
amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf * rws,struct pipe_fence_handle * fence)1261 static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
1262 struct pipe_fence_handle *fence)
1263 {
1264 struct amdgpu_cs *acs = amdgpu_cs(rws);
1265 struct amdgpu_cs_context *cs = acs->csc;
1266
1267 add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence);
1268 }
1269
amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs * acs,unsigned num_real_buffers,struct drm_amdgpu_bo_list_entry * bo_list_real,uint64_t * seq_no)1270 static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs,
1271 unsigned num_real_buffers,
1272 struct drm_amdgpu_bo_list_entry *bo_list_real,
1273 uint64_t *seq_no)
1274 {
1275 struct amdgpu_winsys *aws = acs->aws;
1276 struct amdgpu_cs_context *cs = acs->cst;
1277 struct drm_amdgpu_bo_list_in bo_list_in;
1278 struct drm_amdgpu_cs_chunk chunks[8];
1279 unsigned num_chunks = 0;
1280
1281 /* BO list */
1282 bo_list_in.operation = ~0;
1283 bo_list_in.list_handle = ~0;
1284 bo_list_in.bo_number = num_real_buffers;
1285 bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1286 bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list_real;
1287
1288 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1289 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1290 chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1291 num_chunks++;
1292
1293 /* Syncobj dependencies. */
1294 unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1295 if (num_syncobj_dependencies) {
1296 struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1297 (struct drm_amdgpu_cs_chunk_sem *)
1298 alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
1299
1300 for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1301 struct amdgpu_fence *fence =
1302 (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1303
1304 assert(util_queue_fence_is_signalled(&fence->submitted));
1305 sem_chunk[i].handle = fence->syncobj;
1306 }
1307
1308 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
1309 chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
1310 chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1311 num_chunks++;
1312 }
1313
1314 /* Syncobj signals. */
1315 unsigned num_syncobj_to_signal = 1 + cs->syncobj_to_signal.num;
1316 struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1317 (struct drm_amdgpu_cs_chunk_sem *)
1318 alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
1319
1320 for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) {
1321 struct amdgpu_fence *fence =
1322 (struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1323
1324 sem_chunk[i].handle = fence->syncobj;
1325 }
1326 sem_chunk[cs->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)cs->fence)->syncobj;
1327
1328 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
1329 chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal;
1330 chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1331 num_chunks++;
1332
1333 if (aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) {
1334 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW;
1335 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4;
1336 chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk;
1337 num_chunks++;
1338 }
1339
1340 /* Fence */
1341 if (amdgpu_cs_has_user_fence(acs)) {
1342 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1343 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1344 chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
1345 num_chunks++;
1346 }
1347
1348 /* IB */
1349 if (cs->chunk_ib[IB_PREAMBLE].ib_bytes) {
1350 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1351 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1352 chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_PREAMBLE];
1353 num_chunks++;
1354 }
1355
1356 /* IB */
1357 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1358 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1359 chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_MAIN];
1360 num_chunks++;
1361
1362 if (cs->secure) {
1363 cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
1364 cs->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
1365 } else {
1366 cs->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1367 cs->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1368 }
1369
1370 assert(num_chunks <= 8);
1371
1372 /* Submit the command buffer.
1373 *
1374 * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
1375 * quite often, but it eventually succeeds after enough attempts. This happens frequently
1376 * with dEQP using NGG streamout.
1377 */
1378 int r = 0;
1379
1380 do {
1381 /* Wait 1 ms and try again. */
1382 if (r == -ENOMEM)
1383 os_time_sleep(1000);
1384
1385 r = ac_drm_cs_submit_raw2(aws->dev, acs->ctx->ctx_handle, 0, num_chunks, chunks, seq_no);
1386 } while (r == -ENOMEM);
1387
1388 return r;
1389 }
1390
amdgpu_cs_add_userq_packets(struct amdgpu_userq * userq,struct amdgpu_cs_context * cs,uint64_t num_fences,struct drm_amdgpu_userq_fence_info * fence_info)1391 static void amdgpu_cs_add_userq_packets(struct amdgpu_userq *userq,
1392 struct amdgpu_cs_context *cs,
1393 uint64_t num_fences,
1394 struct drm_amdgpu_userq_fence_info *fence_info)
1395 {
1396 amdgpu_pkt_begin();
1397
1398 if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) {
1399 if (num_fences) {
1400 unsigned num_fences_in_iter;
1401 /* FENCE_WAIT_MULTI packet supports max 32 fenes */
1402 for (unsigned i = 0; i < num_fences; i = i + 32) {
1403 num_fences_in_iter = (i + 32 > num_fences) ? num_fences - i : 32;
1404 amdgpu_pkt_add_dw(PKT3(PKT3_FENCE_WAIT_MULTI, num_fences_in_iter * 4, 0));
1405 amdgpu_pkt_add_dw(S_D10_ENGINE_SEL(1) | S_D10_POLL_INTERVAL(4) | S_D10_PREEMPTABLE(1));
1406 for (unsigned j = 0; j < num_fences_in_iter; j++) {
1407 amdgpu_pkt_add_dw(fence_info[i + j].va);
1408 amdgpu_pkt_add_dw(fence_info[i + j].va >> 32);
1409 amdgpu_pkt_add_dw(fence_info[i + j].value);
1410 amdgpu_pkt_add_dw(fence_info[i + j].value >> 32);
1411 }
1412 }
1413 }
1414
1415 amdgpu_pkt_add_dw(PKT3(PKT3_HDP_FLUSH, 0, 0));
1416 amdgpu_pkt_add_dw(0x0);
1417
1418 amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
1419 amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start);
1420 amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start >> 32);
1421 if (userq->ip_type == AMD_IP_GFX)
1422 amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_INHERIT_VMID_MQD_GFX(1));
1423 else
1424 amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_VALID_COMPUTE(1) |
1425 S_3F3_INHERIT_VMID_MQD_COMPUTE(1));
1426
1427 /* Add 8 for release mem packet and 2 for protected fence signal packet.
1428 * Calculcating userq_fence_seq_num this way to match with kernel fence that is
1429 * returned in userq_wait iotl.
1430 */
1431 userq->user_fence_seq_num = *userq->wptr_bo_map + __num_dw_written + 8 + 2;
1432
1433 /* add release mem for user fence */
1434 amdgpu_pkt_add_dw(PKT3(PKT3_RELEASE_MEM, 6, 0));
1435 amdgpu_pkt_add_dw(S_490_EVENT_TYPE(V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT) |
1436 S_490_EVENT_INDEX(5) | S_490_GLM_WB(1) | S_490_GLM_INV(1) |
1437 S_490_GL2_WB(1) | S_490_SEQ(1) | S_490_CACHE_POLICY(3));
1438 amdgpu_pkt_add_dw(S_030358_DATA_SEL(2));
1439 amdgpu_pkt_add_dw(userq->user_fence_va);
1440 amdgpu_pkt_add_dw(userq->user_fence_va >> 32);
1441 amdgpu_pkt_add_dw(userq->user_fence_seq_num);
1442 amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32);
1443 amdgpu_pkt_add_dw(0);
1444
1445 /* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer
1446 * is only accessible from kernel through VMID 0.
1447 */
1448 amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0));
1449 amdgpu_pkt_add_dw(0);
1450 } else {
1451 fprintf(stderr, "amdgpu: unsupported userq ip submission = %d\n", userq->ip_type);
1452 }
1453
1454 amdgpu_pkt_end();
1455 }
1456
amdgpu_cs_submit_ib_userq(struct amdgpu_userq * userq,struct amdgpu_cs * acs,uint32_t * shared_buf_kms_handles_write,unsigned num_shared_buf_write,uint32_t * shared_buf_kms_handles_read,unsigned num_shared_buf_read,uint64_t * seq_no,uint64_t vm_timeline_point)1457 static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq,
1458 struct amdgpu_cs *acs,
1459 uint32_t *shared_buf_kms_handles_write,
1460 unsigned num_shared_buf_write,
1461 uint32_t *shared_buf_kms_handles_read,
1462 unsigned num_shared_buf_read,
1463 uint64_t *seq_no,
1464 uint64_t vm_timeline_point)
1465 {
1466 int r = 0;
1467 struct amdgpu_winsys *aws = acs->aws;
1468 struct amdgpu_cs_context *cs = acs->cst;
1469
1470 /* Syncobj dependencies. */
1471 unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1472 uint32_t *syncobj_dependencies_list =
1473 (uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t));
1474
1475 /* Currently only 1 vm timeline syncobj can be a dependency. */
1476 uint16_t num_syncobj_timeline_dependencies = 1;
1477 uint32_t syncobj_timeline_dependency;
1478 uint64_t syncobj_timeline_dependency_point;
1479
1480 if (num_syncobj_dependencies) {
1481 for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1482 struct amdgpu_fence *fence =
1483 (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1484
1485 assert(util_queue_fence_is_signalled(&fence->submitted));
1486 syncobj_dependencies_list[i] = fence->syncobj;
1487 }
1488 }
1489 syncobj_timeline_dependency = aws->vm_timeline_syncobj;
1490 syncobj_timeline_dependency_point = vm_timeline_point;
1491
1492 /* Syncobj signals. Adding 1 for cs submission fence. */
1493 unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num + 1;
1494 uint32_t *syncobj_signal_list =
1495 (uint32_t*)alloca(num_syncobj_to_signal * sizeof(uint32_t));
1496
1497 for (unsigned i = 0; i < cs->syncobj_to_signal.num; i++) {
1498 struct amdgpu_fence *fence =
1499 (struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1500
1501 syncobj_signal_list[i] = fence->syncobj;
1502 }
1503 syncobj_signal_list[num_syncobj_to_signal - 1] = ((struct amdgpu_fence*)cs->fence)->syncobj;
1504
1505 struct drm_amdgpu_userq_fence_info *fence_info;
1506 struct drm_amdgpu_userq_wait userq_wait_data = {
1507 .syncobj_handles = (uintptr_t)syncobj_dependencies_list,
1508 .syncobj_timeline_handles = (uintptr_t)&syncobj_timeline_dependency,
1509 .syncobj_timeline_points = (uintptr_t)&syncobj_timeline_dependency_point,
1510 .bo_read_handles = (uintptr_t)shared_buf_kms_handles_read,
1511 .bo_write_handles = (uintptr_t)shared_buf_kms_handles_write,
1512 .num_syncobj_timeline_handles = num_syncobj_timeline_dependencies,
1513 .num_fences = 0,
1514 .num_syncobj_handles = num_syncobj_dependencies,
1515 .num_bo_read_handles = num_shared_buf_read,
1516 .num_bo_write_handles = num_shared_buf_write,
1517 .out_fences = (uintptr_t)NULL,
1518 };
1519
1520 /*
1521 * Buffers sharing synchronization follow these rules:
1522 * - read-only buffers wait for all previous writes to complete
1523 * - write-only(also read-write) buffers wait for all previous reads to complete
1524 * To implement this strategy, we use amdgpu_userq_wait() before submitting
1525 * a job, and amdgpu_userq_signal() after to indicate completion.
1526 */
1527 r = ac_drm_userq_wait(aws->dev, &userq_wait_data);
1528 if (r)
1529 fprintf(stderr, "amdgpu: getting wait num_fences failed\n");
1530
1531 fence_info = (struct drm_amdgpu_userq_fence_info*)
1532 alloca(userq_wait_data.num_fences * sizeof(struct drm_amdgpu_userq_fence_info));
1533 userq_wait_data.out_fences = (uintptr_t)fence_info;
1534
1535 r = ac_drm_userq_wait(aws->dev, &userq_wait_data);
1536 if (r)
1537 fprintf(stderr, "amdgpu: getting wait fences failed\n");
1538
1539 simple_mtx_lock(&userq->lock);
1540 amdgpu_cs_add_userq_packets(userq, cs, userq_wait_data.num_fences, fence_info);
1541 struct drm_amdgpu_userq_signal userq_signal_data = {
1542 .queue_id = userq->userq_handle,
1543 .syncobj_handles = (uintptr_t)syncobj_signal_list,
1544 .num_syncobj_handles = num_syncobj_to_signal,
1545 .bo_read_handles = (uintptr_t)shared_buf_kms_handles_read,
1546 .bo_write_handles = (uintptr_t)shared_buf_kms_handles_write,
1547 .num_bo_read_handles = num_shared_buf_read,
1548 .num_bo_write_handles = num_shared_buf_write,
1549 };
1550
1551 r = ac_drm_userq_signal(aws->dev, &userq_signal_data);
1552 if (!r)
1553 userq->doorbell_bo_map[AMDGPU_USERQ_DOORBELL_INDEX] = *userq->wptr_bo_map;
1554
1555 *seq_no = userq->user_fence_seq_num;
1556 simple_mtx_unlock(&userq->lock);
1557
1558 return r;
1559 }
1560
1561 enum queue_type {
1562 KERNELQ,
1563 KERNELQ_ALT_FENCE,
1564 USERQ,
1565 };
1566
1567 /* The template parameter determines whether the queue should skip code used by the default queue
1568 * system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence
1569 * for all BOs.
1570 */
1571 template<enum queue_type queue_type>
amdgpu_cs_submit_ib(void * job,void * gdata,int thread_index)1572 static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
1573 {
1574 struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
1575 struct amdgpu_winsys *aws = acs->aws;
1576 struct amdgpu_cs_context *cs = acs->cst;
1577 int r;
1578 uint64_t seq_no = 0;
1579 bool has_user_fence = amdgpu_cs_has_user_fence(acs);
1580 /* The maximum timeline point of VM updates for all BOs used in this submit. */
1581 uint64_t vm_timeline_point = 0;
1582
1583 simple_mtx_lock(&aws->bo_fence_lock);
1584 unsigned queue_index;
1585 struct amdgpu_queue *queue;
1586 uint_seq_no prev_seq_no, next_seq_no;
1587
1588 if (queue_type != KERNELQ_ALT_FENCE) {
1589 queue_index = acs->queue_index;
1590 queue = &aws->queues[queue_index];
1591 prev_seq_no = queue->latest_seq_no;
1592
1593 /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
1594 * but the values aren't related.
1595 */
1596 next_seq_no = prev_seq_no + 1;
1597
1598 /* Wait for the oldest fence to signal. This should always check the user fence, then wait
1599 * via the ioctl. We have to do this because we are going to release the oldest fence and
1600 * replace it with the latest fence in the ring.
1601 */
1602 struct pipe_fence_handle **oldest_fence =
1603 &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
1604
1605 if (*oldest_fence) {
1606 if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
1607 /* Take the reference because the fence can be released by other threads after we
1608 * unlock the mutex.
1609 */
1610 struct pipe_fence_handle *tmp_fence = NULL;
1611 amdgpu_fence_reference(&tmp_fence, *oldest_fence);
1612
1613 /* Unlock the mutex before waiting. */
1614 simple_mtx_unlock(&aws->bo_fence_lock);
1615 amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
1616 amdgpu_fence_reference(&tmp_fence, NULL);
1617 simple_mtx_lock(&aws->bo_fence_lock);
1618 }
1619
1620 /* Remove the idle fence from the ring. */
1621 amdgpu_fence_reference(oldest_fence, NULL);
1622 }
1623 }
1624
1625 /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
1626 * sequence number per queue and removes all older ones.
1627 */
1628 struct amdgpu_seq_no_fences seq_no_dependencies;
1629 memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies));
1630
1631 if (queue_type != KERNELQ_ALT_FENCE) {
1632 /* Add a fence dependency on the previous IB if the IP has multiple physical queues to
1633 * make it appear as if it had only 1 queue, or if the previous IB comes from a different
1634 * context. The reasons are:
1635 * - Our BO fence tracking only supports 1 queue per IP.
1636 * - IBs from different contexts must wait for each other and can't execute in a random order.
1637 */
1638 struct amdgpu_fence *prev_fence =
1639 (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
1640
1641 /* Add a dependency on a previous fence, unless we can determine that
1642 * it's useless because the execution order is guaranteed.
1643 */
1644 if (prev_fence) {
1645 bool same_ctx = queue->last_ctx == acs->ctx;
1646 /* userqueue submission mode uses a single queue per process. */
1647 bool same_queue = aws->info.ip[acs->ip_type].num_queues > 1 &&
1648 queue_type != USERQ;
1649 if (!same_ctx || !same_queue)
1650 add_seq_no_to_list(aws, &seq_no_dependencies, queue_index, prev_seq_no);
1651 }
1652 }
1653
1654 /* Since the kernel driver doesn't synchronize execution between different
1655 * rings automatically, we have to add fence dependencies manually. This gathers sequence
1656 * numbers from BOs and sets the next sequence number in the BOs.
1657 */
1658
1659 /* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1660 struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1661 unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1662 unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1663 unsigned queue_index_bit = (queue_type == KERNELQ_ALT_FENCE) ?
1664 0 : BITFIELD_BIT(queue_index);
1665
1666 for (unsigned i = 0; i < num_slab_entry_buffers; i++) {
1667 struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i];
1668 struct amdgpu_winsys_bo *bo = buffer->bo;
1669
1670 amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1671 buffer->usage);
1672 if (queue_type == KERNELQ_ALT_FENCE)
1673 amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1674 else
1675 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1676
1677 /* We didn't add any slab entries into the real buffer list that will be submitted
1678 * to the kernel. Do it now.
1679 */
1680 struct amdgpu_cs_buffer *real_buffer =
1681 amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(buffer->bo)->b,
1682 &cs->buffer_lists[AMDGPU_BO_REAL], false);
1683
1684 /* We need to set the usage because it determines the BO priority. */
1685 real_buffer->usage |= buffer->usage;
1686 }
1687
1688 /* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1689 unsigned num_real_buffers_except_sparse = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1690 struct amdgpu_cs_buffer *sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].buffers;
1691 unsigned num_sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].num_buffers;
1692 bool out_of_memory = false;
1693
1694 for (unsigned i = 0; i < num_sparse_buffers; i++) {
1695 struct amdgpu_cs_buffer *buffer = &sparse_buffers[i];
1696 struct amdgpu_winsys_bo *bo = buffer->bo;
1697
1698 amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1699 buffer->usage);
1700 if (queue_type == KERNELQ_ALT_FENCE)
1701 amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1702 else
1703 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1704
1705 /* Add backing buffers of sparse buffers to the buffer list.
1706 *
1707 * This is done late, during submission, to keep the buffer list short before
1708 * submit, and to avoid managing fences for the backing buffers.
1709 */
1710 struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo);
1711
1712 if (queue_type == USERQ) {
1713 uint64_t bo_vm_point = p_atomic_read(&sparse_bo->vm_timeline_point);
1714 vm_timeline_point = MAX2(vm_timeline_point, bo_vm_point);
1715 }
1716
1717 simple_mtx_lock(&sparse_bo->commit_lock);
1718 list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) {
1719 /* We can directly add the buffer here, because we know that each
1720 * backing buffer occurs only once.
1721 */
1722 struct amdgpu_cs_buffer *real_buffer =
1723 amdgpu_do_add_buffer(cs, &backing->bo->b, &cs->buffer_lists[AMDGPU_BO_REAL], true);
1724 if (!real_buffer) {
1725 fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__);
1726 simple_mtx_unlock(&sparse_bo->commit_lock);
1727 r = -ENOMEM;
1728 out_of_memory = true;
1729 }
1730
1731 real_buffer->usage = buffer->usage;
1732 }
1733 simple_mtx_unlock(&sparse_bo->commit_lock);
1734 }
1735
1736 /* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */
1737 unsigned num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1738 struct amdgpu_cs_buffer *real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].buffers;
1739 struct drm_amdgpu_bo_list_entry *bo_list;
1740 /* BO dependency management depends on the queue mode:
1741 * - kernel queue: BO used by the submit are passed to the kernel in a
1742 * drm_amdgpu_bo_list_entry list. The inter-process synchronization is handled
1743 * automatically by the kernel; intra-process sync is handled by Mesa.
1744 * - user queue: intra-process sync is similar. Inter-process sync is handled
1745 * using timeline points, amdgpu_userq_wait (before a submit) and
1746 * amdgpu_userq_signal (after a submit).
1747 */
1748 unsigned num_shared_buf_write;
1749 unsigned num_shared_buf_read;
1750 /* Store write handles in the begining and read handles at the end in shared_buf_kms_handles.
1751 * If usage is read and write then store the handle in write list.
1752 */
1753 uint32_t *shared_buf_kms_handles;
1754 if (queue_type != USERQ) {
1755 bo_list = (struct drm_amdgpu_bo_list_entry *)
1756 alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1757 } else {
1758 num_shared_buf_write = 0;
1759 num_shared_buf_read = 0;
1760 shared_buf_kms_handles = (uint32_t*)alloca(num_real_buffers * sizeof(uint32_t));
1761 }
1762 unsigned i;
1763
1764 for (i = 0; i < initial_num_real_buffers; i++) {
1765 struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1766 struct amdgpu_winsys_bo *bo = buffer->bo;
1767
1768 amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1769 buffer->usage);
1770 if (queue_type == KERNELQ_ALT_FENCE)
1771 amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1772 else
1773 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1774
1775 if (queue_type != USERQ) {
1776 amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1777 } else {
1778 vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point);
1779
1780 if (!get_real_bo(bo)->is_shared)
1781 continue;
1782
1783 if (buffer->usage & RADEON_USAGE_WRITE) {
1784 shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle;
1785 num_shared_buf_write++;
1786 } else {
1787 num_shared_buf_read++;
1788 shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
1789 get_real_bo(bo)->kms_handle;
1790 }
1791 }
1792 }
1793
1794 /* These are backing buffers of slab entries. Don't add their fence dependencies. */
1795 for (; i < num_real_buffers_except_sparse; i++) {
1796 struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1797 struct amdgpu_winsys_bo *bo = buffer->bo;
1798
1799 if (queue_type == KERNELQ_ALT_FENCE)
1800 get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true;
1801 else
1802 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1803
1804 if (queue_type != USERQ) {
1805 amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1806 } else {
1807 vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point);
1808
1809 if (!get_real_bo(bo)->is_shared)
1810 continue;
1811
1812 if (buffer->usage & RADEON_USAGE_WRITE) {
1813 shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle;
1814 num_shared_buf_write++;
1815 } else {
1816 num_shared_buf_read++;
1817 shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
1818 get_real_bo(bo)->kms_handle;
1819 }
1820 }
1821 }
1822
1823 /* Sparse backing BOs are last. Don't update their fences because we don't use them. */
1824 for (; i < num_real_buffers; ++i) {
1825 struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1826
1827 if (queue_type != USERQ) {
1828 amdgpu_add_to_kernel_bo_list(&bo_list[i], buffer->bo, buffer->usage);
1829 } else {
1830 if (!get_real_bo(buffer->bo)->is_shared)
1831 continue;
1832 if (buffer->usage & RADEON_USAGE_WRITE) {
1833 shared_buf_kms_handles[num_shared_buf_write] =
1834 get_real_bo(buffer->bo)->kms_handle;
1835 num_shared_buf_write++;
1836 } else {
1837 num_shared_buf_read++;
1838 shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
1839 get_real_bo(buffer->bo)->kms_handle;
1840 }
1841 }
1842 }
1843
1844 #if 0 /* Debug code. */
1845 printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
1846
1847 /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
1848 for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
1849 if (i == acs->queue_index)
1850 continue;
1851
1852 struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
1853 if (!fence) {
1854 if (i <= 1)
1855 printf(" queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
1856 continue;
1857 }
1858
1859 bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
1860 uint_seq_no old = seq_no_dependencies.seq_no[i];
1861 add_seq_no_to_list(aws, &seq_no_dependencies, i, aws->queues[i].latest_seq_no);
1862 uint_seq_no new = seq_no_dependencies.seq_no[i];
1863
1864 if (!valid)
1865 printf(" missing dependency on queue=%u, seq_no=%u\n", i, new);
1866 else if (old != new)
1867 printf(" too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
1868 else
1869 printf(" has dependency on queue=%u, seq_no=%u\n", i, old);
1870 }
1871 #endif
1872
1873 /* Convert the sequence numbers we gathered to fence dependencies. */
1874 u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
1875 struct pipe_fence_handle **fence = get_fence_from_ring(aws, &seq_no_dependencies, i);
1876
1877 if (fence) {
1878 /* If it's idle, don't add it to the list of dependencies. */
1879 if (amdgpu_fence_wait(*fence, 0, false))
1880 amdgpu_fence_reference(fence, NULL);
1881 else
1882 add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)*fence);
1883 }
1884 }
1885
1886 if (queue_type != KERNELQ_ALT_FENCE) {
1887 /* Finally, add the IB fence into the fence ring of the queue. */
1888 amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
1889 queue->latest_seq_no = next_seq_no;
1890 ((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no;
1891
1892 /* Update the last used context in the queue. */
1893 amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
1894 }
1895 simple_mtx_unlock(&aws->bo_fence_lock);
1896
1897 #if MESA_DEBUG
1898 /* Prepare the buffer list. */
1899 if (aws->debug_all_bos) {
1900 /* The buffer list contains all buffers. This is a slow path that
1901 * ensures that no buffer is missing in the BO list.
1902 */
1903 simple_mtx_lock(&aws->global_bo_list_lock);
1904 if (queue_type != USERQ) {
1905 bo_list = (struct drm_amdgpu_bo_list_entry *)
1906 alloca(aws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1907 num_real_buffers = 0;
1908 list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
1909 bo_list[num_real_buffers].bo_handle = bo->kms_handle;
1910 bo_list[num_real_buffers].bo_priority = 0;
1911 ++num_real_buffers;
1912 }
1913 } else {
1914 shared_buf_kms_handles = (uint32_t*)alloca(aws->num_buffers * sizeof(uint32_t));
1915 num_shared_buf_write = 0;
1916 num_shared_buf_read = 0;
1917 list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
1918 shared_buf_kms_handles[num_shared_buf_write] = bo->kms_handle;
1919 num_shared_buf_write++;
1920 }
1921 }
1922 simple_mtx_unlock(&aws->global_bo_list_lock);
1923 }
1924 #endif
1925
1926 if (acs->ip_type == AMD_IP_GFX)
1927 aws->gfx_bo_list_counter += num_real_buffers;
1928
1929 if (out_of_memory) {
1930 r = -ENOMEM;
1931 } else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) {
1932 r = -ECANCELED;
1933 } else if (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX) {
1934 r = 0;
1935 } else {
1936 if (queue_type != USERQ) {
1937 /* Submit the command buffer.
1938 *
1939 * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
1940 * quite often, but it eventually succeeds after enough attempts. This happens frequently
1941 * with dEQP using NGG streamout.
1942 */
1943 r = 0;
1944
1945 do {
1946 /* Wait 1 ms and try again. */
1947 if (r == -ENOMEM)
1948 os_time_sleep(1000);
1949
1950 r = amdgpu_cs_submit_ib_kernelq(acs, num_real_buffers, bo_list, &seq_no);
1951 } while (r == -ENOMEM);
1952
1953 if (!r) {
1954 /* Success. */
1955 uint64_t *user_fence = NULL;
1956
1957 /* Need to reserve 4 QWORD for user fence:
1958 * QWORD[0]: completed fence
1959 * QWORD[1]: preempted fence
1960 * QWORD[2]: reset fence
1961 * QWORD[3]: preempted then reset
1962 */
1963 if (has_user_fence)
1964 user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
1965 amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
1966 }
1967 } else {
1968 struct amdgpu_userq *userq = &queue->userq;
1969 r = amdgpu_cs_submit_ib_userq(userq, acs, shared_buf_kms_handles, num_shared_buf_write,
1970 &shared_buf_kms_handles[num_real_buffers - num_shared_buf_read],
1971 num_shared_buf_read, &seq_no, vm_timeline_point);
1972 if (!r) {
1973 /* Success. */
1974 amdgpu_fence_submitted(cs->fence, seq_no, userq->user_fence_ptr);
1975 }
1976 }
1977 }
1978
1979 if (unlikely(r)) {
1980 if (r == -ECANCELED) {
1981 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET,
1982 "amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
1983 } else if (r == -ENODATA) {
1984 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1985 "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
1986 } else if (r == -ETIME) {
1987 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1988 "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
1989 } else {
1990 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx,
1991 PIPE_UNKNOWN_CONTEXT_RESET,
1992 "amdgpu: The CS has been rejected, "
1993 "see dmesg for more information (%i).\n",
1994 r);
1995 }
1996 }
1997
1998 /* If there was an error, signal the fence, because it won't be signalled
1999 * by the hardware. */
2000 if (r || (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX))
2001 amdgpu_fence_signalled(cs->fence);
2002
2003 if (unlikely(aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0))
2004 acs->mcbp_fw_shadow_chunk.flags = 0;
2005
2006 cs->error_code = r;
2007
2008 /* Clear the buffer lists. */
2009 for (unsigned list = 0; list < ARRAY_SIZE(cs->buffer_lists); list++) {
2010 struct amdgpu_cs_buffer *buffers = cs->buffer_lists[list].buffers;
2011 unsigned num_buffers = cs->buffer_lists[list].num_buffers;
2012
2013 if (list == AMDGPU_BO_REAL) {
2014 /* Only decrement num_active_ioctls and unref where we incremented them.
2015 * We did both for regular real BOs. We only incremented the refcount for sparse
2016 * backing BOs.
2017 */
2018 /* Regular real BOs. */
2019 for (unsigned i = 0; i < initial_num_real_buffers; i++) {
2020 p_atomic_dec(&buffers[i].bo->num_active_ioctls);
2021 amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
2022 }
2023
2024 /* Do nothing for slab BOs. */
2025
2026 /* Sparse backing BOs. */
2027 for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++)
2028 amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
2029 } else {
2030 for (unsigned i = 0; i < num_buffers; i++) {
2031 p_atomic_dec(&buffers[i].bo->num_active_ioctls);
2032 amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
2033 }
2034 }
2035
2036 cs->buffer_lists[list].num_buffers = 0;
2037 }
2038
2039 amdgpu_cs_context_cleanup(aws, cs);
2040 }
2041
2042 /* Make sure the previous submission is completed. */
amdgpu_cs_sync_flush(struct radeon_cmdbuf * rcs)2043 void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
2044 {
2045 struct amdgpu_cs *cs = amdgpu_cs(rcs);
2046
2047 /* Wait for any pending ioctl of this CS to complete. */
2048 util_queue_fence_wait(&cs->flush_completed);
2049 }
2050
amdgpu_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** fence)2051 static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
2052 unsigned flags,
2053 struct pipe_fence_handle **fence)
2054 {
2055 struct amdgpu_cs *cs = amdgpu_cs(rcs);
2056 struct amdgpu_winsys *aws = cs->aws;
2057 int error_code = 0;
2058 uint32_t ib_pad_dw_mask = aws->info.ip[cs->ip_type].ib_pad_dw_mask;
2059
2060 rcs->current.max_dw += amdgpu_cs_epilog_dws(cs);
2061
2062 /* Pad the IB according to the mask. */
2063 switch (cs->ip_type) {
2064 case AMD_IP_SDMA:
2065 if (aws->info.gfx_level <= GFX6) {
2066 while (rcs->current.cdw & ib_pad_dw_mask)
2067 radeon_emit(rcs, 0xf0000000); /* NOP packet */
2068 } else {
2069 while (rcs->current.cdw & ib_pad_dw_mask)
2070 radeon_emit(rcs, SDMA_NOP_PAD);
2071 }
2072 break;
2073 case AMD_IP_GFX:
2074 case AMD_IP_COMPUTE:
2075 amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0);
2076 if (cs->ip_type == AMD_IP_GFX)
2077 aws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
2078 break;
2079 case AMD_IP_UVD:
2080 case AMD_IP_UVD_ENC:
2081 while (rcs->current.cdw & ib_pad_dw_mask)
2082 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
2083 break;
2084 case AMD_IP_VCN_JPEG:
2085 if (rcs->current.cdw % 2)
2086 assert(0);
2087 while (rcs->current.cdw & ib_pad_dw_mask) {
2088 radeon_emit(rcs, 0x60000000); /* nop packet */
2089 radeon_emit(rcs, 0x00000000);
2090 }
2091 break;
2092 case AMD_IP_VCN_DEC:
2093 while (rcs->current.cdw & ib_pad_dw_mask)
2094 radeon_emit(rcs, 0x81ff); /* nop packet */
2095 break;
2096 default:
2097 break;
2098 }
2099
2100 if (rcs->current.cdw > rcs->current.max_dw) {
2101 fprintf(stderr, "amdgpu: command stream overflowed\n");
2102 }
2103
2104 /* If the CS is not empty or overflowed.... */
2105 if (likely(radeon_emitted(rcs, 0) &&
2106 rcs->current.cdw <= rcs->current.max_dw &&
2107 !(flags & RADEON_FLUSH_NOOP))) {
2108 struct amdgpu_cs_context *cur = cs->csc;
2109
2110 /* Set IB sizes. */
2111 amdgpu_ib_finalize(aws, rcs, &cs->main_ib, cs->ip_type);
2112
2113 /* Create a fence. */
2114 amdgpu_fence_reference(&cur->fence, NULL);
2115 if (cs->next_fence) {
2116 /* just move the reference */
2117 cur->fence = cs->next_fence;
2118 cs->next_fence = NULL;
2119 } else {
2120 cur->fence = amdgpu_fence_create(cs);
2121 }
2122 if (fence)
2123 amdgpu_fence_reference(fence, cur->fence);
2124
2125 for (unsigned i = 0; i < ARRAY_SIZE(cur->buffer_lists); i++) {
2126 unsigned num_buffers = cur->buffer_lists[i].num_buffers;
2127 struct amdgpu_cs_buffer *buffers = cur->buffer_lists[i].buffers;
2128
2129 for (unsigned j = 0; j < num_buffers; j++)
2130 p_atomic_inc(&buffers[j].bo->num_active_ioctls);
2131 }
2132
2133 amdgpu_cs_sync_flush(rcs);
2134
2135 cur->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
2136 if (cs->noop && cs->ip_type == AMD_IP_GFX) {
2137 /* Reduce the IB size and fill it with NOP to make it like an empty IB. */
2138 unsigned noop_dw_size = aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
2139 assert(cur->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size);
2140
2141 cur->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
2142 cur->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4;
2143 }
2144
2145 /* Swap command streams. "cst" is going to be submitted. */
2146 rcs->csc = cs->csc = cs->cst;
2147 cs->cst = cur;
2148
2149 /* only gfx, compute and sdma queues are supported in userqueues. */
2150 if (aws->info.use_userq && cs->ip_type <= AMD_IP_SDMA) {
2151 util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed,
2152 amdgpu_cs_submit_ib<USERQ>, NULL, 0);
2153 } else {
2154 util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed,
2155 cs->uses_alt_fence ?
2156 amdgpu_cs_submit_ib<KERNELQ_ALT_FENCE>
2157 : amdgpu_cs_submit_ib<KERNELQ>,
2158 NULL, 0);
2159 }
2160
2161 if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
2162 cs->csc->secure = !cs->cst->secure;
2163 else
2164 cs->csc->secure = cs->cst->secure;
2165
2166 if (!(flags & PIPE_FLUSH_ASYNC)) {
2167 amdgpu_cs_sync_flush(rcs);
2168 error_code = cur->error_code;
2169 }
2170 } else {
2171 if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
2172 cs->csc->secure = !cs->csc->secure;
2173
2174 amdgpu_cs_context_cleanup_buffers(aws, cs->csc);
2175 amdgpu_cs_context_cleanup(aws, cs->csc);
2176 }
2177
2178 memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
2179
2180 amdgpu_get_new_ib(aws, rcs, &cs->main_ib, cs);
2181
2182 if (cs->preamble_ib_bo) {
2183 amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
2184 RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
2185 }
2186
2187 if (cs->ip_type == AMD_IP_GFX)
2188 aws->num_gfx_IBs++;
2189 else if (cs->ip_type == AMD_IP_SDMA)
2190 aws->num_sdma_IBs++;
2191
2192 return error_code;
2193 }
2194
amdgpu_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)2195 static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
2196 struct pb_buffer_lean *_buf,
2197 unsigned usage)
2198 {
2199 struct amdgpu_cs *cs = amdgpu_cs(rcs);
2200 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
2201
2202 return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
2203 }
2204
amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf * rcs,uint64_t regs_va,uint64_t csa_va)2205 static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va,
2206 uint64_t csa_va)
2207 {
2208 struct amdgpu_cs *cs = amdgpu_cs(rcs);
2209 cs->mcbp_fw_shadow_chunk.shadow_va = regs_va;
2210 cs->mcbp_fw_shadow_chunk.csa_va = csa_va;
2211 cs->mcbp_fw_shadow_chunk.gds_va = 0;
2212 cs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
2213 }
2214
amdgpu_winsys_fence_reference(struct radeon_winsys * rws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)2215 static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws,
2216 struct pipe_fence_handle **dst,
2217 struct pipe_fence_handle *src)
2218 {
2219 amdgpu_fence_reference(dst, src);
2220 }
2221
amdgpu_cs_init_functions(struct amdgpu_screen_winsys * sws)2222 void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *sws)
2223 {
2224 sws->base.ctx_create = amdgpu_ctx_create;
2225 sws->base.ctx_destroy = amdgpu_ctx_destroy;
2226 sws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status;
2227 sws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
2228 sws->base.cs_create = amdgpu_cs_create;
2229 sws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
2230 sws->base.cs_destroy = amdgpu_cs_destroy;
2231 sws->base.cs_add_buffer = amdgpu_cs_add_buffer;
2232 sws->base.cs_validate = amdgpu_cs_validate;
2233 sws->base.cs_check_space = amdgpu_cs_check_space;
2234 sws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
2235 sws->base.cs_flush = amdgpu_cs_flush;
2236 sws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
2237 sws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
2238 sws->base.cs_sync_flush = amdgpu_cs_sync_flush;
2239 sws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
2240 sws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal;
2241 sws->base.cs_get_ip_type = amdgpu_cs_get_ip_type;
2242 sws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
2243 sws->base.fence_reference = amdgpu_winsys_fence_reference;
2244 sws->base.fence_import_syncobj = amdgpu_fence_import_syncobj;
2245 sws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
2246 sws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
2247 sws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
2248
2249 if (sws->aws->info.has_fw_based_shadowing)
2250 sws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va;
2251 }
2252