1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 /*
9 This file replaces libdrm's radeon_cs_gem with our own implemention.
10 It's optimized specifically for Radeon DRM.
11 Adding buffers and space checking are faster and simpler than their
12 counterparts in libdrm (the time complexity of all the functions
13 is O(1) in nearly all scenarios, thanks to hashing).
14
15 It works like this:
16
17 cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
18 also adds the size of 'buf' to the used_gart and used_vram winsys variables
19 based on the domains, which are simply or'd for the accounting purposes.
20 The adding is skipped if the reloc is already present in the list, but it
21 accounts any newly-referenced domains.
22
23 cs_validate is then called, which just checks:
24 used_vram/gart < vram/gart_size * 0.8
25 The 0.8 number allows for some memory fragmentation. If the validation
26 fails, the pipe driver flushes CS and tries do the validation again,
27 i.e. it validates only that one operation. If it fails again, it drops
28 the operation on the floor and prints some nasty message to stderr.
29 (done in the pipe driver)
30
31 cs_write_reloc(cs, buf) just writes a reloc that has been added using
32 cs_add_buffer. The read_domain and write_domain parameters have been removed,
33 because we already specify them in cs_add_buffer.
34 */
35
36 #include "radeon_drm_cs.h"
37
38 #include "util/u_memory.h"
39 #include "util/os_time.h"
40
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <stdint.h>
44 #include <xf86drm.h>
45
46
47 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
48
49 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
50 static void radeon_fence_reference(struct radeon_winsys *ws,
51 struct pipe_fence_handle **dst,
52 struct pipe_fence_handle *src);
53
radeon_drm_ctx_create(struct radeon_winsys * ws,enum radeon_ctx_priority priority,bool allow_context_lost)54 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws,
55 enum radeon_ctx_priority priority,
56 bool allow_context_lost)
57 {
58 struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
59 if (!ctx)
60 return NULL;
61
62 ctx->ws = (struct radeon_drm_winsys*)ws;
63 ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
64 return (struct radeon_winsys_ctx*)ctx;
65 }
66
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)67 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
68 {
69 FREE(ctx);
70 }
71
72 static void
radeon_drm_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)73 radeon_drm_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
74 const char *format, ...)
75 {
76 /* TODO: we should do something better here */
77 va_list args;
78
79 va_start(args, format);
80 vfprintf(stderr, format, args);
81 va_end(args);
82 }
83
84 static enum pipe_reset_status
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx * rctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)85 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
86 bool *needs_reset, bool *reset_completed)
87 {
88 struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
89
90 unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
91
92 if (ctx->gpu_reset_counter == latest) {
93 if (needs_reset)
94 *needs_reset = false;
95 if (reset_completed)
96 *reset_completed = false;
97 return PIPE_NO_RESET;
98 }
99
100 if (needs_reset)
101 *needs_reset = true;
102 if (reset_completed)
103 *reset_completed = true;
104
105 ctx->gpu_reset_counter = latest;
106 return PIPE_UNKNOWN_CONTEXT_RESET;
107 }
108
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)109 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
110 struct radeon_drm_winsys *ws)
111 {
112 int i;
113
114 csc->fd = ws->fd;
115
116 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
117 csc->chunks[0].length_dw = 0;
118 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
119 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
120 csc->chunks[1].length_dw = 0;
121 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
122 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
123 csc->chunks[2].length_dw = 2;
124 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
125
126 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
127 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
128 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
129
130 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
131
132 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
133 csc->reloc_indices_hashlist[i] = -1;
134 }
135 return true;
136 }
137
radeon_cs_context_cleanup(struct radeon_winsys * rws,struct radeon_cs_context * csc)138 static void radeon_cs_context_cleanup(struct radeon_winsys *rws,
139 struct radeon_cs_context *csc)
140 {
141 unsigned i;
142
143 for (i = 0; i < csc->num_relocs; i++) {
144 p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
145 radeon_ws_bo_reference(rws, &csc->relocs_bo[i].bo, NULL);
146 }
147 for (i = 0; i < csc->num_slab_buffers; ++i) {
148 p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
149 radeon_ws_bo_reference(rws, &csc->slab_buffers[i].bo, NULL);
150 }
151
152 csc->num_relocs = 0;
153 csc->num_validated_relocs = 0;
154 csc->num_slab_buffers = 0;
155 csc->chunks[0].length_dw = 0;
156 csc->chunks[1].length_dw = 0;
157
158 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
159 csc->reloc_indices_hashlist[i] = -1;
160 }
161 }
162
radeon_destroy_cs_context(struct radeon_winsys * rws,struct radeon_cs_context * csc)163 static void radeon_destroy_cs_context(struct radeon_winsys *rws, struct radeon_cs_context *csc)
164 {
165 radeon_cs_context_cleanup(rws, csc);
166 FREE(csc->slab_buffers);
167 FREE(csc->relocs_bo);
168 FREE(csc->relocs);
169 }
170
171 static bool
radeon_drm_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * ctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)172 radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
173 struct radeon_winsys_ctx *ctx,
174 enum amd_ip_type ip_type,
175 void (*flush)(void *ctx, unsigned flags,
176 struct pipe_fence_handle **fence),
177 void *flush_ctx)
178 {
179 struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
180 struct radeon_drm_cs *cs;
181
182 cs = CALLOC_STRUCT(radeon_drm_cs);
183 if (!cs) {
184 return false;
185 }
186 util_queue_fence_init(&cs->flush_completed);
187
188 cs->ws = ws;
189 cs->flush_cs = flush;
190 cs->flush_data = flush_ctx;
191
192 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
193 FREE(cs);
194 return false;
195 }
196 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
197 radeon_destroy_cs_context(&ws->base, &cs->csc1);
198 FREE(cs);
199 return false;
200 }
201
202 /* Set the first command buffer as current. */
203 cs->csc = &cs->csc1;
204 cs->cst = &cs->csc2;
205 cs->ip_type = ip_type;
206
207 memset(rcs, 0, sizeof(*rcs));
208 rcs->current.buf = cs->csc->buf;
209 rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
210 rcs->priv = cs;
211
212 p_atomic_inc(&ws->num_cs);
213 return true;
214 }
215
radeon_lookup_buffer(struct radeon_winsys * rws,struct radeon_cs_context * csc,struct radeon_bo * bo)216 int radeon_lookup_buffer(struct radeon_winsys *rws, struct radeon_cs_context *csc,
217 struct radeon_bo *bo)
218 {
219 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
220 struct radeon_bo_item *buffers;
221 unsigned num_buffers;
222 int i = csc->reloc_indices_hashlist[hash];
223
224 if (bo->handle) {
225 buffers = csc->relocs_bo;
226 num_buffers = csc->num_relocs;
227 } else {
228 buffers = csc->slab_buffers;
229 num_buffers = csc->num_slab_buffers;
230 }
231
232 /* not found or found */
233 if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
234 return i;
235
236 /* Hash collision, look for the BO in the list of relocs linearly. */
237 for (i = num_buffers - 1; i >= 0; i--) {
238 if (buffers[i].bo == bo) {
239 /* Put this reloc in the hash list.
240 * This will prevent additional hash collisions if there are
241 * several consecutive lookup_buffer calls for the same buffer.
242 *
243 * Example: Assuming buffers A,B,C collide in the hash list,
244 * the following sequence of relocs:
245 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
246 * will collide here: ^ and here: ^,
247 * meaning that we should get very few collisions in the end. */
248 csc->reloc_indices_hashlist[hash] = i;
249 return i;
250 }
251 }
252 return -1;
253 }
254
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)255 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
256 struct radeon_bo *bo)
257 {
258 struct radeon_cs_context *csc = cs->csc;
259 struct drm_radeon_cs_reloc *reloc;
260 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
261 int i = -1;
262
263 i = radeon_lookup_buffer(&cs->ws->base, csc, bo);
264
265 if (i >= 0) {
266 /* For async DMA, every add_buffer call must add a buffer to the list
267 * no matter how many duplicates there are. This is due to the fact
268 * the DMA CS checker doesn't use NOP packets for offset patching,
269 * but always uses the i-th buffer from the list to patch the i-th
270 * offset. If there are N offsets in a DMA CS, there must also be N
271 * buffers in the relocation list.
272 *
273 * This doesn't have to be done if virtual memory is enabled,
274 * because there is no offset patching with virtual memory.
275 */
276 if (cs->ip_type != AMD_IP_SDMA || cs->ws->info.r600_has_virtual_memory) {
277 return i;
278 }
279 }
280
281 /* New relocation, check if the backing array is large enough. */
282 if (csc->num_relocs >= csc->max_relocs) {
283 uint32_t size;
284 csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
285
286 size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
287 csc->relocs_bo = realloc(csc->relocs_bo, size);
288
289 size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
290 csc->relocs = realloc(csc->relocs, size);
291
292 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
293 }
294
295 /* Initialize the new relocation. */
296 csc->relocs_bo[csc->num_relocs].bo = NULL;
297 csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
298 radeon_ws_bo_reference(&cs->ws->base, &csc->relocs_bo[csc->num_relocs].bo, bo);
299 p_atomic_inc(&bo->num_cs_references);
300 reloc = &csc->relocs[csc->num_relocs];
301 reloc->handle = bo->handle;
302 reloc->read_domains = 0;
303 reloc->write_domain = 0;
304 reloc->flags = 0;
305
306 csc->reloc_indices_hashlist[hash] = csc->num_relocs;
307
308 csc->chunks[1].length_dw += RELOC_DWORDS;
309
310 return csc->num_relocs++;
311 }
312
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)313 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
314 struct radeon_bo *bo)
315 {
316 struct radeon_cs_context *csc = cs->csc;
317 unsigned hash;
318 struct radeon_bo_item *item;
319 int idx;
320 int real_idx;
321
322 idx = radeon_lookup_buffer(&cs->ws->base, csc, bo);
323 if (idx >= 0)
324 return idx;
325
326 real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
327
328 /* Check if the backing array is large enough. */
329 if (csc->num_slab_buffers >= csc->max_slab_buffers) {
330 unsigned new_max = MAX2(csc->max_slab_buffers + 16,
331 (unsigned)(csc->max_slab_buffers * 1.3));
332 struct radeon_bo_item *new_buffers =
333 REALLOC(csc->slab_buffers,
334 csc->max_slab_buffers * sizeof(*new_buffers),
335 new_max * sizeof(*new_buffers));
336 if (!new_buffers) {
337 fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
338 return -1;
339 }
340
341 csc->max_slab_buffers = new_max;
342 csc->slab_buffers = new_buffers;
343 }
344
345 /* Initialize the new relocation. */
346 idx = csc->num_slab_buffers++;
347 item = &csc->slab_buffers[idx];
348
349 item->bo = NULL;
350 item->u.slab.real_idx = real_idx;
351 radeon_ws_bo_reference(&cs->ws->base, &item->bo, bo);
352 p_atomic_inc(&bo->num_cs_references);
353
354 hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
355 csc->reloc_indices_hashlist[hash] = idx;
356
357 return idx;
358 }
359
radeon_drm_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)360 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
361 struct pb_buffer_lean *buf,
362 unsigned usage,
363 enum radeon_bo_domain domains)
364 {
365 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
366 struct radeon_bo *bo = (struct radeon_bo*)buf;
367 enum radeon_bo_domain added_domains;
368
369 /* If VRAM is just stolen system memory, allow both VRAM and
370 * GTT, whichever has free space. If a buffer is evicted from
371 * VRAM to GTT, it will stay there.
372 */
373 if (!cs->ws->info.has_dedicated_vram)
374 domains |= RADEON_DOMAIN_GTT;
375
376 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
377 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
378 struct drm_radeon_cs_reloc *reloc;
379 int index;
380
381 if (!bo->handle) {
382 index = radeon_lookup_or_add_slab_buffer(cs, bo);
383 if (index < 0)
384 return 0;
385
386 index = cs->csc->slab_buffers[index].u.slab.real_idx;
387 } else {
388 index = radeon_lookup_or_add_real_buffer(cs, bo);
389 }
390
391 reloc = &cs->csc->relocs[index];
392 added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
393 reloc->read_domains |= rd;
394 reloc->write_domain |= wd;
395
396 /* The priority must be in [0, 15]. It's used by the kernel memory management. */
397 unsigned priority = usage & RADEON_ALL_PRIORITIES;
398 unsigned bo_priority = util_last_bit(priority) / 2;
399 reloc->flags = MAX2(reloc->flags, bo_priority);
400 cs->csc->relocs_bo[index].u.real.priority_usage |= priority;
401
402 if (added_domains & RADEON_DOMAIN_VRAM)
403 rcs->used_vram_kb += bo->base.size / 1024;
404 else if (added_domains & RADEON_DOMAIN_GTT)
405 rcs->used_gart_kb += bo->base.size / 1024;
406
407 return index;
408 }
409
radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf)410 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
411 struct pb_buffer_lean *buf)
412 {
413 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
414
415 return radeon_lookup_buffer(&cs->ws->base, cs->csc, (struct radeon_bo*)buf);
416 }
417
radeon_drm_cs_validate(struct radeon_cmdbuf * rcs)418 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
419 {
420 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
421 bool status =
422 rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
423 rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
424
425 if (status) {
426 cs->csc->num_validated_relocs = cs->csc->num_relocs;
427 } else {
428 /* Remove lately-added buffers. The validation failed with them
429 * and the CS is about to be flushed because of that. Keep only
430 * the already-validated buffers. */
431 unsigned i;
432
433 for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
434 p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
435 radeon_ws_bo_reference(&cs->ws->base, &cs->csc->relocs_bo[i].bo, NULL);
436 }
437 cs->csc->num_relocs = cs->csc->num_validated_relocs;
438
439 /* Flush if there are any relocs. Clean up otherwise. */
440 if (cs->csc->num_relocs) {
441 cs->flush_cs(cs->flush_data,
442 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
443 } else {
444 radeon_cs_context_cleanup(&cs->ws->base, cs->csc);
445 rcs->used_vram_kb = 0;
446 rcs->used_gart_kb = 0;
447
448 assert(rcs->current.cdw == 0);
449 if (rcs->current.cdw != 0) {
450 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
451 }
452 }
453 }
454 return status;
455 }
456
radeon_drm_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)457 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
458 {
459 assert(rcs->current.cdw <= rcs->current.max_dw);
460 return rcs->current.max_dw - rcs->current.cdw >= dw;
461 }
462
radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)463 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
464 struct radeon_bo_list_item *list)
465 {
466 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
467 int i;
468
469 if (list) {
470 for (i = 0; i < cs->csc->num_relocs; i++) {
471 list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
472 list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
473 list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
474 }
475 }
476 return cs->csc->num_relocs;
477 }
478
radeon_drm_cs_emit_ioctl_oneshot(void * job,void * gdata,int thread_index)479 void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
480 {
481 struct radeon_drm_cs *cs = (struct radeon_drm_cs*)job;
482 struct radeon_cs_context *csc = cs->cst;
483 unsigned i;
484 int r;
485
486 r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
487 &csc->cs, sizeof(struct drm_radeon_cs));
488 if (r) {
489 if (r == -ENOMEM)
490 fprintf(stderr, "radeon: Not enough memory for command submission.\n");
491 else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
492 unsigned i;
493
494 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
495 for (i = 0; i < csc->chunks[0].length_dw; i++) {
496 fprintf(stderr, "0x%08X\n", csc->buf[i]);
497 }
498 } else {
499 fprintf(stderr, "radeon: The kernel rejected CS, "
500 "see dmesg for more information (%i).\n", r);
501 }
502 }
503
504 for (i = 0; i < csc->num_relocs; i++)
505 p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
506 for (i = 0; i < csc->num_slab_buffers; i++)
507 p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
508
509 radeon_cs_context_cleanup(&cs->ws->base, csc);
510 }
511
512 /*
513 * Make sure previous submission of this cs are completed
514 */
radeon_drm_cs_sync_flush(struct radeon_cmdbuf * rcs)515 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
516 {
517 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
518
519 /* Wait for any pending ioctl of this CS to complete. */
520 if (util_queue_is_initialized(&cs->ws->cs_queue))
521 util_queue_fence_wait(&cs->flush_completed);
522 }
523
524 /* Add the given fence to a slab buffer fence list.
525 *
526 * There is a potential race condition when bo participates in submissions on
527 * two or more threads simultaneously. Since we do not know which of the
528 * submissions will be sent to the GPU first, we have to keep the fences
529 * of all submissions.
530 *
531 * However, fences that belong to submissions that have already returned from
532 * their respective ioctl do not have to be kept, because we know that they
533 * will signal earlier.
534 */
radeon_bo_slab_fence(struct radeon_winsys * rws,struct radeon_bo * bo,struct radeon_bo * fence)535 static void radeon_bo_slab_fence(struct radeon_winsys *rws, struct radeon_bo *bo,
536 struct radeon_bo *fence)
537 {
538 unsigned dst;
539
540 assert(fence->num_cs_references);
541
542 /* Cleanup older fences */
543 dst = 0;
544 for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
545 if (bo->u.slab.fences[src]->num_cs_references) {
546 bo->u.slab.fences[dst] = bo->u.slab.fences[src];
547 dst++;
548 } else {
549 radeon_ws_bo_reference(rws, &bo->u.slab.fences[src], NULL);
550 }
551 }
552 bo->u.slab.num_fences = dst;
553
554 /* Check available space for the new fence */
555 if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
556 unsigned new_max_fences = bo->u.slab.max_fences + 1;
557 struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
558 bo->u.slab.max_fences * sizeof(*new_fences),
559 new_max_fences * sizeof(*new_fences));
560 if (!new_fences) {
561 fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
562 return;
563 }
564
565 bo->u.slab.fences = new_fences;
566 bo->u.slab.max_fences = new_max_fences;
567 }
568
569 /* Add the new fence */
570 bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
571 radeon_ws_bo_reference(rws, &bo->u.slab.fences[bo->u.slab.num_fences], fence);
572 bo->u.slab.num_fences++;
573 }
574
radeon_drm_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** pfence)575 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
576 unsigned flags,
577 struct pipe_fence_handle **pfence)
578 {
579 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
580 struct radeon_cs_context *tmp;
581
582 switch (cs->ip_type) {
583 case AMD_IP_SDMA:
584 /* pad DMA ring to 8 DWs */
585 if (cs->ws->info.gfx_level <= GFX6) {
586 while (rcs->current.cdw & 7)
587 radeon_emit(rcs, 0xf0000000); /* NOP packet */
588 } else {
589 while (rcs->current.cdw & 7)
590 radeon_emit(rcs, 0x00000000); /* NOP packet */
591 }
592 break;
593 case AMD_IP_GFX:
594 /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
595 * r6xx, requires at least 4 dw alignment to avoid a hw bug.
596 */
597 if (cs->ws->info.gfx_ib_pad_with_type2) {
598 while (rcs->current.cdw & 7)
599 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
600 } else {
601 while (rcs->current.cdw & 7)
602 radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
603 }
604 break;
605 case AMD_IP_UVD:
606 while (rcs->current.cdw & 15)
607 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
608 break;
609 default:
610 break;
611 }
612
613 if (rcs->current.cdw > rcs->current.max_dw) {
614 fprintf(stderr, "radeon: command stream overflowed\n");
615 }
616
617 if (pfence || cs->csc->num_slab_buffers) {
618 struct pipe_fence_handle *fence;
619
620 if (cs->next_fence) {
621 fence = cs->next_fence;
622 cs->next_fence = NULL;
623 } else {
624 fence = radeon_cs_create_fence(rcs);
625 }
626
627 if (fence) {
628 if (pfence)
629 radeon_fence_reference(&cs->ws->base, pfence, fence);
630
631 mtx_lock(&cs->ws->bo_fence_lock);
632 for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
633 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
634 p_atomic_inc(&bo->num_active_ioctls);
635 radeon_bo_slab_fence(&cs->ws->base, bo, (struct radeon_bo *)fence);
636 }
637 mtx_unlock(&cs->ws->bo_fence_lock);
638
639 radeon_fence_reference(&cs->ws->base, &fence, NULL);
640 }
641 } else {
642 radeon_fence_reference(&cs->ws->base, &cs->next_fence, NULL);
643 }
644
645 radeon_drm_cs_sync_flush(rcs);
646
647 /* Swap command streams. */
648 tmp = cs->csc;
649 cs->csc = cs->cst;
650 cs->cst = tmp;
651
652 /* If the CS is not empty or overflowed, emit it in a separate thread. */
653 if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
654 !cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
655 unsigned i, num_relocs;
656
657 num_relocs = cs->cst->num_relocs;
658
659 cs->cst->chunks[0].length_dw = rcs->current.cdw;
660
661 for (i = 0; i < num_relocs; i++) {
662 /* Update the number of active asynchronous CS ioctls for the buffer. */
663 p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
664 }
665
666 switch (cs->ip_type) {
667 case AMD_IP_SDMA:
668 cs->cst->flags[0] = 0;
669 cs->cst->flags[1] = RADEON_CS_RING_DMA;
670 cs->cst->cs.num_chunks = 3;
671 if (cs->ws->info.r600_has_virtual_memory) {
672 cs->cst->flags[0] |= RADEON_CS_USE_VM;
673 }
674 break;
675
676 case AMD_IP_UVD:
677 cs->cst->flags[0] = 0;
678 cs->cst->flags[1] = RADEON_CS_RING_UVD;
679 cs->cst->cs.num_chunks = 3;
680 break;
681
682 case AMD_IP_VCE:
683 cs->cst->flags[0] = 0;
684 cs->cst->flags[1] = RADEON_CS_RING_VCE;
685 cs->cst->cs.num_chunks = 3;
686 break;
687
688 default:
689 case AMD_IP_GFX:
690 case AMD_IP_COMPUTE:
691 cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
692 cs->cst->flags[1] = RADEON_CS_RING_GFX;
693 cs->cst->cs.num_chunks = 3;
694
695 if (cs->ws->info.r600_has_virtual_memory) {
696 cs->cst->flags[0] |= RADEON_CS_USE_VM;
697 cs->cst->cs.num_chunks = 3;
698 }
699 if (flags & PIPE_FLUSH_END_OF_FRAME) {
700 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
701 cs->cst->cs.num_chunks = 3;
702 }
703 if (cs->ip_type == AMD_IP_COMPUTE) {
704 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
705 cs->cst->cs.num_chunks = 3;
706 }
707 break;
708 }
709
710 if (util_queue_is_initialized(&cs->ws->cs_queue)) {
711 util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
712 radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
713 if (!(flags & PIPE_FLUSH_ASYNC))
714 radeon_drm_cs_sync_flush(rcs);
715 } else {
716 radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
717 }
718 } else {
719 radeon_cs_context_cleanup(&cs->ws->base, cs->cst);
720 }
721
722 /* Prepare a new CS. */
723 rcs->current.buf = cs->csc->buf;
724 rcs->current.cdw = 0;
725 rcs->used_vram_kb = 0;
726 rcs->used_gart_kb = 0;
727
728 if (cs->ip_type == AMD_IP_GFX)
729 cs->ws->num_gfx_IBs++;
730 else if (cs->ip_type == AMD_IP_SDMA)
731 cs->ws->num_sdma_IBs++;
732 return 0;
733 }
734
radeon_drm_cs_destroy(struct radeon_cmdbuf * rcs)735 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
736 {
737 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
738
739 if (!cs)
740 return;
741
742 radeon_drm_cs_sync_flush(rcs);
743 util_queue_fence_destroy(&cs->flush_completed);
744 radeon_cs_context_cleanup(&cs->ws->base, &cs->csc1);
745 radeon_cs_context_cleanup(&cs->ws->base, &cs->csc2);
746 p_atomic_dec(&cs->ws->num_cs);
747 radeon_destroy_cs_context(&cs->ws->base, &cs->csc1);
748 radeon_destroy_cs_context(&cs->ws->base, &cs->csc2);
749 radeon_fence_reference(&cs->ws->base, &cs->next_fence, NULL);
750 FREE(cs);
751 }
752
radeon_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)753 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
754 struct pb_buffer_lean *_buf,
755 unsigned usage)
756 {
757 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
758 struct radeon_bo *bo = (struct radeon_bo*)_buf;
759 int index;
760
761 if (!bo->num_cs_references)
762 return false;
763
764 index = radeon_lookup_buffer(&cs->ws->base, cs->csc, bo);
765 if (index == -1)
766 return false;
767
768 if (!bo->handle)
769 index = cs->csc->slab_buffers[index].u.slab.real_idx;
770
771 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
772 return true;
773 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
774 return true;
775
776 return false;
777 }
778
779 /* FENCES */
780
radeon_cs_create_fence(struct radeon_cmdbuf * rcs)781 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
782 {
783 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
784 struct pb_buffer_lean *fence;
785
786 /* Create a fence, which is a dummy BO. */
787 fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
788 RADEON_DOMAIN_GTT,
789 RADEON_FLAG_NO_SUBALLOC
790 | RADEON_FLAG_NO_INTERPROCESS_SHARING);
791 if (!fence)
792 return NULL;
793
794 /* Add the fence as a dummy relocation. */
795 cs->ws->base.cs_add_buffer(rcs, fence,
796 RADEON_USAGE_READWRITE | RADEON_PRIO_FENCE_TRACE, RADEON_DOMAIN_GTT);
797 return (struct pipe_fence_handle*)fence;
798 }
799
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)800 static bool radeon_fence_wait(struct radeon_winsys *ws,
801 struct pipe_fence_handle *fence,
802 uint64_t timeout)
803 {
804 return ws->buffer_wait(ws, (struct pb_buffer_lean*)fence, timeout,
805 RADEON_USAGE_READWRITE);
806 }
807
radeon_fence_reference(struct radeon_winsys * ws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)808 static void radeon_fence_reference(struct radeon_winsys *ws,
809 struct pipe_fence_handle **dst,
810 struct pipe_fence_handle *src)
811 {
812 radeon_bo_reference(ws, (struct pb_buffer_lean**)dst, (struct pb_buffer_lean*)src);
813 }
814
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf * rcs)815 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
816 {
817 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
818 struct pipe_fence_handle *fence = NULL;
819
820 if (cs->next_fence) {
821 radeon_fence_reference(&cs->ws->base, &fence, cs->next_fence);
822 return fence;
823 }
824
825 fence = radeon_cs_create_fence(rcs);
826 if (!fence)
827 return NULL;
828
829 radeon_fence_reference(&cs->ws->base, &cs->next_fence, fence);
830 return fence;
831 }
832
833 static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf * cs,struct pipe_fence_handle * fence)834 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
835 struct pipe_fence_handle *fence)
836 {
837 /* TODO: Handle the following unlikely multi-threaded scenario:
838 *
839 * Thread 1 / Context 1 Thread 2 / Context 2
840 * -------------------- --------------------
841 * f = cs_get_next_fence()
842 * cs_add_fence_dependency(f)
843 * cs_flush()
844 * cs_flush()
845 *
846 * We currently assume that this does not happen because we don't support
847 * asynchronous flushes on Radeon.
848 */
849 }
850
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)851 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
852 {
853 ws->base.ctx_create = radeon_drm_ctx_create;
854 ws->base.ctx_destroy = radeon_drm_ctx_destroy;
855 ws->base.ctx_set_sw_reset_status = radeon_drm_ctx_set_sw_reset_status;
856 ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
857 ws->base.cs_create = radeon_drm_cs_create;
858 ws->base.cs_destroy = radeon_drm_cs_destroy;
859 ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
860 ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
861 ws->base.cs_validate = radeon_drm_cs_validate;
862 ws->base.cs_check_space = radeon_drm_cs_check_space;
863 ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
864 ws->base.cs_flush = radeon_drm_cs_flush;
865 ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
866 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
867 ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
868 ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
869 ws->base.fence_wait = radeon_fence_wait;
870 ws->base.fence_reference = radeon_fence_reference;
871 }
872