1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27
28 /*
29 This file replaces libdrm's radeon_cs_gem with our own implemention.
30 It's optimized specifically for Radeon DRM.
31 Adding buffers and space checking are faster and simpler than their
32 counterparts in libdrm (the time complexity of all the functions
33 is O(1) in nearly all scenarios, thanks to hashing).
34
35 It works like this:
36
37 cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38 also adds the size of 'buf' to the used_gart and used_vram winsys variables
39 based on the domains, which are simply or'd for the accounting purposes.
40 The adding is skipped if the reloc is already present in the list, but it
41 accounts any newly-referenced domains.
42
43 cs_validate is then called, which just checks:
44 used_vram/gart < vram/gart_size * 0.8
45 The 0.8 number allows for some memory fragmentation. If the validation
46 fails, the pipe driver flushes CS and tries do the validation again,
47 i.e. it validates only that one operation. If it fails again, it drops
48 the operation on the floor and prints some nasty message to stderr.
49 (done in the pipe driver)
50
51 cs_write_reloc(cs, buf) just writes a reloc that has been added using
52 cs_add_buffer. The read_domain and write_domain parameters have been removed,
53 because we already specify them in cs_add_buffer.
54 */
55
56 #include "radeon_drm_cs.h"
57
58 #include "util/u_memory.h"
59 #include "util/os_time.h"
60
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <xf86drm.h>
65
66
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68
69 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
70 static void radeon_fence_reference(struct pipe_fence_handle **dst,
71 struct pipe_fence_handle *src);
72
radeon_drm_ctx_create(struct radeon_winsys * ws,enum radeon_ctx_priority priority)73 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws,
74 enum radeon_ctx_priority priority)
75 {
76 struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
77 if (!ctx)
78 return NULL;
79
80 ctx->ws = (struct radeon_drm_winsys*)ws;
81 ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
82 return (struct radeon_winsys_ctx*)ctx;
83 }
84
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)85 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
86 {
87 FREE(ctx);
88 }
89
90 static enum pipe_reset_status
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx * rctx,bool full_reset_only,bool * needs_reset)91 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
92 bool *needs_reset)
93 {
94 struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
95
96 unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
97
98 if (ctx->gpu_reset_counter == latest) {
99 if (needs_reset)
100 *needs_reset = false;
101 return PIPE_NO_RESET;
102 }
103
104 if (needs_reset)
105 *needs_reset = true;
106
107 ctx->gpu_reset_counter = latest;
108 return PIPE_UNKNOWN_CONTEXT_RESET;
109 }
110
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)111 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
112 struct radeon_drm_winsys *ws)
113 {
114 int i;
115
116 csc->fd = ws->fd;
117
118 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
119 csc->chunks[0].length_dw = 0;
120 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
121 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
122 csc->chunks[1].length_dw = 0;
123 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
124 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
125 csc->chunks[2].length_dw = 2;
126 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
127
128 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
129 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
130 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
131
132 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
133
134 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
135 csc->reloc_indices_hashlist[i] = -1;
136 }
137 return true;
138 }
139
radeon_cs_context_cleanup(struct radeon_cs_context * csc)140 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
141 {
142 unsigned i;
143
144 for (i = 0; i < csc->num_relocs; i++) {
145 p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
146 radeon_ws_bo_reference(&csc->relocs_bo[i].bo, NULL);
147 }
148 for (i = 0; i < csc->num_slab_buffers; ++i) {
149 p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
150 radeon_ws_bo_reference(&csc->slab_buffers[i].bo, NULL);
151 }
152
153 csc->num_relocs = 0;
154 csc->num_validated_relocs = 0;
155 csc->num_slab_buffers = 0;
156 csc->chunks[0].length_dw = 0;
157 csc->chunks[1].length_dw = 0;
158
159 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
160 csc->reloc_indices_hashlist[i] = -1;
161 }
162 }
163
radeon_destroy_cs_context(struct radeon_cs_context * csc)164 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
165 {
166 radeon_cs_context_cleanup(csc);
167 FREE(csc->slab_buffers);
168 FREE(csc->relocs_bo);
169 FREE(csc->relocs);
170 }
171
172
173 static bool
radeon_drm_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * ctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx,bool stop_exec_on_failure)174 radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
175 struct radeon_winsys_ctx *ctx,
176 enum amd_ip_type ip_type,
177 void (*flush)(void *ctx, unsigned flags,
178 struct pipe_fence_handle **fence),
179 void *flush_ctx,
180 bool stop_exec_on_failure)
181 {
182 struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
183 struct radeon_drm_cs *cs;
184
185 cs = CALLOC_STRUCT(radeon_drm_cs);
186 if (!cs) {
187 return false;
188 }
189 util_queue_fence_init(&cs->flush_completed);
190
191 cs->ws = ws;
192 cs->flush_cs = flush;
193 cs->flush_data = flush_ctx;
194
195 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
196 FREE(cs);
197 return false;
198 }
199 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
200 radeon_destroy_cs_context(&cs->csc1);
201 FREE(cs);
202 return false;
203 }
204
205 /* Set the first command buffer as current. */
206 cs->csc = &cs->csc1;
207 cs->cst = &cs->csc2;
208 cs->ip_type = ip_type;
209
210 memset(rcs, 0, sizeof(*rcs));
211 rcs->current.buf = cs->csc->buf;
212 rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
213 rcs->priv = cs;
214
215 p_atomic_inc(&ws->num_cs);
216 return true;
217 }
218
radeon_drm_cs_set_preamble(struct radeon_cmdbuf * cs,const uint32_t * preamble_ib,unsigned preamble_num_dw,bool preamble_changed)219 static void radeon_drm_cs_set_preamble(struct radeon_cmdbuf *cs, const uint32_t *preamble_ib,
220 unsigned preamble_num_dw, bool preamble_changed)
221 {
222 /* The radeon kernel driver doesn't support preambles. */
223 radeon_emit_array(cs, preamble_ib, preamble_num_dw);
224 }
225
radeon_lookup_buffer(struct radeon_cs_context * csc,struct radeon_bo * bo)226 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
227 {
228 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
229 struct radeon_bo_item *buffers;
230 unsigned num_buffers;
231 int i = csc->reloc_indices_hashlist[hash];
232
233 if (bo->handle) {
234 buffers = csc->relocs_bo;
235 num_buffers = csc->num_relocs;
236 } else {
237 buffers = csc->slab_buffers;
238 num_buffers = csc->num_slab_buffers;
239 }
240
241 /* not found or found */
242 if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
243 return i;
244
245 /* Hash collision, look for the BO in the list of relocs linearly. */
246 for (i = num_buffers - 1; i >= 0; i--) {
247 if (buffers[i].bo == bo) {
248 /* Put this reloc in the hash list.
249 * This will prevent additional hash collisions if there are
250 * several consecutive lookup_buffer calls for the same buffer.
251 *
252 * Example: Assuming buffers A,B,C collide in the hash list,
253 * the following sequence of relocs:
254 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
255 * will collide here: ^ and here: ^,
256 * meaning that we should get very few collisions in the end. */
257 csc->reloc_indices_hashlist[hash] = i;
258 return i;
259 }
260 }
261 return -1;
262 }
263
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)264 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
265 struct radeon_bo *bo)
266 {
267 struct radeon_cs_context *csc = cs->csc;
268 struct drm_radeon_cs_reloc *reloc;
269 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
270 int i = -1;
271
272 i = radeon_lookup_buffer(csc, bo);
273
274 if (i >= 0) {
275 /* For async DMA, every add_buffer call must add a buffer to the list
276 * no matter how many duplicates there are. This is due to the fact
277 * the DMA CS checker doesn't use NOP packets for offset patching,
278 * but always uses the i-th buffer from the list to patch the i-th
279 * offset. If there are N offsets in a DMA CS, there must also be N
280 * buffers in the relocation list.
281 *
282 * This doesn't have to be done if virtual memory is enabled,
283 * because there is no offset patching with virtual memory.
284 */
285 if (cs->ip_type != AMD_IP_SDMA || cs->ws->info.r600_has_virtual_memory) {
286 return i;
287 }
288 }
289
290 /* New relocation, check if the backing array is large enough. */
291 if (csc->num_relocs >= csc->max_relocs) {
292 uint32_t size;
293 csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
294
295 size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
296 csc->relocs_bo = realloc(csc->relocs_bo, size);
297
298 size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
299 csc->relocs = realloc(csc->relocs, size);
300
301 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
302 }
303
304 /* Initialize the new relocation. */
305 csc->relocs_bo[csc->num_relocs].bo = NULL;
306 csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
307 radeon_ws_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
308 p_atomic_inc(&bo->num_cs_references);
309 reloc = &csc->relocs[csc->num_relocs];
310 reloc->handle = bo->handle;
311 reloc->read_domains = 0;
312 reloc->write_domain = 0;
313 reloc->flags = 0;
314
315 csc->reloc_indices_hashlist[hash] = csc->num_relocs;
316
317 csc->chunks[1].length_dw += RELOC_DWORDS;
318
319 return csc->num_relocs++;
320 }
321
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)322 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
323 struct radeon_bo *bo)
324 {
325 struct radeon_cs_context *csc = cs->csc;
326 unsigned hash;
327 struct radeon_bo_item *item;
328 int idx;
329 int real_idx;
330
331 idx = radeon_lookup_buffer(csc, bo);
332 if (idx >= 0)
333 return idx;
334
335 real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
336
337 /* Check if the backing array is large enough. */
338 if (csc->num_slab_buffers >= csc->max_slab_buffers) {
339 unsigned new_max = MAX2(csc->max_slab_buffers + 16,
340 (unsigned)(csc->max_slab_buffers * 1.3));
341 struct radeon_bo_item *new_buffers =
342 REALLOC(csc->slab_buffers,
343 csc->max_slab_buffers * sizeof(*new_buffers),
344 new_max * sizeof(*new_buffers));
345 if (!new_buffers) {
346 fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
347 return -1;
348 }
349
350 csc->max_slab_buffers = new_max;
351 csc->slab_buffers = new_buffers;
352 }
353
354 /* Initialize the new relocation. */
355 idx = csc->num_slab_buffers++;
356 item = &csc->slab_buffers[idx];
357
358 item->bo = NULL;
359 item->u.slab.real_idx = real_idx;
360 radeon_ws_bo_reference(&item->bo, bo);
361 p_atomic_inc(&bo->num_cs_references);
362
363 hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
364 csc->reloc_indices_hashlist[hash] = idx;
365
366 return idx;
367 }
368
radeon_drm_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf,unsigned usage,enum radeon_bo_domain domains)369 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
370 struct pb_buffer *buf,
371 unsigned usage,
372 enum radeon_bo_domain domains)
373 {
374 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
375 struct radeon_bo *bo = (struct radeon_bo*)buf;
376 enum radeon_bo_domain added_domains;
377
378 /* If VRAM is just stolen system memory, allow both VRAM and
379 * GTT, whichever has free space. If a buffer is evicted from
380 * VRAM to GTT, it will stay there.
381 */
382 if (!cs->ws->info.has_dedicated_vram)
383 domains |= RADEON_DOMAIN_GTT;
384
385 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
386 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
387 struct drm_radeon_cs_reloc *reloc;
388 int index;
389
390 if (!bo->handle) {
391 index = radeon_lookup_or_add_slab_buffer(cs, bo);
392 if (index < 0)
393 return 0;
394
395 index = cs->csc->slab_buffers[index].u.slab.real_idx;
396 } else {
397 index = radeon_lookup_or_add_real_buffer(cs, bo);
398 }
399
400 reloc = &cs->csc->relocs[index];
401 added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
402 reloc->read_domains |= rd;
403 reloc->write_domain |= wd;
404
405 /* The priority must be in [0, 15]. It's used by the kernel memory management. */
406 unsigned priority = usage & RADEON_ALL_PRIORITIES;
407 unsigned bo_priority = util_last_bit(priority) / 2;
408 reloc->flags = MAX2(reloc->flags, bo_priority);
409 cs->csc->relocs_bo[index].u.real.priority_usage |= priority;
410
411 if (added_domains & RADEON_DOMAIN_VRAM)
412 rcs->used_vram_kb += bo->base.size / 1024;
413 else if (added_domains & RADEON_DOMAIN_GTT)
414 rcs->used_gart_kb += bo->base.size / 1024;
415
416 return index;
417 }
418
radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf)419 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
420 struct pb_buffer *buf)
421 {
422 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
423
424 return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
425 }
426
radeon_drm_cs_validate(struct radeon_cmdbuf * rcs)427 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
428 {
429 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
430 bool status =
431 rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
432 rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
433
434 if (status) {
435 cs->csc->num_validated_relocs = cs->csc->num_relocs;
436 } else {
437 /* Remove lately-added buffers. The validation failed with them
438 * and the CS is about to be flushed because of that. Keep only
439 * the already-validated buffers. */
440 unsigned i;
441
442 for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
443 p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
444 radeon_ws_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
445 }
446 cs->csc->num_relocs = cs->csc->num_validated_relocs;
447
448 /* Flush if there are any relocs. Clean up otherwise. */
449 if (cs->csc->num_relocs) {
450 cs->flush_cs(cs->flush_data,
451 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
452 } else {
453 radeon_cs_context_cleanup(cs->csc);
454 rcs->used_vram_kb = 0;
455 rcs->used_gart_kb = 0;
456
457 assert(rcs->current.cdw == 0);
458 if (rcs->current.cdw != 0) {
459 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
460 }
461 }
462 }
463 return status;
464 }
465
radeon_drm_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)466 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
467 {
468 assert(rcs->current.cdw <= rcs->current.max_dw);
469 return rcs->current.max_dw - rcs->current.cdw >= dw;
470 }
471
radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)472 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
473 struct radeon_bo_list_item *list)
474 {
475 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
476 int i;
477
478 if (list) {
479 for (i = 0; i < cs->csc->num_relocs; i++) {
480 list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
481 list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
482 list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
483 }
484 }
485 return cs->csc->num_relocs;
486 }
487
radeon_drm_cs_emit_ioctl_oneshot(void * job,void * gdata,int thread_index)488 void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
489 {
490 struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
491 unsigned i;
492 int r;
493
494 r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
495 &csc->cs, sizeof(struct drm_radeon_cs));
496 if (r) {
497 if (r == -ENOMEM)
498 fprintf(stderr, "radeon: Not enough memory for command submission.\n");
499 else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
500 unsigned i;
501
502 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
503 for (i = 0; i < csc->chunks[0].length_dw; i++) {
504 fprintf(stderr, "0x%08X\n", csc->buf[i]);
505 }
506 } else {
507 fprintf(stderr, "radeon: The kernel rejected CS, "
508 "see dmesg for more information (%i).\n", r);
509 }
510 }
511
512 for (i = 0; i < csc->num_relocs; i++)
513 p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
514 for (i = 0; i < csc->num_slab_buffers; i++)
515 p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
516
517 radeon_cs_context_cleanup(csc);
518 }
519
520 /*
521 * Make sure previous submission of this cs are completed
522 */
radeon_drm_cs_sync_flush(struct radeon_cmdbuf * rcs)523 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
524 {
525 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
526
527 /* Wait for any pending ioctl of this CS to complete. */
528 if (util_queue_is_initialized(&cs->ws->cs_queue))
529 util_queue_fence_wait(&cs->flush_completed);
530 }
531
532 /* Add the given fence to a slab buffer fence list.
533 *
534 * There is a potential race condition when bo participates in submissions on
535 * two or more threads simultaneously. Since we do not know which of the
536 * submissions will be sent to the GPU first, we have to keep the fences
537 * of all submissions.
538 *
539 * However, fences that belong to submissions that have already returned from
540 * their respective ioctl do not have to be kept, because we know that they
541 * will signal earlier.
542 */
radeon_bo_slab_fence(struct radeon_bo * bo,struct radeon_bo * fence)543 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
544 {
545 unsigned dst;
546
547 assert(fence->num_cs_references);
548
549 /* Cleanup older fences */
550 dst = 0;
551 for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
552 if (bo->u.slab.fences[src]->num_cs_references) {
553 bo->u.slab.fences[dst] = bo->u.slab.fences[src];
554 dst++;
555 } else {
556 radeon_ws_bo_reference(&bo->u.slab.fences[src], NULL);
557 }
558 }
559 bo->u.slab.num_fences = dst;
560
561 /* Check available space for the new fence */
562 if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
563 unsigned new_max_fences = bo->u.slab.max_fences + 1;
564 struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
565 bo->u.slab.max_fences * sizeof(*new_fences),
566 new_max_fences * sizeof(*new_fences));
567 if (!new_fences) {
568 fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
569 return;
570 }
571
572 bo->u.slab.fences = new_fences;
573 bo->u.slab.max_fences = new_max_fences;
574 }
575
576 /* Add the new fence */
577 bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
578 radeon_ws_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
579 bo->u.slab.num_fences++;
580 }
581
radeon_drm_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** pfence)582 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
583 unsigned flags,
584 struct pipe_fence_handle **pfence)
585 {
586 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
587 struct radeon_cs_context *tmp;
588
589 switch (cs->ip_type) {
590 case AMD_IP_SDMA:
591 /* pad DMA ring to 8 DWs */
592 if (cs->ws->info.gfx_level <= GFX6) {
593 while (rcs->current.cdw & 7)
594 radeon_emit(rcs, 0xf0000000); /* NOP packet */
595 } else {
596 while (rcs->current.cdw & 7)
597 radeon_emit(rcs, 0x00000000); /* NOP packet */
598 }
599 break;
600 case AMD_IP_GFX:
601 /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
602 * r6xx, requires at least 4 dw alignment to avoid a hw bug.
603 */
604 if (cs->ws->info.gfx_ib_pad_with_type2) {
605 while (rcs->current.cdw & 7)
606 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
607 } else {
608 while (rcs->current.cdw & 7)
609 radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
610 }
611 break;
612 case AMD_IP_UVD:
613 while (rcs->current.cdw & 15)
614 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
615 break;
616 default:
617 break;
618 }
619
620 if (rcs->current.cdw > rcs->current.max_dw) {
621 fprintf(stderr, "radeon: command stream overflowed\n");
622 }
623
624 if (pfence || cs->csc->num_slab_buffers) {
625 struct pipe_fence_handle *fence;
626
627 if (cs->next_fence) {
628 fence = cs->next_fence;
629 cs->next_fence = NULL;
630 } else {
631 fence = radeon_cs_create_fence(rcs);
632 }
633
634 if (fence) {
635 if (pfence)
636 radeon_fence_reference(pfence, fence);
637
638 mtx_lock(&cs->ws->bo_fence_lock);
639 for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
640 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
641 p_atomic_inc(&bo->num_active_ioctls);
642 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
643 }
644 mtx_unlock(&cs->ws->bo_fence_lock);
645
646 radeon_fence_reference(&fence, NULL);
647 }
648 } else {
649 radeon_fence_reference(&cs->next_fence, NULL);
650 }
651
652 radeon_drm_cs_sync_flush(rcs);
653
654 /* Swap command streams. */
655 tmp = cs->csc;
656 cs->csc = cs->cst;
657 cs->cst = tmp;
658
659 /* If the CS is not empty or overflowed, emit it in a separate thread. */
660 if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
661 !cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
662 unsigned i, num_relocs;
663
664 num_relocs = cs->cst->num_relocs;
665
666 cs->cst->chunks[0].length_dw = rcs->current.cdw;
667
668 for (i = 0; i < num_relocs; i++) {
669 /* Update the number of active asynchronous CS ioctls for the buffer. */
670 p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
671 }
672
673 switch (cs->ip_type) {
674 case AMD_IP_SDMA:
675 cs->cst->flags[0] = 0;
676 cs->cst->flags[1] = RADEON_CS_RING_DMA;
677 cs->cst->cs.num_chunks = 3;
678 if (cs->ws->info.r600_has_virtual_memory) {
679 cs->cst->flags[0] |= RADEON_CS_USE_VM;
680 }
681 break;
682
683 case AMD_IP_UVD:
684 cs->cst->flags[0] = 0;
685 cs->cst->flags[1] = RADEON_CS_RING_UVD;
686 cs->cst->cs.num_chunks = 3;
687 break;
688
689 case AMD_IP_VCE:
690 cs->cst->flags[0] = 0;
691 cs->cst->flags[1] = RADEON_CS_RING_VCE;
692 cs->cst->cs.num_chunks = 3;
693 break;
694
695 default:
696 case AMD_IP_GFX:
697 case AMD_IP_COMPUTE:
698 cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
699 cs->cst->flags[1] = RADEON_CS_RING_GFX;
700 cs->cst->cs.num_chunks = 3;
701
702 if (cs->ws->info.r600_has_virtual_memory) {
703 cs->cst->flags[0] |= RADEON_CS_USE_VM;
704 cs->cst->cs.num_chunks = 3;
705 }
706 if (flags & PIPE_FLUSH_END_OF_FRAME) {
707 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
708 cs->cst->cs.num_chunks = 3;
709 }
710 if (cs->ip_type == AMD_IP_COMPUTE) {
711 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
712 cs->cst->cs.num_chunks = 3;
713 }
714 break;
715 }
716
717 if (util_queue_is_initialized(&cs->ws->cs_queue)) {
718 util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
719 radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
720 if (!(flags & PIPE_FLUSH_ASYNC))
721 radeon_drm_cs_sync_flush(rcs);
722 } else {
723 radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
724 }
725 } else {
726 radeon_cs_context_cleanup(cs->cst);
727 }
728
729 /* Prepare a new CS. */
730 rcs->current.buf = cs->csc->buf;
731 rcs->current.cdw = 0;
732 rcs->used_vram_kb = 0;
733 rcs->used_gart_kb = 0;
734
735 if (cs->ip_type == AMD_IP_GFX)
736 cs->ws->num_gfx_IBs++;
737 else if (cs->ip_type == AMD_IP_SDMA)
738 cs->ws->num_sdma_IBs++;
739 return 0;
740 }
741
radeon_drm_cs_destroy(struct radeon_cmdbuf * rcs)742 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
743 {
744 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
745
746 if (!cs)
747 return;
748
749 radeon_drm_cs_sync_flush(rcs);
750 util_queue_fence_destroy(&cs->flush_completed);
751 radeon_cs_context_cleanup(&cs->csc1);
752 radeon_cs_context_cleanup(&cs->csc2);
753 p_atomic_dec(&cs->ws->num_cs);
754 radeon_destroy_cs_context(&cs->csc1);
755 radeon_destroy_cs_context(&cs->csc2);
756 radeon_fence_reference(&cs->next_fence, NULL);
757 FREE(cs);
758 }
759
radeon_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer * _buf,unsigned usage)760 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
761 struct pb_buffer *_buf,
762 unsigned usage)
763 {
764 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
765 struct radeon_bo *bo = (struct radeon_bo*)_buf;
766 int index;
767
768 if (!bo->num_cs_references)
769 return false;
770
771 index = radeon_lookup_buffer(cs->csc, bo);
772 if (index == -1)
773 return false;
774
775 if (!bo->handle)
776 index = cs->csc->slab_buffers[index].u.slab.real_idx;
777
778 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
779 return true;
780 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
781 return true;
782
783 return false;
784 }
785
786 /* FENCES */
787
radeon_cs_create_fence(struct radeon_cmdbuf * rcs)788 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
789 {
790 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
791 struct pb_buffer *fence;
792
793 /* Create a fence, which is a dummy BO. */
794 fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
795 RADEON_DOMAIN_GTT,
796 RADEON_FLAG_NO_SUBALLOC
797 | RADEON_FLAG_NO_INTERPROCESS_SHARING);
798 if (!fence)
799 return NULL;
800
801 /* Add the fence as a dummy relocation. */
802 cs->ws->base.cs_add_buffer(rcs, fence,
803 RADEON_USAGE_READWRITE | RADEON_PRIO_FENCE_TRACE, RADEON_DOMAIN_GTT);
804 return (struct pipe_fence_handle*)fence;
805 }
806
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)807 static bool radeon_fence_wait(struct radeon_winsys *ws,
808 struct pipe_fence_handle *fence,
809 uint64_t timeout)
810 {
811 return ws->buffer_wait(ws, (struct pb_buffer*)fence, timeout,
812 RADEON_USAGE_READWRITE);
813 }
814
radeon_fence_reference(struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)815 static void radeon_fence_reference(struct pipe_fence_handle **dst,
816 struct pipe_fence_handle *src)
817 {
818 pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
819 }
820
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf * rcs)821 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
822 {
823 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
824 struct pipe_fence_handle *fence = NULL;
825
826 if (cs->next_fence) {
827 radeon_fence_reference(&fence, cs->next_fence);
828 return fence;
829 }
830
831 fence = radeon_cs_create_fence(rcs);
832 if (!fence)
833 return NULL;
834
835 radeon_fence_reference(&cs->next_fence, fence);
836 return fence;
837 }
838
839 static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf * cs,struct pipe_fence_handle * fence,unsigned dependency_flags)840 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
841 struct pipe_fence_handle *fence,
842 unsigned dependency_flags)
843 {
844 /* TODO: Handle the following unlikely multi-threaded scenario:
845 *
846 * Thread 1 / Context 1 Thread 2 / Context 2
847 * -------------------- --------------------
848 * f = cs_get_next_fence()
849 * cs_add_fence_dependency(f)
850 * cs_flush()
851 * cs_flush()
852 *
853 * We currently assume that this does not happen because we don't support
854 * asynchronous flushes on Radeon.
855 */
856 }
857
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)858 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
859 {
860 ws->base.ctx_create = radeon_drm_ctx_create;
861 ws->base.ctx_destroy = radeon_drm_ctx_destroy;
862 ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
863 ws->base.cs_create = radeon_drm_cs_create;
864 ws->base.cs_set_preamble = radeon_drm_cs_set_preamble;
865 ws->base.cs_destroy = radeon_drm_cs_destroy;
866 ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
867 ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
868 ws->base.cs_validate = radeon_drm_cs_validate;
869 ws->base.cs_check_space = radeon_drm_cs_check_space;
870 ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
871 ws->base.cs_flush = radeon_drm_cs_flush;
872 ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
873 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
874 ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
875 ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
876 ws->base.fence_wait = radeon_fence_wait;
877 ws->base.fence_reference = radeon_fence_reference;
878 }
879