1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27
28 /*
29 This file replaces libdrm's radeon_cs_gem with our own implemention.
30 It's optimized specifically for Radeon DRM.
31 Adding buffers and space checking are faster and simpler than their
32 counterparts in libdrm (the time complexity of all the functions
33 is O(1) in nearly all scenarios, thanks to hashing).
34
35 It works like this:
36
37 cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38 also adds the size of 'buf' to the used_gart and used_vram winsys variables
39 based on the domains, which are simply or'd for the accounting purposes.
40 The adding is skipped if the reloc is already present in the list, but it
41 accounts any newly-referenced domains.
42
43 cs_validate is then called, which just checks:
44 used_vram/gart < vram/gart_size * 0.8
45 The 0.8 number allows for some memory fragmentation. If the validation
46 fails, the pipe driver flushes CS and tries do the validation again,
47 i.e. it validates only that one operation. If it fails again, it drops
48 the operation on the floor and prints some nasty message to stderr.
49 (done in the pipe driver)
50
51 cs_write_reloc(cs, buf) just writes a reloc that has been added using
52 cs_add_buffer. The read_domain and write_domain parameters have been removed,
53 because we already specify them in cs_add_buffer.
54 */
55
56 #include "radeon_drm_cs.h"
57
58 #include "util/u_memory.h"
59 #include "util/os_time.h"
60
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <xf86drm.h>
65
66
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68
69 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
70 static void radeon_fence_reference(struct pipe_fence_handle **dst,
71 struct pipe_fence_handle *src);
72
radeon_drm_ctx_create(struct radeon_winsys * ws)73 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
74 {
75 struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
76 if (!ctx)
77 return NULL;
78
79 ctx->ws = (struct radeon_drm_winsys*)ws;
80 ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
81 return (struct radeon_winsys_ctx*)ctx;
82 }
83
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)84 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
85 {
86 FREE(ctx);
87 }
88
89 static enum pipe_reset_status
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx * rctx)90 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx)
91 {
92 struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
93
94 unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
95
96 if (ctx->gpu_reset_counter == latest)
97 return PIPE_NO_RESET;
98
99 ctx->gpu_reset_counter = latest;
100 return PIPE_UNKNOWN_CONTEXT_RESET;
101 }
102
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)103 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
104 struct radeon_drm_winsys *ws)
105 {
106 int i;
107
108 csc->fd = ws->fd;
109
110 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
111 csc->chunks[0].length_dw = 0;
112 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
113 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
114 csc->chunks[1].length_dw = 0;
115 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
116 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
117 csc->chunks[2].length_dw = 2;
118 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
119
120 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
121 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
122 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
123
124 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
125
126 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
127 csc->reloc_indices_hashlist[i] = -1;
128 }
129 return true;
130 }
131
radeon_cs_context_cleanup(struct radeon_cs_context * csc)132 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
133 {
134 unsigned i;
135
136 for (i = 0; i < csc->num_relocs; i++) {
137 p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
138 radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
139 }
140 for (i = 0; i < csc->num_slab_buffers; ++i) {
141 p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
142 radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
143 }
144
145 csc->num_relocs = 0;
146 csc->num_validated_relocs = 0;
147 csc->num_slab_buffers = 0;
148 csc->chunks[0].length_dw = 0;
149 csc->chunks[1].length_dw = 0;
150
151 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
152 csc->reloc_indices_hashlist[i] = -1;
153 }
154 }
155
radeon_destroy_cs_context(struct radeon_cs_context * csc)156 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
157 {
158 radeon_cs_context_cleanup(csc);
159 FREE(csc->slab_buffers);
160 FREE(csc->relocs_bo);
161 FREE(csc->relocs);
162 }
163
164
165 static struct radeon_cmdbuf *
radeon_drm_cs_create(struct radeon_winsys_ctx * ctx,enum ring_type ring_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx,bool stop_exec_on_failure)166 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
167 enum ring_type ring_type,
168 void (*flush)(void *ctx, unsigned flags,
169 struct pipe_fence_handle **fence),
170 void *flush_ctx,
171 bool stop_exec_on_failure)
172 {
173 struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
174 struct radeon_drm_cs *cs;
175
176 cs = CALLOC_STRUCT(radeon_drm_cs);
177 if (!cs) {
178 return NULL;
179 }
180 util_queue_fence_init(&cs->flush_completed);
181
182 cs->ws = ws;
183 cs->flush_cs = flush;
184 cs->flush_data = flush_ctx;
185
186 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
187 FREE(cs);
188 return NULL;
189 }
190 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
191 radeon_destroy_cs_context(&cs->csc1);
192 FREE(cs);
193 return NULL;
194 }
195
196 /* Set the first command buffer as current. */
197 cs->csc = &cs->csc1;
198 cs->cst = &cs->csc2;
199 cs->base.current.buf = cs->csc->buf;
200 cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
201 cs->ring_type = ring_type;
202
203 p_atomic_inc(&ws->num_cs);
204 return &cs->base;
205 }
206
radeon_lookup_buffer(struct radeon_cs_context * csc,struct radeon_bo * bo)207 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
208 {
209 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
210 struct radeon_bo_item *buffers;
211 unsigned num_buffers;
212 int i = csc->reloc_indices_hashlist[hash];
213
214 if (bo->handle) {
215 buffers = csc->relocs_bo;
216 num_buffers = csc->num_relocs;
217 } else {
218 buffers = csc->slab_buffers;
219 num_buffers = csc->num_slab_buffers;
220 }
221
222 /* not found or found */
223 if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
224 return i;
225
226 /* Hash collision, look for the BO in the list of relocs linearly. */
227 for (i = num_buffers - 1; i >= 0; i--) {
228 if (buffers[i].bo == bo) {
229 /* Put this reloc in the hash list.
230 * This will prevent additional hash collisions if there are
231 * several consecutive lookup_buffer calls for the same buffer.
232 *
233 * Example: Assuming buffers A,B,C collide in the hash list,
234 * the following sequence of relocs:
235 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
236 * will collide here: ^ and here: ^,
237 * meaning that we should get very few collisions in the end. */
238 csc->reloc_indices_hashlist[hash] = i;
239 return i;
240 }
241 }
242 return -1;
243 }
244
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)245 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
246 struct radeon_bo *bo)
247 {
248 struct radeon_cs_context *csc = cs->csc;
249 struct drm_radeon_cs_reloc *reloc;
250 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
251 int i = -1;
252
253 i = radeon_lookup_buffer(csc, bo);
254
255 if (i >= 0) {
256 /* For async DMA, every add_buffer call must add a buffer to the list
257 * no matter how many duplicates there are. This is due to the fact
258 * the DMA CS checker doesn't use NOP packets for offset patching,
259 * but always uses the i-th buffer from the list to patch the i-th
260 * offset. If there are N offsets in a DMA CS, there must also be N
261 * buffers in the relocation list.
262 *
263 * This doesn't have to be done if virtual memory is enabled,
264 * because there is no offset patching with virtual memory.
265 */
266 if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
267 return i;
268 }
269 }
270
271 /* New relocation, check if the backing array is large enough. */
272 if (csc->num_relocs >= csc->max_relocs) {
273 uint32_t size;
274 csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
275
276 size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
277 csc->relocs_bo = realloc(csc->relocs_bo, size);
278
279 size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
280 csc->relocs = realloc(csc->relocs, size);
281
282 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
283 }
284
285 /* Initialize the new relocation. */
286 csc->relocs_bo[csc->num_relocs].bo = NULL;
287 csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
288 radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
289 p_atomic_inc(&bo->num_cs_references);
290 reloc = &csc->relocs[csc->num_relocs];
291 reloc->handle = bo->handle;
292 reloc->read_domains = 0;
293 reloc->write_domain = 0;
294 reloc->flags = 0;
295
296 csc->reloc_indices_hashlist[hash] = csc->num_relocs;
297
298 csc->chunks[1].length_dw += RELOC_DWORDS;
299
300 return csc->num_relocs++;
301 }
302
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)303 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
304 struct radeon_bo *bo)
305 {
306 struct radeon_cs_context *csc = cs->csc;
307 unsigned hash;
308 struct radeon_bo_item *item;
309 int idx;
310 int real_idx;
311
312 idx = radeon_lookup_buffer(csc, bo);
313 if (idx >= 0)
314 return idx;
315
316 real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
317
318 /* Check if the backing array is large enough. */
319 if (csc->num_slab_buffers >= csc->max_slab_buffers) {
320 unsigned new_max = MAX2(csc->max_slab_buffers + 16,
321 (unsigned)(csc->max_slab_buffers * 1.3));
322 struct radeon_bo_item *new_buffers =
323 REALLOC(csc->slab_buffers,
324 csc->max_slab_buffers * sizeof(*new_buffers),
325 new_max * sizeof(*new_buffers));
326 if (!new_buffers) {
327 fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
328 return -1;
329 }
330
331 csc->max_slab_buffers = new_max;
332 csc->slab_buffers = new_buffers;
333 }
334
335 /* Initialize the new relocation. */
336 idx = csc->num_slab_buffers++;
337 item = &csc->slab_buffers[idx];
338
339 item->bo = NULL;
340 item->u.slab.real_idx = real_idx;
341 radeon_bo_reference(&item->bo, bo);
342 p_atomic_inc(&bo->num_cs_references);
343
344 hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
345 csc->reloc_indices_hashlist[hash] = idx;
346
347 return idx;
348 }
349
radeon_drm_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf,enum radeon_bo_usage usage,enum radeon_bo_domain domains,enum radeon_bo_priority priority)350 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
351 struct pb_buffer *buf,
352 enum radeon_bo_usage usage,
353 enum radeon_bo_domain domains,
354 enum radeon_bo_priority priority)
355 {
356 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
357 struct radeon_bo *bo = (struct radeon_bo*)buf;
358 enum radeon_bo_domain added_domains;
359
360 /* If VRAM is just stolen system memory, allow both VRAM and
361 * GTT, whichever has free space. If a buffer is evicted from
362 * VRAM to GTT, it will stay there.
363 */
364 if (!cs->ws->info.has_dedicated_vram)
365 domains |= RADEON_DOMAIN_GTT;
366
367 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
368 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
369 struct drm_radeon_cs_reloc *reloc;
370 int index;
371
372 if (!bo->handle) {
373 index = radeon_lookup_or_add_slab_buffer(cs, bo);
374 if (index < 0)
375 return 0;
376
377 index = cs->csc->slab_buffers[index].u.slab.real_idx;
378 } else {
379 index = radeon_lookup_or_add_real_buffer(cs, bo);
380 }
381
382 reloc = &cs->csc->relocs[index];
383 added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
384 reloc->read_domains |= rd;
385 reloc->write_domain |= wd;
386 reloc->flags = MAX2(reloc->flags, priority);
387 cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
388
389 if (added_domains & RADEON_DOMAIN_VRAM)
390 cs->base.used_vram += bo->base.size;
391 else if (added_domains & RADEON_DOMAIN_GTT)
392 cs->base.used_gart += bo->base.size;
393
394 return index;
395 }
396
radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf)397 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
398 struct pb_buffer *buf)
399 {
400 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
401
402 return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
403 }
404
radeon_drm_cs_validate(struct radeon_cmdbuf * rcs)405 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
406 {
407 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
408 bool status =
409 cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
410 cs->base.used_vram < cs->ws->info.vram_size * 0.8;
411
412 if (status) {
413 cs->csc->num_validated_relocs = cs->csc->num_relocs;
414 } else {
415 /* Remove lately-added buffers. The validation failed with them
416 * and the CS is about to be flushed because of that. Keep only
417 * the already-validated buffers. */
418 unsigned i;
419
420 for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
421 p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
422 radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
423 }
424 cs->csc->num_relocs = cs->csc->num_validated_relocs;
425
426 /* Flush if there are any relocs. Clean up otherwise. */
427 if (cs->csc->num_relocs) {
428 cs->flush_cs(cs->flush_data,
429 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
430 } else {
431 radeon_cs_context_cleanup(cs->csc);
432 cs->base.used_vram = 0;
433 cs->base.used_gart = 0;
434
435 assert(cs->base.current.cdw == 0);
436 if (cs->base.current.cdw != 0) {
437 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
438 }
439 }
440 }
441 return status;
442 }
443
radeon_drm_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw,bool force_chaining)444 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
445 bool force_chaining)
446 {
447 assert(rcs->current.cdw <= rcs->current.max_dw);
448 return rcs->current.max_dw - rcs->current.cdw >= dw;
449 }
450
radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)451 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
452 struct radeon_bo_list_item *list)
453 {
454 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
455 int i;
456
457 if (list) {
458 for (i = 0; i < cs->csc->num_relocs; i++) {
459 list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
460 list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
461 list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
462 }
463 }
464 return cs->csc->num_relocs;
465 }
466
radeon_drm_cs_emit_ioctl_oneshot(void * job,int thread_index)467 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
468 {
469 struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
470 unsigned i;
471 int r;
472
473 r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
474 &csc->cs, sizeof(struct drm_radeon_cs));
475 if (r) {
476 if (r == -ENOMEM)
477 fprintf(stderr, "radeon: Not enough memory for command submission.\n");
478 else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
479 unsigned i;
480
481 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
482 for (i = 0; i < csc->chunks[0].length_dw; i++) {
483 fprintf(stderr, "0x%08X\n", csc->buf[i]);
484 }
485 } else {
486 fprintf(stderr, "radeon: The kernel rejected CS, "
487 "see dmesg for more information (%i).\n", r);
488 }
489 }
490
491 for (i = 0; i < csc->num_relocs; i++)
492 p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
493 for (i = 0; i < csc->num_slab_buffers; i++)
494 p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
495
496 radeon_cs_context_cleanup(csc);
497 }
498
499 /*
500 * Make sure previous submission of this cs are completed
501 */
radeon_drm_cs_sync_flush(struct radeon_cmdbuf * rcs)502 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
503 {
504 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
505
506 /* Wait for any pending ioctl of this CS to complete. */
507 if (util_queue_is_initialized(&cs->ws->cs_queue))
508 util_queue_fence_wait(&cs->flush_completed);
509 }
510
511 /* Add the given fence to a slab buffer fence list.
512 *
513 * There is a potential race condition when bo participates in submissions on
514 * two or more threads simultaneously. Since we do not know which of the
515 * submissions will be sent to the GPU first, we have to keep the fences
516 * of all submissions.
517 *
518 * However, fences that belong to submissions that have already returned from
519 * their respective ioctl do not have to be kept, because we know that they
520 * will signal earlier.
521 */
radeon_bo_slab_fence(struct radeon_bo * bo,struct radeon_bo * fence)522 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
523 {
524 unsigned dst;
525
526 assert(fence->num_cs_references);
527
528 /* Cleanup older fences */
529 dst = 0;
530 for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
531 if (bo->u.slab.fences[src]->num_cs_references) {
532 bo->u.slab.fences[dst] = bo->u.slab.fences[src];
533 dst++;
534 } else {
535 radeon_bo_reference(&bo->u.slab.fences[src], NULL);
536 }
537 }
538 bo->u.slab.num_fences = dst;
539
540 /* Check available space for the new fence */
541 if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
542 unsigned new_max_fences = bo->u.slab.max_fences + 1;
543 struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
544 bo->u.slab.max_fences * sizeof(*new_fences),
545 new_max_fences * sizeof(*new_fences));
546 if (!new_fences) {
547 fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
548 return;
549 }
550
551 bo->u.slab.fences = new_fences;
552 bo->u.slab.max_fences = new_max_fences;
553 }
554
555 /* Add the new fence */
556 bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
557 radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
558 bo->u.slab.num_fences++;
559 }
560
561 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
562
radeon_drm_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** pfence)563 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
564 unsigned flags,
565 struct pipe_fence_handle **pfence)
566 {
567 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
568 struct radeon_cs_context *tmp;
569
570 switch (cs->ring_type) {
571 case RING_DMA:
572 /* pad DMA ring to 8 DWs */
573 if (cs->ws->info.chip_class <= GFX6) {
574 while (rcs->current.cdw & 7)
575 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
576 } else {
577 while (rcs->current.cdw & 7)
578 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
579 }
580 break;
581 case RING_GFX:
582 /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
583 * r6xx, requires at least 4 dw alignment to avoid a hw bug.
584 */
585 if (cs->ws->info.gfx_ib_pad_with_type2) {
586 while (rcs->current.cdw & 7)
587 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
588 } else {
589 while (rcs->current.cdw & 7)
590 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
591 }
592 break;
593 case RING_UVD:
594 while (rcs->current.cdw & 15)
595 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
596 break;
597 default:
598 break;
599 }
600
601 if (rcs->current.cdw > rcs->current.max_dw) {
602 fprintf(stderr, "radeon: command stream overflowed\n");
603 }
604
605 if (pfence || cs->csc->num_slab_buffers) {
606 struct pipe_fence_handle *fence;
607
608 if (cs->next_fence) {
609 fence = cs->next_fence;
610 cs->next_fence = NULL;
611 } else {
612 fence = radeon_cs_create_fence(rcs);
613 }
614
615 if (fence) {
616 if (pfence)
617 radeon_fence_reference(pfence, fence);
618
619 mtx_lock(&cs->ws->bo_fence_lock);
620 for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
621 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
622 p_atomic_inc(&bo->num_active_ioctls);
623 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
624 }
625 mtx_unlock(&cs->ws->bo_fence_lock);
626
627 radeon_fence_reference(&fence, NULL);
628 }
629 } else {
630 radeon_fence_reference(&cs->next_fence, NULL);
631 }
632
633 radeon_drm_cs_sync_flush(rcs);
634
635 /* Swap command streams. */
636 tmp = cs->csc;
637 cs->csc = cs->cst;
638 cs->cst = tmp;
639
640 /* If the CS is not empty or overflowed, emit it in a separate thread. */
641 if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw &&
642 !debug_get_option_noop() && !(flags & RADEON_FLUSH_NOOP)) {
643 unsigned i, num_relocs;
644
645 num_relocs = cs->cst->num_relocs;
646
647 cs->cst->chunks[0].length_dw = cs->base.current.cdw;
648
649 for (i = 0; i < num_relocs; i++) {
650 /* Update the number of active asynchronous CS ioctls for the buffer. */
651 p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
652 }
653
654 switch (cs->ring_type) {
655 case RING_DMA:
656 cs->cst->flags[0] = 0;
657 cs->cst->flags[1] = RADEON_CS_RING_DMA;
658 cs->cst->cs.num_chunks = 3;
659 if (cs->ws->info.r600_has_virtual_memory) {
660 cs->cst->flags[0] |= RADEON_CS_USE_VM;
661 }
662 break;
663
664 case RING_UVD:
665 cs->cst->flags[0] = 0;
666 cs->cst->flags[1] = RADEON_CS_RING_UVD;
667 cs->cst->cs.num_chunks = 3;
668 break;
669
670 case RING_VCE:
671 cs->cst->flags[0] = 0;
672 cs->cst->flags[1] = RADEON_CS_RING_VCE;
673 cs->cst->cs.num_chunks = 3;
674 break;
675
676 default:
677 case RING_GFX:
678 case RING_COMPUTE:
679 cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
680 cs->cst->flags[1] = RADEON_CS_RING_GFX;
681 cs->cst->cs.num_chunks = 3;
682
683 if (cs->ws->info.r600_has_virtual_memory) {
684 cs->cst->flags[0] |= RADEON_CS_USE_VM;
685 cs->cst->cs.num_chunks = 3;
686 }
687 if (flags & PIPE_FLUSH_END_OF_FRAME) {
688 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
689 cs->cst->cs.num_chunks = 3;
690 }
691 if (cs->ring_type == RING_COMPUTE) {
692 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
693 cs->cst->cs.num_chunks = 3;
694 }
695 break;
696 }
697
698 if (util_queue_is_initialized(&cs->ws->cs_queue)) {
699 util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
700 radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
701 if (!(flags & PIPE_FLUSH_ASYNC))
702 radeon_drm_cs_sync_flush(rcs);
703 } else {
704 radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
705 }
706 } else {
707 radeon_cs_context_cleanup(cs->cst);
708 }
709
710 /* Prepare a new CS. */
711 cs->base.current.buf = cs->csc->buf;
712 cs->base.current.cdw = 0;
713 cs->base.used_vram = 0;
714 cs->base.used_gart = 0;
715
716 if (cs->ring_type == RING_GFX)
717 cs->ws->num_gfx_IBs++;
718 else if (cs->ring_type == RING_DMA)
719 cs->ws->num_sdma_IBs++;
720 return 0;
721 }
722
radeon_drm_cs_destroy(struct radeon_cmdbuf * rcs)723 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
724 {
725 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
726
727 radeon_drm_cs_sync_flush(rcs);
728 util_queue_fence_destroy(&cs->flush_completed);
729 radeon_cs_context_cleanup(&cs->csc1);
730 radeon_cs_context_cleanup(&cs->csc2);
731 p_atomic_dec(&cs->ws->num_cs);
732 radeon_destroy_cs_context(&cs->csc1);
733 radeon_destroy_cs_context(&cs->csc2);
734 radeon_fence_reference(&cs->next_fence, NULL);
735 FREE(cs);
736 }
737
radeon_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer * _buf,enum radeon_bo_usage usage)738 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
739 struct pb_buffer *_buf,
740 enum radeon_bo_usage usage)
741 {
742 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
743 struct radeon_bo *bo = (struct radeon_bo*)_buf;
744 int index;
745
746 if (!bo->num_cs_references)
747 return false;
748
749 index = radeon_lookup_buffer(cs->csc, bo);
750 if (index == -1)
751 return false;
752
753 if (!bo->handle)
754 index = cs->csc->slab_buffers[index].u.slab.real_idx;
755
756 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
757 return true;
758 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
759 return true;
760
761 return false;
762 }
763
764 /* FENCES */
765
radeon_cs_create_fence(struct radeon_cmdbuf * rcs)766 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
767 {
768 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
769 struct pb_buffer *fence;
770
771 /* Create a fence, which is a dummy BO. */
772 fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
773 RADEON_DOMAIN_GTT,
774 RADEON_FLAG_NO_SUBALLOC
775 | RADEON_FLAG_NO_INTERPROCESS_SHARING);
776 if (!fence)
777 return NULL;
778
779 /* Add the fence as a dummy relocation. */
780 cs->ws->base.cs_add_buffer(rcs, fence,
781 RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
782 RADEON_PRIO_FENCE);
783 return (struct pipe_fence_handle*)fence;
784 }
785
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)786 static bool radeon_fence_wait(struct radeon_winsys *ws,
787 struct pipe_fence_handle *fence,
788 uint64_t timeout)
789 {
790 return ws->buffer_wait((struct pb_buffer*)fence, timeout,
791 RADEON_USAGE_READWRITE);
792 }
793
radeon_fence_reference(struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)794 static void radeon_fence_reference(struct pipe_fence_handle **dst,
795 struct pipe_fence_handle *src)
796 {
797 pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
798 }
799
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf * rcs)800 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
801 {
802 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
803 struct pipe_fence_handle *fence = NULL;
804
805 if (cs->next_fence) {
806 radeon_fence_reference(&fence, cs->next_fence);
807 return fence;
808 }
809
810 fence = radeon_cs_create_fence(rcs);
811 if (!fence)
812 return NULL;
813
814 radeon_fence_reference(&cs->next_fence, fence);
815 return fence;
816 }
817
818 static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf * cs,struct pipe_fence_handle * fence,unsigned dependency_flags)819 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
820 struct pipe_fence_handle *fence,
821 unsigned dependency_flags)
822 {
823 /* TODO: Handle the following unlikely multi-threaded scenario:
824 *
825 * Thread 1 / Context 1 Thread 2 / Context 2
826 * -------------------- --------------------
827 * f = cs_get_next_fence()
828 * cs_add_fence_dependency(f)
829 * cs_flush()
830 * cs_flush()
831 *
832 * We currently assume that this does not happen because we don't support
833 * asynchronous flushes on Radeon.
834 */
835 }
836
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)837 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
838 {
839 ws->base.ctx_create = radeon_drm_ctx_create;
840 ws->base.ctx_destroy = radeon_drm_ctx_destroy;
841 ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
842 ws->base.cs_create = radeon_drm_cs_create;
843 ws->base.cs_destroy = radeon_drm_cs_destroy;
844 ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
845 ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
846 ws->base.cs_validate = radeon_drm_cs_validate;
847 ws->base.cs_check_space = radeon_drm_cs_check_space;
848 ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
849 ws->base.cs_flush = radeon_drm_cs_flush;
850 ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
851 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
852 ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
853 ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
854 ws->base.fence_wait = radeon_fence_wait;
855 ws->base.fence_reference = radeon_fence_reference;
856 }
857