1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27
28 /*
29 This file replaces libdrm's radeon_cs_gem with our own implemention.
30 It's optimized specifically for Radeon DRM.
31 Adding buffers and space checking are faster and simpler than their
32 counterparts in libdrm (the time complexity of all the functions
33 is O(1) in nearly all scenarios, thanks to hashing).
34
35 It works like this:
36
37 cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38 also adds the size of 'buf' to the used_gart and used_vram winsys variables
39 based on the domains, which are simply or'd for the accounting purposes.
40 The adding is skipped if the reloc is already present in the list, but it
41 accounts any newly-referenced domains.
42
43 cs_validate is then called, which just checks:
44 used_vram/gart < vram/gart_size * 0.8
45 The 0.8 number allows for some memory fragmentation. If the validation
46 fails, the pipe driver flushes CS and tries do the validation again,
47 i.e. it validates only that one operation. If it fails again, it drops
48 the operation on the floor and prints some nasty message to stderr.
49 (done in the pipe driver)
50
51 cs_write_reloc(cs, buf) just writes a reloc that has been added using
52 cs_add_buffer. The read_domain and write_domain parameters have been removed,
53 because we already specify them in cs_add_buffer.
54 */
55
56 #include "radeon_drm_cs.h"
57
58 #include "util/u_memory.h"
59 #include "util/os_time.h"
60
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <xf86drm.h>
65
66
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68
69 static struct pipe_fence_handle *
70 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
71 static void radeon_fence_reference(struct pipe_fence_handle **dst,
72 struct pipe_fence_handle *src);
73
radeon_drm_ctx_create(struct radeon_winsys * ws)74 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
75 {
76 /* No context support here. Just return the winsys pointer
77 * as the "context". */
78 return (struct radeon_winsys_ctx*)ws;
79 }
80
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)81 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
82 {
83 /* No context support here. */
84 }
85
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)86 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
87 struct radeon_drm_winsys *ws)
88 {
89 int i;
90
91 csc->fd = ws->fd;
92
93 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
94 csc->chunks[0].length_dw = 0;
95 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
96 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
97 csc->chunks[1].length_dw = 0;
98 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
99 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
100 csc->chunks[2].length_dw = 2;
101 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
102
103 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
104 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
105 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
106
107 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
108
109 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
110 csc->reloc_indices_hashlist[i] = -1;
111 }
112 return true;
113 }
114
radeon_cs_context_cleanup(struct radeon_cs_context * csc)115 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
116 {
117 unsigned i;
118
119 for (i = 0; i < csc->num_relocs; i++) {
120 p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
121 radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
122 }
123 for (i = 0; i < csc->num_slab_buffers; ++i) {
124 p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
125 radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
126 }
127
128 csc->num_relocs = 0;
129 csc->num_validated_relocs = 0;
130 csc->num_slab_buffers = 0;
131 csc->chunks[0].length_dw = 0;
132 csc->chunks[1].length_dw = 0;
133
134 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
135 csc->reloc_indices_hashlist[i] = -1;
136 }
137 }
138
radeon_destroy_cs_context(struct radeon_cs_context * csc)139 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
140 {
141 radeon_cs_context_cleanup(csc);
142 FREE(csc->slab_buffers);
143 FREE(csc->relocs_bo);
144 FREE(csc->relocs);
145 }
146
147
148 static struct radeon_winsys_cs *
radeon_drm_cs_create(struct radeon_winsys_ctx * ctx,enum ring_type ring_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)149 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
150 enum ring_type ring_type,
151 void (*flush)(void *ctx, unsigned flags,
152 struct pipe_fence_handle **fence),
153 void *flush_ctx)
154 {
155 struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
156 struct radeon_drm_cs *cs;
157
158 cs = CALLOC_STRUCT(radeon_drm_cs);
159 if (!cs) {
160 return NULL;
161 }
162 util_queue_fence_init(&cs->flush_completed);
163
164 cs->ws = ws;
165 cs->flush_cs = flush;
166 cs->flush_data = flush_ctx;
167
168 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
169 FREE(cs);
170 return NULL;
171 }
172 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
173 radeon_destroy_cs_context(&cs->csc1);
174 FREE(cs);
175 return NULL;
176 }
177
178 /* Set the first command buffer as current. */
179 cs->csc = &cs->csc1;
180 cs->cst = &cs->csc2;
181 cs->base.current.buf = cs->csc->buf;
182 cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
183 cs->ring_type = ring_type;
184
185 p_atomic_inc(&ws->num_cs);
186 return &cs->base;
187 }
188
radeon_lookup_buffer(struct radeon_cs_context * csc,struct radeon_bo * bo)189 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
190 {
191 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
192 struct radeon_bo_item *buffers;
193 unsigned num_buffers;
194 int i = csc->reloc_indices_hashlist[hash];
195
196 if (bo->handle) {
197 buffers = csc->relocs_bo;
198 num_buffers = csc->num_relocs;
199 } else {
200 buffers = csc->slab_buffers;
201 num_buffers = csc->num_slab_buffers;
202 }
203
204 /* not found or found */
205 if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
206 return i;
207
208 /* Hash collision, look for the BO in the list of relocs linearly. */
209 for (i = num_buffers - 1; i >= 0; i--) {
210 if (buffers[i].bo == bo) {
211 /* Put this reloc in the hash list.
212 * This will prevent additional hash collisions if there are
213 * several consecutive lookup_buffer calls for the same buffer.
214 *
215 * Example: Assuming buffers A,B,C collide in the hash list,
216 * the following sequence of relocs:
217 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
218 * will collide here: ^ and here: ^,
219 * meaning that we should get very few collisions in the end. */
220 csc->reloc_indices_hashlist[hash] = i;
221 return i;
222 }
223 }
224 return -1;
225 }
226
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)227 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
228 struct radeon_bo *bo)
229 {
230 struct radeon_cs_context *csc = cs->csc;
231 struct drm_radeon_cs_reloc *reloc;
232 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
233 int i = -1;
234
235 i = radeon_lookup_buffer(csc, bo);
236
237 if (i >= 0) {
238 /* For async DMA, every add_buffer call must add a buffer to the list
239 * no matter how many duplicates there are. This is due to the fact
240 * the DMA CS checker doesn't use NOP packets for offset patching,
241 * but always uses the i-th buffer from the list to patch the i-th
242 * offset. If there are N offsets in a DMA CS, there must also be N
243 * buffers in the relocation list.
244 *
245 * This doesn't have to be done if virtual memory is enabled,
246 * because there is no offset patching with virtual memory.
247 */
248 if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
249 return i;
250 }
251 }
252
253 /* New relocation, check if the backing array is large enough. */
254 if (csc->num_relocs >= csc->max_relocs) {
255 uint32_t size;
256 csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
257
258 size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
259 csc->relocs_bo = realloc(csc->relocs_bo, size);
260
261 size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
262 csc->relocs = realloc(csc->relocs, size);
263
264 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
265 }
266
267 /* Initialize the new relocation. */
268 csc->relocs_bo[csc->num_relocs].bo = NULL;
269 csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
270 radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
271 p_atomic_inc(&bo->num_cs_references);
272 reloc = &csc->relocs[csc->num_relocs];
273 reloc->handle = bo->handle;
274 reloc->read_domains = 0;
275 reloc->write_domain = 0;
276 reloc->flags = 0;
277
278 csc->reloc_indices_hashlist[hash] = csc->num_relocs;
279
280 csc->chunks[1].length_dw += RELOC_DWORDS;
281
282 return csc->num_relocs++;
283 }
284
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)285 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
286 struct radeon_bo *bo)
287 {
288 struct radeon_cs_context *csc = cs->csc;
289 unsigned hash;
290 struct radeon_bo_item *item;
291 int idx;
292 int real_idx;
293
294 idx = radeon_lookup_buffer(csc, bo);
295 if (idx >= 0)
296 return idx;
297
298 real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
299
300 /* Check if the backing array is large enough. */
301 if (csc->num_slab_buffers >= csc->max_slab_buffers) {
302 unsigned new_max = MAX2(csc->max_slab_buffers + 16,
303 (unsigned)(csc->max_slab_buffers * 1.3));
304 struct radeon_bo_item *new_buffers =
305 REALLOC(csc->slab_buffers,
306 csc->max_slab_buffers * sizeof(*new_buffers),
307 new_max * sizeof(*new_buffers));
308 if (!new_buffers) {
309 fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
310 return -1;
311 }
312
313 csc->max_slab_buffers = new_max;
314 csc->slab_buffers = new_buffers;
315 }
316
317 /* Initialize the new relocation. */
318 idx = csc->num_slab_buffers++;
319 item = &csc->slab_buffers[idx];
320
321 item->bo = NULL;
322 item->u.slab.real_idx = real_idx;
323 radeon_bo_reference(&item->bo, bo);
324 p_atomic_inc(&bo->num_cs_references);
325
326 hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
327 csc->reloc_indices_hashlist[hash] = idx;
328
329 return idx;
330 }
331
radeon_drm_cs_add_buffer(struct radeon_winsys_cs * rcs,struct pb_buffer * buf,enum radeon_bo_usage usage,enum radeon_bo_domain domains,enum radeon_bo_priority priority)332 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
333 struct pb_buffer *buf,
334 enum radeon_bo_usage usage,
335 enum radeon_bo_domain domains,
336 enum radeon_bo_priority priority)
337 {
338 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
339 struct radeon_bo *bo = (struct radeon_bo*)buf;
340 enum radeon_bo_domain added_domains;
341
342 /* If VRAM is just stolen system memory, allow both VRAM and
343 * GTT, whichever has free space. If a buffer is evicted from
344 * VRAM to GTT, it will stay there.
345 */
346 if (!cs->ws->info.has_dedicated_vram)
347 domains |= RADEON_DOMAIN_GTT;
348
349 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
350 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
351 struct drm_radeon_cs_reloc *reloc;
352 int index;
353
354 if (!bo->handle) {
355 index = radeon_lookup_or_add_slab_buffer(cs, bo);
356 if (index < 0)
357 return 0;
358
359 index = cs->csc->slab_buffers[index].u.slab.real_idx;
360 } else {
361 index = radeon_lookup_or_add_real_buffer(cs, bo);
362 }
363
364 reloc = &cs->csc->relocs[index];
365 added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
366 reloc->read_domains |= rd;
367 reloc->write_domain |= wd;
368 reloc->flags = MAX2(reloc->flags, priority);
369 cs->csc->relocs_bo[index].u.real.priority_usage |= 1ull << priority;
370
371 if (added_domains & RADEON_DOMAIN_VRAM)
372 cs->base.used_vram += bo->base.size;
373 else if (added_domains & RADEON_DOMAIN_GTT)
374 cs->base.used_gart += bo->base.size;
375
376 return index;
377 }
378
radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs * rcs,struct pb_buffer * buf)379 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
380 struct pb_buffer *buf)
381 {
382 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
383
384 return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
385 }
386
radeon_drm_cs_validate(struct radeon_winsys_cs * rcs)387 static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
388 {
389 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
390 bool status =
391 cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
392 cs->base.used_vram < cs->ws->info.vram_size * 0.8;
393
394 if (status) {
395 cs->csc->num_validated_relocs = cs->csc->num_relocs;
396 } else {
397 /* Remove lately-added buffers. The validation failed with them
398 * and the CS is about to be flushed because of that. Keep only
399 * the already-validated buffers. */
400 unsigned i;
401
402 for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
403 p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
404 radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
405 }
406 cs->csc->num_relocs = cs->csc->num_validated_relocs;
407
408 /* Flush if there are any relocs. Clean up otherwise. */
409 if (cs->csc->num_relocs) {
410 cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
411 } else {
412 radeon_cs_context_cleanup(cs->csc);
413 cs->base.used_vram = 0;
414 cs->base.used_gart = 0;
415
416 assert(cs->base.current.cdw == 0);
417 if (cs->base.current.cdw != 0) {
418 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
419 }
420 }
421 }
422 return status;
423 }
424
radeon_drm_cs_check_space(struct radeon_winsys_cs * rcs,unsigned dw)425 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
426 {
427 assert(rcs->current.cdw <= rcs->current.max_dw);
428 return rcs->current.max_dw - rcs->current.cdw >= dw;
429 }
430
radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs * rcs,struct radeon_bo_list_item * list)431 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
432 struct radeon_bo_list_item *list)
433 {
434 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
435 int i;
436
437 if (list) {
438 for (i = 0; i < cs->csc->num_relocs; i++) {
439 list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
440 list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
441 list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
442 }
443 }
444 return cs->csc->num_relocs;
445 }
446
radeon_drm_cs_emit_ioctl_oneshot(void * job,int thread_index)447 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
448 {
449 struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
450 unsigned i;
451 int r;
452
453 r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
454 &csc->cs, sizeof(struct drm_radeon_cs));
455 if (r) {
456 if (r == -ENOMEM)
457 fprintf(stderr, "radeon: Not enough memory for command submission.\n");
458 else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
459 unsigned i;
460
461 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
462 for (i = 0; i < csc->chunks[0].length_dw; i++) {
463 fprintf(stderr, "0x%08X\n", csc->buf[i]);
464 }
465 } else {
466 fprintf(stderr, "radeon: The kernel rejected CS, "
467 "see dmesg for more information (%i).\n", r);
468 }
469 }
470
471 for (i = 0; i < csc->num_relocs; i++)
472 p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
473 for (i = 0; i < csc->num_slab_buffers; i++)
474 p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
475
476 radeon_cs_context_cleanup(csc);
477 }
478
479 /*
480 * Make sure previous submission of this cs are completed
481 */
radeon_drm_cs_sync_flush(struct radeon_winsys_cs * rcs)482 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
483 {
484 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
485
486 /* Wait for any pending ioctl of this CS to complete. */
487 if (util_queue_is_initialized(&cs->ws->cs_queue))
488 util_queue_fence_wait(&cs->flush_completed);
489 }
490
491 /* Add the given fence to a slab buffer fence list.
492 *
493 * There is a potential race condition when bo participates in submissions on
494 * two or more threads simultaneously. Since we do not know which of the
495 * submissions will be sent to the GPU first, we have to keep the fences
496 * of all submissions.
497 *
498 * However, fences that belong to submissions that have already returned from
499 * their respective ioctl do not have to be kept, because we know that they
500 * will signal earlier.
501 */
radeon_bo_slab_fence(struct radeon_bo * bo,struct radeon_bo * fence)502 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
503 {
504 unsigned dst;
505
506 assert(fence->num_cs_references);
507
508 /* Cleanup older fences */
509 dst = 0;
510 for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
511 if (bo->u.slab.fences[src]->num_cs_references) {
512 bo->u.slab.fences[dst] = bo->u.slab.fences[src];
513 dst++;
514 } else {
515 radeon_bo_reference(&bo->u.slab.fences[src], NULL);
516 }
517 }
518 bo->u.slab.num_fences = dst;
519
520 /* Check available space for the new fence */
521 if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
522 unsigned new_max_fences = bo->u.slab.max_fences + 1;
523 struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
524 bo->u.slab.max_fences * sizeof(*new_fences),
525 new_max_fences * sizeof(*new_fences));
526 if (!new_fences) {
527 fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
528 return;
529 }
530
531 bo->u.slab.fences = new_fences;
532 bo->u.slab.max_fences = new_max_fences;
533 }
534
535 /* Add the new fence */
536 bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
537 radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
538 bo->u.slab.num_fences++;
539 }
540
541 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
542
radeon_drm_cs_flush(struct radeon_winsys_cs * rcs,unsigned flags,struct pipe_fence_handle ** pfence)543 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
544 unsigned flags,
545 struct pipe_fence_handle **pfence)
546 {
547 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
548 struct radeon_cs_context *tmp;
549
550 switch (cs->ring_type) {
551 case RING_DMA:
552 /* pad DMA ring to 8 DWs */
553 if (cs->ws->info.chip_class <= SI) {
554 while (rcs->current.cdw & 7)
555 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
556 } else {
557 while (rcs->current.cdw & 7)
558 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
559 }
560 break;
561 case RING_GFX:
562 /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
563 * r6xx, requires at least 4 dw alignment to avoid a hw bug.
564 */
565 if (cs->ws->info.gfx_ib_pad_with_type2) {
566 while (rcs->current.cdw & 7)
567 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
568 } else {
569 while (rcs->current.cdw & 7)
570 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
571 }
572 break;
573 case RING_UVD:
574 while (rcs->current.cdw & 15)
575 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
576 break;
577 default:
578 break;
579 }
580
581 if (rcs->current.cdw > rcs->current.max_dw) {
582 fprintf(stderr, "radeon: command stream overflowed\n");
583 }
584
585 if (pfence || cs->csc->num_slab_buffers) {
586 struct pipe_fence_handle *fence;
587
588 if (cs->next_fence) {
589 fence = cs->next_fence;
590 cs->next_fence = NULL;
591 } else {
592 fence = radeon_cs_create_fence(rcs);
593 }
594
595 if (fence) {
596 if (pfence)
597 radeon_fence_reference(pfence, fence);
598
599 mtx_lock(&cs->ws->bo_fence_lock);
600 for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
601 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
602 p_atomic_inc(&bo->num_active_ioctls);
603 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
604 }
605 mtx_unlock(&cs->ws->bo_fence_lock);
606
607 radeon_fence_reference(&fence, NULL);
608 }
609 } else {
610 radeon_fence_reference(&cs->next_fence, NULL);
611 }
612
613 radeon_drm_cs_sync_flush(rcs);
614
615 /* Swap command streams. */
616 tmp = cs->csc;
617 cs->csc = cs->cst;
618 cs->cst = tmp;
619
620 /* If the CS is not empty or overflowed, emit it in a separate thread. */
621 if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
622 unsigned i, num_relocs;
623
624 num_relocs = cs->cst->num_relocs;
625
626 cs->cst->chunks[0].length_dw = cs->base.current.cdw;
627
628 for (i = 0; i < num_relocs; i++) {
629 /* Update the number of active asynchronous CS ioctls for the buffer. */
630 p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
631 }
632
633 switch (cs->ring_type) {
634 case RING_DMA:
635 cs->cst->flags[0] = 0;
636 cs->cst->flags[1] = RADEON_CS_RING_DMA;
637 cs->cst->cs.num_chunks = 3;
638 if (cs->ws->info.has_virtual_memory) {
639 cs->cst->flags[0] |= RADEON_CS_USE_VM;
640 }
641 break;
642
643 case RING_UVD:
644 cs->cst->flags[0] = 0;
645 cs->cst->flags[1] = RADEON_CS_RING_UVD;
646 cs->cst->cs.num_chunks = 3;
647 break;
648
649 case RING_VCE:
650 cs->cst->flags[0] = 0;
651 cs->cst->flags[1] = RADEON_CS_RING_VCE;
652 cs->cst->cs.num_chunks = 3;
653 break;
654
655 default:
656 case RING_GFX:
657 case RING_COMPUTE:
658 cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
659 cs->cst->flags[1] = RADEON_CS_RING_GFX;
660 cs->cst->cs.num_chunks = 3;
661
662 if (cs->ws->info.has_virtual_memory) {
663 cs->cst->flags[0] |= RADEON_CS_USE_VM;
664 cs->cst->cs.num_chunks = 3;
665 }
666 if (flags & PIPE_FLUSH_END_OF_FRAME) {
667 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
668 cs->cst->cs.num_chunks = 3;
669 }
670 if (cs->ring_type == RING_COMPUTE) {
671 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
672 cs->cst->cs.num_chunks = 3;
673 }
674 break;
675 }
676
677 if (util_queue_is_initialized(&cs->ws->cs_queue)) {
678 util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
679 radeon_drm_cs_emit_ioctl_oneshot, NULL);
680 if (!(flags & PIPE_FLUSH_ASYNC))
681 radeon_drm_cs_sync_flush(rcs);
682 } else {
683 radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
684 }
685 } else {
686 radeon_cs_context_cleanup(cs->cst);
687 }
688
689 /* Prepare a new CS. */
690 cs->base.current.buf = cs->csc->buf;
691 cs->base.current.cdw = 0;
692 cs->base.used_vram = 0;
693 cs->base.used_gart = 0;
694
695 if (cs->ring_type == RING_GFX)
696 cs->ws->num_gfx_IBs++;
697 else if (cs->ring_type == RING_DMA)
698 cs->ws->num_sdma_IBs++;
699 return 0;
700 }
701
radeon_drm_cs_destroy(struct radeon_winsys_cs * rcs)702 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
703 {
704 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
705
706 radeon_drm_cs_sync_flush(rcs);
707 util_queue_fence_destroy(&cs->flush_completed);
708 radeon_cs_context_cleanup(&cs->csc1);
709 radeon_cs_context_cleanup(&cs->csc2);
710 p_atomic_dec(&cs->ws->num_cs);
711 radeon_destroy_cs_context(&cs->csc1);
712 radeon_destroy_cs_context(&cs->csc2);
713 radeon_fence_reference(&cs->next_fence, NULL);
714 FREE(cs);
715 }
716
radeon_bo_is_referenced(struct radeon_winsys_cs * rcs,struct pb_buffer * _buf,enum radeon_bo_usage usage)717 static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
718 struct pb_buffer *_buf,
719 enum radeon_bo_usage usage)
720 {
721 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
722 struct radeon_bo *bo = (struct radeon_bo*)_buf;
723 int index;
724
725 if (!bo->num_cs_references)
726 return false;
727
728 index = radeon_lookup_buffer(cs->csc, bo);
729 if (index == -1)
730 return false;
731
732 if (!bo->handle)
733 index = cs->csc->slab_buffers[index].u.slab.real_idx;
734
735 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
736 return true;
737 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
738 return true;
739
740 return false;
741 }
742
743 /* FENCES */
744
745 static struct pipe_fence_handle *
radeon_cs_create_fence(struct radeon_winsys_cs * rcs)746 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
747 {
748 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
749 struct pb_buffer *fence;
750
751 /* Create a fence, which is a dummy BO. */
752 fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
753 RADEON_DOMAIN_GTT, RADEON_FLAG_NO_SUBALLOC);
754 if (!fence)
755 return NULL;
756
757 /* Add the fence as a dummy relocation. */
758 cs->ws->base.cs_add_buffer(rcs, fence,
759 RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
760 RADEON_PRIO_FENCE);
761 return (struct pipe_fence_handle*)fence;
762 }
763
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)764 static bool radeon_fence_wait(struct radeon_winsys *ws,
765 struct pipe_fence_handle *fence,
766 uint64_t timeout)
767 {
768 return ws->buffer_wait((struct pb_buffer*)fence, timeout,
769 RADEON_USAGE_READWRITE);
770 }
771
radeon_fence_reference(struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)772 static void radeon_fence_reference(struct pipe_fence_handle **dst,
773 struct pipe_fence_handle *src)
774 {
775 pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
776 }
777
778 static struct pipe_fence_handle *
radeon_drm_cs_get_next_fence(struct radeon_winsys_cs * rcs)779 radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
780 {
781 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
782 struct pipe_fence_handle *fence = NULL;
783
784 if (cs->next_fence) {
785 radeon_fence_reference(&fence, cs->next_fence);
786 return fence;
787 }
788
789 fence = radeon_cs_create_fence(rcs);
790 if (!fence)
791 return NULL;
792
793 radeon_fence_reference(&cs->next_fence, fence);
794 return fence;
795 }
796
797 static void
radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs * cs,struct pipe_fence_handle * fence)798 radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs *cs,
799 struct pipe_fence_handle *fence)
800 {
801 /* TODO: Handle the following unlikely multi-threaded scenario:
802 *
803 * Thread 1 / Context 1 Thread 2 / Context 2
804 * -------------------- --------------------
805 * f = cs_get_next_fence()
806 * cs_add_fence_dependency(f)
807 * cs_flush()
808 * cs_flush()
809 *
810 * We currently assume that this does not happen because we don't support
811 * asynchronous flushes on Radeon.
812 */
813 }
814
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)815 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
816 {
817 ws->base.ctx_create = radeon_drm_ctx_create;
818 ws->base.ctx_destroy = radeon_drm_ctx_destroy;
819 ws->base.cs_create = radeon_drm_cs_create;
820 ws->base.cs_destroy = radeon_drm_cs_destroy;
821 ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
822 ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
823 ws->base.cs_validate = radeon_drm_cs_validate;
824 ws->base.cs_check_space = radeon_drm_cs_check_space;
825 ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
826 ws->base.cs_flush = radeon_drm_cs_flush;
827 ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
828 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
829 ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
830 ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
831 ws->base.fence_wait = radeon_fence_wait;
832 ws->base.fence_reference = radeon_fence_reference;
833 }
834