• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /*
9     This file replaces libdrm's radeon_cs_gem with our own implemention.
10     It's optimized specifically for Radeon DRM.
11     Adding buffers and space checking are faster and simpler than their
12     counterparts in libdrm (the time complexity of all the functions
13     is O(1) in nearly all scenarios, thanks to hashing).
14 
15     It works like this:
16 
17     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
18     also adds the size of 'buf' to the used_gart and used_vram winsys variables
19     based on the domains, which are simply or'd for the accounting purposes.
20     The adding is skipped if the reloc is already present in the list, but it
21     accounts any newly-referenced domains.
22 
23     cs_validate is then called, which just checks:
24         used_vram/gart < vram/gart_size * 0.8
25     The 0.8 number allows for some memory fragmentation. If the validation
26     fails, the pipe driver flushes CS and tries do the validation again,
27     i.e. it validates only that one operation. If it fails again, it drops
28     the operation on the floor and prints some nasty message to stderr.
29     (done in the pipe driver)
30 
31     cs_write_reloc(cs, buf) just writes a reloc that has been added using
32     cs_add_buffer. The read_domain and write_domain parameters have been removed,
33     because we already specify them in cs_add_buffer.
34 */
35 
36 #include "radeon_drm_cs.h"
37 
38 #include "util/u_memory.h"
39 #include "util/os_time.h"
40 
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <stdint.h>
44 #include <xf86drm.h>
45 
46 
47 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
48 
49 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
50 static void radeon_fence_reference(struct radeon_winsys *ws,
51                                    struct pipe_fence_handle **dst,
52                                    struct pipe_fence_handle *src);
53 
radeon_drm_ctx_create(struct radeon_winsys * ws,enum radeon_ctx_priority priority,bool allow_context_lost)54 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws,
55                                                        enum radeon_ctx_priority priority,
56                                                        bool allow_context_lost)
57 {
58    struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
59    if (!ctx)
60       return NULL;
61 
62    ctx->ws = (struct radeon_drm_winsys*)ws;
63    ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
64    return (struct radeon_winsys_ctx*)ctx;
65 }
66 
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)67 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
68 {
69    FREE(ctx);
70 }
71 
72 static void
radeon_drm_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)73 radeon_drm_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
74                                    const char *format, ...)
75 {
76    /* TODO: we should do something better here */
77    va_list args;
78 
79    va_start(args, format);
80    vfprintf(stderr, format, args);
81    va_end(args);
82 }
83 
84 static enum pipe_reset_status
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx * rctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)85 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
86                                   bool *needs_reset, bool *reset_completed)
87 {
88    struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
89 
90    unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
91 
92    if (ctx->gpu_reset_counter == latest) {
93       if (needs_reset)
94          *needs_reset = false;
95       if (reset_completed)
96          *reset_completed = false;
97       return PIPE_NO_RESET;
98    }
99 
100    if (needs_reset)
101       *needs_reset = true;
102    if (reset_completed)
103       *reset_completed = true;
104 
105    ctx->gpu_reset_counter = latest;
106    return PIPE_UNKNOWN_CONTEXT_RESET;
107 }
108 
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)109 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
110                                    struct radeon_drm_winsys *ws)
111 {
112    int i;
113 
114    csc->fd = ws->fd;
115 
116    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
117    csc->chunks[0].length_dw = 0;
118    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
119    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
120    csc->chunks[1].length_dw = 0;
121    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
122    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
123    csc->chunks[2].length_dw = 2;
124    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
125 
126    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
127    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
128    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
129 
130    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
131 
132    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
133       csc->reloc_indices_hashlist[i] = -1;
134    }
135    return true;
136 }
137 
radeon_cs_context_cleanup(struct radeon_winsys * rws,struct radeon_cs_context * csc)138 static void radeon_cs_context_cleanup(struct radeon_winsys *rws,
139                                       struct radeon_cs_context *csc)
140 {
141    unsigned i;
142 
143    for (i = 0; i < csc->num_relocs; i++) {
144       p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
145       radeon_ws_bo_reference(rws, &csc->relocs_bo[i].bo, NULL);
146    }
147    for (i = 0; i < csc->num_slab_buffers; ++i) {
148       p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
149       radeon_ws_bo_reference(rws, &csc->slab_buffers[i].bo, NULL);
150    }
151 
152    csc->num_relocs = 0;
153    csc->num_validated_relocs = 0;
154    csc->num_slab_buffers = 0;
155    csc->chunks[0].length_dw = 0;
156    csc->chunks[1].length_dw = 0;
157 
158    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
159       csc->reloc_indices_hashlist[i] = -1;
160    }
161 }
162 
radeon_destroy_cs_context(struct radeon_winsys * rws,struct radeon_cs_context * csc)163 static void radeon_destroy_cs_context(struct radeon_winsys *rws, struct radeon_cs_context *csc)
164 {
165    radeon_cs_context_cleanup(rws, csc);
166    FREE(csc->slab_buffers);
167    FREE(csc->relocs_bo);
168    FREE(csc->relocs);
169 }
170 
171 static bool
radeon_drm_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * ctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)172 radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
173                      struct radeon_winsys_ctx *ctx,
174                      enum amd_ip_type ip_type,
175                      void (*flush)(void *ctx, unsigned flags,
176                                    struct pipe_fence_handle **fence),
177                      void *flush_ctx)
178 {
179    struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
180    struct radeon_drm_cs *cs;
181 
182    cs = CALLOC_STRUCT(radeon_drm_cs);
183    if (!cs) {
184       return false;
185    }
186    util_queue_fence_init(&cs->flush_completed);
187 
188    cs->ws = ws;
189    cs->flush_cs = flush;
190    cs->flush_data = flush_ctx;
191 
192    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
193       FREE(cs);
194       return false;
195    }
196    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
197       radeon_destroy_cs_context(&ws->base, &cs->csc1);
198       FREE(cs);
199       return false;
200    }
201 
202    /* Set the first command buffer as current. */
203    cs->csc = &cs->csc1;
204    cs->cst = &cs->csc2;
205    cs->ip_type = ip_type;
206 
207    memset(rcs, 0, sizeof(*rcs));
208    rcs->current.buf = cs->csc->buf;
209    rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
210    rcs->priv = cs;
211 
212    p_atomic_inc(&ws->num_cs);
213    return true;
214 }
215 
radeon_lookup_buffer(struct radeon_winsys * rws,struct radeon_cs_context * csc,struct radeon_bo * bo)216 int radeon_lookup_buffer(struct radeon_winsys *rws, struct radeon_cs_context *csc,
217                          struct radeon_bo *bo)
218 {
219    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
220    struct radeon_bo_item *buffers;
221    unsigned num_buffers;
222    int i = csc->reloc_indices_hashlist[hash];
223 
224    if (bo->handle) {
225       buffers = csc->relocs_bo;
226       num_buffers = csc->num_relocs;
227    } else {
228       buffers = csc->slab_buffers;
229       num_buffers = csc->num_slab_buffers;
230    }
231 
232    /* not found or found */
233    if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
234       return i;
235 
236    /* Hash collision, look for the BO in the list of relocs linearly. */
237    for (i = num_buffers - 1; i >= 0; i--) {
238       if (buffers[i].bo == bo) {
239          /* Put this reloc in the hash list.
240           * This will prevent additional hash collisions if there are
241           * several consecutive lookup_buffer calls for the same buffer.
242           *
243           * Example: Assuming buffers A,B,C collide in the hash list,
244           * the following sequence of relocs:
245           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
246           * will collide here: ^ and here:   ^,
247           * meaning that we should get very few collisions in the end. */
248          csc->reloc_indices_hashlist[hash] = i;
249          return i;
250       }
251    }
252    return -1;
253 }
254 
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)255 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
256                                                  struct radeon_bo *bo)
257 {
258    struct radeon_cs_context *csc = cs->csc;
259    struct drm_radeon_cs_reloc *reloc;
260    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
261    int i = -1;
262 
263    i = radeon_lookup_buffer(&cs->ws->base, csc, bo);
264 
265    if (i >= 0) {
266       /* For async DMA, every add_buffer call must add a buffer to the list
267        * no matter how many duplicates there are. This is due to the fact
268        * the DMA CS checker doesn't use NOP packets for offset patching,
269        * but always uses the i-th buffer from the list to patch the i-th
270        * offset. If there are N offsets in a DMA CS, there must also be N
271        * buffers in the relocation list.
272        *
273        * This doesn't have to be done if virtual memory is enabled,
274        * because there is no offset patching with virtual memory.
275        */
276       if (cs->ip_type != AMD_IP_SDMA || cs->ws->info.r600_has_virtual_memory) {
277          return i;
278       }
279    }
280 
281    /* New relocation, check if the backing array is large enough. */
282    if (csc->num_relocs >= csc->max_relocs) {
283       uint32_t size;
284       csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
285 
286       size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
287       csc->relocs_bo = realloc(csc->relocs_bo, size);
288 
289       size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
290       csc->relocs = realloc(csc->relocs, size);
291 
292       csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
293    }
294 
295    /* Initialize the new relocation. */
296    csc->relocs_bo[csc->num_relocs].bo = NULL;
297    csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
298    radeon_ws_bo_reference(&cs->ws->base, &csc->relocs_bo[csc->num_relocs].bo, bo);
299    p_atomic_inc(&bo->num_cs_references);
300    reloc = &csc->relocs[csc->num_relocs];
301    reloc->handle = bo->handle;
302    reloc->read_domains = 0;
303    reloc->write_domain = 0;
304    reloc->flags = 0;
305 
306    csc->reloc_indices_hashlist[hash] = csc->num_relocs;
307 
308    csc->chunks[1].length_dw += RELOC_DWORDS;
309 
310    return csc->num_relocs++;
311 }
312 
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)313 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
314                                             struct radeon_bo *bo)
315 {
316    struct radeon_cs_context *csc = cs->csc;
317    unsigned hash;
318    struct radeon_bo_item *item;
319    int idx;
320    int real_idx;
321 
322    idx = radeon_lookup_buffer(&cs->ws->base, csc, bo);
323    if (idx >= 0)
324       return idx;
325 
326    real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
327 
328    /* Check if the backing array is large enough. */
329    if (csc->num_slab_buffers >= csc->max_slab_buffers) {
330       unsigned new_max = MAX2(csc->max_slab_buffers + 16,
331                               (unsigned)(csc->max_slab_buffers * 1.3));
332       struct radeon_bo_item *new_buffers =
333             REALLOC(csc->slab_buffers,
334                     csc->max_slab_buffers * sizeof(*new_buffers),
335                     new_max * sizeof(*new_buffers));
336       if (!new_buffers) {
337          fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
338          return -1;
339       }
340 
341       csc->max_slab_buffers = new_max;
342       csc->slab_buffers = new_buffers;
343    }
344 
345    /* Initialize the new relocation. */
346    idx = csc->num_slab_buffers++;
347    item = &csc->slab_buffers[idx];
348 
349    item->bo = NULL;
350    item->u.slab.real_idx = real_idx;
351    radeon_ws_bo_reference(&cs->ws->base, &item->bo, bo);
352    p_atomic_inc(&bo->num_cs_references);
353 
354    hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
355    csc->reloc_indices_hashlist[hash] = idx;
356 
357    return idx;
358 }
359 
radeon_drm_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)360 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
361                                          struct pb_buffer_lean *buf,
362                                          unsigned usage,
363                                          enum radeon_bo_domain domains)
364 {
365    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
366    struct radeon_bo *bo = (struct radeon_bo*)buf;
367    enum radeon_bo_domain added_domains;
368 
369    /* If VRAM is just stolen system memory, allow both VRAM and
370     * GTT, whichever has free space. If a buffer is evicted from
371     * VRAM to GTT, it will stay there.
372     */
373    if (!cs->ws->info.has_dedicated_vram)
374       domains |= RADEON_DOMAIN_GTT;
375 
376    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
377    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
378    struct drm_radeon_cs_reloc *reloc;
379    int index;
380 
381    if (!bo->handle) {
382       index = radeon_lookup_or_add_slab_buffer(cs, bo);
383       if (index < 0)
384          return 0;
385 
386       index = cs->csc->slab_buffers[index].u.slab.real_idx;
387    } else {
388       index = radeon_lookup_or_add_real_buffer(cs, bo);
389    }
390 
391    reloc = &cs->csc->relocs[index];
392    added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
393    reloc->read_domains |= rd;
394    reloc->write_domain |= wd;
395 
396    /* The priority must be in [0, 15]. It's used by the kernel memory management. */
397    unsigned priority = usage & RADEON_ALL_PRIORITIES;
398    unsigned bo_priority = util_last_bit(priority) / 2;
399    reloc->flags = MAX2(reloc->flags, bo_priority);
400    cs->csc->relocs_bo[index].u.real.priority_usage |= priority;
401 
402    if (added_domains & RADEON_DOMAIN_VRAM)
403       rcs->used_vram_kb += bo->base.size / 1024;
404    else if (added_domains & RADEON_DOMAIN_GTT)
405       rcs->used_gart_kb += bo->base.size / 1024;
406 
407    return index;
408 }
409 
radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf)410 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
411                                        struct pb_buffer_lean *buf)
412 {
413    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
414 
415    return radeon_lookup_buffer(&cs->ws->base, cs->csc, (struct radeon_bo*)buf);
416 }
417 
radeon_drm_cs_validate(struct radeon_cmdbuf * rcs)418 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
419 {
420    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
421    bool status =
422          rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
423          rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
424 
425    if (status) {
426       cs->csc->num_validated_relocs = cs->csc->num_relocs;
427    } else {
428       /* Remove lately-added buffers. The validation failed with them
429        * and the CS is about to be flushed because of that. Keep only
430        * the already-validated buffers. */
431       unsigned i;
432 
433       for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
434          p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
435          radeon_ws_bo_reference(&cs->ws->base, &cs->csc->relocs_bo[i].bo, NULL);
436       }
437       cs->csc->num_relocs = cs->csc->num_validated_relocs;
438 
439       /* Flush if there are any relocs. Clean up otherwise. */
440       if (cs->csc->num_relocs) {
441          cs->flush_cs(cs->flush_data,
442                       RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
443       } else {
444          radeon_cs_context_cleanup(&cs->ws->base, cs->csc);
445          rcs->used_vram_kb = 0;
446          rcs->used_gart_kb = 0;
447 
448          assert(rcs->current.cdw == 0);
449          if (rcs->current.cdw != 0) {
450             fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
451          }
452       }
453    }
454    return status;
455 }
456 
radeon_drm_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)457 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
458 {
459    assert(rcs->current.cdw <= rcs->current.max_dw);
460    return rcs->current.max_dw - rcs->current.cdw >= dw;
461 }
462 
radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)463 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
464                                               struct radeon_bo_list_item *list)
465 {
466    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
467    int i;
468 
469    if (list) {
470       for (i = 0; i < cs->csc->num_relocs; i++) {
471          list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
472          list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
473          list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
474       }
475    }
476    return cs->csc->num_relocs;
477 }
478 
radeon_drm_cs_emit_ioctl_oneshot(void * job,void * gdata,int thread_index)479 void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
480 {
481    struct radeon_drm_cs *cs = (struct radeon_drm_cs*)job;
482    struct radeon_cs_context *csc = cs->cst;
483    unsigned i;
484    int r;
485 
486    r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
487                            &csc->cs, sizeof(struct drm_radeon_cs));
488    if (r) {
489       if (r == -ENOMEM)
490          fprintf(stderr, "radeon: Not enough memory for command submission.\n");
491       else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
492          unsigned i;
493 
494          fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
495          for (i = 0; i < csc->chunks[0].length_dw; i++) {
496             fprintf(stderr, "0x%08X\n", csc->buf[i]);
497          }
498       } else {
499          fprintf(stderr, "radeon: The kernel rejected CS, "
500                          "see dmesg for more information (%i).\n", r);
501       }
502    }
503 
504    for (i = 0; i < csc->num_relocs; i++)
505       p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
506    for (i = 0; i < csc->num_slab_buffers; i++)
507       p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
508 
509    radeon_cs_context_cleanup(&cs->ws->base, csc);
510 }
511 
512 /*
513  * Make sure previous submission of this cs are completed
514  */
radeon_drm_cs_sync_flush(struct radeon_cmdbuf * rcs)515 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
516 {
517    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
518 
519    /* Wait for any pending ioctl of this CS to complete. */
520    if (util_queue_is_initialized(&cs->ws->cs_queue))
521       util_queue_fence_wait(&cs->flush_completed);
522 }
523 
524 /* Add the given fence to a slab buffer fence list.
525  *
526  * There is a potential race condition when bo participates in submissions on
527  * two or more threads simultaneously. Since we do not know which of the
528  * submissions will be sent to the GPU first, we have to keep the fences
529  * of all submissions.
530  *
531  * However, fences that belong to submissions that have already returned from
532  * their respective ioctl do not have to be kept, because we know that they
533  * will signal earlier.
534  */
radeon_bo_slab_fence(struct radeon_winsys * rws,struct radeon_bo * bo,struct radeon_bo * fence)535 static void radeon_bo_slab_fence(struct radeon_winsys *rws, struct radeon_bo *bo,
536                                  struct radeon_bo *fence)
537 {
538    unsigned dst;
539 
540    assert(fence->num_cs_references);
541 
542    /* Cleanup older fences */
543    dst = 0;
544    for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
545       if (bo->u.slab.fences[src]->num_cs_references) {
546          bo->u.slab.fences[dst] = bo->u.slab.fences[src];
547          dst++;
548       } else {
549          radeon_ws_bo_reference(rws, &bo->u.slab.fences[src], NULL);
550       }
551    }
552    bo->u.slab.num_fences = dst;
553 
554    /* Check available space for the new fence */
555    if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
556       unsigned new_max_fences = bo->u.slab.max_fences + 1;
557       struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
558                                               bo->u.slab.max_fences * sizeof(*new_fences),
559                                               new_max_fences * sizeof(*new_fences));
560       if (!new_fences) {
561          fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
562          return;
563       }
564 
565       bo->u.slab.fences = new_fences;
566       bo->u.slab.max_fences = new_max_fences;
567    }
568 
569    /* Add the new fence */
570    bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
571    radeon_ws_bo_reference(rws, &bo->u.slab.fences[bo->u.slab.num_fences], fence);
572    bo->u.slab.num_fences++;
573 }
574 
radeon_drm_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** pfence)575 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
576                                unsigned flags,
577                                struct pipe_fence_handle **pfence)
578 {
579    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
580    struct radeon_cs_context *tmp;
581 
582    switch (cs->ip_type) {
583    case AMD_IP_SDMA:
584       /* pad DMA ring to 8 DWs */
585       if (cs->ws->info.gfx_level <= GFX6) {
586          while (rcs->current.cdw & 7)
587             radeon_emit(rcs, 0xf0000000); /* NOP packet */
588       } else {
589          while (rcs->current.cdw & 7)
590             radeon_emit(rcs, 0x00000000); /* NOP packet */
591       }
592       break;
593    case AMD_IP_GFX:
594       /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
595        * r6xx, requires at least 4 dw alignment to avoid a hw bug.
596        */
597       if (cs->ws->info.gfx_ib_pad_with_type2) {
598          while (rcs->current.cdw & 7)
599             radeon_emit(rcs, 0x80000000); /* type2 nop packet */
600       } else {
601          while (rcs->current.cdw & 7)
602             radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
603       }
604       break;
605    case AMD_IP_UVD:
606       while (rcs->current.cdw & 15)
607          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
608       break;
609    default:
610       break;
611    }
612 
613    if (rcs->current.cdw > rcs->current.max_dw) {
614       fprintf(stderr, "radeon: command stream overflowed\n");
615    }
616 
617    if (pfence || cs->csc->num_slab_buffers) {
618       struct pipe_fence_handle *fence;
619 
620       if (cs->next_fence) {
621          fence = cs->next_fence;
622          cs->next_fence = NULL;
623       } else {
624          fence = radeon_cs_create_fence(rcs);
625       }
626 
627       if (fence) {
628          if (pfence)
629             radeon_fence_reference(&cs->ws->base, pfence, fence);
630 
631          mtx_lock(&cs->ws->bo_fence_lock);
632          for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
633             struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
634             p_atomic_inc(&bo->num_active_ioctls);
635             radeon_bo_slab_fence(&cs->ws->base, bo, (struct radeon_bo *)fence);
636          }
637          mtx_unlock(&cs->ws->bo_fence_lock);
638 
639          radeon_fence_reference(&cs->ws->base, &fence, NULL);
640       }
641    } else {
642       radeon_fence_reference(&cs->ws->base, &cs->next_fence, NULL);
643    }
644 
645    radeon_drm_cs_sync_flush(rcs);
646 
647    /* Swap command streams. */
648    tmp = cs->csc;
649    cs->csc = cs->cst;
650    cs->cst = tmp;
651 
652    /* If the CS is not empty or overflowed, emit it in a separate thread. */
653    if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
654        !cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
655       unsigned i, num_relocs;
656 
657       num_relocs = cs->cst->num_relocs;
658 
659       cs->cst->chunks[0].length_dw = rcs->current.cdw;
660 
661       for (i = 0; i < num_relocs; i++) {
662          /* Update the number of active asynchronous CS ioctls for the buffer. */
663          p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
664       }
665 
666       switch (cs->ip_type) {
667       case AMD_IP_SDMA:
668          cs->cst->flags[0] = 0;
669          cs->cst->flags[1] = RADEON_CS_RING_DMA;
670          cs->cst->cs.num_chunks = 3;
671          if (cs->ws->info.r600_has_virtual_memory) {
672             cs->cst->flags[0] |= RADEON_CS_USE_VM;
673          }
674          break;
675 
676       case AMD_IP_UVD:
677          cs->cst->flags[0] = 0;
678          cs->cst->flags[1] = RADEON_CS_RING_UVD;
679          cs->cst->cs.num_chunks = 3;
680          break;
681 
682       case AMD_IP_VCE:
683          cs->cst->flags[0] = 0;
684          cs->cst->flags[1] = RADEON_CS_RING_VCE;
685          cs->cst->cs.num_chunks = 3;
686          break;
687 
688       default:
689       case AMD_IP_GFX:
690       case AMD_IP_COMPUTE:
691          cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
692          cs->cst->flags[1] = RADEON_CS_RING_GFX;
693          cs->cst->cs.num_chunks = 3;
694 
695          if (cs->ws->info.r600_has_virtual_memory) {
696             cs->cst->flags[0] |= RADEON_CS_USE_VM;
697             cs->cst->cs.num_chunks = 3;
698          }
699          if (flags & PIPE_FLUSH_END_OF_FRAME) {
700             cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
701             cs->cst->cs.num_chunks = 3;
702          }
703          if (cs->ip_type == AMD_IP_COMPUTE) {
704             cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
705             cs->cst->cs.num_chunks = 3;
706          }
707          break;
708       }
709 
710       if (util_queue_is_initialized(&cs->ws->cs_queue)) {
711          util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
712                             radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
713          if (!(flags & PIPE_FLUSH_ASYNC))
714             radeon_drm_cs_sync_flush(rcs);
715       } else {
716          radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
717       }
718    } else {
719       radeon_cs_context_cleanup(&cs->ws->base, cs->cst);
720    }
721 
722    /* Prepare a new CS. */
723    rcs->current.buf = cs->csc->buf;
724    rcs->current.cdw = 0;
725    rcs->used_vram_kb = 0;
726    rcs->used_gart_kb = 0;
727 
728    if (cs->ip_type == AMD_IP_GFX)
729       cs->ws->num_gfx_IBs++;
730    else if (cs->ip_type == AMD_IP_SDMA)
731       cs->ws->num_sdma_IBs++;
732    return 0;
733 }
734 
radeon_drm_cs_destroy(struct radeon_cmdbuf * rcs)735 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
736 {
737    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
738 
739    if (!cs)
740       return;
741 
742    radeon_drm_cs_sync_flush(rcs);
743    util_queue_fence_destroy(&cs->flush_completed);
744    radeon_cs_context_cleanup(&cs->ws->base, &cs->csc1);
745    radeon_cs_context_cleanup(&cs->ws->base, &cs->csc2);
746    p_atomic_dec(&cs->ws->num_cs);
747    radeon_destroy_cs_context(&cs->ws->base, &cs->csc1);
748    radeon_destroy_cs_context(&cs->ws->base, &cs->csc2);
749    radeon_fence_reference(&cs->ws->base, &cs->next_fence, NULL);
750    FREE(cs);
751 }
752 
radeon_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)753 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
754                                     struct pb_buffer_lean *_buf,
755                                     unsigned usage)
756 {
757    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
758    struct radeon_bo *bo = (struct radeon_bo*)_buf;
759    int index;
760 
761    if (!bo->num_cs_references)
762       return false;
763 
764    index = radeon_lookup_buffer(&cs->ws->base, cs->csc, bo);
765    if (index == -1)
766       return false;
767 
768    if (!bo->handle)
769       index = cs->csc->slab_buffers[index].u.slab.real_idx;
770 
771    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
772       return true;
773    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
774       return true;
775 
776    return false;
777 }
778 
779 /* FENCES */
780 
radeon_cs_create_fence(struct radeon_cmdbuf * rcs)781 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
782 {
783    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
784    struct pb_buffer_lean *fence;
785 
786    /* Create a fence, which is a dummy BO. */
787    fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
788                                       RADEON_DOMAIN_GTT,
789                                       RADEON_FLAG_NO_SUBALLOC
790                                       | RADEON_FLAG_NO_INTERPROCESS_SHARING);
791    if (!fence)
792       return NULL;
793 
794    /* Add the fence as a dummy relocation. */
795    cs->ws->base.cs_add_buffer(rcs, fence,
796                               RADEON_USAGE_READWRITE | RADEON_PRIO_FENCE_TRACE, RADEON_DOMAIN_GTT);
797    return (struct pipe_fence_handle*)fence;
798 }
799 
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)800 static bool radeon_fence_wait(struct radeon_winsys *ws,
801                               struct pipe_fence_handle *fence,
802                               uint64_t timeout)
803 {
804    return ws->buffer_wait(ws, (struct pb_buffer_lean*)fence, timeout,
805                           RADEON_USAGE_READWRITE);
806 }
807 
radeon_fence_reference(struct radeon_winsys * ws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)808 static void radeon_fence_reference(struct radeon_winsys *ws,
809                                    struct pipe_fence_handle **dst,
810                                    struct pipe_fence_handle *src)
811 {
812    radeon_bo_reference(ws, (struct pb_buffer_lean**)dst, (struct pb_buffer_lean*)src);
813 }
814 
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf * rcs)815 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
816 {
817    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
818    struct pipe_fence_handle *fence = NULL;
819 
820    if (cs->next_fence) {
821       radeon_fence_reference(&cs->ws->base, &fence, cs->next_fence);
822       return fence;
823    }
824 
825    fence = radeon_cs_create_fence(rcs);
826    if (!fence)
827       return NULL;
828 
829    radeon_fence_reference(&cs->ws->base, &cs->next_fence, fence);
830    return fence;
831 }
832 
833 static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf * cs,struct pipe_fence_handle * fence)834 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
835                                    struct pipe_fence_handle *fence)
836 {
837    /* TODO: Handle the following unlikely multi-threaded scenario:
838     *
839     *  Thread 1 / Context 1                   Thread 2 / Context 2
840     *  --------------------                   --------------------
841     *  f = cs_get_next_fence()
842     *                                         cs_add_fence_dependency(f)
843     *                                         cs_flush()
844     *  cs_flush()
845     *
846     * We currently assume that this does not happen because we don't support
847     * asynchronous flushes on Radeon.
848     */
849 }
850 
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)851 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
852 {
853    ws->base.ctx_create = radeon_drm_ctx_create;
854    ws->base.ctx_destroy = radeon_drm_ctx_destroy;
855    ws->base.ctx_set_sw_reset_status = radeon_drm_ctx_set_sw_reset_status;
856    ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
857    ws->base.cs_create = radeon_drm_cs_create;
858    ws->base.cs_destroy = radeon_drm_cs_destroy;
859    ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
860    ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
861    ws->base.cs_validate = radeon_drm_cs_validate;
862    ws->base.cs_check_space = radeon_drm_cs_check_space;
863    ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
864    ws->base.cs_flush = radeon_drm_cs_flush;
865    ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
866    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
867    ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
868    ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
869    ws->base.fence_wait = radeon_fence_wait;
870    ws->base.fence_reference = radeon_fence_reference;
871 }
872