• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * The above copyright notice and this permission notice (including the
24  * next paragraph) shall be included in all copies or substantial portions
25  * of the Software.
26  */
27 
28 /*
29     This file replaces libdrm's radeon_cs_gem with our own implemention.
30     It's optimized specifically for Radeon DRM.
31     Adding buffers and space checking are faster and simpler than their
32     counterparts in libdrm (the time complexity of all the functions
33     is O(1) in nearly all scenarios, thanks to hashing).
34 
35     It works like this:
36 
37     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38     also adds the size of 'buf' to the used_gart and used_vram winsys variables
39     based on the domains, which are simply or'd for the accounting purposes.
40     The adding is skipped if the reloc is already present in the list, but it
41     accounts any newly-referenced domains.
42 
43     cs_validate is then called, which just checks:
44         used_vram/gart < vram/gart_size * 0.8
45     The 0.8 number allows for some memory fragmentation. If the validation
46     fails, the pipe driver flushes CS and tries do the validation again,
47     i.e. it validates only that one operation. If it fails again, it drops
48     the operation on the floor and prints some nasty message to stderr.
49     (done in the pipe driver)
50 
51     cs_write_reloc(cs, buf) just writes a reloc that has been added using
52     cs_add_buffer. The read_domain and write_domain parameters have been removed,
53     because we already specify them in cs_add_buffer.
54 */
55 
56 #include "radeon_drm_cs.h"
57 
58 #include "util/u_memory.h"
59 #include "util/os_time.h"
60 
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <xf86drm.h>
65 
66 
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68 
69 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
70 static void radeon_fence_reference(struct pipe_fence_handle **dst,
71                                    struct pipe_fence_handle *src);
72 
radeon_drm_ctx_create(struct radeon_winsys * ws,enum radeon_ctx_priority priority)73 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws,
74                                                        enum radeon_ctx_priority priority)
75 {
76    struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
77    if (!ctx)
78       return NULL;
79 
80    ctx->ws = (struct radeon_drm_winsys*)ws;
81    ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
82    return (struct radeon_winsys_ctx*)ctx;
83 }
84 
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)85 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
86 {
87    FREE(ctx);
88 }
89 
90 static enum pipe_reset_status
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx * rctx,bool full_reset_only,bool * needs_reset)91 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
92                                   bool *needs_reset)
93 {
94    struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
95 
96    unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
97 
98    if (ctx->gpu_reset_counter == latest) {
99       if (needs_reset)
100          *needs_reset = false;
101       return PIPE_NO_RESET;
102    }
103 
104    if (needs_reset)
105       *needs_reset = true;
106 
107    ctx->gpu_reset_counter = latest;
108    return PIPE_UNKNOWN_CONTEXT_RESET;
109 }
110 
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)111 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
112                                    struct radeon_drm_winsys *ws)
113 {
114    int i;
115 
116    csc->fd = ws->fd;
117 
118    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
119    csc->chunks[0].length_dw = 0;
120    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
121    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
122    csc->chunks[1].length_dw = 0;
123    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
124    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
125    csc->chunks[2].length_dw = 2;
126    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
127 
128    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
129    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
130    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
131 
132    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
133 
134    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
135       csc->reloc_indices_hashlist[i] = -1;
136    }
137    return true;
138 }
139 
radeon_cs_context_cleanup(struct radeon_cs_context * csc)140 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
141 {
142    unsigned i;
143 
144    for (i = 0; i < csc->num_relocs; i++) {
145       p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
146       radeon_ws_bo_reference(&csc->relocs_bo[i].bo, NULL);
147    }
148    for (i = 0; i < csc->num_slab_buffers; ++i) {
149       p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
150       radeon_ws_bo_reference(&csc->slab_buffers[i].bo, NULL);
151    }
152 
153    csc->num_relocs = 0;
154    csc->num_validated_relocs = 0;
155    csc->num_slab_buffers = 0;
156    csc->chunks[0].length_dw = 0;
157    csc->chunks[1].length_dw = 0;
158 
159    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
160       csc->reloc_indices_hashlist[i] = -1;
161    }
162 }
163 
radeon_destroy_cs_context(struct radeon_cs_context * csc)164 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
165 {
166    radeon_cs_context_cleanup(csc);
167    FREE(csc->slab_buffers);
168    FREE(csc->relocs_bo);
169    FREE(csc->relocs);
170 }
171 
172 
173 static bool
radeon_drm_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * ctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx,bool stop_exec_on_failure)174 radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
175                      struct radeon_winsys_ctx *ctx,
176                      enum amd_ip_type ip_type,
177                      void (*flush)(void *ctx, unsigned flags,
178                                    struct pipe_fence_handle **fence),
179                      void *flush_ctx,
180                      bool stop_exec_on_failure)
181 {
182    struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
183    struct radeon_drm_cs *cs;
184 
185    cs = CALLOC_STRUCT(radeon_drm_cs);
186    if (!cs) {
187       return false;
188    }
189    util_queue_fence_init(&cs->flush_completed);
190 
191    cs->ws = ws;
192    cs->flush_cs = flush;
193    cs->flush_data = flush_ctx;
194 
195    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
196       FREE(cs);
197       return false;
198    }
199    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
200       radeon_destroy_cs_context(&cs->csc1);
201       FREE(cs);
202       return false;
203    }
204 
205    /* Set the first command buffer as current. */
206    cs->csc = &cs->csc1;
207    cs->cst = &cs->csc2;
208    cs->ip_type = ip_type;
209 
210    memset(rcs, 0, sizeof(*rcs));
211    rcs->current.buf = cs->csc->buf;
212    rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
213    rcs->priv = cs;
214 
215    p_atomic_inc(&ws->num_cs);
216    return true;
217 }
218 
radeon_drm_cs_set_preamble(struct radeon_cmdbuf * cs,const uint32_t * preamble_ib,unsigned preamble_num_dw,bool preamble_changed)219 static void radeon_drm_cs_set_preamble(struct radeon_cmdbuf *cs, const uint32_t *preamble_ib,
220                                        unsigned preamble_num_dw, bool preamble_changed)
221 {
222    /* The radeon kernel driver doesn't support preambles. */
223    radeon_emit_array(cs, preamble_ib, preamble_num_dw);
224 }
225 
radeon_lookup_buffer(struct radeon_cs_context * csc,struct radeon_bo * bo)226 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
227 {
228    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
229    struct radeon_bo_item *buffers;
230    unsigned num_buffers;
231    int i = csc->reloc_indices_hashlist[hash];
232 
233    if (bo->handle) {
234       buffers = csc->relocs_bo;
235       num_buffers = csc->num_relocs;
236    } else {
237       buffers = csc->slab_buffers;
238       num_buffers = csc->num_slab_buffers;
239    }
240 
241    /* not found or found */
242    if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
243       return i;
244 
245    /* Hash collision, look for the BO in the list of relocs linearly. */
246    for (i = num_buffers - 1; i >= 0; i--) {
247       if (buffers[i].bo == bo) {
248          /* Put this reloc in the hash list.
249           * This will prevent additional hash collisions if there are
250           * several consecutive lookup_buffer calls for the same buffer.
251           *
252           * Example: Assuming buffers A,B,C collide in the hash list,
253           * the following sequence of relocs:
254           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
255           * will collide here: ^ and here:   ^,
256           * meaning that we should get very few collisions in the end. */
257          csc->reloc_indices_hashlist[hash] = i;
258          return i;
259       }
260    }
261    return -1;
262 }
263 
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)264 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
265                                                  struct radeon_bo *bo)
266 {
267    struct radeon_cs_context *csc = cs->csc;
268    struct drm_radeon_cs_reloc *reloc;
269    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
270    int i = -1;
271 
272    i = radeon_lookup_buffer(csc, bo);
273 
274    if (i >= 0) {
275       /* For async DMA, every add_buffer call must add a buffer to the list
276        * no matter how many duplicates there are. This is due to the fact
277        * the DMA CS checker doesn't use NOP packets for offset patching,
278        * but always uses the i-th buffer from the list to patch the i-th
279        * offset. If there are N offsets in a DMA CS, there must also be N
280        * buffers in the relocation list.
281        *
282        * This doesn't have to be done if virtual memory is enabled,
283        * because there is no offset patching with virtual memory.
284        */
285       if (cs->ip_type != AMD_IP_SDMA || cs->ws->info.r600_has_virtual_memory) {
286          return i;
287       }
288    }
289 
290    /* New relocation, check if the backing array is large enough. */
291    if (csc->num_relocs >= csc->max_relocs) {
292       uint32_t size;
293       csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
294 
295       size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
296       csc->relocs_bo = realloc(csc->relocs_bo, size);
297 
298       size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
299       csc->relocs = realloc(csc->relocs, size);
300 
301       csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
302    }
303 
304    /* Initialize the new relocation. */
305    csc->relocs_bo[csc->num_relocs].bo = NULL;
306    csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
307    radeon_ws_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
308    p_atomic_inc(&bo->num_cs_references);
309    reloc = &csc->relocs[csc->num_relocs];
310    reloc->handle = bo->handle;
311    reloc->read_domains = 0;
312    reloc->write_domain = 0;
313    reloc->flags = 0;
314 
315    csc->reloc_indices_hashlist[hash] = csc->num_relocs;
316 
317    csc->chunks[1].length_dw += RELOC_DWORDS;
318 
319    return csc->num_relocs++;
320 }
321 
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)322 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
323                                             struct radeon_bo *bo)
324 {
325    struct radeon_cs_context *csc = cs->csc;
326    unsigned hash;
327    struct radeon_bo_item *item;
328    int idx;
329    int real_idx;
330 
331    idx = radeon_lookup_buffer(csc, bo);
332    if (idx >= 0)
333       return idx;
334 
335    real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
336 
337    /* Check if the backing array is large enough. */
338    if (csc->num_slab_buffers >= csc->max_slab_buffers) {
339       unsigned new_max = MAX2(csc->max_slab_buffers + 16,
340                               (unsigned)(csc->max_slab_buffers * 1.3));
341       struct radeon_bo_item *new_buffers =
342             REALLOC(csc->slab_buffers,
343                     csc->max_slab_buffers * sizeof(*new_buffers),
344                     new_max * sizeof(*new_buffers));
345       if (!new_buffers) {
346          fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
347          return -1;
348       }
349 
350       csc->max_slab_buffers = new_max;
351       csc->slab_buffers = new_buffers;
352    }
353 
354    /* Initialize the new relocation. */
355    idx = csc->num_slab_buffers++;
356    item = &csc->slab_buffers[idx];
357 
358    item->bo = NULL;
359    item->u.slab.real_idx = real_idx;
360    radeon_ws_bo_reference(&item->bo, bo);
361    p_atomic_inc(&bo->num_cs_references);
362 
363    hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
364    csc->reloc_indices_hashlist[hash] = idx;
365 
366    return idx;
367 }
368 
radeon_drm_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf,unsigned usage,enum radeon_bo_domain domains)369 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
370                                          struct pb_buffer *buf,
371                                          unsigned usage,
372                                          enum radeon_bo_domain domains)
373 {
374    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
375    struct radeon_bo *bo = (struct radeon_bo*)buf;
376    enum radeon_bo_domain added_domains;
377 
378    /* If VRAM is just stolen system memory, allow both VRAM and
379     * GTT, whichever has free space. If a buffer is evicted from
380     * VRAM to GTT, it will stay there.
381     */
382    if (!cs->ws->info.has_dedicated_vram)
383       domains |= RADEON_DOMAIN_GTT;
384 
385    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
386    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
387    struct drm_radeon_cs_reloc *reloc;
388    int index;
389 
390    if (!bo->handle) {
391       index = radeon_lookup_or_add_slab_buffer(cs, bo);
392       if (index < 0)
393          return 0;
394 
395       index = cs->csc->slab_buffers[index].u.slab.real_idx;
396    } else {
397       index = radeon_lookup_or_add_real_buffer(cs, bo);
398    }
399 
400    reloc = &cs->csc->relocs[index];
401    added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
402    reloc->read_domains |= rd;
403    reloc->write_domain |= wd;
404 
405    /* The priority must be in [0, 15]. It's used by the kernel memory management. */
406    unsigned priority = usage & RADEON_ALL_PRIORITIES;
407    unsigned bo_priority = util_last_bit(priority) / 2;
408    reloc->flags = MAX2(reloc->flags, bo_priority);
409    cs->csc->relocs_bo[index].u.real.priority_usage |= priority;
410 
411    if (added_domains & RADEON_DOMAIN_VRAM)
412       rcs->used_vram_kb += bo->base.size / 1024;
413    else if (added_domains & RADEON_DOMAIN_GTT)
414       rcs->used_gart_kb += bo->base.size / 1024;
415 
416    return index;
417 }
418 
radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf)419 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
420                                        struct pb_buffer *buf)
421 {
422    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
423 
424    return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
425 }
426 
radeon_drm_cs_validate(struct radeon_cmdbuf * rcs)427 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
428 {
429    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
430    bool status =
431          rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
432          rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
433 
434    if (status) {
435       cs->csc->num_validated_relocs = cs->csc->num_relocs;
436    } else {
437       /* Remove lately-added buffers. The validation failed with them
438        * and the CS is about to be flushed because of that. Keep only
439        * the already-validated buffers. */
440       unsigned i;
441 
442       for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
443          p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
444          radeon_ws_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
445       }
446       cs->csc->num_relocs = cs->csc->num_validated_relocs;
447 
448       /* Flush if there are any relocs. Clean up otherwise. */
449       if (cs->csc->num_relocs) {
450          cs->flush_cs(cs->flush_data,
451                       RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
452       } else {
453          radeon_cs_context_cleanup(cs->csc);
454          rcs->used_vram_kb = 0;
455          rcs->used_gart_kb = 0;
456 
457          assert(rcs->current.cdw == 0);
458          if (rcs->current.cdw != 0) {
459             fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
460          }
461       }
462    }
463    return status;
464 }
465 
radeon_drm_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)466 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
467 {
468    assert(rcs->current.cdw <= rcs->current.max_dw);
469    return rcs->current.max_dw - rcs->current.cdw >= dw;
470 }
471 
radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)472 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
473                                               struct radeon_bo_list_item *list)
474 {
475    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
476    int i;
477 
478    if (list) {
479       for (i = 0; i < cs->csc->num_relocs; i++) {
480          list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
481          list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
482          list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
483       }
484    }
485    return cs->csc->num_relocs;
486 }
487 
radeon_drm_cs_emit_ioctl_oneshot(void * job,void * gdata,int thread_index)488 void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
489 {
490    struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
491    unsigned i;
492    int r;
493 
494    r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
495                            &csc->cs, sizeof(struct drm_radeon_cs));
496    if (r) {
497       if (r == -ENOMEM)
498          fprintf(stderr, "radeon: Not enough memory for command submission.\n");
499       else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
500          unsigned i;
501 
502          fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
503          for (i = 0; i < csc->chunks[0].length_dw; i++) {
504             fprintf(stderr, "0x%08X\n", csc->buf[i]);
505          }
506       } else {
507          fprintf(stderr, "radeon: The kernel rejected CS, "
508                          "see dmesg for more information (%i).\n", r);
509       }
510    }
511 
512    for (i = 0; i < csc->num_relocs; i++)
513       p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
514    for (i = 0; i < csc->num_slab_buffers; i++)
515       p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
516 
517    radeon_cs_context_cleanup(csc);
518 }
519 
520 /*
521  * Make sure previous submission of this cs are completed
522  */
radeon_drm_cs_sync_flush(struct radeon_cmdbuf * rcs)523 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
524 {
525    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
526 
527    /* Wait for any pending ioctl of this CS to complete. */
528    if (util_queue_is_initialized(&cs->ws->cs_queue))
529       util_queue_fence_wait(&cs->flush_completed);
530 }
531 
532 /* Add the given fence to a slab buffer fence list.
533  *
534  * There is a potential race condition when bo participates in submissions on
535  * two or more threads simultaneously. Since we do not know which of the
536  * submissions will be sent to the GPU first, we have to keep the fences
537  * of all submissions.
538  *
539  * However, fences that belong to submissions that have already returned from
540  * their respective ioctl do not have to be kept, because we know that they
541  * will signal earlier.
542  */
radeon_bo_slab_fence(struct radeon_bo * bo,struct radeon_bo * fence)543 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
544 {
545    unsigned dst;
546 
547    assert(fence->num_cs_references);
548 
549    /* Cleanup older fences */
550    dst = 0;
551    for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
552       if (bo->u.slab.fences[src]->num_cs_references) {
553          bo->u.slab.fences[dst] = bo->u.slab.fences[src];
554          dst++;
555       } else {
556          radeon_ws_bo_reference(&bo->u.slab.fences[src], NULL);
557       }
558    }
559    bo->u.slab.num_fences = dst;
560 
561    /* Check available space for the new fence */
562    if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
563       unsigned new_max_fences = bo->u.slab.max_fences + 1;
564       struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
565                                               bo->u.slab.max_fences * sizeof(*new_fences),
566                                               new_max_fences * sizeof(*new_fences));
567       if (!new_fences) {
568          fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
569          return;
570       }
571 
572       bo->u.slab.fences = new_fences;
573       bo->u.slab.max_fences = new_max_fences;
574    }
575 
576    /* Add the new fence */
577    bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
578    radeon_ws_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
579    bo->u.slab.num_fences++;
580 }
581 
radeon_drm_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** pfence)582 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
583                                unsigned flags,
584                                struct pipe_fence_handle **pfence)
585 {
586    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
587    struct radeon_cs_context *tmp;
588 
589    switch (cs->ip_type) {
590    case AMD_IP_SDMA:
591       /* pad DMA ring to 8 DWs */
592       if (cs->ws->info.gfx_level <= GFX6) {
593          while (rcs->current.cdw & 7)
594             radeon_emit(rcs, 0xf0000000); /* NOP packet */
595       } else {
596          while (rcs->current.cdw & 7)
597             radeon_emit(rcs, 0x00000000); /* NOP packet */
598       }
599       break;
600    case AMD_IP_GFX:
601       /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
602        * r6xx, requires at least 4 dw alignment to avoid a hw bug.
603        */
604       if (cs->ws->info.gfx_ib_pad_with_type2) {
605          while (rcs->current.cdw & 7)
606             radeon_emit(rcs, 0x80000000); /* type2 nop packet */
607       } else {
608          while (rcs->current.cdw & 7)
609             radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
610       }
611       break;
612    case AMD_IP_UVD:
613       while (rcs->current.cdw & 15)
614          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
615       break;
616    default:
617       break;
618    }
619 
620    if (rcs->current.cdw > rcs->current.max_dw) {
621       fprintf(stderr, "radeon: command stream overflowed\n");
622    }
623 
624    if (pfence || cs->csc->num_slab_buffers) {
625       struct pipe_fence_handle *fence;
626 
627       if (cs->next_fence) {
628          fence = cs->next_fence;
629          cs->next_fence = NULL;
630       } else {
631          fence = radeon_cs_create_fence(rcs);
632       }
633 
634       if (fence) {
635          if (pfence)
636             radeon_fence_reference(pfence, fence);
637 
638          mtx_lock(&cs->ws->bo_fence_lock);
639          for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
640             struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
641             p_atomic_inc(&bo->num_active_ioctls);
642             radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
643          }
644          mtx_unlock(&cs->ws->bo_fence_lock);
645 
646          radeon_fence_reference(&fence, NULL);
647       }
648    } else {
649       radeon_fence_reference(&cs->next_fence, NULL);
650    }
651 
652    radeon_drm_cs_sync_flush(rcs);
653 
654    /* Swap command streams. */
655    tmp = cs->csc;
656    cs->csc = cs->cst;
657    cs->cst = tmp;
658 
659    /* If the CS is not empty or overflowed, emit it in a separate thread. */
660    if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
661        !cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
662       unsigned i, num_relocs;
663 
664       num_relocs = cs->cst->num_relocs;
665 
666       cs->cst->chunks[0].length_dw = rcs->current.cdw;
667 
668       for (i = 0; i < num_relocs; i++) {
669          /* Update the number of active asynchronous CS ioctls for the buffer. */
670          p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
671       }
672 
673       switch (cs->ip_type) {
674       case AMD_IP_SDMA:
675          cs->cst->flags[0] = 0;
676          cs->cst->flags[1] = RADEON_CS_RING_DMA;
677          cs->cst->cs.num_chunks = 3;
678          if (cs->ws->info.r600_has_virtual_memory) {
679             cs->cst->flags[0] |= RADEON_CS_USE_VM;
680          }
681          break;
682 
683       case AMD_IP_UVD:
684          cs->cst->flags[0] = 0;
685          cs->cst->flags[1] = RADEON_CS_RING_UVD;
686          cs->cst->cs.num_chunks = 3;
687          break;
688 
689       case AMD_IP_VCE:
690          cs->cst->flags[0] = 0;
691          cs->cst->flags[1] = RADEON_CS_RING_VCE;
692          cs->cst->cs.num_chunks = 3;
693          break;
694 
695       default:
696       case AMD_IP_GFX:
697       case AMD_IP_COMPUTE:
698          cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
699          cs->cst->flags[1] = RADEON_CS_RING_GFX;
700          cs->cst->cs.num_chunks = 3;
701 
702          if (cs->ws->info.r600_has_virtual_memory) {
703             cs->cst->flags[0] |= RADEON_CS_USE_VM;
704             cs->cst->cs.num_chunks = 3;
705          }
706          if (flags & PIPE_FLUSH_END_OF_FRAME) {
707             cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
708             cs->cst->cs.num_chunks = 3;
709          }
710          if (cs->ip_type == AMD_IP_COMPUTE) {
711             cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
712             cs->cst->cs.num_chunks = 3;
713          }
714          break;
715       }
716 
717       if (util_queue_is_initialized(&cs->ws->cs_queue)) {
718          util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
719                             radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
720          if (!(flags & PIPE_FLUSH_ASYNC))
721             radeon_drm_cs_sync_flush(rcs);
722       } else {
723          radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
724       }
725    } else {
726       radeon_cs_context_cleanup(cs->cst);
727    }
728 
729    /* Prepare a new CS. */
730    rcs->current.buf = cs->csc->buf;
731    rcs->current.cdw = 0;
732    rcs->used_vram_kb = 0;
733    rcs->used_gart_kb = 0;
734 
735    if (cs->ip_type == AMD_IP_GFX)
736       cs->ws->num_gfx_IBs++;
737    else if (cs->ip_type == AMD_IP_SDMA)
738       cs->ws->num_sdma_IBs++;
739    return 0;
740 }
741 
radeon_drm_cs_destroy(struct radeon_cmdbuf * rcs)742 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
743 {
744    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
745 
746    if (!cs)
747       return;
748 
749    radeon_drm_cs_sync_flush(rcs);
750    util_queue_fence_destroy(&cs->flush_completed);
751    radeon_cs_context_cleanup(&cs->csc1);
752    radeon_cs_context_cleanup(&cs->csc2);
753    p_atomic_dec(&cs->ws->num_cs);
754    radeon_destroy_cs_context(&cs->csc1);
755    radeon_destroy_cs_context(&cs->csc2);
756    radeon_fence_reference(&cs->next_fence, NULL);
757    FREE(cs);
758 }
759 
radeon_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer * _buf,unsigned usage)760 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
761                                     struct pb_buffer *_buf,
762                                     unsigned usage)
763 {
764    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
765    struct radeon_bo *bo = (struct radeon_bo*)_buf;
766    int index;
767 
768    if (!bo->num_cs_references)
769       return false;
770 
771    index = radeon_lookup_buffer(cs->csc, bo);
772    if (index == -1)
773       return false;
774 
775    if (!bo->handle)
776       index = cs->csc->slab_buffers[index].u.slab.real_idx;
777 
778    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
779       return true;
780    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
781       return true;
782 
783    return false;
784 }
785 
786 /* FENCES */
787 
radeon_cs_create_fence(struct radeon_cmdbuf * rcs)788 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
789 {
790    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
791    struct pb_buffer *fence;
792 
793    /* Create a fence, which is a dummy BO. */
794    fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
795                                       RADEON_DOMAIN_GTT,
796                                       RADEON_FLAG_NO_SUBALLOC
797                                       | RADEON_FLAG_NO_INTERPROCESS_SHARING);
798    if (!fence)
799       return NULL;
800 
801    /* Add the fence as a dummy relocation. */
802    cs->ws->base.cs_add_buffer(rcs, fence,
803                               RADEON_USAGE_READWRITE | RADEON_PRIO_FENCE_TRACE, RADEON_DOMAIN_GTT);
804    return (struct pipe_fence_handle*)fence;
805 }
806 
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)807 static bool radeon_fence_wait(struct radeon_winsys *ws,
808                               struct pipe_fence_handle *fence,
809                               uint64_t timeout)
810 {
811    return ws->buffer_wait(ws, (struct pb_buffer*)fence, timeout,
812                           RADEON_USAGE_READWRITE);
813 }
814 
radeon_fence_reference(struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)815 static void radeon_fence_reference(struct pipe_fence_handle **dst,
816                                    struct pipe_fence_handle *src)
817 {
818    pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
819 }
820 
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf * rcs)821 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
822 {
823    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
824    struct pipe_fence_handle *fence = NULL;
825 
826    if (cs->next_fence) {
827       radeon_fence_reference(&fence, cs->next_fence);
828       return fence;
829    }
830 
831    fence = radeon_cs_create_fence(rcs);
832    if (!fence)
833       return NULL;
834 
835    radeon_fence_reference(&cs->next_fence, fence);
836    return fence;
837 }
838 
839 static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf * cs,struct pipe_fence_handle * fence,unsigned dependency_flags)840 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
841                                    struct pipe_fence_handle *fence,
842                                    unsigned dependency_flags)
843 {
844    /* TODO: Handle the following unlikely multi-threaded scenario:
845     *
846     *  Thread 1 / Context 1                   Thread 2 / Context 2
847     *  --------------------                   --------------------
848     *  f = cs_get_next_fence()
849     *                                         cs_add_fence_dependency(f)
850     *                                         cs_flush()
851     *  cs_flush()
852     *
853     * We currently assume that this does not happen because we don't support
854     * asynchronous flushes on Radeon.
855     */
856 }
857 
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)858 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
859 {
860    ws->base.ctx_create = radeon_drm_ctx_create;
861    ws->base.ctx_destroy = radeon_drm_ctx_destroy;
862    ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
863    ws->base.cs_create = radeon_drm_cs_create;
864    ws->base.cs_set_preamble = radeon_drm_cs_set_preamble;
865    ws->base.cs_destroy = radeon_drm_cs_destroy;
866    ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
867    ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
868    ws->base.cs_validate = radeon_drm_cs_validate;
869    ws->base.cs_check_space = radeon_drm_cs_check_space;
870    ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
871    ws->base.cs_flush = radeon_drm_cs_flush;
872    ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
873    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
874    ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
875    ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
876    ws->base.fence_wait = radeon_fence_wait;
877    ws->base.fence_reference = radeon_fence_reference;
878 }
879