/* * Copyright © 2008 Jérôme Glisse * Copyright © 2010 Marek Olšák * Copyright © 2015 Advanced Micro Devices, Inc. * * SPDX-License-Identifier: MIT */ #include "amdgpu_cs.h" #include "util/detect_os.h" #include "amdgpu_winsys.h" #include "util/os_time.h" #include #include #include "amd/common/sid.h" /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error * codes in the kernel). */ #if DETECT_OS_OPENBSD #define ENODATA ENOTSUP #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY #define ENODATA ECONNREFUSED #endif /* FENCES */ void amdgpu_fence_destroy(struct amdgpu_fence *fence) { ac_drm_cs_destroy_syncobj(fence->aws->fd, fence->syncobj); if (fence->ctx) amdgpu_ctx_reference(&fence->ctx, NULL); util_queue_fence_destroy(&fence->submitted); FREE(fence); } static struct pipe_fence_handle * amdgpu_fence_create(struct amdgpu_cs *cs) { struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); struct amdgpu_ctx *ctx = cs->ctx; fence->reference.count = 1; fence->aws = ctx->aws; amdgpu_ctx_reference(&fence->ctx, ctx); fence->ctx = ctx; fence->ip_type = cs->ip_type; if (ac_drm_cs_create_syncobj2(ctx->aws->fd, 0, &fence->syncobj)) { free(fence); return NULL; } util_queue_fence_init(&fence->submitted); util_queue_fence_reset(&fence->submitted); fence->queue_index = cs->queue_index; return (struct pipe_fence_handle *)fence; } static struct pipe_fence_handle * amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd) { struct amdgpu_winsys *aws = amdgpu_winsys(rws); struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); int r; if (!fence) return NULL; pipe_reference_init(&fence->reference, 1); fence->aws = aws; fence->ip_type = 0xffffffff; r = ac_drm_cs_import_syncobj(aws->fd, fd, &fence->syncobj); if (r) { FREE(fence); return NULL; } util_queue_fence_init(&fence->submitted); fence->imported = true; return (struct pipe_fence_handle*)fence; } static struct pipe_fence_handle * amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd) { struct amdgpu_winsys *aws = amdgpu_winsys(rws); struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); if (!fence) return NULL; pipe_reference_init(&fence->reference, 1); fence->aws = aws; /* fence->ctx == NULL means that the fence is syncobj-based. */ /* Convert sync_file into syncobj. */ int r = ac_drm_cs_create_syncobj(aws->fd, &fence->syncobj); if (r) { FREE(fence); return NULL; } r = ac_drm_cs_syncobj_import_sync_file(aws->fd, fence->syncobj, fd); if (r) { ac_drm_cs_destroy_syncobj(aws->fd, fence->syncobj); FREE(fence); return NULL; } util_queue_fence_init(&fence->submitted); fence->imported = true; return (struct pipe_fence_handle*)fence; } static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws, struct pipe_fence_handle *pfence) { struct amdgpu_winsys *aws = amdgpu_winsys(rws); struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; int fd, r; util_queue_fence_wait(&fence->submitted); /* Convert syncobj into sync_file. */ r = ac_drm_cs_syncobj_export_sync_file(aws->fd, fence->syncobj, &fd); return r ? -1 : fd; } static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws) { struct amdgpu_winsys *aws = amdgpu_winsys(rws); uint32_t syncobj; int fd = -1; int r = ac_drm_cs_create_syncobj2(aws->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &syncobj); if (r) { return -1; } r = ac_drm_cs_syncobj_export_sync_file(aws->fd, syncobj, &fd); if (r) { fd = -1; } ac_drm_cs_destroy_syncobj(aws->fd, syncobj); return fd; } static void amdgpu_fence_submitted(struct pipe_fence_handle *fence, uint64_t seq_no, uint64_t *user_fence_cpu_address) { struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; afence->seq_no = seq_no; afence->user_fence_cpu_address = user_fence_cpu_address; util_queue_fence_signal(&afence->submitted); } static void amdgpu_fence_signalled(struct pipe_fence_handle *fence) { struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; afence->signalled = true; util_queue_fence_signal(&afence->submitted); } bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, bool absolute) { struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; int64_t abs_timeout; uint64_t *user_fence_cpu; if (afence->signalled) return true; if (absolute) abs_timeout = timeout; else abs_timeout = os_time_get_absolute_timeout(timeout); /* The fence might not have a number assigned if its IB is being * submitted in the other thread right now. Wait until the submission * is done. */ if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout)) return false; user_fence_cpu = afence->user_fence_cpu_address; if (user_fence_cpu) { if (*user_fence_cpu >= afence->seq_no) { afence->signalled = true; return true; } /* No timeout, just query: no need for the ioctl. */ if (!absolute && !timeout) return false; } if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE) abs_timeout = INT64_MAX; if (ac_drm_cs_syncobj_wait(afence->aws->fd, &afence->syncobj, 1, abs_timeout, 0, NULL)) return false; /* Check that guest-side syncobj agrees with the user fence. */ if (user_fence_cpu && afence->aws->info.is_virtio) assert(afence->seq_no <= *user_fence_cpu); afence->signalled = true; return true; } static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws, struct pipe_fence_handle *fence, uint64_t timeout) { return amdgpu_fence_wait(fence, timeout, false); } static struct pipe_fence_handle * amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs) { struct amdgpu_cs *cs = amdgpu_cs(rcs); struct pipe_fence_handle *fence = NULL; if (cs->noop) return NULL; if (cs->next_fence) { amdgpu_fence_reference(&fence, cs->next_fence); return fence; } fence = amdgpu_fence_create(cs); if (!fence) return NULL; amdgpu_fence_reference(&cs->next_fence, fence); return fence; } /* CONTEXTS */ static uint32_t radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority) { switch (radeon_priority) { case RADEON_CTX_PRIORITY_REALTIME: return AMDGPU_CTX_PRIORITY_VERY_HIGH; case RADEON_CTX_PRIORITY_HIGH: return AMDGPU_CTX_PRIORITY_HIGH; case RADEON_CTX_PRIORITY_MEDIUM: return AMDGPU_CTX_PRIORITY_NORMAL; case RADEON_CTX_PRIORITY_LOW: return AMDGPU_CTX_PRIORITY_LOW; default: unreachable("Invalid context priority"); } } static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *rws, enum radeon_ctx_priority priority, bool allow_context_lost) { struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx); int r; struct amdgpu_bo_alloc_request alloc_buffer = {}; uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority); ac_drm_device *dev; ac_drm_bo buf_handle; if (!ctx) return NULL; ctx->aws = amdgpu_winsys(rws); ctx->reference.count = 1; ctx->allow_context_lost = allow_context_lost; dev = ctx->aws->dev; r = ac_drm_cs_ctx_create2(dev, amdgpu_priority, &ctx->ctx_handle); if (r) { fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r); goto error_create; } alloc_buffer.alloc_size = ctx->aws->info.gart_page_size; alloc_buffer.phys_alignment = ctx->aws->info.gart_page_size; alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT; r = ac_drm_bo_alloc(dev, &alloc_buffer, &buf_handle); if (r) { fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r); goto error_user_fence_alloc; } ctx->user_fence_cpu_address_base = NULL; r = ac_drm_bo_cpu_map(dev, buf_handle, (void**)&ctx->user_fence_cpu_address_base); if (r) { fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r); goto error_user_fence_map; } memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size); ctx->user_fence_bo = buf_handle; ac_drm_bo_export(dev, buf_handle, amdgpu_bo_handle_type_kms, &ctx->user_fence_bo_kms_handle); return (struct radeon_winsys_ctx*)ctx; error_user_fence_map: ac_drm_bo_free(dev, buf_handle); error_user_fence_alloc: ac_drm_cs_ctx_free(dev, ctx->ctx_handle); error_create: FREE(ctx); return NULL; } static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; amdgpu_ctx_reference(&ctx, NULL); } static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *aws, enum amd_ip_type ip_type, uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space) { unsigned pad_dw_mask = aws->info.ip[ip_type].ib_pad_dw_mask; unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask; if (unaligned_dw) { int remaining = pad_dw_mask + 1 - unaligned_dw; /* Only pad by 1 dword with the type-2 NOP if necessary. */ if (remaining == 1 && aws->info.gfx_ib_pad_with_type2) { ib[(*num_dw)++] = PKT2_NOP_PAD; } else { /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized * packet. The size of the packet body after the header is always count + 1. * If count == -1, there is no packet body. NOP is the only packet that can have * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1). */ ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0); *num_dw += remaining - 1; } } assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0); } static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx) { struct amdgpu_bo_alloc_request request = {0}; struct drm_amdgpu_bo_list_in bo_list_in; struct drm_amdgpu_cs_chunk_ib ib_in = {0}; ac_drm_bo bo; amdgpu_va_handle va_handle = NULL; struct drm_amdgpu_cs_chunk chunks[2]; struct drm_amdgpu_bo_list_entry list; unsigned noop_dw_size; void *cpu = NULL; uint64_t seq_no; uint64_t va; int r; /* Older amdgpu doesn't report if the reset is complete or not. Detect * it by submitting a no-op job. If it reports an error, then assume * that the reset is not complete. */ uint32_t temp_ctx_handle; r = ac_drm_cs_ctx_create2(ctx->aws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx_handle); if (r) return r; request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM; request.alloc_size = 4096; request.phys_alignment = 4096; r = ac_drm_bo_alloc(ctx->aws->dev, &request, &bo); if (r) goto destroy_ctx; r = ac_drm_va_range_alloc(ctx->aws->dev, amdgpu_gpu_va_range_general, request.alloc_size, request.phys_alignment, 0, &va, &va_handle, AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH); if (r) goto destroy_bo; uint32_t kms_handle; ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &kms_handle); r = ac_drm_bo_va_op_raw(ctx->aws->dev, kms_handle, 0, request.alloc_size, va, AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE, AMDGPU_VA_OP_MAP); if (r) goto destroy_bo; r = ac_drm_bo_cpu_map(ctx->aws->dev, bo, &cpu); if (r) goto destroy_bo; noop_dw_size = ctx->aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1; ((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0); ac_drm_bo_cpu_unmap(ctx->aws->dev, bo); list.bo_handle = kms_handle; ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &list.bo_handle); list.bo_priority = 0; bo_list_in.list_handle = ~0; bo_list_in.bo_number = 1; bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry); bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list; ib_in.ip_type = AMD_IP_GFX; ib_in.ib_bytes = noop_dw_size * 4; ib_in.va_start = va; chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES; chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4; chunks[0].chunk_data = (uintptr_t)&bo_list_in; chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB; chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; chunks[1].chunk_data = (uintptr_t)&ib_in; r = ac_drm_cs_submit_raw2(ctx->aws->dev, temp_ctx_handle, 0, 2, chunks, &seq_no); destroy_bo: if (va_handle) ac_drm_va_range_free(va_handle); ac_drm_bo_free(ctx->aws->dev, bo); destroy_ctx: ac_drm_cs_ctx_free(ctx->aws->dev, temp_ctx_handle); return r; } static void amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status, const char *format, ...) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; /* Don't overwrite the last reset status. */ if (ctx->sw_status != PIPE_NO_RESET) return; ctx->sw_status = status; if (!ctx->allow_context_lost) { va_list args; va_start(args, format); vfprintf(stderr, format, args); va_end(args); /* Non-robust contexts are allowed to terminate the process. The only alternative is * to skip command submission, which would look like a freeze because nothing is drawn, * which looks like a hang without any reset. */ abort(); } } static enum pipe_reset_status amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only, bool *needs_reset, bool *reset_completed) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; if (needs_reset) *needs_reset = false; if (reset_completed) *reset_completed = false; /* Return a failure due to a GPU hang. */ uint64_t flags; if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) { /* If the caller is only interested in full reset (= wants to ignore soft * recoveries), we can use the rejected cs count as a quick first check. */ return PIPE_NO_RESET; } /* * ctx->sw_status is updated on alloc/ioctl failures. * * We only rely on amdgpu_cs_query_reset_state2 to tell us * that the context reset is complete. */ if (ctx->sw_status != PIPE_NO_RESET) { int r = ac_drm_cs_query_reset_state2(ctx->aws->dev, ctx->ctx_handle, &flags); if (!r) { if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) { if (reset_completed) { /* The ARB_robustness spec says: * * If a reset status other than NO_ERROR is returned and subsequent * calls return NO_ERROR, the context reset was encountered and * completed. If a reset status is repeatedly returned, the context may * be in the process of resetting. * * Starting with drm_minor >= 54 amdgpu reports if the reset is complete, * so don't do anything special. On older kernels, submit a no-op cs. If it * succeeds then assume the reset is complete. */ if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS)) *reset_completed = true; if (ctx->aws->info.drm_minor < 54 && ctx->aws->info.has_graphics) *reset_completed = amdgpu_submit_gfx_nop(ctx) == 0; } } } else { fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r); } /* Return a failure due to SW issues. */ if (needs_reset) *needs_reset = true; return ctx->sw_status; } if (needs_reset) *needs_reset = false; return PIPE_NO_RESET; } /* COMMAND SUBMISSION */ static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs) { return acs->ip_type == AMD_IP_GFX || acs->ip_type == AMD_IP_COMPUTE || acs->ip_type == AMD_IP_SDMA; } static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs) { if (cs->has_chaining) return 4; /* for chaining */ return 0; } static struct amdgpu_cs_buffer * amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo, struct amdgpu_buffer_list *list) { int num_buffers = list->num_buffers; struct amdgpu_cs_buffer *buffers = list->buffers; unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1); int i = cs->buffer_indices_hashlist[hash]; /* not found or found */ if (i < 0) return NULL; if (i < num_buffers && buffers[i].bo == bo) return &buffers[i]; /* Hash collision, look for the BO in the list of buffers linearly. */ for (int i = num_buffers - 1; i >= 0; i--) { if (buffers[i].bo == bo) { /* Put this buffer in the hash list. * This will prevent additional hash collisions if there are * several consecutive lookup_buffer calls for the same buffer. * * Example: Assuming buffers A,B,C collide in the hash list, * the following sequence of buffers: * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC * will collide here: ^ and here: ^, * meaning that we should get very few collisions in the end. */ cs->buffer_indices_hashlist[hash] = i & 0x7fff; return &buffers[i]; } } return NULL; } struct amdgpu_cs_buffer * amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo) { return amdgpu_lookup_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)]); } static struct amdgpu_cs_buffer * amdgpu_do_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo, struct amdgpu_buffer_list *list, bool add_ref) { /* New buffer, check if the backing array is large enough. */ if (unlikely(list->num_buffers >= list->max_buffers)) { unsigned new_max = MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3)); struct amdgpu_cs_buffer *new_buffers; new_buffers = (struct amdgpu_cs_buffer *) REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers), new_max * sizeof(*new_buffers)); if (!new_buffers) { fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n"); return NULL; } list->max_buffers = new_max; list->buffers = new_buffers; } unsigned idx = list->num_buffers++; struct amdgpu_cs_buffer *buffer = &list->buffers[idx]; if (add_ref) p_atomic_inc(&bo->base.reference.count); buffer->bo = bo; buffer->usage = 0; unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1); cs->buffer_indices_hashlist[hash] = idx & 0x7fff; return buffer; } static struct amdgpu_cs_buffer * amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo, struct amdgpu_buffer_list *list, bool add_ref) { struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(cs, bo, list); return buffer ? buffer : amdgpu_do_add_buffer(cs, bo, list, add_ref); } static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs, struct pb_buffer_lean *buf, unsigned usage, enum radeon_bo_domain domains) { /* Don't use the "domains" parameter. Amdgpu doesn't support changing * the buffer placement during command submission. */ struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc; struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; struct amdgpu_cs_buffer *buffer; /* Fast exit for no-op calls. * This is very effective with suballocators and linear uploaders that * are outside of the winsys. */ if (bo == cs->last_added_bo && (usage & cs->last_added_bo_usage) == usage) return 0; buffer = amdgpu_lookup_or_add_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)], true); if (!buffer) return 0; buffer->usage |= usage; cs->last_added_bo_usage = buffer->usage; cs->last_added_bo = bo; return 0; } static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *aws, struct amdgpu_ib *main_ib, struct amdgpu_cs *cs) { struct pb_buffer_lean *pb; uint8_t *mapped; unsigned buffer_size; /* Always create a buffer that is at least as large as the maximum seen IB size, * aligned to a power of two. */ buffer_size = util_next_power_of_two(main_ib->max_ib_bytes); /* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/ if (!cs->has_chaining) buffer_size *= 4; const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024); /* This is the maximum size that fits into the INDIRECT_BUFFER packet. */ const unsigned max_size = 2 * 1024 * 1024; buffer_size = MIN2(buffer_size, max_size); buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */ /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU. * The speed of writing to GTT WC is somewhere between no difference and very slow, while * VRAM being very slow a lot more often. * * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency * and doesn't have to wait for cached GL2 requests to be processed. */ enum radeon_bo_domain domain = RADEON_DOMAIN_GTT; unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_GL2_BYPASS; if (cs->ip_type == AMD_IP_GFX || cs->ip_type == AMD_IP_COMPUTE || cs->ip_type == AMD_IP_SDMA) { /* Avoids hangs with "rendercheck -t cacomposite -f a8r8g8b8" via glamor * on Navi 14 */ flags |= RADEON_FLAG_32BIT; } pb = amdgpu_bo_create(aws, buffer_size, aws->info.gart_page_size, domain, (radeon_bo_flag)flags); if (!pb) return false; mapped = (uint8_t*)amdgpu_bo_map(&aws->dummy_sws.base, pb, NULL, PIPE_MAP_WRITE); if (!mapped) { radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL); return false; } radeon_bo_reference(&aws->dummy_sws.base, &main_ib->big_buffer, pb); radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL); main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer); main_ib->big_buffer_cpu_ptr = mapped; main_ib->used_ib_space = 0; return true; } static bool amdgpu_get_new_ib(struct amdgpu_winsys *aws, struct radeon_cmdbuf *rcs, struct amdgpu_ib *main_ib, struct amdgpu_cs *cs) { struct drm_amdgpu_cs_chunk_ib *chunk_ib = &cs->csc->chunk_ib[IB_MAIN]; /* This is the minimum size of a contiguous IB. */ unsigned ib_size = 16 * 1024; /* Always allocate at least the size of the biggest cs_check_space call, * because precisely the last call might have requested this size. */ ib_size = MAX2(ib_size, main_ib->max_check_space_size); if (!cs->has_chaining) { ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes), IB_MAX_SUBMIT_BYTES)); } /* Decay the IB buffer size over time, so that memory usage decreases after * a temporary peak. */ main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32; rcs->prev_dw = 0; rcs->num_prev = 0; rcs->current.cdw = 0; rcs->current.buf = NULL; /* Allocate a new buffer for IBs if the current buffer is all used. */ if (!main_ib->big_buffer || main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) { if (!amdgpu_ib_new_buffer(aws, main_ib, cs)) return false; } chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space; chunk_ib->ib_bytes = 0; /* ib_bytes is in dwords and the conversion to bytes will be done before * the CS ioctl. */ main_ib->ptr_ib_size = &chunk_ib->ib_bytes; main_ib->is_chained_ib = false; amdgpu_cs_add_buffer(rcs, main_ib->big_buffer, (radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB), (radeon_bo_domain)0); rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space); cs->csc->ib_main_addr = rcs->current.buf; ib_size = main_ib->big_buffer->size - main_ib->used_ib_space; rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs); return true; } static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib) { if (ib->is_chained_ib) { *ib->ptr_ib_size = rcs->current.cdw | S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL); } else { *ib->ptr_ib_size = rcs->current.cdw; } } static void amdgpu_ib_finalize(struct amdgpu_winsys *aws, struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib, enum amd_ip_type ip_type) { amdgpu_set_ib_size(rcs, ib); ib->used_ib_space += rcs->current.cdw * 4; ib->used_ib_space = align(ib->used_ib_space, aws->info.ip[ip_type].ib_alignment); ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4); } static bool amdgpu_init_cs_context(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs, enum amd_ip_type ip_type) { for (unsigned i = 0; i < ARRAY_SIZE(cs->chunk_ib); i++) { cs->chunk_ib[i].ip_type = ip_type; cs->chunk_ib[i].flags = 0; if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) { /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation * is the beginning of IBs because completion of an IB doesn't care about the state of * GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be * executed in parallel, so draw calls from the current IB can finish after the next IB * starts drawing, and so the cache flush at the end of IBs is usually late and thus * useless. */ cs->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE; } } cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE; cs->last_added_bo = NULL; return true; } static void cleanup_fence_list(struct amdgpu_fence_list *fences) { for (unsigned i = 0; i < fences->num; i++) amdgpu_fence_drop_reference(fences->list[i]); fences->num = 0; } static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs) { for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) { struct amdgpu_cs_buffer *buffers = cs->buffer_lists[i].buffers; unsigned num_buffers = cs->buffer_lists[i].num_buffers; for (unsigned j = 0; j < num_buffers; j++) amdgpu_winsys_bo_drop_reference(aws, buffers[j].bo); cs->buffer_lists[i].num_buffers = 0; } } static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs) { cs->seq_no_dependencies.valid_fence_mask = 0; cleanup_fence_list(&cs->syncobj_dependencies); cleanup_fence_list(&cs->syncobj_to_signal); amdgpu_fence_reference(&cs->fence, NULL); cs->last_added_bo = NULL; } static void amdgpu_destroy_cs_context(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs) { amdgpu_cs_context_cleanup_buffers(aws, cs); amdgpu_cs_context_cleanup(aws, cs); for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) FREE(cs->buffer_lists[i].buffers); FREE(cs->syncobj_dependencies.list); FREE(cs->syncobj_to_signal.list); } static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs) { struct amdgpu_cs *cs = amdgpu_cs(rcs); return cs->ip_type; } static bool ip_uses_alt_fence(enum amd_ip_type ip_type) { /* The alt_fence path can be tested thoroughly by enabling it for GFX here. */ return ip_type == AMD_IP_VCN_DEC || ip_type == AMD_IP_VCN_ENC || ip_type == AMD_IP_VCN_JPEG; } static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs) { struct amdgpu_cs *cs = amdgpu_cs(rcs); if (!cs) return; amdgpu_cs_sync_flush(rcs); util_queue_fence_destroy(&cs->flush_completed); p_atomic_dec(&cs->aws->num_cs); radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->preamble_ib_bo, NULL); radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->main_ib.big_buffer, NULL); FREE(rcs->prev); amdgpu_destroy_cs_context(cs->aws, &cs->csc1); amdgpu_destroy_cs_context(cs->aws, &cs->csc2); amdgpu_fence_reference(&cs->next_fence, NULL); FREE(cs); } static bool amdgpu_cs_create(struct radeon_cmdbuf *rcs, struct radeon_winsys_ctx *rwctx, enum amd_ip_type ip_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), void *flush_ctx) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; struct amdgpu_cs *cs; cs = CALLOC_STRUCT(amdgpu_cs); if (!cs) { return false; } util_queue_fence_init(&cs->flush_completed); cs->aws = ctx->aws; cs->ctx = ctx; cs->flush_cs = flush; cs->flush_data = flush_ctx; cs->ip_type = ip_type; cs->noop = ctx->aws->noop_cs; cs->has_chaining = ctx->aws->info.gfx_level >= GFX7 && (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE); /* Compute the queue index by counting the IPs that have queues. */ assert(ip_type < ARRAY_SIZE(ctx->aws->info.ip)); assert(ctx->aws->info.ip[ip_type].num_queues); if (ip_uses_alt_fence(ip_type)) { cs->queue_index = INT_MAX; cs->uses_alt_fence = true; } else { cs->queue_index = 0; for (unsigned i = 0; i < ARRAY_SIZE(ctx->aws->info.ip); i++) { if (!ctx->aws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i)) continue; if (i == ip_type) break; cs->queue_index++; } assert(cs->queue_index < AMDGPU_MAX_QUEUES); } ac_drm_cs_chunk_fence_info_to_data(cs->ctx->user_fence_bo_kms_handle, cs->ip_type * 4, (struct drm_amdgpu_cs_chunk_data*)&cs->fence_chunk); if (!amdgpu_init_cs_context(ctx->aws, &cs->csc1, ip_type)) { FREE(cs); return false; } if (!amdgpu_init_cs_context(ctx->aws, &cs->csc2, ip_type)) { amdgpu_destroy_cs_context(ctx->aws, &cs->csc1); FREE(cs); return false; } memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); /* Set the first submission context as current. */ rcs->csc = cs->csc = &cs->csc1; cs->cst = &cs->csc2; /* Assign to both amdgpu_cs_context; only csc will use it. */ cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist; cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist; cs->csc1.aws = ctx->aws; cs->csc2.aws = ctx->aws; p_atomic_inc(&ctx->aws->num_cs); if (!amdgpu_get_new_ib(ctx->aws, rcs, &cs->main_ib, cs)) goto fail; /* Currently only gfx, compute and sdma queues supports user queue. */ if (cs->aws->info.use_userq && ip_type <= AMD_IP_SDMA) { if (!amdgpu_userq_init(cs->aws, &cs->aws->queues[cs->queue_index].userq, ip_type)) goto fail; } rcs->priv = cs; return true; fail: amdgpu_cs_destroy(rcs); return false; } static bool amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib, unsigned preamble_num_dw) { struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_winsys *aws = cs->aws; struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2}; unsigned size = align(preamble_num_dw * 4, aws->info.ip[AMD_IP_GFX].ib_alignment); struct pb_buffer_lean *preamble_bo; uint32_t *map; /* Create the preamble IB buffer. */ preamble_bo = amdgpu_bo_create(aws, size, aws->info.ip[AMD_IP_GFX].ib_alignment, RADEON_DOMAIN_VRAM, (radeon_bo_flag) (RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_GTT_WC)); if (!preamble_bo) return false; map = (uint32_t*)amdgpu_bo_map(&aws->dummy_sws.base, preamble_bo, NULL, (pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY)); if (!map) { radeon_bo_reference(&aws->dummy_sws.base, &preamble_bo, NULL); return false; } /* Upload the preamble IB. */ memcpy(map, preamble_ib, preamble_num_dw * 4); /* Pad the IB. */ amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, map, &preamble_num_dw, 0); amdgpu_bo_unmap(&aws->dummy_sws.base, preamble_bo); for (unsigned i = 0; i < 2; i++) { csc[i]->chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo); csc[i]->chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4; csc[i]->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT; } assert(!cs->preamble_ib_bo); cs->preamble_ib_bo = preamble_bo; amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0); return true; } static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs) { return true; } static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw) { struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_ib *main_ib = &cs->main_ib; assert(rcs->current.cdw <= rcs->current.max_dw); unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw; if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES) return false; if (rcs->current.max_dw - rcs->current.cdw >= dw) return true; unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs); unsigned need_byte_size = (dw + cs_epilog_dw) * 4; /* 125% of the size for IB epilog. */ unsigned safe_byte_size = need_byte_size + need_byte_size / 4; main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size); main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4); if (!cs->has_chaining) return false; /* Allocate a new chunk */ if (rcs->num_prev >= rcs->max_prev) { unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev); struct radeon_cmdbuf_chunk *new_prev; new_prev = (struct radeon_cmdbuf_chunk*) REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev, sizeof(*new_prev) * new_max_prev); if (!new_prev) return false; rcs->prev = new_prev; rcs->max_prev = new_max_prev; } if (!amdgpu_ib_new_buffer(cs->aws, main_ib, cs)) return false; assert(main_ib->used_ib_space == 0); uint64_t va = main_ib->gpu_address; /* This space was originally reserved. */ rcs->current.max_dw += cs_epilog_dw; /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */ amdgpu_pad_gfx_compute_ib(cs->aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4); radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)); radeon_emit(rcs, va); radeon_emit(rcs, va >> 32); uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++]; assert((rcs->current.cdw & cs->aws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0); assert(rcs->current.cdw <= rcs->current.max_dw); amdgpu_set_ib_size(rcs, main_ib); main_ib->ptr_ib_size = new_ptr_ib_size; main_ib->is_chained_ib = true; /* Hook up the new chunk */ rcs->prev[rcs->num_prev].buf = rcs->current.buf; rcs->prev[rcs->num_prev].cdw = rcs->current.cdw; rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */ rcs->num_prev++; rcs->prev_dw += rcs->current.cdw; rcs->current.cdw = 0; rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space); rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw; amdgpu_cs_add_buffer(rcs, main_ib->big_buffer, RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0); return true; } static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *cs) { unsigned num_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers; struct amdgpu_cs_buffer *buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers; for (unsigned i = 0; i < num_buffers; i++) { struct amdgpu_cs_buffer *slab_buffer = &buffers[i]; struct amdgpu_cs_buffer *real_buffer = amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(slab_buffer->bo)->b, &cs->buffer_lists[AMDGPU_BO_REAL], true); /* We need to set the usage because it determines the BO priority. * * Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its * BO fences to fence dependencies. Only the slab entries should do that. */ real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED; } } static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs, struct radeon_bo_list_item *list) { struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc; /* We do this in the CS thread, but since we need to return the final usage of all buffers * here, do it here too. There is no harm in doing it again in the CS thread. */ amdgpu_add_slab_backing_buffers(cs); struct amdgpu_buffer_list *real_buffers = &cs->buffer_lists[AMDGPU_BO_REAL]; unsigned num_real_buffers = real_buffers->num_buffers; #if HAVE_AMDGPU_VIRTIO assert(!cs->ws->info.is_virtio); #endif if (list) { for (unsigned i = 0; i < num_real_buffers; i++) { list[i].bo_size = real_buffers->buffers[i].bo->base.size; list[i].vm_address = amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle); list[i].priority_usage = real_buffers->buffers[i].usage; } } return num_real_buffers; } static void add_fence_to_list(struct amdgpu_fence_list *fences, struct amdgpu_fence *fence) { unsigned idx = fences->num++; if (idx >= fences->max) { unsigned size; const unsigned increment = 8; fences->max = idx + increment; size = fences->max * sizeof(fences->list[0]); fences->list = (struct pipe_fence_handle**)realloc(fences->list, size); } amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence); } static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs, struct pipe_fence_handle *pfence) { struct amdgpu_cs *acs = amdgpu_cs(rcs); struct amdgpu_cs_context *cs = acs->csc; struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; util_queue_fence_wait(&fence->submitted); if (!fence->imported) { /* Ignore idle fences. This will only check the user fence in memory. */ if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) { add_seq_no_to_list(acs->aws, &cs->seq_no_dependencies, fence->queue_index, fence->queue_seq_no); } } else add_fence_to_list(&cs->syncobj_dependencies, fence); } static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs, unsigned queue_index_bit, struct amdgpu_seq_no_fences *dependencies, struct amdgpu_winsys_bo *bo, unsigned usage) { if (usage & RADEON_USAGE_SYNCHRONIZED) { /* Add BO fences from queues other than 'queue_index' to dependencies. */ u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) { add_seq_no_to_list(ws, dependencies, other_queue_idx, bo->fences.seq_no[other_queue_idx]); } if (bo->alt_fence) add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence); } } static void amdgpu_set_bo_seq_no(unsigned queue_index, struct amdgpu_winsys_bo *bo, uint_seq_no new_queue_seq_no) { bo->fences.seq_no[queue_index] = new_queue_seq_no; bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index); } static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry, struct amdgpu_winsys_bo *bo, unsigned usage) { bo_entry->bo_handle = get_real_bo(bo)->kms_handle; bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2; } static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws, struct pipe_fence_handle *fence) { struct amdgpu_cs *acs = amdgpu_cs(rws); struct amdgpu_cs_context *cs = acs->csc; add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence); } static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs, unsigned num_real_buffers, struct drm_amdgpu_bo_list_entry *bo_list_real, uint64_t *seq_no) { struct amdgpu_winsys *aws = acs->aws; struct amdgpu_cs_context *cs = acs->cst; struct drm_amdgpu_bo_list_in bo_list_in; struct drm_amdgpu_cs_chunk chunks[8]; unsigned num_chunks = 0; /* BO list */ bo_list_in.operation = ~0; bo_list_in.list_handle = ~0; bo_list_in.bo_number = num_real_buffers; bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry); bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list_real; chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES; chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4; chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in; num_chunks++; /* Syncobj dependencies. */ unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num; if (num_syncobj_dependencies) { struct drm_amdgpu_cs_chunk_sem *sem_chunk = (struct drm_amdgpu_cs_chunk_sem *) alloca(num_syncobj_dependencies * sizeof(sem_chunk[0])); for (unsigned i = 0; i < num_syncobj_dependencies; i++) { struct amdgpu_fence *fence = (struct amdgpu_fence*)cs->syncobj_dependencies.list[i]; assert(util_queue_fence_is_signalled(&fence->submitted)); sem_chunk[i].handle = fence->syncobj; } chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN; chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies; chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk; num_chunks++; } /* Syncobj signals. */ unsigned num_syncobj_to_signal = 1 + cs->syncobj_to_signal.num; struct drm_amdgpu_cs_chunk_sem *sem_chunk = (struct drm_amdgpu_cs_chunk_sem *) alloca(num_syncobj_to_signal * sizeof(sem_chunk[0])); for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) { struct amdgpu_fence *fence = (struct amdgpu_fence*)cs->syncobj_to_signal.list[i]; sem_chunk[i].handle = fence->syncobj; } sem_chunk[cs->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)cs->fence)->syncobj; chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT; chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal; chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk; num_chunks++; if (aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) { chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW; chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4; chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk; num_chunks++; } /* Fence */ if (amdgpu_cs_has_user_fence(acs)) { chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE; chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4; chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk; num_chunks++; } /* IB */ if (cs->chunk_ib[IB_PREAMBLE].ib_bytes) { chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_PREAMBLE]; num_chunks++; } /* IB */ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_MAIN]; num_chunks++; if (cs->secure) { cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE; cs->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE; } else { cs->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE; cs->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE; } assert(num_chunks <= 8); /* Submit the command buffer. * * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites * quite often, but it eventually succeeds after enough attempts. This happens frequently * with dEQP using NGG streamout. */ int r = 0; do { /* Wait 1 ms and try again. */ if (r == -ENOMEM) os_time_sleep(1000); r = ac_drm_cs_submit_raw2(aws->dev, acs->ctx->ctx_handle, 0, num_chunks, chunks, seq_no); } while (r == -ENOMEM); return r; } static void amdgpu_cs_add_userq_packets(struct amdgpu_userq *userq, struct amdgpu_cs_context *cs, uint64_t num_fences, struct drm_amdgpu_userq_fence_info *fence_info) { amdgpu_pkt_begin(); if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) { if (num_fences) { unsigned num_fences_in_iter; /* FENCE_WAIT_MULTI packet supports max 32 fenes */ for (unsigned i = 0; i < num_fences; i = i + 32) { num_fences_in_iter = (i + 32 > num_fences) ? num_fences - i : 32; amdgpu_pkt_add_dw(PKT3(PKT3_FENCE_WAIT_MULTI, num_fences_in_iter * 4, 0)); amdgpu_pkt_add_dw(S_D10_ENGINE_SEL(1) | S_D10_POLL_INTERVAL(4) | S_D10_PREEMPTABLE(1)); for (unsigned j = 0; j < num_fences_in_iter; j++) { amdgpu_pkt_add_dw(fence_info[i + j].va); amdgpu_pkt_add_dw(fence_info[i + j].va >> 32); amdgpu_pkt_add_dw(fence_info[i + j].value); amdgpu_pkt_add_dw(fence_info[i + j].value >> 32); } } } amdgpu_pkt_add_dw(PKT3(PKT3_HDP_FLUSH, 0, 0)); amdgpu_pkt_add_dw(0x0); amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0)); amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start); amdgpu_pkt_add_dw(cs->chunk_ib[IB_MAIN].va_start >> 32); if (userq->ip_type == AMD_IP_GFX) amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_INHERIT_VMID_MQD_GFX(1)); else amdgpu_pkt_add_dw((cs->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_VALID_COMPUTE(1) | S_3F3_INHERIT_VMID_MQD_COMPUTE(1)); /* Add 8 for release mem packet and 2 for protected fence signal packet. * Calculcating userq_fence_seq_num this way to match with kernel fence that is * returned in userq_wait iotl. */ userq->user_fence_seq_num = *userq->wptr_bo_map + __num_dw_written + 8 + 2; /* add release mem for user fence */ amdgpu_pkt_add_dw(PKT3(PKT3_RELEASE_MEM, 6, 0)); amdgpu_pkt_add_dw(S_490_EVENT_TYPE(V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT) | S_490_EVENT_INDEX(5) | S_490_GLM_WB(1) | S_490_GLM_INV(1) | S_490_GL2_WB(1) | S_490_SEQ(1) | S_490_CACHE_POLICY(3)); amdgpu_pkt_add_dw(S_030358_DATA_SEL(2)); amdgpu_pkt_add_dw(userq->user_fence_va); amdgpu_pkt_add_dw(userq->user_fence_va >> 32); amdgpu_pkt_add_dw(userq->user_fence_seq_num); amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32); amdgpu_pkt_add_dw(0); /* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer * is only accessible from kernel through VMID 0. */ amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0)); amdgpu_pkt_add_dw(0); } else { fprintf(stderr, "amdgpu: unsupported userq ip submission = %d\n", userq->ip_type); } amdgpu_pkt_end(); } static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq, struct amdgpu_cs *acs, uint32_t *shared_buf_kms_handles_write, unsigned num_shared_buf_write, uint32_t *shared_buf_kms_handles_read, unsigned num_shared_buf_read, uint64_t *seq_no, uint64_t vm_timeline_point) { int r = 0; struct amdgpu_winsys *aws = acs->aws; struct amdgpu_cs_context *cs = acs->cst; /* Syncobj dependencies. */ unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num; uint32_t *syncobj_dependencies_list = (uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t)); /* Currently only 1 vm timeline syncobj can be a dependency. */ uint16_t num_syncobj_timeline_dependencies = 1; uint32_t syncobj_timeline_dependency; uint64_t syncobj_timeline_dependency_point; if (num_syncobj_dependencies) { for (unsigned i = 0; i < num_syncobj_dependencies; i++) { struct amdgpu_fence *fence = (struct amdgpu_fence*)cs->syncobj_dependencies.list[i]; assert(util_queue_fence_is_signalled(&fence->submitted)); syncobj_dependencies_list[i] = fence->syncobj; } } syncobj_timeline_dependency = aws->vm_timeline_syncobj; syncobj_timeline_dependency_point = vm_timeline_point; /* Syncobj signals. Adding 1 for cs submission fence. */ unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num + 1; uint32_t *syncobj_signal_list = (uint32_t*)alloca(num_syncobj_to_signal * sizeof(uint32_t)); for (unsigned i = 0; i < cs->syncobj_to_signal.num; i++) { struct amdgpu_fence *fence = (struct amdgpu_fence*)cs->syncobj_to_signal.list[i]; syncobj_signal_list[i] = fence->syncobj; } syncobj_signal_list[num_syncobj_to_signal - 1] = ((struct amdgpu_fence*)cs->fence)->syncobj; struct drm_amdgpu_userq_fence_info *fence_info; struct drm_amdgpu_userq_wait userq_wait_data = { .syncobj_handles = (uintptr_t)syncobj_dependencies_list, .syncobj_timeline_handles = (uintptr_t)&syncobj_timeline_dependency, .syncobj_timeline_points = (uintptr_t)&syncobj_timeline_dependency_point, .bo_read_handles = (uintptr_t)shared_buf_kms_handles_read, .bo_write_handles = (uintptr_t)shared_buf_kms_handles_write, .num_syncobj_timeline_handles = num_syncobj_timeline_dependencies, .num_fences = 0, .num_syncobj_handles = num_syncobj_dependencies, .num_bo_read_handles = num_shared_buf_read, .num_bo_write_handles = num_shared_buf_write, .out_fences = (uintptr_t)NULL, }; /* * Buffers sharing synchronization follow these rules: * - read-only buffers wait for all previous writes to complete * - write-only(also read-write) buffers wait for all previous reads to complete * To implement this strategy, we use amdgpu_userq_wait() before submitting * a job, and amdgpu_userq_signal() after to indicate completion. */ r = ac_drm_userq_wait(aws->dev, &userq_wait_data); if (r) fprintf(stderr, "amdgpu: getting wait num_fences failed\n"); fence_info = (struct drm_amdgpu_userq_fence_info*) alloca(userq_wait_data.num_fences * sizeof(struct drm_amdgpu_userq_fence_info)); userq_wait_data.out_fences = (uintptr_t)fence_info; r = ac_drm_userq_wait(aws->dev, &userq_wait_data); if (r) fprintf(stderr, "amdgpu: getting wait fences failed\n"); simple_mtx_lock(&userq->lock); amdgpu_cs_add_userq_packets(userq, cs, userq_wait_data.num_fences, fence_info); struct drm_amdgpu_userq_signal userq_signal_data = { .queue_id = userq->userq_handle, .syncobj_handles = (uintptr_t)syncobj_signal_list, .num_syncobj_handles = num_syncobj_to_signal, .bo_read_handles = (uintptr_t)shared_buf_kms_handles_read, .bo_write_handles = (uintptr_t)shared_buf_kms_handles_write, .num_bo_read_handles = num_shared_buf_read, .num_bo_write_handles = num_shared_buf_write, }; r = ac_drm_userq_signal(aws->dev, &userq_signal_data); if (!r) userq->doorbell_bo_map[AMDGPU_USERQ_DOORBELL_INDEX] = *userq->wptr_bo_map; *seq_no = userq->user_fence_seq_num; simple_mtx_unlock(&userq->lock); return r; } enum queue_type { KERNELQ, KERNELQ_ALT_FENCE, USERQ, }; /* The template parameter determines whether the queue should skip code used by the default queue * system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence * for all BOs. */ template static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index) { struct amdgpu_cs *acs = (struct amdgpu_cs*)job; struct amdgpu_winsys *aws = acs->aws; struct amdgpu_cs_context *cs = acs->cst; int r; uint64_t seq_no = 0; bool has_user_fence = amdgpu_cs_has_user_fence(acs); /* The maximum timeline point of VM updates for all BOs used in this submit. */ uint64_t vm_timeline_point = 0; simple_mtx_lock(&aws->bo_fence_lock); unsigned queue_index; struct amdgpu_queue *queue; uint_seq_no prev_seq_no, next_seq_no; if (queue_type != KERNELQ_ALT_FENCE) { queue_index = acs->queue_index; queue = &aws->queues[queue_index]; prev_seq_no = queue->latest_seq_no; /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno, * but the values aren't related. */ next_seq_no = prev_seq_no + 1; /* Wait for the oldest fence to signal. This should always check the user fence, then wait * via the ioctl. We have to do this because we are going to release the oldest fence and * replace it with the latest fence in the ring. */ struct pipe_fence_handle **oldest_fence = &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE]; if (*oldest_fence) { if (!amdgpu_fence_wait(*oldest_fence, 0, false)) { /* Take the reference because the fence can be released by other threads after we * unlock the mutex. */ struct pipe_fence_handle *tmp_fence = NULL; amdgpu_fence_reference(&tmp_fence, *oldest_fence); /* Unlock the mutex before waiting. */ simple_mtx_unlock(&aws->bo_fence_lock); amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false); amdgpu_fence_reference(&tmp_fence, NULL); simple_mtx_lock(&aws->bo_fence_lock); } /* Remove the idle fence from the ring. */ amdgpu_fence_reference(oldest_fence, NULL); } } /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest * sequence number per queue and removes all older ones. */ struct amdgpu_seq_no_fences seq_no_dependencies; memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies)); if (queue_type != KERNELQ_ALT_FENCE) { /* Add a fence dependency on the previous IB if the IP has multiple physical queues to * make it appear as if it had only 1 queue, or if the previous IB comes from a different * context. The reasons are: * - Our BO fence tracking only supports 1 queue per IP. * - IBs from different contexts must wait for each other and can't execute in a random order. */ struct amdgpu_fence *prev_fence = (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE]; /* Add a dependency on a previous fence, unless we can determine that * it's useless because the execution order is guaranteed. */ if (prev_fence) { bool same_ctx = queue->last_ctx == acs->ctx; /* userqueue submission mode uses a single queue per process. */ bool same_queue = aws->info.ip[acs->ip_type].num_queues > 1 && queue_type != USERQ; if (!same_ctx || !same_queue) add_seq_no_to_list(aws, &seq_no_dependencies, queue_index, prev_seq_no); } } /* Since the kernel driver doesn't synchronize execution between different * rings automatically, we have to add fence dependencies manually. This gathers sequence * numbers from BOs and sets the next sequence number in the BOs. */ /* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */ struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers; unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers; unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers; unsigned queue_index_bit = (queue_type == KERNELQ_ALT_FENCE) ? 0 : BITFIELD_BIT(queue_index); for (unsigned i = 0; i < num_slab_entry_buffers; i++) { struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i]; struct amdgpu_winsys_bo *bo = buffer->bo; amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo, buffer->usage); if (queue_type == KERNELQ_ALT_FENCE) amdgpu_fence_reference(&bo->alt_fence, cs->fence); else amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no); /* We didn't add any slab entries into the real buffer list that will be submitted * to the kernel. Do it now. */ struct amdgpu_cs_buffer *real_buffer = amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(buffer->bo)->b, &cs->buffer_lists[AMDGPU_BO_REAL], false); /* We need to set the usage because it determines the BO priority. */ real_buffer->usage |= buffer->usage; } /* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */ unsigned num_real_buffers_except_sparse = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers; struct amdgpu_cs_buffer *sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].buffers; unsigned num_sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].num_buffers; bool out_of_memory = false; for (unsigned i = 0; i < num_sparse_buffers; i++) { struct amdgpu_cs_buffer *buffer = &sparse_buffers[i]; struct amdgpu_winsys_bo *bo = buffer->bo; amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo, buffer->usage); if (queue_type == KERNELQ_ALT_FENCE) amdgpu_fence_reference(&bo->alt_fence, cs->fence); else amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no); /* Add backing buffers of sparse buffers to the buffer list. * * This is done late, during submission, to keep the buffer list short before * submit, and to avoid managing fences for the backing buffers. */ struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo); if (queue_type == USERQ) { uint64_t bo_vm_point = p_atomic_read(&sparse_bo->vm_timeline_point); vm_timeline_point = MAX2(vm_timeline_point, bo_vm_point); } simple_mtx_lock(&sparse_bo->commit_lock); list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) { /* We can directly add the buffer here, because we know that each * backing buffer occurs only once. */ struct amdgpu_cs_buffer *real_buffer = amdgpu_do_add_buffer(cs, &backing->bo->b, &cs->buffer_lists[AMDGPU_BO_REAL], true); if (!real_buffer) { fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__); simple_mtx_unlock(&sparse_bo->commit_lock); r = -ENOMEM; out_of_memory = true; } real_buffer->usage = buffer->usage; } simple_mtx_unlock(&sparse_bo->commit_lock); } /* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */ unsigned num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers; struct amdgpu_cs_buffer *real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].buffers; struct drm_amdgpu_bo_list_entry *bo_list; /* BO dependency management depends on the queue mode: * - kernel queue: BO used by the submit are passed to the kernel in a * drm_amdgpu_bo_list_entry list. The inter-process synchronization is handled * automatically by the kernel; intra-process sync is handled by Mesa. * - user queue: intra-process sync is similar. Inter-process sync is handled * using timeline points, amdgpu_userq_wait (before a submit) and * amdgpu_userq_signal (after a submit). */ unsigned num_shared_buf_write; unsigned num_shared_buf_read; /* Store write handles in the begining and read handles at the end in shared_buf_kms_handles. * If usage is read and write then store the handle in write list. */ uint32_t *shared_buf_kms_handles; if (queue_type != USERQ) { bo_list = (struct drm_amdgpu_bo_list_entry *) alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry)); } else { num_shared_buf_write = 0; num_shared_buf_read = 0; shared_buf_kms_handles = (uint32_t*)alloca(num_real_buffers * sizeof(uint32_t)); } unsigned i; for (i = 0; i < initial_num_real_buffers; i++) { struct amdgpu_cs_buffer *buffer = &real_buffers[i]; struct amdgpu_winsys_bo *bo = buffer->bo; amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo, buffer->usage); if (queue_type == KERNELQ_ALT_FENCE) amdgpu_fence_reference(&bo->alt_fence, cs->fence); else amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no); if (queue_type != USERQ) { amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage); } else { vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point); if (!get_real_bo(bo)->is_shared) continue; if (buffer->usage & RADEON_USAGE_WRITE) { shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle; num_shared_buf_write++; } else { num_shared_buf_read++; shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] = get_real_bo(bo)->kms_handle; } } } /* These are backing buffers of slab entries. Don't add their fence dependencies. */ for (; i < num_real_buffers_except_sparse; i++) { struct amdgpu_cs_buffer *buffer = &real_buffers[i]; struct amdgpu_winsys_bo *bo = buffer->bo; if (queue_type == KERNELQ_ALT_FENCE) get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true; else amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no); if (queue_type != USERQ) { amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage); } else { vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point); if (!get_real_bo(bo)->is_shared) continue; if (buffer->usage & RADEON_USAGE_WRITE) { shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle; num_shared_buf_write++; } else { num_shared_buf_read++; shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] = get_real_bo(bo)->kms_handle; } } } /* Sparse backing BOs are last. Don't update their fences because we don't use them. */ for (; i < num_real_buffers; ++i) { struct amdgpu_cs_buffer *buffer = &real_buffers[i]; if (queue_type != USERQ) { amdgpu_add_to_kernel_bo_list(&bo_list[i], buffer->bo, buffer->usage); } else { if (!get_real_bo(buffer->bo)->is_shared) continue; if (buffer->usage & RADEON_USAGE_WRITE) { shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(buffer->bo)->kms_handle; num_shared_buf_write++; } else { num_shared_buf_read++; shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] = get_real_bo(buffer->bo)->kms_handle; } } } #if 0 /* Debug code. */ printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no); /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */ for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) { if (i == acs->queue_index) continue; struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE]; if (!fence) { if (i <= 1) printf(" queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no); continue; } bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i); uint_seq_no old = seq_no_dependencies.seq_no[i]; add_seq_no_to_list(aws, &seq_no_dependencies, i, aws->queues[i].latest_seq_no); uint_seq_no new = seq_no_dependencies.seq_no[i]; if (!valid) printf(" missing dependency on queue=%u, seq_no=%u\n", i, new); else if (old != new) printf(" too old dependency on queue=%u, old=%u, new=%u\n", i, old, new); else printf(" has dependency on queue=%u, seq_no=%u\n", i, old); } #endif /* Convert the sequence numbers we gathered to fence dependencies. */ u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) { struct pipe_fence_handle **fence = get_fence_from_ring(aws, &seq_no_dependencies, i); if (fence) { /* If it's idle, don't add it to the list of dependencies. */ if (amdgpu_fence_wait(*fence, 0, false)) amdgpu_fence_reference(fence, NULL); else add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)*fence); } } if (queue_type != KERNELQ_ALT_FENCE) { /* Finally, add the IB fence into the fence ring of the queue. */ amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence); queue->latest_seq_no = next_seq_no; ((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no; /* Update the last used context in the queue. */ amdgpu_ctx_reference(&queue->last_ctx, acs->ctx); } simple_mtx_unlock(&aws->bo_fence_lock); #if MESA_DEBUG /* Prepare the buffer list. */ if (aws->debug_all_bos) { /* The buffer list contains all buffers. This is a slow path that * ensures that no buffer is missing in the BO list. */ simple_mtx_lock(&aws->global_bo_list_lock); if (queue_type != USERQ) { bo_list = (struct drm_amdgpu_bo_list_entry *) alloca(aws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry)); num_real_buffers = 0; list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) { bo_list[num_real_buffers].bo_handle = bo->kms_handle; bo_list[num_real_buffers].bo_priority = 0; ++num_real_buffers; } } else { shared_buf_kms_handles = (uint32_t*)alloca(aws->num_buffers * sizeof(uint32_t)); num_shared_buf_write = 0; num_shared_buf_read = 0; list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) { shared_buf_kms_handles[num_shared_buf_write] = bo->kms_handle; num_shared_buf_write++; } } simple_mtx_unlock(&aws->global_bo_list_lock); } #endif if (acs->ip_type == AMD_IP_GFX) aws->gfx_bo_list_counter += num_real_buffers; if (out_of_memory) { r = -ENOMEM; } else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) { r = -ECANCELED; } else if (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX) { r = 0; } else { if (queue_type != USERQ) { /* Submit the command buffer. * * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites * quite often, but it eventually succeeds after enough attempts. This happens frequently * with dEQP using NGG streamout. */ r = 0; do { /* Wait 1 ms and try again. */ if (r == -ENOMEM) os_time_sleep(1000); r = amdgpu_cs_submit_ib_kernelq(acs, num_real_buffers, bo_list, &seq_no); } while (r == -ENOMEM); if (!r) { /* Success. */ uint64_t *user_fence = NULL; /* Need to reserve 4 QWORD for user fence: * QWORD[0]: completed fence * QWORD[1]: preempted fence * QWORD[2]: reset fence * QWORD[3]: preempted then reset */ if (has_user_fence) user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4; amdgpu_fence_submitted(cs->fence, seq_no, user_fence); } } else { struct amdgpu_userq *userq = &queue->userq; r = amdgpu_cs_submit_ib_userq(userq, acs, shared_buf_kms_handles, num_shared_buf_write, &shared_buf_kms_handles[num_real_buffers - num_shared_buf_read], num_shared_buf_read, &seq_no, vm_timeline_point); if (!r) { /* Success. */ amdgpu_fence_submitted(cs->fence, seq_no, userq->user_fence_ptr); } } } if (unlikely(r)) { if (r == -ECANCELED) { amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET, "amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n"); } else if (r == -ENODATA) { amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET, "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n"); } else if (r == -ETIME) { amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET, "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n"); } else { amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_UNKNOWN_CONTEXT_RESET, "amdgpu: The CS has been rejected, " "see dmesg for more information (%i).\n", r); } } /* If there was an error, signal the fence, because it won't be signalled * by the hardware. */ if (r || (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX)) amdgpu_fence_signalled(cs->fence); if (unlikely(aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0)) acs->mcbp_fw_shadow_chunk.flags = 0; cs->error_code = r; /* Clear the buffer lists. */ for (unsigned list = 0; list < ARRAY_SIZE(cs->buffer_lists); list++) { struct amdgpu_cs_buffer *buffers = cs->buffer_lists[list].buffers; unsigned num_buffers = cs->buffer_lists[list].num_buffers; if (list == AMDGPU_BO_REAL) { /* Only decrement num_active_ioctls and unref where we incremented them. * We did both for regular real BOs. We only incremented the refcount for sparse * backing BOs. */ /* Regular real BOs. */ for (unsigned i = 0; i < initial_num_real_buffers; i++) { p_atomic_dec(&buffers[i].bo->num_active_ioctls); amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo); } /* Do nothing for slab BOs. */ /* Sparse backing BOs. */ for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++) amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo); } else { for (unsigned i = 0; i < num_buffers; i++) { p_atomic_dec(&buffers[i].bo->num_active_ioctls); amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo); } } cs->buffer_lists[list].num_buffers = 0; } amdgpu_cs_context_cleanup(aws, cs); } /* Make sure the previous submission is completed. */ void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs) { struct amdgpu_cs *cs = amdgpu_cs(rcs); /* Wait for any pending ioctl of this CS to complete. */ util_queue_fence_wait(&cs->flush_completed); } static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, unsigned flags, struct pipe_fence_handle **fence) { struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_winsys *aws = cs->aws; int error_code = 0; uint32_t ib_pad_dw_mask = aws->info.ip[cs->ip_type].ib_pad_dw_mask; rcs->current.max_dw += amdgpu_cs_epilog_dws(cs); /* Pad the IB according to the mask. */ switch (cs->ip_type) { case AMD_IP_SDMA: if (aws->info.gfx_level <= GFX6) { while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, 0xf0000000); /* NOP packet */ } else { while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, SDMA_NOP_PAD); } break; case AMD_IP_GFX: case AMD_IP_COMPUTE: amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0); if (cs->ip_type == AMD_IP_GFX) aws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4; break; case AMD_IP_UVD: case AMD_IP_UVD_ENC: while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, 0x80000000); /* type2 nop packet */ break; case AMD_IP_VCN_JPEG: if (rcs->current.cdw % 2) assert(0); while (rcs->current.cdw & ib_pad_dw_mask) { radeon_emit(rcs, 0x60000000); /* nop packet */ radeon_emit(rcs, 0x00000000); } break; case AMD_IP_VCN_DEC: while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, 0x81ff); /* nop packet */ break; default: break; } if (rcs->current.cdw > rcs->current.max_dw) { fprintf(stderr, "amdgpu: command stream overflowed\n"); } /* If the CS is not empty or overflowed.... */ if (likely(radeon_emitted(rcs, 0) && rcs->current.cdw <= rcs->current.max_dw && !(flags & RADEON_FLUSH_NOOP))) { struct amdgpu_cs_context *cur = cs->csc; /* Set IB sizes. */ amdgpu_ib_finalize(aws, rcs, &cs->main_ib, cs->ip_type); /* Create a fence. */ amdgpu_fence_reference(&cur->fence, NULL); if (cs->next_fence) { /* just move the reference */ cur->fence = cs->next_fence; cs->next_fence = NULL; } else { cur->fence = amdgpu_fence_create(cs); } if (fence) amdgpu_fence_reference(fence, cur->fence); for (unsigned i = 0; i < ARRAY_SIZE(cur->buffer_lists); i++) { unsigned num_buffers = cur->buffer_lists[i].num_buffers; struct amdgpu_cs_buffer *buffers = cur->buffer_lists[i].buffers; for (unsigned j = 0; j < num_buffers; j++) p_atomic_inc(&buffers[j].bo->num_active_ioctls); } amdgpu_cs_sync_flush(rcs); cur->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */ if (cs->noop && cs->ip_type == AMD_IP_GFX) { /* Reduce the IB size and fill it with NOP to make it like an empty IB. */ unsigned noop_dw_size = aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1; assert(cur->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size); cur->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0); cur->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4; } /* Swap command streams. "cst" is going to be submitted. */ rcs->csc = cs->csc = cs->cst; cs->cst = cur; /* only gfx, compute and sdma queues are supported in userqueues. */ if (aws->info.use_userq && cs->ip_type <= AMD_IP_SDMA) { util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed, amdgpu_cs_submit_ib, NULL, 0); } else { util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed, cs->uses_alt_fence ? amdgpu_cs_submit_ib : amdgpu_cs_submit_ib, NULL, 0); } if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION) cs->csc->secure = !cs->cst->secure; else cs->csc->secure = cs->cst->secure; if (!(flags & PIPE_FLUSH_ASYNC)) { amdgpu_cs_sync_flush(rcs); error_code = cur->error_code; } } else { if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION) cs->csc->secure = !cs->csc->secure; amdgpu_cs_context_cleanup_buffers(aws, cs->csc); amdgpu_cs_context_cleanup(aws, cs->csc); } memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); amdgpu_get_new_ib(aws, rcs, &cs->main_ib, cs); if (cs->preamble_ib_bo) { amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0); } if (cs->ip_type == AMD_IP_GFX) aws->num_gfx_IBs++; else if (cs->ip_type == AMD_IP_SDMA) aws->num_sdma_IBs++; return error_code; } static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs, struct pb_buffer_lean *_buf, unsigned usage) { struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf; return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage); } static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va, uint64_t csa_va) { struct amdgpu_cs *cs = amdgpu_cs(rcs); cs->mcbp_fw_shadow_chunk.shadow_va = regs_va; cs->mcbp_fw_shadow_chunk.csa_va = csa_va; cs->mcbp_fw_shadow_chunk.gds_va = 0; cs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW; } static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws, struct pipe_fence_handle **dst, struct pipe_fence_handle *src) { amdgpu_fence_reference(dst, src); } void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *sws) { sws->base.ctx_create = amdgpu_ctx_create; sws->base.ctx_destroy = amdgpu_ctx_destroy; sws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status; sws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; sws->base.cs_create = amdgpu_cs_create; sws->base.cs_setup_preemption = amdgpu_cs_setup_preemption; sws->base.cs_destroy = amdgpu_cs_destroy; sws->base.cs_add_buffer = amdgpu_cs_add_buffer; sws->base.cs_validate = amdgpu_cs_validate; sws->base.cs_check_space = amdgpu_cs_check_space; sws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list; sws->base.cs_flush = amdgpu_cs_flush; sws->base.cs_get_next_fence = amdgpu_cs_get_next_fence; sws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced; sws->base.cs_sync_flush = amdgpu_cs_sync_flush; sws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency; sws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal; sws->base.cs_get_ip_type = amdgpu_cs_get_ip_type; sws->base.fence_wait = amdgpu_fence_wait_rel_timeout; sws->base.fence_reference = amdgpu_winsys_fence_reference; sws->base.fence_import_syncobj = amdgpu_fence_import_syncobj; sws->base.fence_import_sync_file = amdgpu_fence_import_sync_file; sws->base.fence_export_sync_file = amdgpu_fence_export_sync_file; sws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file; if (sws->aws->info.has_fw_based_shadowing) sws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va; }