/* * Copyright 2018 Advanced Micro Devices, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* This file implements tests on the si_clearbuffer function. */ #include "si_pipe.h" #include "si_query.h" #define MIN_SIZE 512 #define MAX_SIZE (128 * 1024 * 1024) #define SIZE_SHIFT 1 #define NUM_RUNS 128 static double get_MBps_rate(unsigned num_bytes, unsigned ns) { return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0); } void si_test_dma_perf(struct si_screen *sscreen) { struct pipe_screen *screen = &sscreen->b; struct pipe_context *ctx = screen->context_create(screen, NULL, 0); struct si_context *sctx = (struct si_context *)ctx; const uint32_t clear_value = 0x12345678; static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1}; static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16}; #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list) #define NUM_METHODS (3 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list)) static const char *method_str[] = { "CP MC ", "CP L2 ", "CP L2 ", }; static const char *placement_str[] = { /* Clear */ "fill->VRAM", "fill->GTT ", /* Copy */ "VRAM->VRAM", "VRAM->GTT ", "GTT ->VRAM", }; printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n"); printf("Heap ,Method ,L2p,Wa,"); for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { if (size >= 1024) printf("%6uKB,", size / 1024); else printf(" %6uB,", size); } printf("\n"); /* results[log2(size)][placement][method][] */ struct si_result { bool is_valid; bool is_cp; bool is_cs; unsigned cache_policy; unsigned dwords_per_thread; unsigned waves_per_sh; unsigned score; unsigned index; /* index in results[x][y][index] */ } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {}; /* Run benchmarks. */ for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { bool is_copy = placement >= 2; printf("-----------,--------,---,--,"); for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) printf("--------,"); printf("\n"); for (unsigned method = 0; method < NUM_METHODS; method++) { bool test_cp = method <= 2; bool test_cs = method >= 3; unsigned cs_method = method - 3; unsigned cs_waves_per_sh = test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0; cs_method %= 3 * NUM_SHADERS; unsigned cache_policy = test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0; unsigned cs_dwords_per_thread = test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0; if (sctx->gfx_level == GFX6) { /* GFX6 doesn't support CP DMA operations through L2. */ if (test_cp && cache_policy != L2_BYPASS) continue; /* WAVES_PER_SH is in multiples of 16 on GFX6. */ if (test_cs && cs_waves_per_sh % 16 != 0) continue; } /* SI_RESOURCE_FLAG_GL2_BYPASS setting RADEON_FLAG_GL2_BYPASS doesn't affect * chips before gfx9. */ if (test_cs && cache_policy && sctx->gfx_level < GFX9) continue; printf("%s ,", placement_str[placement]); if (test_cs) { printf("CS x%-4u,%3s,", cs_dwords_per_thread, cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : ""); } else { printf("%s,%3s,", method_str[method], method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : ""); } if (test_cs && cs_waves_per_sh) printf("%2u,", cs_waves_per_sh); else printf(" ,"); void *compute_shader = NULL; if (test_cs) { compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread, cache_policy == L2_STREAM, is_copy); } double score = 0; for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { /* Don't test bigger sizes if it's too slow. Print 0. */ if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) { printf("%7.0f ,", 0.0); continue; } enum pipe_resource_usage dst_usage, src_usage; struct pipe_resource *dst, *src; unsigned query_type = PIPE_QUERY_TIME_ELAPSED; unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_GL2_BYPASS : 0; if (placement == 0 || placement == 2 || placement == 4) dst_usage = PIPE_USAGE_DEFAULT; else dst_usage = PIPE_USAGE_STREAM; if (placement == 2 || placement == 3) src_usage = PIPE_USAGE_DEFAULT; else src_usage = PIPE_USAGE_STREAM; dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256); src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL; /* Wait for idle before testing, so that other processes don't mess up the results. */ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB; sctx->emit_cache_flush(sctx, &sctx->gfx_cs); struct pipe_query *q = ctx->create_query(ctx, query_type, 0); ctx->begin_query(ctx, q); /* Run tests. */ for (unsigned iter = 0; iter < NUM_RUNS; iter++) { if (test_cp) { /* CP DMA */ if (is_copy) { si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_NONE, cache_policy); } else { si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, size, clear_value, SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_NONE, cache_policy); } } else { /* Compute */ /* The memory accesses are coalesced, meaning that the 1st instruction writes * the 1st contiguous block of data for the whole wave, the 2nd instruction * writes the 2nd contiguous block of data, etc. */ unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4); unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread; unsigned dwords_per_wave = cs_dwords_per_thread * 64; unsigned num_dwords = size / 4; unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); struct pipe_grid_info info = {}; info.block[0] = MIN2(64, num_instructions); info.block[1] = 1; info.block[2] = 1; info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); info.grid[1] = 1; info.grid[2] = 1; struct pipe_shader_buffer sb[2] = {}; sb[0].buffer = dst; sb[0].buffer_size = size; if (is_copy) { sb[1].buffer = src; sb[1].buffer_size = size; } else { for (unsigned i = 0; i < 4; i++) sctx->cs_user_data[i] = clear_value; } ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1); ctx->bind_compute_state(ctx, compute_shader); sctx->cs_max_waves_per_sh = cs_waves_per_sh; ctx->launch_grid(ctx, &info); ctx->bind_compute_state(ctx, NULL); sctx->cs_max_waves_per_sh = 0; /* disable the limit */ } /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */ sctx->flags |= SI_CONTEXT_INV_VCACHE | (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) | SI_CONTEXT_CS_PARTIAL_FLUSH; sctx->emit_cache_flush(sctx, &sctx->gfx_cs); } ctx->end_query(ctx, q); ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC); pipe_resource_reference(&dst, NULL); pipe_resource_reference(&src, NULL); /* Get results. */ union pipe_query_result result; ctx->get_query_result(ctx, q, true, &result); ctx->destroy_query(ctx, q); score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS); printf("%7.0f ,", score); fflush(stdout); struct si_result *r = &results[util_logbase2(size)][placement][method]; r->is_valid = true; r->is_cp = test_cp; r->is_cs = test_cs; r->cache_policy = cache_policy; r->dwords_per_thread = cs_dwords_per_thread; r->waves_per_sh = cs_waves_per_sh; r->score = score; r->index = method; } puts(""); if (compute_shader) ctx->delete_compute_state(ctx, compute_shader); } } puts(""); puts("static struct si_method"); printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool " "cached)\n", sctx->screen->info.name); puts("{"); puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); /* Analyze results and find the best methods. */ for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { if (placement == 0) puts(" if (dst == RADEON_DOMAIN_VRAM) {"); else if (placement == 1) puts(" } else { /* GTT */"); else if (placement == 2) { puts("}"); puts(""); puts("static struct si_method"); printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n", sctx->screen->info.name); printf(" uint64_t size64, bool async, bool cached)\n"); puts("{"); puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {"); } else if (placement == 3) puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {"); else puts(" } else { /* GTT -> VRAM */"); for (unsigned mode = 0; mode < 3; mode++) { bool async = mode == 0; bool cached = mode == 1; if (async) puts(" if (async) { /* async compute */"); else if (cached) puts(" if (cached) { /* gfx ring */"); else puts(" } else { /* gfx ring - uncached */"); /* The list of best chosen methods. */ struct si_result *methods[32]; unsigned method_max_size[32]; unsigned num_methods = 0; for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { /* Find the best method. */ struct si_result *best = NULL; for (unsigned i = 0; i < NUM_METHODS; i++) { struct si_result *r = &results[util_logbase2(size)][placement][i]; if (!r->is_valid) continue; /* Ban CP DMA clears via MC on <= GFX8. They are super slow * on GTT, which we can get due to BO evictions. */ if (sctx->gfx_level <= GFX8 && placement == 1 && r->is_cp && r->cache_policy == L2_BYPASS) continue; if (async) { /* The following constraints for compute IBs try to limit * resource usage so as not to decrease the performance * of gfx IBs too much. */ /* Don't use CP DMA on asynchronous rings, because * the engine is shared with gfx IBs. */ if (r->is_cp) continue; /* Don't use L2 caching on asynchronous rings to minimize * L2 usage. */ if (r->cache_policy == L2_LRU) continue; /* Asynchronous compute recommends waves_per_sh != 0 * to limit CU usage. */ if (r->is_cs && r->waves_per_sh == 0) continue; } else { if (cached && r->cache_policy == L2_BYPASS) continue; if (!cached && r->cache_policy == L2_LRU) continue; } if (!best) { best = r; continue; } /* Assume some measurement error. Earlier methods occupy fewer * resources, so the next method is always more greedy, and we * don't want to select it due to a measurement error. */ double min_improvement = 1.03; if (best->score * min_improvement < r->score) best = r; } if (num_methods > 0) { unsigned prev_index = num_methods - 1; struct si_result *prev = methods[prev_index]; struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index]; /* If the best one is also the best for the previous size, * just bump the size for the previous one. * * If there is no best, it means all methods were too slow * for this size and were not tested. Use the best one for * the previous size. */ if (!best || /* If it's the same method as for the previous size: */ (prev->is_cp == best->is_cp && prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy && prev->dwords_per_thread == best->dwords_per_thread && prev->waves_per_sh == best->waves_per_sh) || /* If the method for the previous size is also the best * for this size: */ (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) { method_max_size[prev_index] = size; continue; } } /* Add it to the list. */ assert(num_methods < ARRAY_SIZE(methods)); methods[num_methods] = best; method_max_size[num_methods] = size; num_methods++; } for (unsigned i = 0; i < num_methods; i++) { struct si_result *best = methods[i]; unsigned size = method_max_size[i]; /* The size threshold is between the current benchmarked * size and the next benchmarked size. */ if (i < num_methods - 1) printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2); else if (i > 0) printf(" else "); else printf(" "); printf("return "); assert(best); const char *cache_policy_str = best->cache_policy == L2_BYPASS ? "L2_BYPASS" : best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM"; if (best->is_cp) { printf("CP_DMA(%s);\n", cache_policy_str); } if (best->is_cs) { printf("COMPUTE(%s, %u, %u);\n", cache_policy_str, best->dwords_per_thread, best->waves_per_sh); } } } puts(" }"); } puts(" }"); puts("}"); ctx->destroy(ctx); exit(0); }