• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* This file implements tests on the si_clearbuffer function. */
8 
9 #include "si_pipe.h"
10 #include "si_query.h"
11 
12 #define MIN_SIZE   512
13 #define MAX_SIZE   (128 * 1024 * 1024)
14 #define SIZE_SHIFT 1
15 #define NUM_RUNS   128
16 
get_MBps_rate(unsigned num_bytes,unsigned ns)17 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
18 {
19    return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
20 }
21 
si_test_dma_perf(struct si_screen * sscreen)22 void si_test_dma_perf(struct si_screen *sscreen)
23 {
24    struct pipe_screen *screen = &sscreen->b;
25    struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
26    struct si_context *sctx = (struct si_context *)ctx;
27    const uint32_t clear_value = 0x12345678;
28    static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
29    static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16};
30 
31 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
32 #define NUM_METHODS (3 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
33 
34    static const char *method_str[] = {
35       "CP MC   ",
36       "CP L2   ",
37       "CP L2   ",
38    };
39    static const char *placement_str[] = {
40       /* Clear */
41       "fill->VRAM",
42       "fill->GTT ",
43       /* Copy */
44       "VRAM->VRAM",
45       "VRAM->GTT ",
46       "GTT ->VRAM",
47    };
48 
49    printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
50    printf("Heap       ,Method  ,L2p,Wa,");
51    for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
52       if (size >= 1024)
53          printf("%6uKB,", size / 1024);
54       else
55          printf(" %6uB,", size);
56    }
57    printf("\n");
58 
59    /* results[log2(size)][placement][method][] */
60    struct si_result {
61       bool is_valid;
62       bool is_cp;
63       bool is_cs;
64       unsigned cache_policy;
65       unsigned dwords_per_thread;
66       unsigned waves_per_sh;
67       unsigned score;
68       unsigned index; /* index in results[x][y][index] */
69    } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
70 
71    /* Run benchmarks. */
72    for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
73       bool is_copy = placement >= 2;
74 
75       printf("-----------,--------,---,--,");
76       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
77          printf("--------,");
78       printf("\n");
79 
80       for (unsigned method = 0; method < NUM_METHODS; method++) {
81          bool test_cp = method <= 2;
82          bool test_cs = method >= 3;
83          unsigned cs_method = method - 3;
84          unsigned cs_waves_per_sh =
85             test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
86          cs_method %= 3 * NUM_SHADERS;
87          unsigned cache_policy =
88             test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
89          unsigned cs_dwords_per_thread =
90             test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
91 
92          if (sctx->gfx_level == GFX6) {
93             /* GFX6 doesn't support CP DMA operations through L2. */
94             if (test_cp && cache_policy != L2_BYPASS)
95                continue;
96             /* WAVES_PER_SH is in multiples of 16 on GFX6. */
97             if (test_cs && cs_waves_per_sh % 16 != 0)
98                continue;
99          }
100 
101          /* SI_RESOURCE_FLAG_GL2_BYPASS setting RADEON_FLAG_GL2_BYPASS doesn't affect
102           * chips before gfx9.
103           */
104          if (test_cs && cache_policy && sctx->gfx_level < GFX9)
105             continue;
106 
107          printf("%s ,", placement_str[placement]);
108          if (test_cs) {
109             printf("CS x%-4u,%3s,", cs_dwords_per_thread,
110                    cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
111          } else {
112             printf("%s,%3s,", method_str[method],
113                    method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
114          }
115          if (test_cs && cs_waves_per_sh)
116             printf("%2u,", cs_waves_per_sh);
117          else
118             printf("  ,");
119 
120          void *compute_shader = NULL;
121          if (test_cs) {
122             compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread,
123                                               cache_policy == L2_STREAM, is_copy);
124          }
125 
126          double score = 0;
127          for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
128             /* Don't test bigger sizes if it's too slow. Print 0. */
129             if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
130                printf("%7.0f ,", 0.0);
131                continue;
132             }
133 
134             enum pipe_resource_usage dst_usage, src_usage;
135             struct pipe_resource *dst, *src;
136             unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
137             unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_GL2_BYPASS : 0;
138 
139             if (placement == 0 || placement == 2 || placement == 4)
140                dst_usage = PIPE_USAGE_DEFAULT;
141             else
142                dst_usage = PIPE_USAGE_STREAM;
143 
144             if (placement == 2 || placement == 3)
145                src_usage = PIPE_USAGE_DEFAULT;
146             else
147                src_usage = PIPE_USAGE_STREAM;
148 
149             dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
150             src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
151 
152             /* Wait for idle before testing, so that other processes don't mess up the results. */
153             sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
154                            SI_CONTEXT_FLUSH_AND_INV_CB |
155                            SI_CONTEXT_FLUSH_AND_INV_DB;
156             si_emit_cache_flush_direct(sctx);
157 
158             struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
159             ctx->begin_query(ctx, q);
160 
161             /* Run tests. */
162             for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
163                if (test_cp) {
164                   /* CP DMA */
165                   if (is_copy) {
166                      si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, SI_OP_SYNC_BEFORE_AFTER,
167                                            SI_COHERENCY_NONE, cache_policy);
168                   } else {
169                      si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, size, clear_value,
170                                             SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_NONE,
171                                             cache_policy);
172                   }
173                } else {
174                   /* Compute */
175                   /* The memory accesses are coalesced, meaning that the 1st instruction writes
176                    * the 1st contiguous block of data for the whole wave, the 2nd instruction
177                    * writes the 2nd contiguous block of data, etc.
178                    */
179                   unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
180                   unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
181                   unsigned dwords_per_wave = cs_dwords_per_thread * 64;
182 
183                   unsigned num_dwords = size / 4;
184                   unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
185 
186                   struct pipe_grid_info info = {};
187                   info.block[0] = MIN2(64, num_instructions);
188                   info.block[1] = 1;
189                   info.block[2] = 1;
190                   info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
191                   info.grid[1] = 1;
192                   info.grid[2] = 1;
193 
194                   struct pipe_shader_buffer sb[2] = {};
195                   sb[0].buffer = dst;
196                   sb[0].buffer_size = size;
197 
198                   if (is_copy) {
199                      sb[1].buffer = src;
200                      sb[1].buffer_size = size;
201                   } else {
202                      for (unsigned i = 0; i < 4; i++)
203                         sctx->cs_user_data[i] = clear_value;
204                   }
205 
206                   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
207                   ctx->bind_compute_state(ctx, compute_shader);
208                   sctx->cs_max_waves_per_sh = cs_waves_per_sh;
209 
210                   ctx->launch_grid(ctx, &info);
211 
212                   ctx->bind_compute_state(ctx, NULL);
213                   sctx->cs_max_waves_per_sh = 0; /* disable the limit */
214                }
215 
216                /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */
217                sctx->flags |= SI_CONTEXT_INV_VCACHE |
218                               (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
219                               SI_CONTEXT_CS_PARTIAL_FLUSH;
220                si_emit_cache_flush_direct(sctx);
221             }
222 
223             ctx->end_query(ctx, q);
224             ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
225 
226             pipe_resource_reference(&dst, NULL);
227             pipe_resource_reference(&src, NULL);
228 
229             /* Get results. */
230 
231             union pipe_query_result result;
232 
233             ctx->get_query_result(ctx, q, true, &result);
234             ctx->destroy_query(ctx, q);
235 
236             score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS);
237             printf("%7.0f ,", score);
238             fflush(stdout);
239 
240             struct si_result *r = &results[util_logbase2(size)][placement][method];
241             r->is_valid = true;
242             r->is_cp = test_cp;
243             r->is_cs = test_cs;
244             r->cache_policy = cache_policy;
245             r->dwords_per_thread = cs_dwords_per_thread;
246             r->waves_per_sh = cs_waves_per_sh;
247             r->score = score;
248             r->index = method;
249          }
250          puts("");
251 
252          if (compute_shader)
253             ctx->delete_compute_state(ctx, compute_shader);
254       }
255    }
256 
257    puts("");
258    puts("static struct si_method");
259    printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
260           "cached)\n",
261           sctx->screen->info.name);
262    puts("{");
263    puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
264 
265    /* Analyze results and find the best methods. */
266    for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
267       if (placement == 0)
268          puts("   if (dst == RADEON_DOMAIN_VRAM) {");
269       else if (placement == 1)
270          puts("   } else { /* GTT */");
271       else if (placement == 2) {
272          puts("}");
273          puts("");
274          puts("static struct si_method");
275          printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
276                 sctx->screen->info.name);
277          printf("                     uint64_t size64, bool async, bool cached)\n");
278          puts("{");
279          puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
280          puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
281       } else if (placement == 3)
282          puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
283       else
284          puts("   } else { /* GTT -> VRAM */");
285 
286       for (unsigned mode = 0; mode < 3; mode++) {
287          bool async = mode == 0;
288          bool cached = mode == 1;
289 
290          if (async)
291             puts("      if (async) { /* async compute */");
292          else if (cached)
293             puts("      if (cached) { /* gfx ring */");
294          else
295             puts("      } else { /* gfx ring - uncached */");
296 
297          /* The list of best chosen methods. */
298          struct si_result *methods[32];
299          unsigned method_max_size[32];
300          unsigned num_methods = 0;
301 
302          for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
303             /* Find the best method. */
304             struct si_result *best = NULL;
305 
306             for (unsigned i = 0; i < NUM_METHODS; i++) {
307                struct si_result *r = &results[util_logbase2(size)][placement][i];
308 
309                if (!r->is_valid)
310                   continue;
311 
312                /* Ban CP DMA clears via MC on <= GFX8. They are super slow
313                 * on GTT, which we can get due to BO evictions.
314                 */
315                if (sctx->gfx_level <= GFX8 && placement == 1 && r->is_cp &&
316                    r->cache_policy == L2_BYPASS)
317                   continue;
318 
319                if (async) {
320                   /* The following constraints for compute IBs try to limit
321                    * resource usage so as not to decrease the performance
322                    * of gfx IBs too much.
323                    */
324 
325                   /* Don't use CP DMA on asynchronous rings, because
326                    * the engine is shared with gfx IBs.
327                    */
328                   if (r->is_cp)
329                      continue;
330 
331                   /* Don't use L2 caching on asynchronous rings to minimize
332                    * L2 usage.
333                    */
334                   if (r->cache_policy == L2_LRU)
335                      continue;
336 
337                   /* Asynchronous compute recommends waves_per_sh != 0
338                    * to limit CU usage. */
339                   if (r->is_cs && r->waves_per_sh == 0)
340                      continue;
341                } else {
342                   if (cached && r->cache_policy == L2_BYPASS)
343                      continue;
344                   if (!cached && r->cache_policy == L2_LRU)
345                      continue;
346                }
347 
348                if (!best) {
349                   best = r;
350                   continue;
351                }
352 
353                /* Assume some measurement error. Earlier methods occupy fewer
354                 * resources, so the next method is always more greedy, and we
355                 * don't want to select it due to a measurement error.
356                 */
357                double min_improvement = 1.03;
358 
359                if (best->score * min_improvement < r->score)
360                   best = r;
361             }
362 
363             if (num_methods > 0) {
364                unsigned prev_index = num_methods - 1;
365                struct si_result *prev = methods[prev_index];
366                struct si_result *prev_this_size =
367                   &results[util_logbase2(size)][placement][prev->index];
368 
369                /* If the best one is also the best for the previous size,
370                 * just bump the size for the previous one.
371                 *
372                 * If there is no best, it means all methods were too slow
373                 * for this size and were not tested. Use the best one for
374                 * the previous size.
375                 */
376                if (!best ||
377                    /* If it's the same method as for the previous size: */
378                    (prev->is_cp == best->is_cp &&
379                     prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
380                     prev->dwords_per_thread == best->dwords_per_thread &&
381                     prev->waves_per_sh == best->waves_per_sh) ||
382                    /* If the method for the previous size is also the best
383                     * for this size: */
384                    (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
385                   method_max_size[prev_index] = size;
386                   continue;
387                }
388             }
389 
390             /* Add it to the list. */
391             assert(num_methods < ARRAY_SIZE(methods));
392             methods[num_methods] = best;
393             method_max_size[num_methods] = size;
394             num_methods++;
395          }
396 
397          for (unsigned i = 0; i < num_methods; i++) {
398             struct si_result *best = methods[i];
399             unsigned size = method_max_size[i];
400 
401             /* The size threshold is between the current benchmarked
402              * size and the next benchmarked size. */
403             if (i < num_methods - 1)
404                printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
405             else if (i > 0)
406                printf("         else                   ");
407             else
408                printf("         ");
409             printf("return ");
410 
411             assert(best);
412             const char *cache_policy_str =
413                best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
414                best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM";
415 
416             if (best->is_cp) {
417                printf("CP_DMA(%s);\n", cache_policy_str);
418             }
419             if (best->is_cs) {
420                printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
421                       best->dwords_per_thread, best->waves_per_sh);
422             }
423          }
424       }
425       puts("      }");
426    }
427    puts("   }");
428    puts("}");
429 
430    ctx->destroy(ctx);
431    exit(0);
432 }
433