1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 */
25
26 /* This file implements tests on the si_clearbuffer function. */
27
28 #include "si_pipe.h"
29 #include "si_query.h"
30
31 #define MIN_SIZE 512
32 #define MAX_SIZE (128 * 1024 * 1024)
33 #define SIZE_SHIFT 1
34 #define NUM_RUNS 128
35
get_MBps_rate(unsigned num_bytes,unsigned ns)36 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
37 {
38 return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
39 }
40
si_test_dma_perf(struct si_screen * sscreen)41 void si_test_dma_perf(struct si_screen *sscreen)
42 {
43 struct pipe_screen *screen = &sscreen->b;
44 struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
45 struct si_context *sctx = (struct si_context *)ctx;
46 const uint32_t clear_value = 0x12345678;
47 static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
48 static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16};
49
50 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
51 #define NUM_METHODS (3 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
52
53 static const char *method_str[] = {
54 "CP MC ",
55 "CP L2 ",
56 "CP L2 ",
57 };
58 static const char *placement_str[] = {
59 /* Clear */
60 "fill->VRAM",
61 "fill->GTT ",
62 /* Copy */
63 "VRAM->VRAM",
64 "VRAM->GTT ",
65 "GTT ->VRAM",
66 };
67
68 printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
69 printf("Heap ,Method ,L2p,Wa,");
70 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
71 if (size >= 1024)
72 printf("%6uKB,", size / 1024);
73 else
74 printf(" %6uB,", size);
75 }
76 printf("\n");
77
78 /* results[log2(size)][placement][method][] */
79 struct si_result {
80 bool is_valid;
81 bool is_cp;
82 bool is_cs;
83 unsigned cache_policy;
84 unsigned dwords_per_thread;
85 unsigned waves_per_sh;
86 unsigned score;
87 unsigned index; /* index in results[x][y][index] */
88 } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
89
90 /* Run benchmarks. */
91 for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
92 bool is_copy = placement >= 2;
93
94 printf("-----------,--------,---,--,");
95 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
96 printf("--------,");
97 printf("\n");
98
99 for (unsigned method = 0; method < NUM_METHODS; method++) {
100 bool test_cp = method <= 2;
101 bool test_cs = method >= 3;
102 unsigned cs_method = method - 3;
103 unsigned cs_waves_per_sh =
104 test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
105 cs_method %= 3 * NUM_SHADERS;
106 unsigned cache_policy =
107 test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
108 unsigned cs_dwords_per_thread =
109 test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
110
111 if (sctx->chip_class == GFX6) {
112 /* GFX6 doesn't support CP DMA operations through L2. */
113 if (test_cp && cache_policy != L2_BYPASS)
114 continue;
115 /* WAVES_PER_SH is in multiples of 16 on GFX6. */
116 if (test_cs && cs_waves_per_sh % 16 != 0)
117 continue;
118 }
119
120 /* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect
121 * chips before gfx9.
122 */
123 if (test_cs && cache_policy && sctx->chip_class < GFX9)
124 continue;
125
126 printf("%s ,", placement_str[placement]);
127 if (test_cs) {
128 printf("CS x%-4u,%3s,", cs_dwords_per_thread,
129 cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
130 } else {
131 printf("%s,%3s,", method_str[method],
132 method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
133 }
134 if (test_cs && cs_waves_per_sh)
135 printf("%2u,", cs_waves_per_sh);
136 else
137 printf(" ,");
138
139 void *compute_shader = NULL;
140 if (test_cs) {
141 compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
142 cache_policy == L2_STREAM, is_copy);
143 }
144
145 double score = 0;
146 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
147 /* Don't test bigger sizes if it's too slow. Print 0. */
148 if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
149 printf("%7.0f ,", 0.0);
150 continue;
151 }
152
153 enum pipe_resource_usage dst_usage, src_usage;
154 struct pipe_resource *dst, *src;
155 unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
156 unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;
157
158 if (placement == 0 || placement == 2 || placement == 4)
159 dst_usage = PIPE_USAGE_DEFAULT;
160 else
161 dst_usage = PIPE_USAGE_STREAM;
162
163 if (placement == 2 || placement == 3)
164 src_usage = PIPE_USAGE_DEFAULT;
165 else
166 src_usage = PIPE_USAGE_STREAM;
167
168 dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
169 src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
170
171 /* Wait for idle before testing, so that other processes don't mess up the results. */
172 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
173 SI_CONTEXT_FLUSH_AND_INV_CB |
174 SI_CONTEXT_FLUSH_AND_INV_DB;
175 sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
176
177 struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
178 ctx->begin_query(ctx, q);
179
180 /* Run tests. */
181 for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
182 if (test_cp) {
183 /* CP DMA */
184 if (is_copy) {
185 si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, SI_OP_SYNC_BEFORE_AFTER,
186 SI_COHERENCY_NONE, cache_policy);
187 } else {
188 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, size, clear_value,
189 SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_NONE,
190 cache_policy);
191 }
192 } else {
193 /* Compute */
194 /* The memory accesses are coalesced, meaning that the 1st instruction writes
195 * the 1st contiguous block of data for the whole wave, the 2nd instruction
196 * writes the 2nd contiguous block of data, etc.
197 */
198 unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
199 unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
200 unsigned dwords_per_wave = cs_dwords_per_thread * 64;
201
202 unsigned num_dwords = size / 4;
203 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
204
205 struct pipe_grid_info info = {};
206 info.block[0] = MIN2(64, num_instructions);
207 info.block[1] = 1;
208 info.block[2] = 1;
209 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
210 info.grid[1] = 1;
211 info.grid[2] = 1;
212
213 struct pipe_shader_buffer sb[2] = {};
214 sb[0].buffer = dst;
215 sb[0].buffer_size = size;
216
217 if (is_copy) {
218 sb[1].buffer = src;
219 sb[1].buffer_size = size;
220 } else {
221 for (unsigned i = 0; i < 4; i++)
222 sctx->cs_user_data[i] = clear_value;
223 }
224
225 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
226 ctx->bind_compute_state(ctx, compute_shader);
227 sctx->cs_max_waves_per_sh = cs_waves_per_sh;
228
229 ctx->launch_grid(ctx, &info);
230
231 ctx->bind_compute_state(ctx, NULL);
232 sctx->cs_max_waves_per_sh = 0; /* disable the limit */
233 }
234
235 /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */
236 sctx->flags |= SI_CONTEXT_INV_VCACHE |
237 (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
238 SI_CONTEXT_CS_PARTIAL_FLUSH;
239 sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
240 }
241
242 ctx->end_query(ctx, q);
243 ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
244
245 pipe_resource_reference(&dst, NULL);
246 pipe_resource_reference(&src, NULL);
247
248 /* Get results. */
249
250 union pipe_query_result result;
251
252 ctx->get_query_result(ctx, q, true, &result);
253 ctx->destroy_query(ctx, q);
254
255 score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS);
256 printf("%7.0f ,", score);
257 fflush(stdout);
258
259 struct si_result *r = &results[util_logbase2(size)][placement][method];
260 r->is_valid = true;
261 r->is_cp = test_cp;
262 r->is_cs = test_cs;
263 r->cache_policy = cache_policy;
264 r->dwords_per_thread = cs_dwords_per_thread;
265 r->waves_per_sh = cs_waves_per_sh;
266 r->score = score;
267 r->index = method;
268 }
269 puts("");
270
271 if (compute_shader)
272 ctx->delete_compute_state(ctx, compute_shader);
273 }
274 }
275
276 puts("");
277 puts("static struct si_method");
278 printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
279 "cached)\n",
280 sctx->screen->info.name);
281 puts("{");
282 puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
283
284 /* Analyze results and find the best methods. */
285 for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
286 if (placement == 0)
287 puts(" if (dst == RADEON_DOMAIN_VRAM) {");
288 else if (placement == 1)
289 puts(" } else { /* GTT */");
290 else if (placement == 2) {
291 puts("}");
292 puts("");
293 puts("static struct si_method");
294 printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
295 sctx->screen->info.name);
296 printf(" uint64_t size64, bool async, bool cached)\n");
297 puts("{");
298 puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
299 puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
300 } else if (placement == 3)
301 puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
302 else
303 puts(" } else { /* GTT -> VRAM */");
304
305 for (unsigned mode = 0; mode < 3; mode++) {
306 bool async = mode == 0;
307 bool cached = mode == 1;
308
309 if (async)
310 puts(" if (async) { /* async compute */");
311 else if (cached)
312 puts(" if (cached) { /* gfx ring */");
313 else
314 puts(" } else { /* gfx ring - uncached */");
315
316 /* The list of best chosen methods. */
317 struct si_result *methods[32];
318 unsigned method_max_size[32];
319 unsigned num_methods = 0;
320
321 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
322 /* Find the best method. */
323 struct si_result *best = NULL;
324
325 for (unsigned i = 0; i < NUM_METHODS; i++) {
326 struct si_result *r = &results[util_logbase2(size)][placement][i];
327
328 if (!r->is_valid)
329 continue;
330
331 /* Ban CP DMA clears via MC on <= GFX8. They are super slow
332 * on GTT, which we can get due to BO evictions.
333 */
334 if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
335 r->cache_policy == L2_BYPASS)
336 continue;
337
338 if (async) {
339 /* The following constraints for compute IBs try to limit
340 * resource usage so as not to decrease the performance
341 * of gfx IBs too much.
342 */
343
344 /* Don't use CP DMA on asynchronous rings, because
345 * the engine is shared with gfx IBs.
346 */
347 if (r->is_cp)
348 continue;
349
350 /* Don't use L2 caching on asynchronous rings to minimize
351 * L2 usage.
352 */
353 if (r->cache_policy == L2_LRU)
354 continue;
355
356 /* Asynchronous compute recommends waves_per_sh != 0
357 * to limit CU usage. */
358 if (r->is_cs && r->waves_per_sh == 0)
359 continue;
360 } else {
361 if (cached && r->cache_policy == L2_BYPASS)
362 continue;
363 if (!cached && r->cache_policy == L2_LRU)
364 continue;
365 }
366
367 if (!best) {
368 best = r;
369 continue;
370 }
371
372 /* Assume some measurement error. Earlier methods occupy fewer
373 * resources, so the next method is always more greedy, and we
374 * don't want to select it due to a measurement error.
375 */
376 double min_improvement = 1.03;
377
378 if (best->score * min_improvement < r->score)
379 best = r;
380 }
381
382 if (num_methods > 0) {
383 unsigned prev_index = num_methods - 1;
384 struct si_result *prev = methods[prev_index];
385 struct si_result *prev_this_size =
386 &results[util_logbase2(size)][placement][prev->index];
387
388 /* If the best one is also the best for the previous size,
389 * just bump the size for the previous one.
390 *
391 * If there is no best, it means all methods were too slow
392 * for this size and were not tested. Use the best one for
393 * the previous size.
394 */
395 if (!best ||
396 /* If it's the same method as for the previous size: */
397 (prev->is_cp == best->is_cp &&
398 prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
399 prev->dwords_per_thread == best->dwords_per_thread &&
400 prev->waves_per_sh == best->waves_per_sh) ||
401 /* If the method for the previous size is also the best
402 * for this size: */
403 (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
404 method_max_size[prev_index] = size;
405 continue;
406 }
407 }
408
409 /* Add it to the list. */
410 assert(num_methods < ARRAY_SIZE(methods));
411 methods[num_methods] = best;
412 method_max_size[num_methods] = size;
413 num_methods++;
414 }
415
416 for (unsigned i = 0; i < num_methods; i++) {
417 struct si_result *best = methods[i];
418 unsigned size = method_max_size[i];
419
420 /* The size threshold is between the current benchmarked
421 * size and the next benchmarked size. */
422 if (i < num_methods - 1)
423 printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
424 else if (i > 0)
425 printf(" else ");
426 else
427 printf(" ");
428 printf("return ");
429
430 assert(best);
431 const char *cache_policy_str =
432 best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
433 best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM";
434
435 if (best->is_cp) {
436 printf("CP_DMA(%s);\n", cache_policy_str);
437 }
438 if (best->is_cs) {
439 printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
440 best->dwords_per_thread, best->waves_per_sh);
441 }
442 }
443 }
444 puts(" }");
445 }
446 puts(" }");
447 puts("}");
448
449 ctx->destroy(ctx);
450 exit(0);
451 }
452