1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 /* This file implements tests on the si_clearbuffer function. */
8
9 #include "si_pipe.h"
10 #include "si_query.h"
11
12 #define MIN_SIZE 512
13 #define MAX_SIZE (128 * 1024 * 1024)
14 #define SIZE_SHIFT 1
15 #define NUM_RUNS 128
16
get_MBps_rate(unsigned num_bytes,unsigned ns)17 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
18 {
19 return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
20 }
21
si_test_dma_perf(struct si_screen * sscreen)22 void si_test_dma_perf(struct si_screen *sscreen)
23 {
24 struct pipe_screen *screen = &sscreen->b;
25 struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
26 struct si_context *sctx = (struct si_context *)ctx;
27 const uint32_t clear_value = 0x12345678;
28 static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
29 static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16};
30
31 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
32 #define NUM_METHODS (3 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
33
34 static const char *method_str[] = {
35 "CP MC ",
36 "CP L2 ",
37 "CP L2 ",
38 };
39 static const char *placement_str[] = {
40 /* Clear */
41 "fill->VRAM",
42 "fill->GTT ",
43 /* Copy */
44 "VRAM->VRAM",
45 "VRAM->GTT ",
46 "GTT ->VRAM",
47 };
48
49 printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
50 printf("Heap ,Method ,L2p,Wa,");
51 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
52 if (size >= 1024)
53 printf("%6uKB,", size / 1024);
54 else
55 printf(" %6uB,", size);
56 }
57 printf("\n");
58
59 /* results[log2(size)][placement][method][] */
60 struct si_result {
61 bool is_valid;
62 bool is_cp;
63 bool is_cs;
64 unsigned cache_policy;
65 unsigned dwords_per_thread;
66 unsigned waves_per_sh;
67 unsigned score;
68 unsigned index; /* index in results[x][y][index] */
69 } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
70
71 /* Run benchmarks. */
72 for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
73 bool is_copy = placement >= 2;
74
75 printf("-----------,--------,---,--,");
76 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
77 printf("--------,");
78 printf("\n");
79
80 for (unsigned method = 0; method < NUM_METHODS; method++) {
81 bool test_cp = method <= 2;
82 bool test_cs = method >= 3;
83 unsigned cs_method = method - 3;
84 unsigned cs_waves_per_sh =
85 test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
86 cs_method %= 3 * NUM_SHADERS;
87 unsigned cache_policy =
88 test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
89 unsigned cs_dwords_per_thread =
90 test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
91
92 if (sctx->gfx_level == GFX6) {
93 /* GFX6 doesn't support CP DMA operations through L2. */
94 if (test_cp && cache_policy != L2_BYPASS)
95 continue;
96 /* WAVES_PER_SH is in multiples of 16 on GFX6. */
97 if (test_cs && cs_waves_per_sh % 16 != 0)
98 continue;
99 }
100
101 /* SI_RESOURCE_FLAG_GL2_BYPASS setting RADEON_FLAG_GL2_BYPASS doesn't affect
102 * chips before gfx9.
103 */
104 if (test_cs && cache_policy && sctx->gfx_level < GFX9)
105 continue;
106
107 printf("%s ,", placement_str[placement]);
108 if (test_cs) {
109 printf("CS x%-4u,%3s,", cs_dwords_per_thread,
110 cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
111 } else {
112 printf("%s,%3s,", method_str[method],
113 method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
114 }
115 if (test_cs && cs_waves_per_sh)
116 printf("%2u,", cs_waves_per_sh);
117 else
118 printf(" ,");
119
120 void *compute_shader = NULL;
121 if (test_cs) {
122 compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread,
123 cache_policy == L2_STREAM, is_copy);
124 }
125
126 double score = 0;
127 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
128 /* Don't test bigger sizes if it's too slow. Print 0. */
129 if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
130 printf("%7.0f ,", 0.0);
131 continue;
132 }
133
134 enum pipe_resource_usage dst_usage, src_usage;
135 struct pipe_resource *dst, *src;
136 unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
137 unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_GL2_BYPASS : 0;
138
139 if (placement == 0 || placement == 2 || placement == 4)
140 dst_usage = PIPE_USAGE_DEFAULT;
141 else
142 dst_usage = PIPE_USAGE_STREAM;
143
144 if (placement == 2 || placement == 3)
145 src_usage = PIPE_USAGE_DEFAULT;
146 else
147 src_usage = PIPE_USAGE_STREAM;
148
149 dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
150 src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
151
152 /* Wait for idle before testing, so that other processes don't mess up the results. */
153 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
154 SI_CONTEXT_FLUSH_AND_INV_CB |
155 SI_CONTEXT_FLUSH_AND_INV_DB;
156 si_emit_cache_flush_direct(sctx);
157
158 struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
159 ctx->begin_query(ctx, q);
160
161 /* Run tests. */
162 for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
163 if (test_cp) {
164 /* CP DMA */
165 if (is_copy) {
166 si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, SI_OP_SYNC_BEFORE_AFTER,
167 SI_COHERENCY_NONE, cache_policy);
168 } else {
169 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, size, clear_value,
170 SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_NONE,
171 cache_policy);
172 }
173 } else {
174 /* Compute */
175 /* The memory accesses are coalesced, meaning that the 1st instruction writes
176 * the 1st contiguous block of data for the whole wave, the 2nd instruction
177 * writes the 2nd contiguous block of data, etc.
178 */
179 unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
180 unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
181 unsigned dwords_per_wave = cs_dwords_per_thread * 64;
182
183 unsigned num_dwords = size / 4;
184 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
185
186 struct pipe_grid_info info = {};
187 info.block[0] = MIN2(64, num_instructions);
188 info.block[1] = 1;
189 info.block[2] = 1;
190 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
191 info.grid[1] = 1;
192 info.grid[2] = 1;
193
194 struct pipe_shader_buffer sb[2] = {};
195 sb[0].buffer = dst;
196 sb[0].buffer_size = size;
197
198 if (is_copy) {
199 sb[1].buffer = src;
200 sb[1].buffer_size = size;
201 } else {
202 for (unsigned i = 0; i < 4; i++)
203 sctx->cs_user_data[i] = clear_value;
204 }
205
206 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
207 ctx->bind_compute_state(ctx, compute_shader);
208 sctx->cs_max_waves_per_sh = cs_waves_per_sh;
209
210 ctx->launch_grid(ctx, &info);
211
212 ctx->bind_compute_state(ctx, NULL);
213 sctx->cs_max_waves_per_sh = 0; /* disable the limit */
214 }
215
216 /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */
217 sctx->flags |= SI_CONTEXT_INV_VCACHE |
218 (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
219 SI_CONTEXT_CS_PARTIAL_FLUSH;
220 si_emit_cache_flush_direct(sctx);
221 }
222
223 ctx->end_query(ctx, q);
224 ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
225
226 pipe_resource_reference(&dst, NULL);
227 pipe_resource_reference(&src, NULL);
228
229 /* Get results. */
230
231 union pipe_query_result result;
232
233 ctx->get_query_result(ctx, q, true, &result);
234 ctx->destroy_query(ctx, q);
235
236 score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS);
237 printf("%7.0f ,", score);
238 fflush(stdout);
239
240 struct si_result *r = &results[util_logbase2(size)][placement][method];
241 r->is_valid = true;
242 r->is_cp = test_cp;
243 r->is_cs = test_cs;
244 r->cache_policy = cache_policy;
245 r->dwords_per_thread = cs_dwords_per_thread;
246 r->waves_per_sh = cs_waves_per_sh;
247 r->score = score;
248 r->index = method;
249 }
250 puts("");
251
252 if (compute_shader)
253 ctx->delete_compute_state(ctx, compute_shader);
254 }
255 }
256
257 puts("");
258 puts("static struct si_method");
259 printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
260 "cached)\n",
261 sctx->screen->info.name);
262 puts("{");
263 puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
264
265 /* Analyze results and find the best methods. */
266 for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
267 if (placement == 0)
268 puts(" if (dst == RADEON_DOMAIN_VRAM) {");
269 else if (placement == 1)
270 puts(" } else { /* GTT */");
271 else if (placement == 2) {
272 puts("}");
273 puts("");
274 puts("static struct si_method");
275 printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
276 sctx->screen->info.name);
277 printf(" uint64_t size64, bool async, bool cached)\n");
278 puts("{");
279 puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
280 puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
281 } else if (placement == 3)
282 puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
283 else
284 puts(" } else { /* GTT -> VRAM */");
285
286 for (unsigned mode = 0; mode < 3; mode++) {
287 bool async = mode == 0;
288 bool cached = mode == 1;
289
290 if (async)
291 puts(" if (async) { /* async compute */");
292 else if (cached)
293 puts(" if (cached) { /* gfx ring */");
294 else
295 puts(" } else { /* gfx ring - uncached */");
296
297 /* The list of best chosen methods. */
298 struct si_result *methods[32];
299 unsigned method_max_size[32];
300 unsigned num_methods = 0;
301
302 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
303 /* Find the best method. */
304 struct si_result *best = NULL;
305
306 for (unsigned i = 0; i < NUM_METHODS; i++) {
307 struct si_result *r = &results[util_logbase2(size)][placement][i];
308
309 if (!r->is_valid)
310 continue;
311
312 /* Ban CP DMA clears via MC on <= GFX8. They are super slow
313 * on GTT, which we can get due to BO evictions.
314 */
315 if (sctx->gfx_level <= GFX8 && placement == 1 && r->is_cp &&
316 r->cache_policy == L2_BYPASS)
317 continue;
318
319 if (async) {
320 /* The following constraints for compute IBs try to limit
321 * resource usage so as not to decrease the performance
322 * of gfx IBs too much.
323 */
324
325 /* Don't use CP DMA on asynchronous rings, because
326 * the engine is shared with gfx IBs.
327 */
328 if (r->is_cp)
329 continue;
330
331 /* Don't use L2 caching on asynchronous rings to minimize
332 * L2 usage.
333 */
334 if (r->cache_policy == L2_LRU)
335 continue;
336
337 /* Asynchronous compute recommends waves_per_sh != 0
338 * to limit CU usage. */
339 if (r->is_cs && r->waves_per_sh == 0)
340 continue;
341 } else {
342 if (cached && r->cache_policy == L2_BYPASS)
343 continue;
344 if (!cached && r->cache_policy == L2_LRU)
345 continue;
346 }
347
348 if (!best) {
349 best = r;
350 continue;
351 }
352
353 /* Assume some measurement error. Earlier methods occupy fewer
354 * resources, so the next method is always more greedy, and we
355 * don't want to select it due to a measurement error.
356 */
357 double min_improvement = 1.03;
358
359 if (best->score * min_improvement < r->score)
360 best = r;
361 }
362
363 if (num_methods > 0) {
364 unsigned prev_index = num_methods - 1;
365 struct si_result *prev = methods[prev_index];
366 struct si_result *prev_this_size =
367 &results[util_logbase2(size)][placement][prev->index];
368
369 /* If the best one is also the best for the previous size,
370 * just bump the size for the previous one.
371 *
372 * If there is no best, it means all methods were too slow
373 * for this size and were not tested. Use the best one for
374 * the previous size.
375 */
376 if (!best ||
377 /* If it's the same method as for the previous size: */
378 (prev->is_cp == best->is_cp &&
379 prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
380 prev->dwords_per_thread == best->dwords_per_thread &&
381 prev->waves_per_sh == best->waves_per_sh) ||
382 /* If the method for the previous size is also the best
383 * for this size: */
384 (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
385 method_max_size[prev_index] = size;
386 continue;
387 }
388 }
389
390 /* Add it to the list. */
391 assert(num_methods < ARRAY_SIZE(methods));
392 methods[num_methods] = best;
393 method_max_size[num_methods] = size;
394 num_methods++;
395 }
396
397 for (unsigned i = 0; i < num_methods; i++) {
398 struct si_result *best = methods[i];
399 unsigned size = method_max_size[i];
400
401 /* The size threshold is between the current benchmarked
402 * size and the next benchmarked size. */
403 if (i < num_methods - 1)
404 printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
405 else if (i > 0)
406 printf(" else ");
407 else
408 printf(" ");
409 printf("return ");
410
411 assert(best);
412 const char *cache_policy_str =
413 best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
414 best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM";
415
416 if (best->is_cp) {
417 printf("CP_DMA(%s);\n", cache_policy_str);
418 }
419 if (best->is_cs) {
420 printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
421 best->dwords_per_thread, best->waves_per_sh);
422 }
423 }
424 }
425 puts(" }");
426 }
427 puts(" }");
428 puts("}");
429
430 ctx->destroy(ctx);
431 exit(0);
432 }
433