1 /*
2 * Copyright © 2021 Igalia S.L.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_autotune.h"
7
8 #include "tu_cmd_buffer.h"
9 #include "tu_cs.h"
10 #include "tu_device.h"
11 #include "tu_image.h"
12 #include "tu_pass.h"
13
14 /* How does it work?
15 *
16 * - For each renderpass we calculate the number of samples passed
17 * by storing the number before and after in GPU memory.
18 * - To store the values each command buffer holds GPU memory which
19 * expands with more renderpasses being written.
20 * - For each renderpass we create tu_renderpass_result entry which
21 * points to the results in GPU memory.
22 * - Later on tu_renderpass_result would be added to the
23 * tu_renderpass_history entry which aggregate results for a
24 * given renderpass.
25 * - On submission:
26 * - Process results which fence was signalled.
27 * - Free per-submission data which we now don't need.
28 *
29 * - Create a command stream to write a fence value. This way we would
30 * know when we could safely read the results.
31 * - We cannot rely on the command buffer's lifetime when referencing
32 * its resources since the buffer could be destroyed before we process
33 * the results.
34 * - For each command buffer:
35 * - Reference its GPU memory.
36 * - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
37 *
38 * Since the command buffers could be recorded on different threads
39 * we have to maintaining some amount of locking history table,
40 * however we change the table only in a single thread at the submission
41 * time, so in most cases there will be no locking.
42 */
43
44 void
45 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
46
47 #define TU_AUTOTUNE_DEBUG_LOG 0
48 /* Dump history entries on autotuner finish,
49 * could be used to gather data from traces.
50 */
51 #define TU_AUTOTUNE_LOG_AT_FINISH 0
52
53 /* How many last renderpass stats are taken into account. */
54 #define MAX_HISTORY_RESULTS 5
55 /* For how many submissions we store renderpass stats. */
56 #define MAX_HISTORY_LIFETIME 128
57
58
59 /**
60 * Tracks results for a given renderpass key
61 */
62 struct tu_renderpass_history {
63 uint64_t key;
64
65 /* We would delete old history entries */
66 uint32_t last_fence;
67
68 /**
69 * List of recent fd_renderpass_result's
70 */
71 struct list_head results;
72 uint32_t num_results;
73
74 uint32_t avg_samples;
75 };
76
77 /* Holds per-submission cs which writes the fence. */
78 struct tu_submission_data {
79 struct list_head node;
80 uint32_t fence;
81
82 struct tu_cs fence_cs;
83 };
84
85 static bool
fence_before(uint32_t a,uint32_t b)86 fence_before(uint32_t a, uint32_t b)
87 {
88 /* essentially a < b, but handle wrapped values */
89 return (int32_t)(a - b) < 0;
90 }
91
92 static uint32_t
get_autotune_fence(struct tu_autotune * at)93 get_autotune_fence(struct tu_autotune *at)
94 {
95 return at->device->global_bo_map->autotune_fence;
96 }
97
98 static struct tu_submission_data *
create_submission_data(struct tu_device * dev,struct tu_autotune * at,uint32_t fence)99 create_submission_data(struct tu_device *dev, struct tu_autotune *at,
100 uint32_t fence)
101 {
102 struct tu_submission_data *submission_data = NULL;
103 if (!list_is_empty(&at->submission_data_pool)) {
104 submission_data = list_first_entry(&at->submission_data_pool,
105 struct tu_submission_data, node);
106 list_del(&submission_data->node);
107 } else {
108 submission_data = (struct tu_submission_data *) calloc(
109 1, sizeof(struct tu_submission_data));
110 tu_cs_init(&submission_data->fence_cs, dev, TU_CS_MODE_GROW, 5, "autotune fence cs");
111 }
112 submission_data->fence = fence;
113
114 struct tu_cs* fence_cs = &submission_data->fence_cs;
115 tu_cs_begin(fence_cs);
116
117 tu_cs_emit_pkt7(fence_cs, CP_EVENT_WRITE, 4);
118 tu_cs_emit(fence_cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
119 tu_cs_emit_qw(fence_cs, dev->global_bo->iova + gb_offset(autotune_fence));
120 tu_cs_emit(fence_cs, fence);
121
122 tu_cs_end(fence_cs);
123
124 list_addtail(&submission_data->node, &at->pending_submission_data);
125
126 return submission_data;
127 }
128
129 static void
finish_submission_data(struct tu_autotune * at,struct tu_submission_data * data)130 finish_submission_data(struct tu_autotune *at,
131 struct tu_submission_data *data)
132 {
133 list_del(&data->node);
134 list_addtail(&data->node, &at->submission_data_pool);
135 tu_cs_reset(&data->fence_cs);
136 }
137
138 static void
free_submission_data(struct tu_submission_data * data)139 free_submission_data(struct tu_submission_data *data)
140 {
141 list_del(&data->node);
142 tu_cs_finish(&data->fence_cs);
143
144 free(data);
145 }
146
147 static uint64_t
hash_renderpass_instance(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd)148 hash_renderpass_instance(const struct tu_render_pass *pass,
149 const struct tu_framebuffer *framebuffer,
150 const struct tu_cmd_buffer *cmd) {
151 uint32_t data[3 + pass->attachment_count * 5];
152 uint32_t* ptr = data;
153
154 *ptr++ = framebuffer->width;
155 *ptr++ = framebuffer->height;
156 *ptr++ = framebuffer->layers;
157
158 for (unsigned i = 0; i < pass->attachment_count; i++) {
159 *ptr++ = cmd->state.attachments[i]->view.width;
160 *ptr++ = cmd->state.attachments[i]->view.height;
161 *ptr++ = cmd->state.attachments[i]->image->vk.format;
162 *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
163 *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
164 }
165
166 return XXH64(data, sizeof(data), pass->autotune_hash);
167 }
168
169 static void
free_result(struct tu_device * dev,struct tu_renderpass_result * result)170 free_result(struct tu_device *dev, struct tu_renderpass_result *result)
171 {
172 tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
173 list_del(&result->node);
174 free(result);
175 }
176
177 static void
free_history(struct tu_device * dev,struct tu_renderpass_history * history)178 free_history(struct tu_device *dev, struct tu_renderpass_history *history)
179 {
180 tu_autotune_free_results_locked(dev, &history->results);
181 free(history);
182 }
183
184 static bool
get_history(struct tu_autotune * at,uint64_t rp_key,uint32_t * avg_samples)185 get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
186 {
187 bool has_history = false;
188
189 /* If the lock contantion would be found in the wild -
190 * we could use try_lock here.
191 */
192 u_rwlock_rdlock(&at->ht_lock);
193 struct hash_entry *entry =
194 _mesa_hash_table_search(at->ht, &rp_key);
195 if (entry) {
196 struct tu_renderpass_history *history =
197 (struct tu_renderpass_history *) entry->data;
198 if (history->num_results > 0) {
199 *avg_samples = p_atomic_read(&history->avg_samples);
200 has_history = true;
201 }
202 }
203 u_rwlock_rdunlock(&at->ht_lock);
204
205 return has_history;
206 }
207
208 static struct tu_renderpass_result *
create_history_result(struct tu_autotune * at,uint64_t rp_key)209 create_history_result(struct tu_autotune *at, uint64_t rp_key)
210 {
211 struct tu_renderpass_result *result =
212 (struct tu_renderpass_result *) calloc(1, sizeof(*result));
213 result->rp_key = rp_key;
214
215 return result;
216 }
217
218 static void
history_add_result(struct tu_device * dev,struct tu_renderpass_history * history,struct tu_renderpass_result * result)219 history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
220 struct tu_renderpass_result *result)
221 {
222 list_delinit(&result->node);
223 list_add(&result->node, &history->results);
224
225 if (history->num_results < MAX_HISTORY_RESULTS) {
226 history->num_results++;
227 } else {
228 /* Once above the limit, start popping old results off the
229 * tail of the list:
230 */
231 struct tu_renderpass_result *old_result =
232 list_last_entry(&history->results, struct tu_renderpass_result, node);
233 mtx_lock(&dev->autotune_mutex);
234 free_result(dev, old_result);
235 mtx_unlock(&dev->autotune_mutex);
236 }
237
238 /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
239 uint32_t total_samples = 0;
240 list_for_each_entry(struct tu_renderpass_result, result,
241 &history->results, node) {
242 total_samples += result->samples_passed;
243 }
244
245 float avg_samples = (float)total_samples / (float)history->num_results;
246 p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
247 }
248
249 static void
process_results(struct tu_autotune * at,uint32_t current_fence)250 process_results(struct tu_autotune *at, uint32_t current_fence)
251 {
252 struct tu_device *dev = at->device;
253
254 list_for_each_entry_safe(struct tu_renderpass_result, result,
255 &at->pending_results, node) {
256 if (fence_before(current_fence, result->fence))
257 break;
258
259 struct tu_renderpass_history *history = result->history;
260 result->samples_passed =
261 result->samples->samples_end - result->samples->samples_start;
262
263 history_add_result(dev, history, result);
264 }
265
266 list_for_each_entry_safe(struct tu_submission_data, submission_data,
267 &at->pending_submission_data, node) {
268 if (fence_before(current_fence, submission_data->fence))
269 break;
270
271 finish_submission_data(at, submission_data);
272 }
273 }
274
275 static void
queue_pending_results(struct tu_autotune * at,struct tu_cmd_buffer * cmdbuf)276 queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
277 {
278 bool one_time_submit = cmdbuf->usage_flags &
279 VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
280
281 if (one_time_submit) {
282 /* We can just steal the list since it won't be resubmitted again */
283 list_splicetail(&cmdbuf->renderpass_autotune_results,
284 &at->pending_results);
285 list_inithead(&cmdbuf->renderpass_autotune_results);
286 } else {
287 list_for_each_entry_safe(struct tu_renderpass_result, result,
288 &cmdbuf->renderpass_autotune_results, node) {
289 /* TODO: copying each result isn't nice */
290 struct tu_renderpass_result *copy =
291 (struct tu_renderpass_result *) malloc(sizeof(*result));
292 *copy = *result;
293 tu_bo_get_ref(copy->bo.bo);
294 list_addtail(©->node, &at->pending_results);
295 }
296 }
297 }
298
299 struct tu_cs *
tu_autotune_on_submit(struct tu_device * dev,struct tu_autotune * at,struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)300 tu_autotune_on_submit(struct tu_device *dev,
301 struct tu_autotune *at,
302 struct tu_cmd_buffer **cmd_buffers,
303 uint32_t cmd_buffer_count)
304 {
305 /* We are single-threaded here */
306
307 const uint32_t gpu_fence = get_autotune_fence(at);
308 const uint32_t new_fence = at->fence_counter++;
309
310 process_results(at, gpu_fence);
311
312 /* Create history entries here to minimize work and locking being
313 * done on renderpass end.
314 */
315 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
316 struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
317 list_for_each_entry_safe(struct tu_renderpass_result, result,
318 &cmdbuf->renderpass_autotune_results, node) {
319 struct tu_renderpass_history *history;
320 struct hash_entry *entry =
321 _mesa_hash_table_search(at->ht, &result->rp_key);
322 if (!entry) {
323 history =
324 (struct tu_renderpass_history *) calloc(1, sizeof(*history));
325 history->key = result->rp_key;
326 list_inithead(&history->results);
327
328 u_rwlock_wrlock(&at->ht_lock);
329 _mesa_hash_table_insert(at->ht, &history->key, history);
330 u_rwlock_wrunlock(&at->ht_lock);
331 } else {
332 history = (struct tu_renderpass_history *) entry->data;
333 }
334
335 history->last_fence = new_fence;
336
337 result->fence = new_fence;
338 result->history = history;
339 }
340 }
341
342 struct tu_submission_data *submission_data =
343 create_submission_data(dev, at, new_fence);
344
345 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
346 struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
347 if (list_is_empty(&cmdbuf->renderpass_autotune_results))
348 continue;
349
350 queue_pending_results(at, cmdbuf);
351 }
352
353 if (TU_AUTOTUNE_DEBUG_LOG)
354 mesa_logi("Total history entries: %u", at->ht->entries);
355
356 /* Cleanup old entries from history table. The assumption
357 * here is that application doesn't hold many old unsubmitted
358 * command buffers, otherwise this table may grow big.
359 */
360 hash_table_foreach(at->ht, entry) {
361 struct tu_renderpass_history *history =
362 (struct tu_renderpass_history *) entry->data;
363 if (fence_before(gpu_fence, history->last_fence + MAX_HISTORY_LIFETIME))
364 continue;
365
366 if (TU_AUTOTUNE_DEBUG_LOG)
367 mesa_logi("Removed old history entry %016" PRIx64 "", history->key);
368
369 u_rwlock_wrlock(&at->ht_lock);
370 _mesa_hash_table_remove_key(at->ht, &history->key);
371 u_rwlock_wrunlock(&at->ht_lock);
372
373 mtx_lock(&dev->autotune_mutex);
374 free_history(dev, history);
375 mtx_unlock(&dev->autotune_mutex);
376 }
377
378 return &submission_data->fence_cs;
379 }
380
381 static bool
renderpass_key_equals(const void * _a,const void * _b)382 renderpass_key_equals(const void *_a, const void *_b)
383 {
384 return *(uint64_t *)_a == *(uint64_t *)_b;
385 }
386
387 static uint32_t
renderpass_key_hash(const void * _a)388 renderpass_key_hash(const void *_a)
389 {
390 return *((uint64_t *) _a) & 0xffffffff;
391 }
392
393 VkResult
tu_autotune_init(struct tu_autotune * at,struct tu_device * dev)394 tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
395 {
396 at->enabled = true;
397 at->device = dev;
398 at->ht = _mesa_hash_table_create(NULL,
399 renderpass_key_hash,
400 renderpass_key_equals);
401 u_rwlock_init(&at->ht_lock);
402
403 list_inithead(&at->pending_results);
404 list_inithead(&at->pending_submission_data);
405 list_inithead(&at->submission_data_pool);
406
407 /* start from 1 because tu6_global::autotune_fence is initialized to 0 */
408 at->fence_counter = 1;
409
410 return VK_SUCCESS;
411 }
412
413 void
tu_autotune_fini(struct tu_autotune * at,struct tu_device * dev)414 tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
415 {
416 if (TU_AUTOTUNE_LOG_AT_FINISH) {
417 while (!list_is_empty(&at->pending_results)) {
418 const uint32_t gpu_fence = get_autotune_fence(at);
419 process_results(at, gpu_fence);
420 }
421
422 hash_table_foreach(at->ht, entry) {
423 struct tu_renderpass_history *history =
424 (struct tu_renderpass_history *) entry->data;
425
426 mesa_logi("%016" PRIx64 " \tavg_passed=%u results=%u",
427 history->key, history->avg_samples, history->num_results);
428 }
429 }
430
431 tu_autotune_free_results(dev, &at->pending_results);
432
433 mtx_lock(&dev->autotune_mutex);
434 hash_table_foreach(at->ht, entry) {
435 struct tu_renderpass_history *history =
436 (struct tu_renderpass_history *) entry->data;
437 free_history(dev, history);
438 }
439 mtx_unlock(&dev->autotune_mutex);
440
441 list_for_each_entry_safe(struct tu_submission_data, submission_data,
442 &at->pending_submission_data, node) {
443 free_submission_data(submission_data);
444 }
445
446 list_for_each_entry_safe(struct tu_submission_data, submission_data,
447 &at->submission_data_pool, node) {
448 free_submission_data(submission_data);
449 }
450
451 _mesa_hash_table_destroy(at->ht, NULL);
452 u_rwlock_destroy(&at->ht_lock);
453 }
454
455 bool
tu_autotune_submit_requires_fence(struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)456 tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
457 uint32_t cmd_buffer_count)
458 {
459 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
460 struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
461 if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
462 return true;
463 }
464
465 return false;
466 }
467
468 void
tu_autotune_free_results_locked(struct tu_device * dev,struct list_head * results)469 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
470 {
471 list_for_each_entry_safe(struct tu_renderpass_result, result,
472 results, node) {
473 free_result(dev, result);
474 }
475 }
476
477 void
tu_autotune_free_results(struct tu_device * dev,struct list_head * results)478 tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
479 {
480 mtx_lock(&dev->autotune_mutex);
481 tu_autotune_free_results_locked(dev, results);
482 mtx_unlock(&dev->autotune_mutex);
483 }
484
485 static bool
fallback_use_bypass(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd_buffer)486 fallback_use_bypass(const struct tu_render_pass *pass,
487 const struct tu_framebuffer *framebuffer,
488 const struct tu_cmd_buffer *cmd_buffer)
489 {
490 if (cmd_buffer->state.rp.drawcall_count > 5)
491 return false;
492
493 for (unsigned i = 0; i < pass->subpass_count; i++) {
494 if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
495 return false;
496 }
497
498 return true;
499 }
500
501 static uint32_t
get_render_pass_pixel_count(const struct tu_cmd_buffer * cmd)502 get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
503 {
504 const VkExtent2D *extent = &cmd->state.render_area.extent;
505 return extent->width * extent->height;
506 }
507
508 static uint64_t
estimate_drawcall_bandwidth(const struct tu_cmd_buffer * cmd,uint32_t avg_renderpass_sample_count)509 estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
510 uint32_t avg_renderpass_sample_count)
511 {
512 const struct tu_cmd_state *state = &cmd->state;
513
514 if (!state->rp.drawcall_count)
515 return 0;
516
517 /* sample count times drawcall_bandwidth_per_sample */
518 return (uint64_t)avg_renderpass_sample_count *
519 state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
520 }
521
522 bool
tu_autotune_use_bypass(struct tu_autotune * at,struct tu_cmd_buffer * cmd_buffer,struct tu_renderpass_result ** autotune_result)523 tu_autotune_use_bypass(struct tu_autotune *at,
524 struct tu_cmd_buffer *cmd_buffer,
525 struct tu_renderpass_result **autotune_result)
526 {
527 const struct tu_render_pass *pass = cmd_buffer->state.pass;
528 const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
529
530 /* If a feedback loop in the subpass caused one of the pipelines used to set
531 * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even
532 * SINGLE_PRIM_MODE(FLUSH), then that should cause significantly increased
533 * sysmem bandwidth (though we haven't quantified it).
534 */
535 if (cmd_buffer->state.rp.sysmem_single_prim_mode)
536 return false;
537
538 /* If the user is using a fragment density map, then this will cause less
539 * FS invocations with GMEM, which has a hard-to-measure impact on
540 * performance because it depends on how heavy the FS is in addition to how
541 * many invocations there were and the density. Let's assume the user knows
542 * what they're doing when they added the map, because if sysmem is
543 * actually faster then they could've just not used the fragment density
544 * map.
545 */
546 if (pass->has_fdm)
547 return false;
548
549 /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
550 * we would have to allocate GPU memory at the submit time and copy
551 * results into it.
552 * Native games ususally don't use it, Zink and DXVK don't use it,
553 * D3D12 doesn't have such concept.
554 */
555 bool simultaneous_use =
556 cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
557
558 if (!at->enabled || simultaneous_use)
559 return fallback_use_bypass(pass, framebuffer, cmd_buffer);
560
561 /* We use 64bit hash as a key since we don't fear rare hash collision,
562 * the worst that would happen is sysmem being selected when it should
563 * have not, and with 64bit it would be extremely rare.
564 *
565 * Q: Why not make the key from framebuffer + renderpass pointers?
566 * A: At least DXVK creates new framebuffers each frame while keeping
567 * renderpasses the same. Also we want to support replaying a single
568 * frame in a loop for testing.
569 */
570 uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
571
572 *autotune_result = create_history_result(at, renderpass_key);
573
574 uint32_t avg_samples = 0;
575 if (get_history(at, renderpass_key, &avg_samples)) {
576 const uint32_t pass_pixel_count =
577 get_render_pass_pixel_count(cmd_buffer);
578 uint64_t sysmem_bandwidth =
579 (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
580 uint64_t gmem_bandwidth =
581 (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
582
583 const uint64_t total_draw_call_bandwidth =
584 estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
585
586 /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
587 sysmem_bandwidth += total_draw_call_bandwidth;
588
589 /* drawcalls access gmem in gmem rendering, but we do not want to ignore
590 * them completely. The state changes between tiles also have an
591 * overhead. The magic numbers of 11 and 10 are randomly chosen.
592 */
593 gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
594
595 const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
596 if (TU_AUTOTUNE_DEBUG_LOG) {
597 const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
598 const float drawcall_bandwidth_per_sample =
599 (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
600 cmd_buffer->state.rp.drawcall_count;
601
602 mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
603 renderpass_key,
604 cmd_buffer->state.rp.drawcall_count,
605 select_sysmem ? "sysmem" : "gmem");
606 mesa_logi(" avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
607 avg_samples,
608 drawcall_bandwidth_per_sample,
609 total_draw_call_bandwidth);
610 mesa_logi(" render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
611 extent->width, extent->height,
612 pass->sysmem_bandwidth_per_pixel,
613 pass->gmem_bandwidth_per_pixel);
614 mesa_logi(" sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
615 sysmem_bandwidth, gmem_bandwidth);
616 }
617
618 return select_sysmem;
619 }
620
621 return fallback_use_bypass(pass, framebuffer, cmd_buffer);
622 }
623
624 void
tu_autotune_begin_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)625 tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
626 struct tu_cs *cs,
627 struct tu_renderpass_result *autotune_result)
628 {
629 if (!autotune_result)
630 return;
631
632 struct tu_device *dev = cmd->device;
633
634 static const uint32_t size = sizeof(struct tu_renderpass_samples);
635
636 mtx_lock(&dev->autotune_mutex);
637 VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
638 mtx_unlock(&dev->autotune_mutex);
639 if (ret != VK_SUCCESS) {
640 autotune_result->bo.iova = 0;
641 return;
642 }
643
644 uint64_t result_iova = autotune_result->bo.iova;
645
646 autotune_result->samples =
647 (struct tu_renderpass_samples *) tu_suballoc_bo_map(
648 &autotune_result->bo);
649
650 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
651
652 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
653 /* A7XX TODO: Fixup ZPASS_DONE */
654 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
655 tu_cs_emit(cs, ZPASS_DONE);
656 }
657
tu_autotune_end_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)658 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
659 struct tu_cs *cs,
660 struct tu_renderpass_result *autotune_result)
661 {
662 if (!autotune_result)
663 return;
664
665 if (!autotune_result->bo.iova)
666 return;
667
668 uint64_t result_iova = autotune_result->bo.iova +
669 offsetof(struct tu_renderpass_samples, samples_end);
670
671 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
672
673 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
674
675 /* A7XX TODO: Fixup ZPASS_DONE */
676 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
677 tu_cs_emit(cs, ZPASS_DONE);
678 }
679