1 /*
2 * Copyright © 2021 Igalia S.L.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_autotune.h"
7
8 #include "tu_cmd_buffer.h"
9 #include "tu_cs.h"
10 #include "tu_device.h"
11 #include "tu_image.h"
12 #include "tu_pass.h"
13
14 /* How does it work?
15 *
16 * - For each renderpass we calculate the number of samples passed
17 * by storing the number before and after in GPU memory.
18 * - To store the values each command buffer holds GPU memory which
19 * expands with more renderpasses being written.
20 * - For each renderpass we create tu_renderpass_result entry which
21 * points to the results in GPU memory.
22 * - Later on tu_renderpass_result would be added to the
23 * tu_renderpass_history entry which aggregate results for a
24 * given renderpass.
25 * - On submission:
26 * - Process results which fence was signalled.
27 * - Free per-submission data which we now don't need.
28 *
29 * - Create a command stream to write a fence value. This way we would
30 * know when we could safely read the results.
31 * - We cannot rely on the command buffer's lifetime when referencing
32 * its resources since the buffer could be destroyed before we process
33 * the results.
34 * - For each command buffer:
35 * - Reference its GPU memory.
36 * - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
37 *
38 * Since the command buffers could be recorded on different threads
39 * we have to maintaining some amount of locking history table,
40 * however we change the table only in a single thread at the submission
41 * time, so in most cases there will be no locking.
42 */
43
44 void
45 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
46
47 #define TU_AUTOTUNE_DEBUG_LOG 0
48 /* Dump history entries on autotuner finish,
49 * could be used to gather data from traces.
50 */
51 #define TU_AUTOTUNE_LOG_AT_FINISH 0
52
53 /* How many last renderpass stats are taken into account. */
54 #define MAX_HISTORY_RESULTS 5
55 /* For how many submissions we store renderpass stats. */
56 #define MAX_HISTORY_LIFETIME 128
57
58
59 /**
60 * Tracks results for a given renderpass key
61 */
62 struct tu_renderpass_history {
63 uint64_t key;
64
65 /* We would delete old history entries */
66 uint32_t last_fence;
67
68 /**
69 * List of recent fd_renderpass_result's
70 */
71 struct list_head results;
72 uint32_t num_results;
73
74 uint32_t avg_samples;
75 };
76
77 /* Holds per-submission cs which writes the fence. */
78 struct tu_submission_data {
79 struct list_head node;
80 uint32_t fence;
81
82 struct tu_cs fence_cs;
83 uint32_t buffers_count;
84 };
85
86 static uint32_t
get_autotune_fence(struct tu_autotune * at)87 get_autotune_fence(struct tu_autotune *at)
88 {
89 const struct tu6_global *global = at->device->global_bo->map;
90 return global->autotune_fence;
91 }
92
93 static struct tu_submission_data *
create_submission_data(struct tu_device * dev,struct tu_autotune * at)94 create_submission_data(struct tu_device *dev, struct tu_autotune *at)
95 {
96 struct tu_submission_data *submission_data =
97 calloc(1, sizeof(struct tu_submission_data));
98 submission_data->fence = at->fence_counter;
99
100 struct tu_cs* fence_cs = &submission_data->fence_cs;
101 tu_cs_init(fence_cs, dev, TU_CS_MODE_GROW, 5);
102 tu_cs_begin(fence_cs);
103
104 tu_cs_emit_pkt7(fence_cs, CP_EVENT_WRITE, 4);
105 tu_cs_emit(fence_cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
106 tu_cs_emit_qw(fence_cs, dev->global_bo->iova + gb_offset(autotune_fence));
107 tu_cs_emit(fence_cs, at->fence_counter);
108
109 tu_cs_end(fence_cs);
110
111 list_addtail(&submission_data->node, &at->pending_submission_data);
112
113 return submission_data;
114 }
115
116 static void
free_submission_data(struct tu_submission_data * data)117 free_submission_data(struct tu_submission_data *data)
118 {
119 list_del(&data->node);
120 tu_cs_finish(&data->fence_cs);
121
122 free(data);
123 }
124
125 #define APPEND_TO_HASH(state, field) \
126 XXH64_update(state, &field, sizeof(field));
127
128 static uint64_t
hash_renderpass_instance(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd)129 hash_renderpass_instance(const struct tu_render_pass *pass,
130 const struct tu_framebuffer *framebuffer,
131 const struct tu_cmd_buffer *cmd) {
132 XXH64_state_t hash_state;
133 XXH64_reset(&hash_state, 0);
134
135 APPEND_TO_HASH(&hash_state, framebuffer->width);
136 APPEND_TO_HASH(&hash_state, framebuffer->height);
137 APPEND_TO_HASH(&hash_state, framebuffer->layers);
138
139 APPEND_TO_HASH(&hash_state, pass->attachment_count);
140 XXH64_update(&hash_state, pass->attachments, pass->attachment_count * sizeof(pass->attachments[0]));
141
142 for (unsigned i = 0; i < pass->attachment_count; i++) {
143 APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.width);
144 APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.height);
145 APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.format);
146 APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.array_layers);
147 APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.mip_levels);
148 }
149
150 APPEND_TO_HASH(&hash_state, pass->subpass_count);
151 for (unsigned i = 0; i < pass->subpass_count; i++) {
152 APPEND_TO_HASH(&hash_state, pass->subpasses[i].samples);
153 APPEND_TO_HASH(&hash_state, pass->subpasses[i].input_count);
154 APPEND_TO_HASH(&hash_state, pass->subpasses[i].color_count);
155 APPEND_TO_HASH(&hash_state, pass->subpasses[i].resolve_count);
156 }
157
158 return XXH64_digest(&hash_state);
159 }
160
161 static void
free_result(struct tu_device * dev,struct tu_renderpass_result * result)162 free_result(struct tu_device *dev, struct tu_renderpass_result *result)
163 {
164 tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
165 list_del(&result->node);
166 free(result);
167 }
168
169 static void
free_history(struct tu_device * dev,struct tu_renderpass_history * history)170 free_history(struct tu_device *dev, struct tu_renderpass_history *history)
171 {
172 tu_autotune_free_results_locked(dev, &history->results);
173 free(history);
174 }
175
176 static bool
get_history(struct tu_autotune * at,uint64_t rp_key,uint32_t * avg_samples)177 get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
178 {
179 bool has_history = false;
180
181 /* If the lock contantion would be found in the wild -
182 * we could use try_lock here.
183 */
184 u_rwlock_rdlock(&at->ht_lock);
185 struct hash_entry *entry =
186 _mesa_hash_table_search(at->ht, &rp_key);
187 if (entry) {
188 struct tu_renderpass_history *history = entry->data;
189 if (history->num_results > 0) {
190 *avg_samples = p_atomic_read(&history->avg_samples);
191 has_history = true;
192 }
193 }
194 u_rwlock_rdunlock(&at->ht_lock);
195
196 return has_history;
197 }
198
199 static struct tu_renderpass_result *
create_history_result(struct tu_autotune * at,uint64_t rp_key)200 create_history_result(struct tu_autotune *at, uint64_t rp_key)
201 {
202 struct tu_renderpass_result *result = calloc(1, sizeof(*result));
203 result->rp_key = rp_key;
204
205 return result;
206 }
207
208 static void
history_add_result(struct tu_device * dev,struct tu_renderpass_history * history,struct tu_renderpass_result * result)209 history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
210 struct tu_renderpass_result *result)
211 {
212 list_delinit(&result->node);
213 list_add(&result->node, &history->results);
214
215 if (history->num_results < MAX_HISTORY_RESULTS) {
216 history->num_results++;
217 } else {
218 /* Once above the limit, start popping old results off the
219 * tail of the list:
220 */
221 struct tu_renderpass_result *old_result =
222 list_last_entry(&history->results, struct tu_renderpass_result, node);
223 mtx_lock(&dev->autotune_mutex);
224 free_result(dev, old_result);
225 mtx_unlock(&dev->autotune_mutex);
226 }
227
228 /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
229 uint32_t total_samples = 0;
230 list_for_each_entry(struct tu_renderpass_result, result,
231 &history->results, node) {
232 total_samples += result->samples_passed;
233 }
234
235 float avg_samples = (float)total_samples / (float)history->num_results;
236 p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
237 }
238
239 static void
process_results(struct tu_autotune * at,uint32_t current_fence)240 process_results(struct tu_autotune *at, uint32_t current_fence)
241 {
242 struct tu_device *dev = at->device;
243
244 list_for_each_entry_safe(struct tu_renderpass_result, result,
245 &at->pending_results, node) {
246 if (result->fence > current_fence)
247 break;
248
249 struct tu_renderpass_history *history = result->history;
250 result->samples_passed =
251 result->samples->samples_end - result->samples->samples_start;
252
253 history_add_result(dev, history, result);
254 }
255
256 list_for_each_entry_safe(struct tu_submission_data, submission_data,
257 &at->pending_submission_data, node) {
258 if (submission_data->fence > current_fence)
259 break;
260
261 free_submission_data(submission_data);
262 }
263 }
264
265 static void
queue_pending_results(struct tu_autotune * at,struct tu_cmd_buffer * cmdbuf)266 queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
267 {
268 bool one_time_submit = cmdbuf->usage_flags &
269 VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
270
271 if (one_time_submit) {
272 /* We can just steal the list since it won't be resubmitted again */
273 list_splicetail(&cmdbuf->renderpass_autotune_results,
274 &at->pending_results);
275 list_inithead(&cmdbuf->renderpass_autotune_results);
276 } else {
277 list_for_each_entry_safe(struct tu_renderpass_result, result,
278 &cmdbuf->renderpass_autotune_results, node) {
279 /* TODO: copying each result isn't nice */
280 struct tu_renderpass_result *copy = malloc(sizeof(*result));
281 *copy = *result;
282 tu_bo_get_ref(copy->bo.bo);
283 list_addtail(©->node, &at->pending_results);
284 }
285 }
286 }
287
288 struct tu_cs *
tu_autotune_on_submit(struct tu_device * dev,struct tu_autotune * at,struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)289 tu_autotune_on_submit(struct tu_device *dev,
290 struct tu_autotune *at,
291 struct tu_cmd_buffer **cmd_buffers,
292 uint32_t cmd_buffer_count)
293 {
294 /* We are single-threaded here */
295
296 const uint32_t gpu_fence = get_autotune_fence(at);
297
298 process_results(at, gpu_fence);
299
300 /* pre-increment so zero isn't valid fence */
301 uint32_t new_fence = ++at->fence_counter;
302 uint32_t result_buffers = 0;
303
304 /* Create history entries here to minimize work and locking being
305 * done on renderpass end.
306 */
307 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
308 struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
309 list_for_each_entry_safe(struct tu_renderpass_result, result,
310 &cmdbuf->renderpass_autotune_results, node) {
311 struct tu_renderpass_history *history;
312 struct hash_entry *entry =
313 _mesa_hash_table_search(at->ht, &result->rp_key);
314 if (!entry) {
315 history = calloc(1, sizeof(*history));
316 history->key = result->rp_key;
317 list_inithead(&history->results);
318
319 u_rwlock_wrlock(&at->ht_lock);
320 _mesa_hash_table_insert(at->ht, &history->key, history);
321 u_rwlock_wrunlock(&at->ht_lock);
322 } else {
323 history = (struct tu_renderpass_history *) entry->data;
324 }
325
326 history->last_fence = new_fence;
327
328 result->fence = new_fence;
329 result->history = history;
330 }
331
332 if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) {
333 result_buffers++;
334 }
335 }
336
337 struct tu_submission_data *submission_data =
338 create_submission_data(dev, at);
339 submission_data->buffers_count = result_buffers;
340
341 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
342 struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
343 if (list_is_empty(&cmdbuf->renderpass_autotune_results))
344 continue;
345
346 queue_pending_results(at, cmdbuf);
347 }
348
349 if (TU_AUTOTUNE_DEBUG_LOG)
350 mesa_logi("Total history entries: %u", at->ht->entries);
351
352 /* Cleanup old entries from history table. The assumption
353 * here is that application doesn't hold many old unsubmitted
354 * command buffers, otherwise this table may grow big.
355 */
356 hash_table_foreach(at->ht, entry) {
357 struct tu_renderpass_history *history = entry->data;
358 if (history->last_fence == 0 ||
359 gpu_fence < history->last_fence ||
360 (gpu_fence - history->last_fence) <= MAX_HISTORY_LIFETIME)
361 continue;
362
363 if (TU_AUTOTUNE_DEBUG_LOG)
364 mesa_logi("Removed old history entry %016"PRIx64"", history->key);
365
366 u_rwlock_wrlock(&at->ht_lock);
367 _mesa_hash_table_remove_key(at->ht, &history->key);
368 u_rwlock_wrunlock(&at->ht_lock);
369
370 mtx_lock(&dev->autotune_mutex);
371 free_history(dev, history);
372 mtx_unlock(&dev->autotune_mutex);
373 }
374
375 return &submission_data->fence_cs;
376 }
377
378 static bool
renderpass_key_equals(const void * _a,const void * _b)379 renderpass_key_equals(const void *_a, const void *_b)
380 {
381 return *(uint64_t *)_a == *(uint64_t *)_b;
382 }
383
384 static uint32_t
renderpass_key_hash(const void * _a)385 renderpass_key_hash(const void *_a)
386 {
387 return *((uint64_t *) _a) & 0xffffffff;
388 }
389
390 VkResult
tu_autotune_init(struct tu_autotune * at,struct tu_device * dev)391 tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
392 {
393 at->enabled = true;
394 at->device = dev;
395 at->ht = _mesa_hash_table_create(NULL,
396 renderpass_key_hash,
397 renderpass_key_equals);
398 u_rwlock_init(&at->ht_lock);
399
400 list_inithead(&at->pending_results);
401 list_inithead(&at->pending_submission_data);
402
403 return VK_SUCCESS;
404 }
405
406 void
tu_autotune_fini(struct tu_autotune * at,struct tu_device * dev)407 tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
408 {
409 if (TU_AUTOTUNE_LOG_AT_FINISH) {
410 while (!list_is_empty(&at->pending_results)) {
411 const uint32_t gpu_fence = get_autotune_fence(at);
412 process_results(at, gpu_fence);
413 }
414
415 hash_table_foreach(at->ht, entry) {
416 struct tu_renderpass_history *history = entry->data;
417
418 mesa_logi("%016"PRIx64" \tavg_passed=%u results=%u",
419 history->key, history->avg_samples, history->num_results);
420 }
421 }
422
423 tu_autotune_free_results(dev, &at->pending_results);
424
425 mtx_lock(&dev->autotune_mutex);
426 hash_table_foreach(at->ht, entry) {
427 struct tu_renderpass_history *history = entry->data;
428 free_history(dev, history);
429 }
430 mtx_unlock(&dev->autotune_mutex);
431
432 list_for_each_entry_safe(struct tu_submission_data, submission_data,
433 &at->pending_submission_data, node) {
434 free_submission_data(submission_data);
435 }
436
437 _mesa_hash_table_destroy(at->ht, NULL);
438 u_rwlock_destroy(&at->ht_lock);
439 }
440
441 bool
tu_autotune_submit_requires_fence(struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)442 tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
443 uint32_t cmd_buffer_count)
444 {
445 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
446 struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
447 if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
448 return true;
449 }
450
451 return false;
452 }
453
454 void
tu_autotune_free_results_locked(struct tu_device * dev,struct list_head * results)455 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
456 {
457 list_for_each_entry_safe(struct tu_renderpass_result, result,
458 results, node) {
459 free_result(dev, result);
460 }
461 }
462
463 void
tu_autotune_free_results(struct tu_device * dev,struct list_head * results)464 tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
465 {
466 mtx_lock(&dev->autotune_mutex);
467 tu_autotune_free_results_locked(dev, results);
468 mtx_unlock(&dev->autotune_mutex);
469 }
470
471 static bool
fallback_use_bypass(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd_buffer)472 fallback_use_bypass(const struct tu_render_pass *pass,
473 const struct tu_framebuffer *framebuffer,
474 const struct tu_cmd_buffer *cmd_buffer)
475 {
476 if (cmd_buffer->state.rp.drawcall_count > 5)
477 return false;
478
479 for (unsigned i = 0; i < pass->subpass_count; i++) {
480 if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
481 return false;
482 }
483
484 return true;
485 }
486
487 static uint32_t
get_render_pass_pixel_count(const struct tu_cmd_buffer * cmd)488 get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
489 {
490 const VkExtent2D *extent = &cmd->state.render_area.extent;
491 return extent->width * extent->height;
492 }
493
494 static uint64_t
estimate_drawcall_bandwidth(const struct tu_cmd_buffer * cmd,uint32_t avg_renderpass_sample_count)495 estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
496 uint32_t avg_renderpass_sample_count)
497 {
498 const struct tu_cmd_state *state = &cmd->state;
499
500 if (!state->rp.drawcall_count)
501 return 0;
502
503 /* sample count times drawcall_bandwidth_per_sample */
504 return (uint64_t)avg_renderpass_sample_count *
505 state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
506 }
507
508 bool
tu_autotune_use_bypass(struct tu_autotune * at,struct tu_cmd_buffer * cmd_buffer,struct tu_renderpass_result ** autotune_result)509 tu_autotune_use_bypass(struct tu_autotune *at,
510 struct tu_cmd_buffer *cmd_buffer,
511 struct tu_renderpass_result **autotune_result)
512 {
513 const struct tu_render_pass *pass = cmd_buffer->state.pass;
514 const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
515
516 for (unsigned i = 0; i < pass->subpass_count; i++) {
517 const struct tu_subpass *subpass = &pass->subpasses[i];
518 /* GMEM works much faster in this case */
519 if (subpass->raster_order_attachment_access)
520 return false;
521
522 /* Would be very slow in sysmem mode because we have to enable
523 * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE)
524 */
525 if (subpass->feedback_loop_color || subpass->feedback_loop_ds)
526 return false;
527 }
528
529 /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
530 * we would have to allocate GPU memory at the submit time and copy
531 * results into it.
532 * Native games ususally don't use it, Zink and DXVK don't use it,
533 * D3D12 doesn't have such concept.
534 */
535 bool simultaneous_use =
536 cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
537
538 if (!at->enabled || simultaneous_use)
539 return fallback_use_bypass(pass, framebuffer, cmd_buffer);
540
541 /* We use 64bit hash as a key since we don't fear rare hash collision,
542 * the worst that would happen is sysmem being selected when it should
543 * have not, and with 64bit it would be extremely rare.
544 *
545 * Q: Why not make the key from framebuffer + renderpass pointers?
546 * A: At least DXVK creates new framebuffers each frame while keeping
547 * renderpasses the same. Also we want to support replaying a single
548 * frame in a loop for testing.
549 */
550 uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
551
552 *autotune_result = create_history_result(at, renderpass_key);
553
554 uint32_t avg_samples = 0;
555 if (get_history(at, renderpass_key, &avg_samples)) {
556 const uint32_t pass_pixel_count =
557 get_render_pass_pixel_count(cmd_buffer);
558 uint64_t sysmem_bandwidth =
559 (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
560 uint64_t gmem_bandwidth =
561 (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
562
563 const uint64_t total_draw_call_bandwidth =
564 estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
565
566 /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
567 sysmem_bandwidth += total_draw_call_bandwidth;
568
569 /* drawcalls access gmem in gmem rendering, but we do not want to ignore
570 * them completely. The state changes between tiles also have an
571 * overhead. The magic numbers of 11 and 10 are randomly chosen.
572 */
573 gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
574
575 const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
576 if (TU_AUTOTUNE_DEBUG_LOG) {
577 const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
578 const float drawcall_bandwidth_per_sample =
579 (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
580 cmd_buffer->state.rp.drawcall_count;
581
582 mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
583 renderpass_key,
584 cmd_buffer->state.rp.drawcall_count,
585 select_sysmem ? "sysmem" : "gmem");
586 mesa_logi(" avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
587 avg_samples,
588 drawcall_bandwidth_per_sample,
589 total_draw_call_bandwidth);
590 mesa_logi(" render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
591 extent->width, extent->height,
592 pass->sysmem_bandwidth_per_pixel,
593 pass->gmem_bandwidth_per_pixel);
594 mesa_logi(" sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
595 sysmem_bandwidth, gmem_bandwidth);
596 }
597
598 return select_sysmem;
599 }
600
601 return fallback_use_bypass(pass, framebuffer, cmd_buffer);
602 }
603
604 void
tu_autotune_begin_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)605 tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
606 struct tu_cs *cs,
607 struct tu_renderpass_result *autotune_result)
608 {
609 if (!autotune_result)
610 return;
611
612 struct tu_device *dev = cmd->device;
613
614 static const uint32_t size = sizeof(struct tu_renderpass_samples);
615
616 mtx_lock(&dev->autotune_mutex);
617 VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
618 mtx_unlock(&dev->autotune_mutex);
619 if (ret != VK_SUCCESS) {
620 autotune_result->bo.iova = 0;
621 return;
622 }
623
624 uint64_t result_iova = autotune_result->bo.iova;
625
626 autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo);
627
628 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
629
630 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
631
632 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
633 tu_cs_emit(cs, ZPASS_DONE);
634 }
635
tu_autotune_end_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)636 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
637 struct tu_cs *cs,
638 struct tu_renderpass_result *autotune_result)
639 {
640 if (!autotune_result)
641 return;
642
643 if (!autotune_result->bo.iova)
644 return;
645
646 uint64_t result_iova = autotune_result->bo.iova +
647 offsetof(struct tu_renderpass_samples, samples_end);
648
649 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
650
651 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
652
653 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
654 tu_cs_emit(cs, ZPASS_DONE);
655 }
656