• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_autotune.h"
7 
8 #include "tu_cmd_buffer.h"
9 #include "tu_cs.h"
10 #include "tu_device.h"
11 #include "tu_image.h"
12 #include "tu_pass.h"
13 
14 /* How does it work?
15  *
16  * - For each renderpass we calculate the number of samples passed
17  *   by storing the number before and after in GPU memory.
18  * - To store the values each command buffer holds GPU memory which
19  *   expands with more renderpasses being written.
20  * - For each renderpass we create tu_renderpass_result entry which
21  *   points to the results in GPU memory.
22  *   - Later on tu_renderpass_result would be added to the
23  *     tu_renderpass_history entry which aggregate results for a
24  *     given renderpass.
25  * - On submission:
26  *   - Process results which fence was signalled.
27  *   - Free per-submission data which we now don't need.
28  *
29  *   - Create a command stream to write a fence value. This way we would
30  *     know when we could safely read the results.
31  *   - We cannot rely on the command buffer's lifetime when referencing
32  *     its resources since the buffer could be destroyed before we process
33  *     the results.
34  *   - For each command buffer:
35  *     - Reference its GPU memory.
36  *     - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
37  *
38  * Since the command buffers could be recorded on different threads
39  * we have to maintaining some amount of locking history table,
40  * however we change the table only in a single thread at the submission
41  * time, so in most cases there will be no locking.
42  */
43 
44 void
45 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
46 
47 #define TU_AUTOTUNE_DEBUG_LOG 0
48 /* Dump history entries on autotuner finish,
49  * could be used to gather data from traces.
50  */
51 #define TU_AUTOTUNE_LOG_AT_FINISH 0
52 
53 /* How many last renderpass stats are taken into account. */
54 #define MAX_HISTORY_RESULTS 5
55 /* For how many submissions we store renderpass stats. */
56 #define MAX_HISTORY_LIFETIME 128
57 
58 
59 /**
60  * Tracks results for a given renderpass key
61  */
62 struct tu_renderpass_history {
63    uint64_t key;
64 
65    /* We would delete old history entries */
66    uint32_t last_fence;
67 
68    /**
69     * List of recent fd_renderpass_result's
70     */
71    struct list_head results;
72    uint32_t num_results;
73 
74    uint32_t avg_samples;
75 };
76 
77 /* Holds per-submission cs which writes the fence. */
78 struct tu_submission_data {
79    struct list_head node;
80    uint32_t fence;
81 
82    struct tu_cs fence_cs;
83    uint32_t buffers_count;
84 };
85 
86 static uint32_t
get_autotune_fence(struct tu_autotune * at)87 get_autotune_fence(struct tu_autotune *at)
88 {
89    const struct tu6_global *global = at->device->global_bo->map;
90    return global->autotune_fence;
91 }
92 
93 static struct tu_submission_data *
create_submission_data(struct tu_device * dev,struct tu_autotune * at)94 create_submission_data(struct tu_device *dev, struct tu_autotune *at)
95 {
96    struct tu_submission_data *submission_data =
97       calloc(1, sizeof(struct tu_submission_data));
98    submission_data->fence = at->fence_counter;
99 
100    struct tu_cs* fence_cs = &submission_data->fence_cs;
101    tu_cs_init(fence_cs, dev, TU_CS_MODE_GROW, 5);
102    tu_cs_begin(fence_cs);
103 
104    tu_cs_emit_pkt7(fence_cs, CP_EVENT_WRITE, 4);
105    tu_cs_emit(fence_cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
106    tu_cs_emit_qw(fence_cs, dev->global_bo->iova + gb_offset(autotune_fence));
107    tu_cs_emit(fence_cs, at->fence_counter);
108 
109    tu_cs_end(fence_cs);
110 
111    list_addtail(&submission_data->node, &at->pending_submission_data);
112 
113    return submission_data;
114 }
115 
116 static void
free_submission_data(struct tu_submission_data * data)117 free_submission_data(struct tu_submission_data *data)
118 {
119    list_del(&data->node);
120    tu_cs_finish(&data->fence_cs);
121 
122    free(data);
123 }
124 
125 #define APPEND_TO_HASH(state, field) \
126    XXH64_update(state, &field, sizeof(field));
127 
128 static uint64_t
hash_renderpass_instance(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd)129 hash_renderpass_instance(const struct tu_render_pass *pass,
130                          const struct tu_framebuffer *framebuffer,
131                          const struct tu_cmd_buffer *cmd) {
132    XXH64_state_t hash_state;
133    XXH64_reset(&hash_state, 0);
134 
135    APPEND_TO_HASH(&hash_state, framebuffer->width);
136    APPEND_TO_HASH(&hash_state, framebuffer->height);
137    APPEND_TO_HASH(&hash_state, framebuffer->layers);
138 
139    APPEND_TO_HASH(&hash_state, pass->attachment_count);
140    XXH64_update(&hash_state, pass->attachments, pass->attachment_count * sizeof(pass->attachments[0]));
141 
142    for (unsigned i = 0; i < pass->attachment_count; i++) {
143       APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.width);
144       APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.height);
145       APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.format);
146       APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.array_layers);
147       APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.mip_levels);
148    }
149 
150    APPEND_TO_HASH(&hash_state, pass->subpass_count);
151    for (unsigned i = 0; i < pass->subpass_count; i++) {
152       APPEND_TO_HASH(&hash_state, pass->subpasses[i].samples);
153       APPEND_TO_HASH(&hash_state, pass->subpasses[i].input_count);
154       APPEND_TO_HASH(&hash_state, pass->subpasses[i].color_count);
155       APPEND_TO_HASH(&hash_state, pass->subpasses[i].resolve_count);
156    }
157 
158    return XXH64_digest(&hash_state);
159 }
160 
161 static void
free_result(struct tu_device * dev,struct tu_renderpass_result * result)162 free_result(struct tu_device *dev, struct tu_renderpass_result *result)
163 {
164    tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
165    list_del(&result->node);
166    free(result);
167 }
168 
169 static void
free_history(struct tu_device * dev,struct tu_renderpass_history * history)170 free_history(struct tu_device *dev, struct tu_renderpass_history *history)
171 {
172    tu_autotune_free_results_locked(dev, &history->results);
173    free(history);
174 }
175 
176 static bool
get_history(struct tu_autotune * at,uint64_t rp_key,uint32_t * avg_samples)177 get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
178 {
179    bool has_history = false;
180 
181    /* If the lock contantion would be found in the wild -
182     * we could use try_lock here.
183     */
184    u_rwlock_rdlock(&at->ht_lock);
185    struct hash_entry *entry =
186       _mesa_hash_table_search(at->ht, &rp_key);
187    if (entry) {
188       struct tu_renderpass_history *history = entry->data;
189       if (history->num_results > 0) {
190          *avg_samples = p_atomic_read(&history->avg_samples);
191          has_history = true;
192       }
193    }
194    u_rwlock_rdunlock(&at->ht_lock);
195 
196    return has_history;
197 }
198 
199 static struct tu_renderpass_result *
create_history_result(struct tu_autotune * at,uint64_t rp_key)200 create_history_result(struct tu_autotune *at, uint64_t rp_key)
201 {
202    struct tu_renderpass_result *result = calloc(1, sizeof(*result));
203    result->rp_key = rp_key;
204 
205    return result;
206 }
207 
208 static void
history_add_result(struct tu_device * dev,struct tu_renderpass_history * history,struct tu_renderpass_result * result)209 history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
210                       struct tu_renderpass_result *result)
211 {
212    list_delinit(&result->node);
213    list_add(&result->node, &history->results);
214 
215    if (history->num_results < MAX_HISTORY_RESULTS) {
216       history->num_results++;
217    } else {
218       /* Once above the limit, start popping old results off the
219        * tail of the list:
220        */
221       struct tu_renderpass_result *old_result =
222          list_last_entry(&history->results, struct tu_renderpass_result, node);
223       mtx_lock(&dev->autotune_mutex);
224       free_result(dev, old_result);
225       mtx_unlock(&dev->autotune_mutex);
226    }
227 
228    /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
229    uint32_t total_samples = 0;
230    list_for_each_entry(struct tu_renderpass_result, result,
231                        &history->results, node) {
232       total_samples += result->samples_passed;
233    }
234 
235    float avg_samples = (float)total_samples / (float)history->num_results;
236    p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
237 }
238 
239 static void
process_results(struct tu_autotune * at,uint32_t current_fence)240 process_results(struct tu_autotune *at, uint32_t current_fence)
241 {
242    struct tu_device *dev = at->device;
243 
244    list_for_each_entry_safe(struct tu_renderpass_result, result,
245                             &at->pending_results, node) {
246       if (result->fence > current_fence)
247          break;
248 
249       struct tu_renderpass_history *history = result->history;
250       result->samples_passed =
251          result->samples->samples_end - result->samples->samples_start;
252 
253       history_add_result(dev, history, result);
254    }
255 
256    list_for_each_entry_safe(struct tu_submission_data, submission_data,
257                             &at->pending_submission_data, node) {
258       if (submission_data->fence > current_fence)
259          break;
260 
261       free_submission_data(submission_data);
262    }
263 }
264 
265 static void
queue_pending_results(struct tu_autotune * at,struct tu_cmd_buffer * cmdbuf)266 queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
267 {
268    bool one_time_submit = cmdbuf->usage_flags &
269          VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
270 
271    if (one_time_submit) {
272       /* We can just steal the list since it won't be resubmitted again */
273       list_splicetail(&cmdbuf->renderpass_autotune_results,
274                         &at->pending_results);
275       list_inithead(&cmdbuf->renderpass_autotune_results);
276    } else {
277       list_for_each_entry_safe(struct tu_renderpass_result, result,
278                               &cmdbuf->renderpass_autotune_results, node) {
279          /* TODO: copying each result isn't nice */
280          struct tu_renderpass_result *copy = malloc(sizeof(*result));
281          *copy = *result;
282          tu_bo_get_ref(copy->bo.bo);
283          list_addtail(&copy->node, &at->pending_results);
284       }
285    }
286 }
287 
288 struct tu_cs *
tu_autotune_on_submit(struct tu_device * dev,struct tu_autotune * at,struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)289 tu_autotune_on_submit(struct tu_device *dev,
290                       struct tu_autotune *at,
291                       struct tu_cmd_buffer **cmd_buffers,
292                       uint32_t cmd_buffer_count)
293 {
294    /* We are single-threaded here */
295 
296    const uint32_t gpu_fence = get_autotune_fence(at);
297 
298    process_results(at, gpu_fence);
299 
300    /* pre-increment so zero isn't valid fence */
301    uint32_t new_fence = ++at->fence_counter;
302    uint32_t result_buffers = 0;
303 
304    /* Create history entries here to minimize work and locking being
305     * done on renderpass end.
306     */
307    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
308       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
309       list_for_each_entry_safe(struct tu_renderpass_result, result,
310                           &cmdbuf->renderpass_autotune_results, node) {
311          struct tu_renderpass_history *history;
312          struct hash_entry *entry =
313             _mesa_hash_table_search(at->ht, &result->rp_key);
314          if (!entry) {
315             history = calloc(1, sizeof(*history));
316             history->key = result->rp_key;
317             list_inithead(&history->results);
318 
319             u_rwlock_wrlock(&at->ht_lock);
320             _mesa_hash_table_insert(at->ht, &history->key, history);
321             u_rwlock_wrunlock(&at->ht_lock);
322          } else {
323             history = (struct tu_renderpass_history *) entry->data;
324          }
325 
326          history->last_fence = new_fence;
327 
328          result->fence = new_fence;
329          result->history = history;
330       }
331 
332       if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) {
333          result_buffers++;
334       }
335    }
336 
337    struct tu_submission_data *submission_data =
338       create_submission_data(dev, at);
339    submission_data->buffers_count = result_buffers;
340 
341    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
342       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
343       if (list_is_empty(&cmdbuf->renderpass_autotune_results))
344          continue;
345 
346       queue_pending_results(at, cmdbuf);
347    }
348 
349    if (TU_AUTOTUNE_DEBUG_LOG)
350       mesa_logi("Total history entries: %u", at->ht->entries);
351 
352    /* Cleanup old entries from history table. The assumption
353     * here is that application doesn't hold many old unsubmitted
354     * command buffers, otherwise this table may grow big.
355     */
356    hash_table_foreach(at->ht, entry) {
357       struct tu_renderpass_history *history = entry->data;
358       if (history->last_fence == 0 ||
359           gpu_fence < history->last_fence ||
360           (gpu_fence - history->last_fence) <= MAX_HISTORY_LIFETIME)
361          continue;
362 
363       if (TU_AUTOTUNE_DEBUG_LOG)
364          mesa_logi("Removed old history entry %016"PRIx64"", history->key);
365 
366       u_rwlock_wrlock(&at->ht_lock);
367       _mesa_hash_table_remove_key(at->ht, &history->key);
368       u_rwlock_wrunlock(&at->ht_lock);
369 
370       mtx_lock(&dev->autotune_mutex);
371       free_history(dev, history);
372       mtx_unlock(&dev->autotune_mutex);
373    }
374 
375    return &submission_data->fence_cs;
376 }
377 
378 static bool
renderpass_key_equals(const void * _a,const void * _b)379 renderpass_key_equals(const void *_a, const void *_b)
380 {
381    return *(uint64_t *)_a == *(uint64_t *)_b;
382 }
383 
384 static uint32_t
renderpass_key_hash(const void * _a)385 renderpass_key_hash(const void *_a)
386 {
387    return *((uint64_t *) _a) & 0xffffffff;
388 }
389 
390 VkResult
tu_autotune_init(struct tu_autotune * at,struct tu_device * dev)391 tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
392 {
393    at->enabled = true;
394    at->device = dev;
395    at->ht = _mesa_hash_table_create(NULL,
396                                     renderpass_key_hash,
397                                     renderpass_key_equals);
398    u_rwlock_init(&at->ht_lock);
399 
400    list_inithead(&at->pending_results);
401    list_inithead(&at->pending_submission_data);
402 
403    return VK_SUCCESS;
404 }
405 
406 void
tu_autotune_fini(struct tu_autotune * at,struct tu_device * dev)407 tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
408 {
409    if (TU_AUTOTUNE_LOG_AT_FINISH) {
410       while (!list_is_empty(&at->pending_results)) {
411          const uint32_t gpu_fence = get_autotune_fence(at);
412          process_results(at, gpu_fence);
413       }
414 
415       hash_table_foreach(at->ht, entry) {
416          struct tu_renderpass_history *history = entry->data;
417 
418          mesa_logi("%016"PRIx64" \tavg_passed=%u results=%u",
419                    history->key, history->avg_samples, history->num_results);
420       }
421    }
422 
423    tu_autotune_free_results(dev, &at->pending_results);
424 
425    mtx_lock(&dev->autotune_mutex);
426    hash_table_foreach(at->ht, entry) {
427       struct tu_renderpass_history *history = entry->data;
428       free_history(dev, history);
429    }
430    mtx_unlock(&dev->autotune_mutex);
431 
432    list_for_each_entry_safe(struct tu_submission_data, submission_data,
433                             &at->pending_submission_data, node) {
434       free_submission_data(submission_data);
435    }
436 
437    _mesa_hash_table_destroy(at->ht, NULL);
438    u_rwlock_destroy(&at->ht_lock);
439 }
440 
441 bool
tu_autotune_submit_requires_fence(struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)442 tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
443                                   uint32_t cmd_buffer_count)
444 {
445    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
446       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
447       if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
448          return true;
449    }
450 
451    return false;
452 }
453 
454 void
tu_autotune_free_results_locked(struct tu_device * dev,struct list_head * results)455 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
456 {
457    list_for_each_entry_safe(struct tu_renderpass_result, result,
458                             results, node) {
459       free_result(dev, result);
460    }
461 }
462 
463 void
tu_autotune_free_results(struct tu_device * dev,struct list_head * results)464 tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
465 {
466    mtx_lock(&dev->autotune_mutex);
467    tu_autotune_free_results_locked(dev, results);
468    mtx_unlock(&dev->autotune_mutex);
469 }
470 
471 static bool
fallback_use_bypass(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd_buffer)472 fallback_use_bypass(const struct tu_render_pass *pass,
473                     const struct tu_framebuffer *framebuffer,
474                     const struct tu_cmd_buffer *cmd_buffer)
475 {
476    if (cmd_buffer->state.rp.drawcall_count > 5)
477       return false;
478 
479    for (unsigned i = 0; i < pass->subpass_count; i++) {
480       if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
481          return false;
482    }
483 
484    return true;
485 }
486 
487 static uint32_t
get_render_pass_pixel_count(const struct tu_cmd_buffer * cmd)488 get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
489 {
490    const VkExtent2D *extent = &cmd->state.render_area.extent;
491    return extent->width * extent->height;
492 }
493 
494 static uint64_t
estimate_drawcall_bandwidth(const struct tu_cmd_buffer * cmd,uint32_t avg_renderpass_sample_count)495 estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
496                             uint32_t avg_renderpass_sample_count)
497 {
498    const struct tu_cmd_state *state = &cmd->state;
499 
500    if (!state->rp.drawcall_count)
501       return 0;
502 
503    /* sample count times drawcall_bandwidth_per_sample */
504    return (uint64_t)avg_renderpass_sample_count *
505       state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
506 }
507 
508 bool
tu_autotune_use_bypass(struct tu_autotune * at,struct tu_cmd_buffer * cmd_buffer,struct tu_renderpass_result ** autotune_result)509 tu_autotune_use_bypass(struct tu_autotune *at,
510                        struct tu_cmd_buffer *cmd_buffer,
511                        struct tu_renderpass_result **autotune_result)
512 {
513    const struct tu_render_pass *pass = cmd_buffer->state.pass;
514    const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
515 
516    for (unsigned i = 0; i < pass->subpass_count; i++) {
517       const struct tu_subpass *subpass = &pass->subpasses[i];
518       /* GMEM works much faster in this case */
519       if (subpass->raster_order_attachment_access)
520          return false;
521 
522       /* Would be very slow in sysmem mode because we have to enable
523        * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE)
524        */
525       if (subpass->feedback_loop_color || subpass->feedback_loop_ds)
526          return false;
527    }
528 
529    /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
530     * we would have to allocate GPU memory at the submit time and copy
531     * results into it.
532     * Native games ususally don't use it, Zink and DXVK don't use it,
533     * D3D12 doesn't have such concept.
534     */
535    bool simultaneous_use =
536       cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
537 
538    if (!at->enabled || simultaneous_use)
539       return fallback_use_bypass(pass, framebuffer, cmd_buffer);
540 
541    /* We use 64bit hash as a key since we don't fear rare hash collision,
542     * the worst that would happen is sysmem being selected when it should
543     * have not, and with 64bit it would be extremely rare.
544     *
545     * Q: Why not make the key from framebuffer + renderpass pointers?
546     * A: At least DXVK creates new framebuffers each frame while keeping
547     *    renderpasses the same. Also we want to support replaying a single
548     *    frame in a loop for testing.
549     */
550    uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
551 
552    *autotune_result = create_history_result(at, renderpass_key);
553 
554    uint32_t avg_samples = 0;
555    if (get_history(at, renderpass_key, &avg_samples)) {
556       const uint32_t pass_pixel_count =
557          get_render_pass_pixel_count(cmd_buffer);
558       uint64_t sysmem_bandwidth =
559          (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
560       uint64_t gmem_bandwidth =
561          (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
562 
563       const uint64_t total_draw_call_bandwidth =
564          estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
565 
566       /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
567       sysmem_bandwidth += total_draw_call_bandwidth;
568 
569       /* drawcalls access gmem in gmem rendering, but we do not want to ignore
570        * them completely.  The state changes between tiles also have an
571        * overhead.  The magic numbers of 11 and 10 are randomly chosen.
572        */
573       gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
574 
575       const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
576       if (TU_AUTOTUNE_DEBUG_LOG) {
577          const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
578          const float drawcall_bandwidth_per_sample =
579             (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
580             cmd_buffer->state.rp.drawcall_count;
581 
582          mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
583                renderpass_key,
584                cmd_buffer->state.rp.drawcall_count,
585                select_sysmem ? "sysmem" : "gmem");
586          mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
587                avg_samples,
588                drawcall_bandwidth_per_sample,
589                total_draw_call_bandwidth);
590          mesa_logi("   render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
591                extent->width, extent->height,
592                pass->sysmem_bandwidth_per_pixel,
593                pass->gmem_bandwidth_per_pixel);
594          mesa_logi("   sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
595                sysmem_bandwidth, gmem_bandwidth);
596       }
597 
598       return select_sysmem;
599    }
600 
601    return fallback_use_bypass(pass, framebuffer, cmd_buffer);
602 }
603 
604 void
tu_autotune_begin_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)605 tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
606                              struct tu_cs *cs,
607                              struct tu_renderpass_result *autotune_result)
608 {
609    if (!autotune_result)
610       return;
611 
612    struct tu_device *dev = cmd->device;
613 
614    static const uint32_t size = sizeof(struct tu_renderpass_samples);
615 
616    mtx_lock(&dev->autotune_mutex);
617    VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
618    mtx_unlock(&dev->autotune_mutex);
619    if (ret != VK_SUCCESS) {
620       autotune_result->bo.iova = 0;
621       return;
622    }
623 
624    uint64_t result_iova = autotune_result->bo.iova;
625 
626    autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo);
627 
628    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
629 
630    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
631 
632    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
633    tu_cs_emit(cs, ZPASS_DONE);
634 }
635 
tu_autotune_end_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)636 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
637                                 struct tu_cs *cs,
638                                 struct tu_renderpass_result *autotune_result)
639 {
640    if (!autotune_result)
641       return;
642 
643    if (!autotune_result->bo.iova)
644       return;
645 
646    uint64_t result_iova = autotune_result->bo.iova +
647                           offsetof(struct tu_renderpass_samples, samples_end);
648 
649    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
650 
651    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
652 
653    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
654    tu_cs_emit(cs, ZPASS_DONE);
655 }
656