• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_autotune.h"
7 
8 #include "tu_cmd_buffer.h"
9 #include "tu_cs.h"
10 #include "tu_device.h"
11 #include "tu_image.h"
12 #include "tu_pass.h"
13 
14 /* How does it work?
15  *
16  * - For each renderpass we calculate the number of samples passed
17  *   by storing the number before and after in GPU memory.
18  * - To store the values each command buffer holds GPU memory which
19  *   expands with more renderpasses being written.
20  * - For each renderpass we create tu_renderpass_result entry which
21  *   points to the results in GPU memory.
22  *   - Later on tu_renderpass_result would be added to the
23  *     tu_renderpass_history entry which aggregate results for a
24  *     given renderpass.
25  * - On submission:
26  *   - Process results which fence was signalled.
27  *   - Free per-submission data which we now don't need.
28  *
29  *   - Create a command stream to write a fence value. This way we would
30  *     know when we could safely read the results.
31  *   - We cannot rely on the command buffer's lifetime when referencing
32  *     its resources since the buffer could be destroyed before we process
33  *     the results.
34  *   - For each command buffer:
35  *     - Reference its GPU memory.
36  *     - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
37  *
38  * Since the command buffers could be recorded on different threads
39  * we have to maintaining some amount of locking history table,
40  * however we change the table only in a single thread at the submission
41  * time, so in most cases there will be no locking.
42  */
43 
44 void
45 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
46 
47 #define TU_AUTOTUNE_DEBUG_LOG 0
48 /* Dump history entries on autotuner finish,
49  * could be used to gather data from traces.
50  */
51 #define TU_AUTOTUNE_LOG_AT_FINISH 0
52 
53 /* How many last renderpass stats are taken into account. */
54 #define MAX_HISTORY_RESULTS 5
55 /* For how many submissions we store renderpass stats. */
56 #define MAX_HISTORY_LIFETIME 128
57 
58 
59 /**
60  * Tracks results for a given renderpass key
61  */
62 struct tu_renderpass_history {
63    uint64_t key;
64 
65    /* We would delete old history entries */
66    uint32_t last_fence;
67 
68    /**
69     * List of recent fd_renderpass_result's
70     */
71    struct list_head results;
72    uint32_t num_results;
73 
74    uint32_t avg_samples;
75 };
76 
77 /* Holds per-submission cs which writes the fence. */
78 struct tu_submission_data {
79    struct list_head node;
80    uint32_t fence;
81 
82    struct tu_cs fence_cs;
83 };
84 
85 static bool
fence_before(uint32_t a,uint32_t b)86 fence_before(uint32_t a, uint32_t b)
87 {
88    /* essentially a < b, but handle wrapped values */
89    return (int32_t)(a - b) < 0;
90 }
91 
92 static uint32_t
get_autotune_fence(struct tu_autotune * at)93 get_autotune_fence(struct tu_autotune *at)
94 {
95    return at->device->global_bo_map->autotune_fence;
96 }
97 
98 static struct tu_submission_data *
create_submission_data(struct tu_device * dev,struct tu_autotune * at,uint32_t fence)99 create_submission_data(struct tu_device *dev, struct tu_autotune *at,
100                        uint32_t fence)
101 {
102    struct tu_submission_data *submission_data = NULL;
103    if (!list_is_empty(&at->submission_data_pool)) {
104       submission_data = list_first_entry(&at->submission_data_pool,
105                                          struct tu_submission_data, node);
106       list_del(&submission_data->node);
107    } else {
108       submission_data = (struct tu_submission_data *) calloc(
109          1, sizeof(struct tu_submission_data));
110       tu_cs_init(&submission_data->fence_cs, dev, TU_CS_MODE_GROW, 5, "autotune fence cs");
111    }
112    submission_data->fence = fence;
113 
114    struct tu_cs* fence_cs = &submission_data->fence_cs;
115    tu_cs_begin(fence_cs);
116 
117    tu_cs_emit_pkt7(fence_cs, CP_EVENT_WRITE, 4);
118    tu_cs_emit(fence_cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
119    tu_cs_emit_qw(fence_cs, dev->global_bo->iova + gb_offset(autotune_fence));
120    tu_cs_emit(fence_cs, fence);
121 
122    tu_cs_end(fence_cs);
123 
124    list_addtail(&submission_data->node, &at->pending_submission_data);
125 
126    return submission_data;
127 }
128 
129 static void
finish_submission_data(struct tu_autotune * at,struct tu_submission_data * data)130 finish_submission_data(struct tu_autotune *at,
131                        struct tu_submission_data *data)
132 {
133    list_del(&data->node);
134    list_addtail(&data->node, &at->submission_data_pool);
135    tu_cs_reset(&data->fence_cs);
136 }
137 
138 static void
free_submission_data(struct tu_submission_data * data)139 free_submission_data(struct tu_submission_data *data)
140 {
141    list_del(&data->node);
142    tu_cs_finish(&data->fence_cs);
143 
144    free(data);
145 }
146 
147 static uint64_t
hash_renderpass_instance(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd)148 hash_renderpass_instance(const struct tu_render_pass *pass,
149                          const struct tu_framebuffer *framebuffer,
150                          const struct tu_cmd_buffer *cmd) {
151    uint32_t data[3 + pass->attachment_count * 5];
152    uint32_t* ptr = data;
153 
154    *ptr++ = framebuffer->width;
155    *ptr++ = framebuffer->height;
156    *ptr++ = framebuffer->layers;
157 
158    for (unsigned i = 0; i < pass->attachment_count; i++) {
159       *ptr++ = cmd->state.attachments[i]->view.width;
160       *ptr++ = cmd->state.attachments[i]->view.height;
161       *ptr++ = cmd->state.attachments[i]->image->vk.format;
162       *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
163       *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
164    }
165 
166    return XXH64(data, sizeof(data), pass->autotune_hash);
167 }
168 
169 static void
free_result(struct tu_device * dev,struct tu_renderpass_result * result)170 free_result(struct tu_device *dev, struct tu_renderpass_result *result)
171 {
172    tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
173    list_del(&result->node);
174    free(result);
175 }
176 
177 static void
free_history(struct tu_device * dev,struct tu_renderpass_history * history)178 free_history(struct tu_device *dev, struct tu_renderpass_history *history)
179 {
180    tu_autotune_free_results_locked(dev, &history->results);
181    free(history);
182 }
183 
184 static bool
get_history(struct tu_autotune * at,uint64_t rp_key,uint32_t * avg_samples)185 get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
186 {
187    bool has_history = false;
188 
189    /* If the lock contantion would be found in the wild -
190     * we could use try_lock here.
191     */
192    u_rwlock_rdlock(&at->ht_lock);
193    struct hash_entry *entry =
194       _mesa_hash_table_search(at->ht, &rp_key);
195    if (entry) {
196       struct tu_renderpass_history *history =
197          (struct tu_renderpass_history *) entry->data;
198       if (history->num_results > 0) {
199          *avg_samples = p_atomic_read(&history->avg_samples);
200          has_history = true;
201       }
202    }
203    u_rwlock_rdunlock(&at->ht_lock);
204 
205    return has_history;
206 }
207 
208 static struct tu_renderpass_result *
create_history_result(struct tu_autotune * at,uint64_t rp_key)209 create_history_result(struct tu_autotune *at, uint64_t rp_key)
210 {
211    struct tu_renderpass_result *result =
212       (struct tu_renderpass_result *) calloc(1, sizeof(*result));
213    result->rp_key = rp_key;
214 
215    return result;
216 }
217 
218 static void
history_add_result(struct tu_device * dev,struct tu_renderpass_history * history,struct tu_renderpass_result * result)219 history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
220                       struct tu_renderpass_result *result)
221 {
222    list_delinit(&result->node);
223    list_add(&result->node, &history->results);
224 
225    if (history->num_results < MAX_HISTORY_RESULTS) {
226       history->num_results++;
227    } else {
228       /* Once above the limit, start popping old results off the
229        * tail of the list:
230        */
231       struct tu_renderpass_result *old_result =
232          list_last_entry(&history->results, struct tu_renderpass_result, node);
233       mtx_lock(&dev->autotune_mutex);
234       free_result(dev, old_result);
235       mtx_unlock(&dev->autotune_mutex);
236    }
237 
238    /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
239    uint32_t total_samples = 0;
240    list_for_each_entry(struct tu_renderpass_result, result,
241                        &history->results, node) {
242       total_samples += result->samples_passed;
243    }
244 
245    float avg_samples = (float)total_samples / (float)history->num_results;
246    p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
247 }
248 
249 static void
process_results(struct tu_autotune * at,uint32_t current_fence)250 process_results(struct tu_autotune *at, uint32_t current_fence)
251 {
252    struct tu_device *dev = at->device;
253 
254    list_for_each_entry_safe(struct tu_renderpass_result, result,
255                             &at->pending_results, node) {
256       if (fence_before(current_fence, result->fence))
257          break;
258 
259       struct tu_renderpass_history *history = result->history;
260       result->samples_passed =
261          result->samples->samples_end - result->samples->samples_start;
262 
263       history_add_result(dev, history, result);
264    }
265 
266    list_for_each_entry_safe(struct tu_submission_data, submission_data,
267                             &at->pending_submission_data, node) {
268       if (fence_before(current_fence, submission_data->fence))
269          break;
270 
271       finish_submission_data(at, submission_data);
272    }
273 }
274 
275 static void
queue_pending_results(struct tu_autotune * at,struct tu_cmd_buffer * cmdbuf)276 queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
277 {
278    bool one_time_submit = cmdbuf->usage_flags &
279          VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
280 
281    if (one_time_submit) {
282       /* We can just steal the list since it won't be resubmitted again */
283       list_splicetail(&cmdbuf->renderpass_autotune_results,
284                         &at->pending_results);
285       list_inithead(&cmdbuf->renderpass_autotune_results);
286    } else {
287       list_for_each_entry_safe(struct tu_renderpass_result, result,
288                               &cmdbuf->renderpass_autotune_results, node) {
289          /* TODO: copying each result isn't nice */
290          struct tu_renderpass_result *copy =
291             (struct tu_renderpass_result *) malloc(sizeof(*result));
292          *copy = *result;
293          tu_bo_get_ref(copy->bo.bo);
294          list_addtail(&copy->node, &at->pending_results);
295       }
296    }
297 }
298 
299 struct tu_cs *
tu_autotune_on_submit(struct tu_device * dev,struct tu_autotune * at,struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)300 tu_autotune_on_submit(struct tu_device *dev,
301                       struct tu_autotune *at,
302                       struct tu_cmd_buffer **cmd_buffers,
303                       uint32_t cmd_buffer_count)
304 {
305    /* We are single-threaded here */
306 
307    const uint32_t gpu_fence = get_autotune_fence(at);
308    const uint32_t new_fence = at->fence_counter++;
309 
310    process_results(at, gpu_fence);
311 
312    /* Create history entries here to minimize work and locking being
313     * done on renderpass end.
314     */
315    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
316       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
317       list_for_each_entry_safe(struct tu_renderpass_result, result,
318                           &cmdbuf->renderpass_autotune_results, node) {
319          struct tu_renderpass_history *history;
320          struct hash_entry *entry =
321             _mesa_hash_table_search(at->ht, &result->rp_key);
322          if (!entry) {
323             history =
324                (struct tu_renderpass_history *) calloc(1, sizeof(*history));
325             history->key = result->rp_key;
326             list_inithead(&history->results);
327 
328             u_rwlock_wrlock(&at->ht_lock);
329             _mesa_hash_table_insert(at->ht, &history->key, history);
330             u_rwlock_wrunlock(&at->ht_lock);
331          } else {
332             history = (struct tu_renderpass_history *) entry->data;
333          }
334 
335          history->last_fence = new_fence;
336 
337          result->fence = new_fence;
338          result->history = history;
339       }
340    }
341 
342    struct tu_submission_data *submission_data =
343       create_submission_data(dev, at, new_fence);
344 
345    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
346       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
347       if (list_is_empty(&cmdbuf->renderpass_autotune_results))
348          continue;
349 
350       queue_pending_results(at, cmdbuf);
351    }
352 
353    if (TU_AUTOTUNE_DEBUG_LOG)
354       mesa_logi("Total history entries: %u", at->ht->entries);
355 
356    /* Cleanup old entries from history table. The assumption
357     * here is that application doesn't hold many old unsubmitted
358     * command buffers, otherwise this table may grow big.
359     */
360    hash_table_foreach(at->ht, entry) {
361       struct tu_renderpass_history *history =
362          (struct tu_renderpass_history *) entry->data;
363       if (fence_before(gpu_fence, history->last_fence + MAX_HISTORY_LIFETIME))
364          continue;
365 
366       if (TU_AUTOTUNE_DEBUG_LOG)
367          mesa_logi("Removed old history entry %016" PRIx64 "", history->key);
368 
369       u_rwlock_wrlock(&at->ht_lock);
370       _mesa_hash_table_remove_key(at->ht, &history->key);
371       u_rwlock_wrunlock(&at->ht_lock);
372 
373       mtx_lock(&dev->autotune_mutex);
374       free_history(dev, history);
375       mtx_unlock(&dev->autotune_mutex);
376    }
377 
378    return &submission_data->fence_cs;
379 }
380 
381 static bool
renderpass_key_equals(const void * _a,const void * _b)382 renderpass_key_equals(const void *_a, const void *_b)
383 {
384    return *(uint64_t *)_a == *(uint64_t *)_b;
385 }
386 
387 static uint32_t
renderpass_key_hash(const void * _a)388 renderpass_key_hash(const void *_a)
389 {
390    return *((uint64_t *) _a) & 0xffffffff;
391 }
392 
393 VkResult
tu_autotune_init(struct tu_autotune * at,struct tu_device * dev)394 tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
395 {
396    at->enabled = true;
397    at->device = dev;
398    at->ht = _mesa_hash_table_create(NULL,
399                                     renderpass_key_hash,
400                                     renderpass_key_equals);
401    u_rwlock_init(&at->ht_lock);
402 
403    list_inithead(&at->pending_results);
404    list_inithead(&at->pending_submission_data);
405    list_inithead(&at->submission_data_pool);
406 
407    /* start from 1 because tu6_global::autotune_fence is initialized to 0 */
408    at->fence_counter = 1;
409 
410    return VK_SUCCESS;
411 }
412 
413 void
tu_autotune_fini(struct tu_autotune * at,struct tu_device * dev)414 tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
415 {
416    if (TU_AUTOTUNE_LOG_AT_FINISH) {
417       while (!list_is_empty(&at->pending_results)) {
418          const uint32_t gpu_fence = get_autotune_fence(at);
419          process_results(at, gpu_fence);
420       }
421 
422       hash_table_foreach(at->ht, entry) {
423          struct tu_renderpass_history *history =
424             (struct tu_renderpass_history *) entry->data;
425 
426          mesa_logi("%016" PRIx64 " \tavg_passed=%u results=%u",
427                    history->key, history->avg_samples, history->num_results);
428       }
429    }
430 
431    tu_autotune_free_results(dev, &at->pending_results);
432 
433    mtx_lock(&dev->autotune_mutex);
434    hash_table_foreach(at->ht, entry) {
435       struct tu_renderpass_history *history =
436          (struct tu_renderpass_history *) entry->data;
437       free_history(dev, history);
438    }
439    mtx_unlock(&dev->autotune_mutex);
440 
441    list_for_each_entry_safe(struct tu_submission_data, submission_data,
442                             &at->pending_submission_data, node) {
443       free_submission_data(submission_data);
444    }
445 
446    list_for_each_entry_safe(struct tu_submission_data, submission_data,
447                             &at->submission_data_pool, node) {
448       free_submission_data(submission_data);
449    }
450 
451    _mesa_hash_table_destroy(at->ht, NULL);
452    u_rwlock_destroy(&at->ht_lock);
453 }
454 
455 bool
tu_autotune_submit_requires_fence(struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)456 tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
457                                   uint32_t cmd_buffer_count)
458 {
459    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
460       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
461       if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
462          return true;
463    }
464 
465    return false;
466 }
467 
468 void
tu_autotune_free_results_locked(struct tu_device * dev,struct list_head * results)469 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
470 {
471    list_for_each_entry_safe(struct tu_renderpass_result, result,
472                             results, node) {
473       free_result(dev, result);
474    }
475 }
476 
477 void
tu_autotune_free_results(struct tu_device * dev,struct list_head * results)478 tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
479 {
480    mtx_lock(&dev->autotune_mutex);
481    tu_autotune_free_results_locked(dev, results);
482    mtx_unlock(&dev->autotune_mutex);
483 }
484 
485 static bool
fallback_use_bypass(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd_buffer)486 fallback_use_bypass(const struct tu_render_pass *pass,
487                     const struct tu_framebuffer *framebuffer,
488                     const struct tu_cmd_buffer *cmd_buffer)
489 {
490    if (cmd_buffer->state.rp.drawcall_count > 5)
491       return false;
492 
493    for (unsigned i = 0; i < pass->subpass_count; i++) {
494       if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
495          return false;
496    }
497 
498    return true;
499 }
500 
501 static uint32_t
get_render_pass_pixel_count(const struct tu_cmd_buffer * cmd)502 get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
503 {
504    const VkExtent2D *extent = &cmd->state.render_area.extent;
505    return extent->width * extent->height;
506 }
507 
508 static uint64_t
estimate_drawcall_bandwidth(const struct tu_cmd_buffer * cmd,uint32_t avg_renderpass_sample_count)509 estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
510                             uint32_t avg_renderpass_sample_count)
511 {
512    const struct tu_cmd_state *state = &cmd->state;
513 
514    if (!state->rp.drawcall_count)
515       return 0;
516 
517    /* sample count times drawcall_bandwidth_per_sample */
518    return (uint64_t)avg_renderpass_sample_count *
519       state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
520 }
521 
522 bool
tu_autotune_use_bypass(struct tu_autotune * at,struct tu_cmd_buffer * cmd_buffer,struct tu_renderpass_result ** autotune_result)523 tu_autotune_use_bypass(struct tu_autotune *at,
524                        struct tu_cmd_buffer *cmd_buffer,
525                        struct tu_renderpass_result **autotune_result)
526 {
527    const struct tu_render_pass *pass = cmd_buffer->state.pass;
528    const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
529 
530    /* If a feedback loop in the subpass caused one of the pipelines used to set
531     * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even
532     * SINGLE_PRIM_MODE(FLUSH), then that should cause significantly increased
533     * sysmem bandwidth (though we haven't quantified it).
534     */
535    if (cmd_buffer->state.rp.sysmem_single_prim_mode)
536       return false;
537 
538    /* If the user is using a fragment density map, then this will cause less
539     * FS invocations with GMEM, which has a hard-to-measure impact on
540     * performance because it depends on how heavy the FS is in addition to how
541     * many invocations there were and the density. Let's assume the user knows
542     * what they're doing when they added the map, because if sysmem is
543     * actually faster then they could've just not used the fragment density
544     * map.
545     */
546    if (pass->has_fdm)
547       return false;
548 
549    /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
550     * we would have to allocate GPU memory at the submit time and copy
551     * results into it.
552     * Native games ususally don't use it, Zink and DXVK don't use it,
553     * D3D12 doesn't have such concept.
554     */
555    bool simultaneous_use =
556       cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
557 
558    if (!at->enabled || simultaneous_use)
559       return fallback_use_bypass(pass, framebuffer, cmd_buffer);
560 
561    /* We use 64bit hash as a key since we don't fear rare hash collision,
562     * the worst that would happen is sysmem being selected when it should
563     * have not, and with 64bit it would be extremely rare.
564     *
565     * Q: Why not make the key from framebuffer + renderpass pointers?
566     * A: At least DXVK creates new framebuffers each frame while keeping
567     *    renderpasses the same. Also we want to support replaying a single
568     *    frame in a loop for testing.
569     */
570    uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
571 
572    *autotune_result = create_history_result(at, renderpass_key);
573 
574    uint32_t avg_samples = 0;
575    if (get_history(at, renderpass_key, &avg_samples)) {
576       const uint32_t pass_pixel_count =
577          get_render_pass_pixel_count(cmd_buffer);
578       uint64_t sysmem_bandwidth =
579          (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
580       uint64_t gmem_bandwidth =
581          (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
582 
583       const uint64_t total_draw_call_bandwidth =
584          estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
585 
586       /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
587       sysmem_bandwidth += total_draw_call_bandwidth;
588 
589       /* drawcalls access gmem in gmem rendering, but we do not want to ignore
590        * them completely.  The state changes between tiles also have an
591        * overhead.  The magic numbers of 11 and 10 are randomly chosen.
592        */
593       gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
594 
595       const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
596       if (TU_AUTOTUNE_DEBUG_LOG) {
597          const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
598          const float drawcall_bandwidth_per_sample =
599             (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
600             cmd_buffer->state.rp.drawcall_count;
601 
602          mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
603                renderpass_key,
604                cmd_buffer->state.rp.drawcall_count,
605                select_sysmem ? "sysmem" : "gmem");
606          mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
607                avg_samples,
608                drawcall_bandwidth_per_sample,
609                total_draw_call_bandwidth);
610          mesa_logi("   render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
611                extent->width, extent->height,
612                pass->sysmem_bandwidth_per_pixel,
613                pass->gmem_bandwidth_per_pixel);
614          mesa_logi("   sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
615                sysmem_bandwidth, gmem_bandwidth);
616       }
617 
618       return select_sysmem;
619    }
620 
621    return fallback_use_bypass(pass, framebuffer, cmd_buffer);
622 }
623 
624 void
tu_autotune_begin_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)625 tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
626                              struct tu_cs *cs,
627                              struct tu_renderpass_result *autotune_result)
628 {
629    if (!autotune_result)
630       return;
631 
632    struct tu_device *dev = cmd->device;
633 
634    static const uint32_t size = sizeof(struct tu_renderpass_samples);
635 
636    mtx_lock(&dev->autotune_mutex);
637    VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
638    mtx_unlock(&dev->autotune_mutex);
639    if (ret != VK_SUCCESS) {
640       autotune_result->bo.iova = 0;
641       return;
642    }
643 
644    uint64_t result_iova = autotune_result->bo.iova;
645 
646    autotune_result->samples =
647       (struct tu_renderpass_samples *) tu_suballoc_bo_map(
648          &autotune_result->bo);
649 
650    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
651 
652    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
653    /* A7XX TODO: Fixup ZPASS_DONE */
654    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
655    tu_cs_emit(cs, ZPASS_DONE);
656 }
657 
tu_autotune_end_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)658 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
659                                 struct tu_cs *cs,
660                                 struct tu_renderpass_result *autotune_result)
661 {
662    if (!autotune_result)
663       return;
664 
665    if (!autotune_result->bo.iova)
666       return;
667 
668    uint64_t result_iova = autotune_result->bo.iova +
669                           offsetof(struct tu_renderpass_samples, samples_end);
670 
671    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
672 
673    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
674 
675    /* A7XX TODO: Fixup ZPASS_DONE */
676    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
677    tu_cs_emit(cs, ZPASS_DONE);
678 }
679