/* * Copyright © 2021 Igalia S.L. * SPDX-License-Identifier: MIT */ #ifndef TU_AUTOTUNE_H #define TU_AUTOTUNE_H #include "tu_common.h" #include "util/hash_table.h" #include "util/rwlock.h" #include "tu_suballoc.h" struct tu_renderpass_history; /** * "autotune" our decisions about bypass vs GMEM rendering, based on historical * data about a given render target. * * In deciding which path to take there are tradeoffs, including some that * are not reasonably estimateable without having some additional information: * * (1) If you know you are touching every pixel (ie. there is a clear), * then the GMEM path will at least not cost more memory bandwidth than * sysmem[1] * * (2) If there is no clear, GMEM could potentially cost *more* bandwidth * if there is sysmem->GMEM restore pass. * * (3) If you see a high draw count, that is an indication that there will be * enough pixels accessed multiple times to benefit from the reduced * memory bandwidth that GMEM brings * * (4) But high draw count where there is not much overdraw can actually be * faster in bypass mode if it is pushing a lot of state change, due to * not having to go thru the state changes per-tile[1] * * The approach taken is to measure the samples-passed for the batch to estimate * the amount of overdraw to detect cases where the number of pixels touched is * low. * * [1] ignoring early-tile-exit optimizations, but any draw that touches all/ * most of the tiles late in the tile-pass can defeat that */ struct tu_autotune { /* We may have to disable autotuner if there are too many * renderpasses in-flight. */ bool enabled; struct tu_device *device; /** * Cache to map renderpass key to historical information about * rendering to that particular render target. */ struct hash_table *ht; struct u_rwlock ht_lock; /** * List of per-renderpass results that we are waiting for the GPU * to finish with before reading back the results. */ struct list_head pending_results; /** * List of per-submission data that we may want to free after we * processed submission results. * This could happend after command buffers which were in the submission * are destroyed. */ struct list_head pending_submission_data; uint32_t fence_counter; uint32_t idx_counter; }; /** * From the cmdstream, the captured samples-passed values are recorded * at the start and end of the batch. * * Note that we do the math on the CPU to avoid a WFI. But pre-emption * may force us to revisit that. */ struct tu_renderpass_samples { uint64_t samples_start; /* hw requires the sample start/stop locations to be 128b aligned. */ uint64_t __pad0; uint64_t samples_end; uint64_t __pad1; }; /** * Tracks the results from an individual renderpass. Initially created * per renderpass, and appended to the tail of at->pending_results. At a later * time, when the GPU has finished writing the results, we fill samples_passed. */ struct tu_renderpass_result { /* Points into GPU memory */ struct tu_renderpass_samples* samples; struct tu_suballoc_bo bo; /* * Below here, only used internally within autotune */ uint64_t rp_key; struct tu_renderpass_history *history; struct list_head node; uint32_t fence; uint64_t samples_passed; }; VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev); void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev); bool tu_autotune_use_bypass(struct tu_autotune *at, struct tu_cmd_buffer *cmd_buffer, struct tu_renderpass_result **autotune_result); void tu_autotune_free_results(struct tu_device *dev, struct list_head *results); bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count); /** * A magic 8-ball that tells the gmem code whether we should do bypass mode * for moar fps. */ struct tu_cs *tu_autotune_on_submit(struct tu_device *dev, struct tu_autotune *at, struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count); struct tu_autotune_results_buffer; void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_renderpass_result *autotune_result); void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_renderpass_result *autotune_result); #endif /* TU_AUTOTUNE_H */