1 /* 2 * Copyright © 2021 Google, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24 #ifndef FREEDRENO_AUTOTUNE_H 25 #define FREEDRENO_AUTOTUNE_H 26 27 #include "util/hash_table.h" 28 #include "util/list.h" 29 30 #include "freedreno_util.h" 31 32 struct fd_autotune_results; 33 34 /** 35 * "autotune" our decisions about bypass vs GMEM rendering, based on historical 36 * data about a given render target. 37 * 38 * In deciding which path to take there are tradeoffs, including some that 39 * are not reasonably estimateable without having some additional information: 40 * 41 * (1) If you know you are touching every pixel (ie. there is a glClear()), 42 * then the GMEM path will at least not cost more memory bandwidth than 43 * sysmem[1] 44 * 45 * (2) If there is no clear, GMEM could potentially cost *more* bandwidth 46 * due to sysmem->GMEM restore pass. 47 * 48 * (3) If you see a high draw count, that is an indication that there will be 49 * enough pixels accessed multiple times to benefit from the reduced 50 * memory bandwidth that GMEM brings 51 * 52 * (4) But high draw count where there is not much overdraw can actually be 53 * faster in bypass mode if it is pushing a lot of state change, due to 54 * not having to go thru the state changes per-tile[2] 55 * 56 * The approach taken is to measure the samples-passed for the batch to estimate 57 * the amount of overdraw to detect cases where the number of pixels touched is 58 * low. 59 * 60 * Note however, that (at least since a5xx) we have PERF_RB_{Z,C}_{READ,WRITE} 61 * performance countables, which give a more direct measurement of what we want 62 * to know (ie. is framebuffer memory access high enough to prefer GMEM), but 63 * with the downside of consuming half of the available RB counters. With the 64 * additional complication that external perfcntr collection (fdperf, perfetto) 65 * and the drive could be stomping on each other's feet. (Also reading the 66 * perfcntrs accurately requires a WFI.) 67 * 68 * [1] ignoring UBWC 69 * [2] ignoring early-tile-exit optimizations, but any draw that touches all/ 70 * most of the tiles late in the tile-pass can defeat that 71 */ 72 struct fd_autotune { 73 74 /** 75 * Cache to map batch->key (also used for batch-cache) to historical 76 * information about rendering to that particular render target. 77 */ 78 struct hash_table *ht; 79 80 /** 81 * List of recently used historical results (to age out old results) 82 */ 83 struct list_head lru; 84 85 /** 86 * GPU buffer used to communicate back results to the CPU 87 */ 88 struct fd_bo *results_mem; 89 struct fd_autotune_results *results; 90 91 /** 92 * List of per-batch results that we are waiting for the GPU to finish 93 * with before reading back the results. 94 */ 95 struct list_head pending_results; 96 97 uint32_t fence_counter; 98 uint32_t idx_counter; 99 }; 100 101 /** 102 * The layout of the memory used to read back per-batch results from the 103 * GPU 104 * 105 * Note this struct is intentionally aligned to 4k. And hw requires the 106 * sample start/stop locations to be 128b aligned. 107 */ 108 struct fd_autotune_results { 109 110 /** 111 * The GPU writes back a "fence" seqno value from the cmdstream after 112 * it finishes writing it's result slot, so that the CPU knows when 113 * results are valid 114 */ 115 uint32_t fence; 116 117 uint32_t __pad0; 118 uint64_t __pad1; 119 120 /** 121 * From the cmdstream, the captured samples-passed values are recorded 122 * at the start and end of the batch. 123 * 124 * Note that we do the math on the CPU to avoid a WFI. But pre-emption 125 * may force us to revisit that. 126 */ 127 struct { 128 uint64_t samples_start; 129 uint64_t __pad0; 130 uint64_t samples_end; 131 uint64_t __pad1; 132 } result[127]; 133 }; 134 135 #define __offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base)) 136 #define results_ptr(at, member) \ 137 (at)->results_mem, __offset((at)->results, &(at)->results->member), 0, 0 138 139 struct fd_batch_history; 140 141 /** 142 * Tracks the results from an individual batch. Initially created per batch, 143 * and appended to the tail of at->pending_results. At a later time, when 144 * the GPU has finished writing the results, 145 * 146 * ralloc parent is the associated fd_batch_history 147 */ 148 struct fd_batch_result { 149 150 /** 151 * The index/slot in fd_autotune_results::result[] to write start/end 152 * counter to 153 */ 154 unsigned idx; 155 156 /** 157 * Fence value to write back to fd_autotune_results::fence after both 158 * start/end values written 159 */ 160 uint32_t fence; 161 162 /* 163 * Below here, only used internally within autotune 164 */ 165 struct fd_batch_history *history; 166 struct list_head node; 167 uint32_t cost; 168 uint64_t samples_passed; 169 }; 170 171 void fd_autotune_init(struct fd_autotune *at, struct fd_device *dev); 172 void fd_autotune_fini(struct fd_autotune *at); 173 174 struct fd_batch; 175 bool fd_autotune_use_bypass(struct fd_autotune *at, 176 struct fd_batch *batch) assert_dt; 177 178 #endif /* FREEDRENO_AUTOTUNE_H */ 179