1 /*
2 * Copyright © 2015 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_util.h"
7
8 #include <errno.h>
9 #include <stdarg.h>
10
11 #include "common/freedreno_rd_output.h"
12 #include "util/u_math.h"
13 #include "util/timespec.h"
14 #include "vk_enum_to_str.h"
15
16 #include "tu_device.h"
17 #include "tu_pass.h"
18
19 static const struct debug_control tu_debug_options[] = {
20 { "startup", TU_DEBUG_STARTUP },
21 { "nir", TU_DEBUG_NIR },
22 { "nobin", TU_DEBUG_NOBIN },
23 { "sysmem", TU_DEBUG_SYSMEM },
24 { "gmem", TU_DEBUG_GMEM },
25 { "forcebin", TU_DEBUG_FORCEBIN },
26 { "layout", TU_DEBUG_LAYOUT },
27 { "noubwc", TU_DEBUG_NOUBWC },
28 { "nomultipos", TU_DEBUG_NOMULTIPOS },
29 { "nolrz", TU_DEBUG_NOLRZ },
30 { "nolrzfc", TU_DEBUG_NOLRZFC },
31 { "perf", TU_DEBUG_PERF },
32 { "perfc", TU_DEBUG_PERFC },
33 { "flushall", TU_DEBUG_FLUSHALL },
34 { "syncdraw", TU_DEBUG_SYNCDRAW },
35 { "push_consts_per_stage", TU_DEBUG_PUSH_CONSTS_PER_STAGE },
36 { "rast_order", TU_DEBUG_RAST_ORDER },
37 { "unaligned_store", TU_DEBUG_UNALIGNED_STORE },
38 { "log_skip_gmem_ops", TU_DEBUG_LOG_SKIP_GMEM_OPS },
39 { "dynamic", TU_DEBUG_DYNAMIC },
40 { "bos", TU_DEBUG_BOS },
41 { "3d_load", TU_DEBUG_3D_LOAD },
42 { "fdm", TU_DEBUG_FDM },
43 { "noconform", TU_DEBUG_NOCONFORM },
44 { "rd", TU_DEBUG_RD },
45 { "hiprio", TU_DEBUG_HIPRIO },
46 { "noconcurrentresolves", TU_DEBUG_NO_CONCURRENT_RESOLVES },
47 { "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES },
48 { NULL, 0 }
49 };
50
51 struct tu_env tu_env;
52
53 static void
tu_env_init_once(void)54 tu_env_init_once(void)
55 {
56 tu_env.debug = parse_debug_string(os_get_option("TU_DEBUG"),
57 tu_debug_options);
58
59 if (TU_DEBUG(STARTUP))
60 mesa_logi("TU_DEBUG=0x%x", tu_env.debug);
61
62 /* TU_DEBUG=rd functionality was moved to fd_rd_output. This debug option
63 * should translate to the basic-level FD_RD_DUMP_ENABLE option.
64 */
65 if (TU_DEBUG(RD))
66 fd_rd_dump_env.flags |= FD_RD_DUMP_ENABLE;
67 }
68
69 void
tu_env_init(void)70 tu_env_init(void)
71 {
72 fd_rd_dump_env_init();
73
74 static once_flag once = ONCE_FLAG_INIT;
75 call_once(&once, tu_env_init_once);
76 }
77
78 void PRINTFLIKE(3, 4)
__tu_finishme(const char * file,int line,const char * format,...)79 __tu_finishme(const char *file, int line, const char *format, ...)
80 {
81 va_list ap;
82 char buffer[256];
83
84 va_start(ap, format);
85 vsnprintf(buffer, sizeof(buffer), format, ap);
86 va_end(ap);
87
88 mesa_loge("%s:%d: FINISHME: %s\n", file, line, buffer);
89 }
90
91 VkResult
__vk_startup_errorf(struct tu_instance * instance,VkResult error,const char * file,int line,const char * format,...)92 __vk_startup_errorf(struct tu_instance *instance,
93 VkResult error,
94 const char *file,
95 int line,
96 const char *format,
97 ...)
98 {
99 va_list ap;
100 char buffer[256];
101
102 const char *error_str = vk_Result_to_str(error);
103
104 if (format) {
105 va_start(ap, format);
106 vsnprintf(buffer, sizeof(buffer), format, ap);
107 va_end(ap);
108
109 mesa_loge("%s:%d: %s (%s)\n", file, line, buffer, error_str);
110 } else {
111 mesa_loge("%s:%d: %s\n", file, line, error_str);
112 }
113
114 return error;
115 }
116
117 static void
tu_tiling_config_update_tile_layout(struct tu_framebuffer * fb,const struct tu_device * dev,const struct tu_render_pass * pass,enum tu_gmem_layout gmem_layout)118 tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
119 const struct tu_device *dev,
120 const struct tu_render_pass *pass,
121 enum tu_gmem_layout gmem_layout)
122 {
123 const uint32_t tile_align_w = pass->tile_align_w;
124 uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
125 struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
126
127 *tiling = (struct tu_tiling_config) {
128 /* Put in dummy values that will assertion fail in register setup using
129 * them, since you shouldn't be doing gmem work if gmem is not possible.
130 */
131 .tile0 = (VkExtent2D) { ~0, ~0 },
132 .tile_count = (VkExtent2D) { .width = 1, .height = 1 },
133 .possible = false,
134 };
135
136 /* From the Vulkan 1.3.232 spec, under VkFramebufferCreateInfo:
137 *
138 * If the render pass uses multiview, then layers must be one and each
139 * attachment requires a number of layers that is greater than the
140 * maximum bit index set in the view mask in the subpasses in which it is
141 * used.
142 */
143
144 uint32_t layers = MAX2(fb->layers, pass->num_views);
145
146 /* If there is more than one layer, we need to make sure that the layer
147 * stride is expressible as an offset in RB_BLIT_BASE_GMEM which ignores
148 * the low 12 bits. The layer stride seems to be implicitly calculated from
149 * the tile width and height so we need to adjust one of them.
150 */
151 const uint32_t gmem_align_log2 = 12;
152 const uint32_t gmem_align = 1 << gmem_align_log2;
153 uint32_t min_layer_stride = tile_align_h * tile_align_w * pass->min_cpp;
154 if (layers > 1 && align(min_layer_stride, gmem_align) != min_layer_stride) {
155 /* Make sure that min_layer_stride is a multiple of gmem_align. Because
156 * gmem_align is a power of two and min_layer_stride isn't already a
157 * multiple of gmem_align, this is equivalent to shifting tile_align_h
158 * until the number of 0 bits at the bottom of min_layer_stride is at
159 * least gmem_align_log2.
160 */
161 tile_align_h <<= gmem_align_log2 - (ffs(min_layer_stride) - 1);
162
163 /* Check that we did the math right. */
164 min_layer_stride = tile_align_h * tile_align_w * pass->min_cpp;
165 assert(align(min_layer_stride, gmem_align) == min_layer_stride);
166 }
167
168 /* will force to sysmem, don't bother trying to have a valid tile config
169 * TODO: just skip all GMEM stuff when sysmem is forced?
170 */
171 if (!pass->gmem_pixels[gmem_layout])
172 return;
173
174 uint32_t best_tile_count = ~0;
175 VkExtent2D tile_count;
176 VkExtent2D tile_size;
177 /* There aren't that many different tile widths possible, so just walk all
178 * of them finding which produces the lowest number of bins.
179 */
180 const uint32_t max_tile_width = MIN2(
181 dev->physical_device->info->tile_max_w, util_align_npot(fb->width, tile_align_w));
182 const uint32_t max_tile_height =
183 MIN2(dev->physical_device->info->tile_max_h,
184 align(fb->height, tile_align_h));
185 for (tile_size.width = tile_align_w; tile_size.width <= max_tile_width;
186 tile_size.width += tile_align_w) {
187 tile_size.height = pass->gmem_pixels[gmem_layout] / (tile_size.width * layers);
188 tile_size.height = MIN2(tile_size.height, max_tile_height);
189 tile_size.height = ROUND_DOWN_TO(tile_size.height, tile_align_h);
190 if (!tile_size.height)
191 continue;
192
193 tile_count.width = DIV_ROUND_UP(fb->width, tile_size.width);
194 tile_count.height = DIV_ROUND_UP(fb->height, tile_size.height);
195
196 /* Drop the height of the tile down to split tiles more evenly across the
197 * screen for a given tile count.
198 */
199 tile_size.height =
200 align(DIV_ROUND_UP(fb->height, tile_count.height), tile_align_h);
201
202 /* Pick the layout with the minimum number of bins (lowest CP overhead
203 * and amount of cache flushing), but the most square tiles in the case
204 * of a tie (likely highest cache locality).
205 */
206 if (tile_count.width * tile_count.height < best_tile_count ||
207 (tile_count.width * tile_count.height == best_tile_count &&
208 abs((int)(tile_size.width - tile_size.height)) <
209 abs((int)(tiling->tile0.width - tiling->tile0.height)))) {
210 tiling->possible = true;
211 tiling->tile0 = tile_size;
212 tiling->tile_count = tile_count;
213 best_tile_count = tile_count.width * tile_count.height;
214 }
215 }
216
217 /* If forcing binning, try to get at least 2 tiles in each direction. */
218 if (TU_DEBUG(FORCEBIN) && tiling->possible) {
219 if (tiling->tile_count.width == 1 && tiling->tile0.width != tile_align_w) {
220 tiling->tile0.width = util_align_npot(DIV_ROUND_UP(tiling->tile0.width, 2), tile_align_w);
221 tiling->tile_count.width = 2;
222 }
223 if (tiling->tile_count.height == 1 && tiling->tile0.height != tile_align_h) {
224 tiling->tile0.height = align(DIV_ROUND_UP(tiling->tile0.height, 2), tile_align_h);
225 tiling->tile_count.height = 2;
226 }
227 }
228 }
229
230 static void
tu_tiling_config_update_pipe_layout(struct tu_tiling_config * tiling,const struct tu_device * dev)231 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
232 const struct tu_device *dev)
233 {
234 const uint32_t max_pipe_count =
235 dev->physical_device->info->num_vsc_pipes;
236
237 /* start from 1 tile per pipe */
238 tiling->pipe0 = (VkExtent2D) {
239 .width = 1,
240 .height = 1,
241 };
242 tiling->pipe_count = tiling->tile_count;
243
244 while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) {
245 if (tiling->pipe0.width < tiling->pipe0.height) {
246 tiling->pipe0.width += 1;
247 tiling->pipe_count.width =
248 DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
249 } else {
250 tiling->pipe0.height += 1;
251 tiling->pipe_count.height =
252 DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
253 }
254 }
255 }
256
257 static void
tu_tiling_config_update_pipes(struct tu_tiling_config * tiling,const struct tu_device * dev)258 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
259 const struct tu_device *dev)
260 {
261 const uint32_t max_pipe_count =
262 dev->physical_device->info->num_vsc_pipes;
263 const uint32_t used_pipe_count =
264 tiling->pipe_count.width * tiling->pipe_count.height;
265 const VkExtent2D last_pipe = {
266 .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
267 .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
268 };
269
270 assert(used_pipe_count <= max_pipe_count);
271 assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
272
273 for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
274 for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
275 const uint32_t pipe_x = tiling->pipe0.width * x;
276 const uint32_t pipe_y = tiling->pipe0.height * y;
277 const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
278 ? last_pipe.width
279 : tiling->pipe0.width;
280 const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
281 ? last_pipe.height
282 : tiling->pipe0.height;
283 const uint32_t n = tiling->pipe_count.width * y + x;
284
285 tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
286 A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
287 A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
288 A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
289 tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
290 }
291 }
292
293 memset(tiling->pipe_config + used_pipe_count, 0,
294 sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
295 }
296
297 static bool
is_hw_binning_possible(const struct tu_tiling_config * tiling)298 is_hw_binning_possible(const struct tu_tiling_config *tiling)
299 {
300 /* Similar to older gens, # of tiles per pipe cannot be more than 32.
301 * But there are no hangs with 16 or more tiles per pipe in either
302 * X or Y direction, so that limit does not seem to apply.
303 */
304 uint32_t tiles_per_pipe = tiling->pipe0.width * tiling->pipe0.height;
305 return tiles_per_pipe <= 32;
306 }
307
308 static void
tu_tiling_config_update_binning(struct tu_tiling_config * tiling,const struct tu_device * device)309 tu_tiling_config_update_binning(struct tu_tiling_config *tiling, const struct tu_device *device)
310 {
311 tiling->binning_possible = is_hw_binning_possible(tiling);
312
313 if (tiling->binning_possible) {
314 tiling->binning = (tiling->tile_count.width * tiling->tile_count.height) > 2;
315
316 if (TU_DEBUG(FORCEBIN))
317 tiling->binning = true;
318 if (TU_DEBUG(NOBIN))
319 tiling->binning = false;
320 } else {
321 tiling->binning = false;
322 }
323 }
324
325 void
tu_framebuffer_tiling_config(struct tu_framebuffer * fb,const struct tu_device * device,const struct tu_render_pass * pass)326 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
327 const struct tu_device *device,
328 const struct tu_render_pass *pass)
329 {
330 for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
331 struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
332 tu_tiling_config_update_tile_layout(fb, device, pass,
333 (enum tu_gmem_layout) gmem_layout);
334 if (!tiling->possible)
335 continue;
336
337 tu_tiling_config_update_pipe_layout(tiling, device);
338 tu_tiling_config_update_pipes(tiling, device);
339 tu_tiling_config_update_binning(tiling, device);
340 }
341 }
342
343 void
tu_dbg_log_gmem_load_store_skips(struct tu_device * device)344 tu_dbg_log_gmem_load_store_skips(struct tu_device *device)
345 {
346 static uint32_t last_skipped_loads = 0;
347 static uint32_t last_skipped_stores = 0;
348 static uint32_t last_total_loads = 0;
349 static uint32_t last_total_stores = 0;
350 static struct timespec last_time = {};
351
352 pthread_mutex_lock(&device->submit_mutex);
353
354 struct timespec current_time;
355 clock_gettime(CLOCK_MONOTONIC, ¤t_time);
356
357 if (timespec_sub_to_nsec(¤t_time, &last_time) > 1000 * 1000 * 1000) {
358 last_time = current_time;
359 } else {
360 pthread_mutex_unlock(&device->submit_mutex);
361 return;
362 }
363
364 struct tu6_global *global = device->global_bo_map;
365
366 uint32_t current_taken_loads = global->dbg_gmem_taken_loads;
367 uint32_t current_taken_stores = global->dbg_gmem_taken_stores;
368 uint32_t current_total_loads = global->dbg_gmem_total_loads;
369 uint32_t current_total_stores = global->dbg_gmem_total_stores;
370
371 uint32_t skipped_loads = current_total_loads - current_taken_loads;
372 uint32_t skipped_stores = current_total_stores - current_taken_stores;
373
374 uint32_t current_time_frame_skipped_loads = skipped_loads - last_skipped_loads;
375 uint32_t current_time_frame_skipped_stores = skipped_stores - last_skipped_stores;
376
377 uint32_t current_time_frame_total_loads = current_total_loads - last_total_loads;
378 uint32_t current_time_frame_total_stores = current_total_stores - last_total_stores;
379
380 mesa_logi("[GMEM] loads total: %u skipped: %.1f%%\n",
381 current_time_frame_total_loads,
382 current_time_frame_skipped_loads / (float) current_time_frame_total_loads * 100.f);
383 mesa_logi("[GMEM] stores total: %u skipped: %.1f%%\n",
384 current_time_frame_total_stores,
385 current_time_frame_skipped_stores / (float) current_time_frame_total_stores * 100.f);
386
387 last_skipped_loads = skipped_loads;
388 last_skipped_stores = skipped_stores;
389 last_total_loads = current_total_loads;
390 last_total_stores = current_total_stores;
391
392 pthread_mutex_unlock(&device->submit_mutex);
393 }
394