1 /*
2 * Copyright © 2015 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_util.h"
7
8 #include <errno.h>
9 #include <stdarg.h>
10
11 #include "common/freedreno_rd_output.h"
12 #include "util/u_math.h"
13 #include "util/timespec.h"
14 #include "util/os_file_notify.h"
15 #include "vk_enum_to_str.h"
16
17 #include "tu_device.h"
18 #include "tu_pass.h"
19
20 static const struct debug_control tu_debug_options[] = {
21 { "startup", TU_DEBUG_STARTUP },
22 { "nir", TU_DEBUG_NIR },
23 { "nobin", TU_DEBUG_NOBIN },
24 { "sysmem", TU_DEBUG_SYSMEM },
25 { "gmem", TU_DEBUG_GMEM },
26 { "forcebin", TU_DEBUG_FORCEBIN },
27 { "layout", TU_DEBUG_LAYOUT },
28 { "noubwc", TU_DEBUG_NOUBWC },
29 { "nomultipos", TU_DEBUG_NOMULTIPOS },
30 { "nolrz", TU_DEBUG_NOLRZ },
31 { "nolrzfc", TU_DEBUG_NOLRZFC },
32 { "perf", TU_DEBUG_PERF },
33 { "perfc", TU_DEBUG_PERFC },
34 { "flushall", TU_DEBUG_FLUSHALL },
35 { "syncdraw", TU_DEBUG_SYNCDRAW },
36 { "push_consts_per_stage", TU_DEBUG_PUSH_CONSTS_PER_STAGE },
37 { "rast_order", TU_DEBUG_RAST_ORDER },
38 { "unaligned_store", TU_DEBUG_UNALIGNED_STORE },
39 { "log_skip_gmem_ops", TU_DEBUG_LOG_SKIP_GMEM_OPS },
40 { "dynamic", TU_DEBUG_DYNAMIC },
41 { "bos", TU_DEBUG_BOS },
42 { "3d_load", TU_DEBUG_3D_LOAD },
43 { "fdm", TU_DEBUG_FDM },
44 { "noconform", TU_DEBUG_NOCONFORM },
45 { "rd", TU_DEBUG_RD },
46 { "hiprio", TU_DEBUG_HIPRIO },
47 { "noconcurrentresolves", TU_DEBUG_NO_CONCURRENT_RESOLVES },
48 { "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES },
49 { "dumpas", TU_DEBUG_DUMPAS },
50 { NULL, 0 }
51 };
52
53 /*
54 * The runtime debug flags are a subset of the debug flags that can be set at
55 * runtime. Flags which depend on running state of the driver, the application
56 * or the hardware and would otherwise break when toggled should not be set here.
57 * Note: Keep in sync with the list of flags in 'docs/drivers/freedreno.rst'.
58 */
59 const uint32_t tu_runtime_debug_flags =
60 TU_DEBUG_NIR | TU_DEBUG_NOBIN | TU_DEBUG_SYSMEM | TU_DEBUG_GMEM |
61 TU_DEBUG_FORCEBIN | TU_DEBUG_LAYOUT | TU_DEBUG_NOLRZ | TU_DEBUG_NOLRZFC |
62 TU_DEBUG_PERF | TU_DEBUG_FLUSHALL | TU_DEBUG_SYNCDRAW |
63 TU_DEBUG_RAST_ORDER | TU_DEBUG_UNALIGNED_STORE |
64 TU_DEBUG_LOG_SKIP_GMEM_OPS | TU_DEBUG_3D_LOAD | TU_DEBUG_FDM |
65 TU_DEBUG_NO_CONCURRENT_RESOLVES | TU_DEBUG_NO_CONCURRENT_UNRESOLVES;
66
67 os_file_notifier_t tu_debug_notifier;
68 struct tu_env tu_env;
69
70 static void
tu_env_notify(void * data,const char * path,bool created,bool deleted,bool dir_deleted)71 tu_env_notify(
72 void *data, const char *path, bool created, bool deleted, bool dir_deleted)
73 {
74 int file_flags = 0;
75 if (!deleted) {
76 FILE *file = fopen(path, "r");
77 if (file) {
78 char buf[512];
79 size_t len = fread(buf, 1, sizeof(buf) - 1, file);
80 fclose(file);
81 buf[len] = '\0';
82
83 file_flags = parse_debug_string(buf, tu_debug_options);
84 }
85 }
86
87 int runtime_flags = file_flags & tu_runtime_debug_flags;
88 if (unlikely(runtime_flags != file_flags)) {
89 mesa_logw(
90 "Certain options in TU_DEBUG_FILE don't support runtime changes: 0x%x, ignoring",
91 file_flags & ~tu_runtime_debug_flags);
92 }
93
94 tu_env.debug.store(runtime_flags | tu_env.env_debug, std::memory_order_release);
95
96 if (unlikely(dir_deleted))
97 mesa_logw(
98 "Directory containing TU_DEBUG_FILE (%s) was deleted, stopping watching",
99 path);
100 }
101
102 static void
tu_env_deinit(void)103 tu_env_deinit(void)
104 {
105 if (tu_debug_notifier)
106 os_file_notifier_destroy(tu_debug_notifier);
107 }
108
109 static void
tu_env_init_once(void)110 tu_env_init_once(void)
111 {
112 tu_env.debug = parse_debug_string(os_get_option("TU_DEBUG"), tu_debug_options);
113 tu_env.env_debug = tu_env.debug & ~tu_runtime_debug_flags;
114
115 if (TU_DEBUG(STARTUP))
116 mesa_logi("TU_DEBUG=0x%x", tu_env.env_debug);
117
118 /* TU_DEBUG=rd functionality was moved to fd_rd_output. This debug option
119 * should translate to the basic-level FD_RD_DUMP_ENABLE option.
120 */
121 if (TU_DEBUG(RD))
122 fd_rd_dump_env.flags |= FD_RD_DUMP_ENABLE;
123
124 const char *debug_file = os_get_option("TU_DEBUG_FILE");
125 if (debug_file) {
126 if (tu_env.debug != tu_env.env_debug) {
127 mesa_logw("TU_DEBUG_FILE is set (%s), but TU_DEBUG is also set. "
128 "Any runtime options (0x%x) in TU_DEBUG will be ignored.",
129 debug_file, tu_env.debug & ~tu_runtime_debug_flags);
130 }
131
132 if (TU_DEBUG(STARTUP))
133 mesa_logi("Watching TU_DEBUG_FILE: %s", debug_file);
134
135 const char* error_str = "Unknown error";
136 tu_debug_notifier =
137 os_file_notifier_create(debug_file, tu_env_notify, NULL, &error_str);
138 if (!tu_debug_notifier)
139 mesa_logw("Failed to watch TU_DEBUG_FILE (%s): %s", debug_file, error_str);
140 } else {
141 tu_debug_notifier = NULL;
142 }
143
144 atexit(tu_env_deinit);
145 }
146
147 void
tu_env_init(void)148 tu_env_init(void)
149 {
150 fd_rd_dump_env_init();
151
152 static once_flag once = ONCE_FLAG_INIT;
153 call_once(&once, tu_env_init_once);
154 }
155
156 void PRINTFLIKE(3, 4)
__tu_finishme(const char * file,int line,const char * format,...)157 __tu_finishme(const char *file, int line, const char *format, ...)
158 {
159 va_list ap;
160 char buffer[256];
161
162 va_start(ap, format);
163 vsnprintf(buffer, sizeof(buffer), format, ap);
164 va_end(ap);
165
166 mesa_loge("%s:%d: FINISHME: %s\n", file, line, buffer);
167 }
168
169 VkResult
__vk_startup_errorf(struct tu_instance * instance,VkResult error,const char * file,int line,const char * format,...)170 __vk_startup_errorf(struct tu_instance *instance,
171 VkResult error,
172 const char *file,
173 int line,
174 const char *format,
175 ...)
176 {
177 va_list ap;
178 char buffer[256];
179
180 const char *error_str = vk_Result_to_str(error);
181
182 if (format) {
183 va_start(ap, format);
184 vsnprintf(buffer, sizeof(buffer), format, ap);
185 va_end(ap);
186
187 mesa_loge("%s:%d: %s (%s)\n", file, line, buffer, error_str);
188 } else {
189 mesa_loge("%s:%d: %s\n", file, line, error_str);
190 }
191
192 return error;
193 }
194
195 static void
tu_tiling_config_update_tile_layout(struct tu_framebuffer * fb,const struct tu_device * dev,const struct tu_render_pass * pass,enum tu_gmem_layout gmem_layout)196 tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
197 const struct tu_device *dev,
198 const struct tu_render_pass *pass,
199 enum tu_gmem_layout gmem_layout)
200 {
201 const uint32_t tile_align_w = pass->tile_align_w;
202 uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
203 struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
204
205 *tiling = (struct tu_tiling_config) {
206 /* Put in dummy values that will assertion fail in register setup using
207 * them, since you shouldn't be doing gmem work if gmem is not possible.
208 */
209 .tile0 = (VkExtent2D) { ~0, ~0 },
210 .tile_count = (VkExtent2D) { .width = 1, .height = 1 },
211 .possible = false,
212 };
213
214 /* From the Vulkan 1.3.232 spec, under VkFramebufferCreateInfo:
215 *
216 * If the render pass uses multiview, then layers must be one and each
217 * attachment requires a number of layers that is greater than the
218 * maximum bit index set in the view mask in the subpasses in which it is
219 * used.
220 */
221
222 uint32_t layers = MAX2(fb->layers, pass->num_views);
223
224 /* If there is more than one layer, we need to make sure that the layer
225 * stride is expressible as an offset in RB_BLIT_BASE_GMEM which ignores
226 * the low 12 bits. The layer stride seems to be implicitly calculated from
227 * the tile width and height so we need to adjust one of them.
228 */
229 const uint32_t gmem_align_log2 = 12;
230 const uint32_t gmem_align = 1 << gmem_align_log2;
231 uint32_t min_layer_stride = tile_align_h * tile_align_w * pass->min_cpp;
232 if (layers > 1 && align(min_layer_stride, gmem_align) != min_layer_stride) {
233 /* Make sure that min_layer_stride is a multiple of gmem_align. Because
234 * gmem_align is a power of two and min_layer_stride isn't already a
235 * multiple of gmem_align, this is equivalent to shifting tile_align_h
236 * until the number of 0 bits at the bottom of min_layer_stride is at
237 * least gmem_align_log2.
238 */
239 tile_align_h <<= gmem_align_log2 - (ffs(min_layer_stride) - 1);
240
241 /* Check that we did the math right. */
242 min_layer_stride = tile_align_h * tile_align_w * pass->min_cpp;
243 assert(align(min_layer_stride, gmem_align) == min_layer_stride);
244 }
245
246 /* will force to sysmem, don't bother trying to have a valid tile config
247 * TODO: just skip all GMEM stuff when sysmem is forced?
248 */
249 if (!pass->gmem_pixels[gmem_layout])
250 return;
251
252 uint32_t best_tile_count = ~0;
253 VkExtent2D tile_count;
254 VkExtent2D tile_size;
255 /* There aren't that many different tile widths possible, so just walk all
256 * of them finding which produces the lowest number of bins.
257 */
258 const uint32_t max_tile_width = MIN2(
259 dev->physical_device->info->tile_max_w, util_align_npot(fb->width, tile_align_w));
260 const uint32_t max_tile_height =
261 MIN2(dev->physical_device->info->tile_max_h,
262 align(fb->height, tile_align_h));
263 for (tile_size.width = tile_align_w; tile_size.width <= max_tile_width;
264 tile_size.width += tile_align_w) {
265 tile_size.height = pass->gmem_pixels[gmem_layout] / (tile_size.width * layers);
266 tile_size.height = MIN2(tile_size.height, max_tile_height);
267 tile_size.height = ROUND_DOWN_TO(tile_size.height, tile_align_h);
268 if (!tile_size.height)
269 continue;
270
271 tile_count.width = DIV_ROUND_UP(fb->width, tile_size.width);
272 tile_count.height = DIV_ROUND_UP(fb->height, tile_size.height);
273
274 /* Drop the height of the tile down to split tiles more evenly across the
275 * screen for a given tile count.
276 */
277 tile_size.height =
278 align(DIV_ROUND_UP(fb->height, tile_count.height), tile_align_h);
279
280 /* Pick the layout with the minimum number of bins (lowest CP overhead
281 * and amount of cache flushing), but the most square tiles in the case
282 * of a tie (likely highest cache locality).
283 */
284 if (tile_count.width * tile_count.height < best_tile_count ||
285 (tile_count.width * tile_count.height == best_tile_count &&
286 abs((int)(tile_size.width - tile_size.height)) <
287 abs((int)(tiling->tile0.width - tiling->tile0.height)))) {
288 tiling->possible = true;
289 tiling->tile0 = tile_size;
290 tiling->tile_count = tile_count;
291 best_tile_count = tile_count.width * tile_count.height;
292 }
293 }
294
295 /* If forcing binning, try to get at least 2 tiles in each direction. */
296 if (TU_DEBUG(FORCEBIN) && tiling->possible) {
297 if (tiling->tile_count.width == 1 && tiling->tile0.width != tile_align_w) {
298 tiling->tile0.width = util_align_npot(DIV_ROUND_UP(tiling->tile0.width, 2), tile_align_w);
299 tiling->tile_count.width = 2;
300 }
301 if (tiling->tile_count.height == 1 && tiling->tile0.height != tile_align_h) {
302 tiling->tile0.height = align(DIV_ROUND_UP(tiling->tile0.height, 2), tile_align_h);
303 tiling->tile_count.height = 2;
304 }
305 }
306 }
307
308 static void
tu_tiling_config_update_pipe_layout(struct tu_tiling_config * tiling,const struct tu_device * dev)309 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
310 const struct tu_device *dev)
311 {
312 const uint32_t max_pipe_count =
313 dev->physical_device->info->num_vsc_pipes;
314
315 /* start from 1 tile per pipe */
316 tiling->pipe0 = (VkExtent2D) {
317 .width = 1,
318 .height = 1,
319 };
320 tiling->pipe_count = tiling->tile_count;
321
322 while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) {
323 if (tiling->pipe0.width < tiling->pipe0.height) {
324 tiling->pipe0.width += 1;
325 tiling->pipe_count.width =
326 DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
327 } else {
328 tiling->pipe0.height += 1;
329 tiling->pipe_count.height =
330 DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
331 }
332 }
333 }
334
335 static void
tu_tiling_config_update_pipes(struct tu_tiling_config * tiling,const struct tu_device * dev)336 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
337 const struct tu_device *dev)
338 {
339 const uint32_t max_pipe_count =
340 dev->physical_device->info->num_vsc_pipes;
341 const uint32_t used_pipe_count =
342 tiling->pipe_count.width * tiling->pipe_count.height;
343 const VkExtent2D last_pipe = {
344 .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
345 .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
346 };
347
348 assert(used_pipe_count <= max_pipe_count);
349 assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
350
351 for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
352 for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
353 const uint32_t pipe_x = tiling->pipe0.width * x;
354 const uint32_t pipe_y = tiling->pipe0.height * y;
355 const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
356 ? last_pipe.width
357 : tiling->pipe0.width;
358 const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
359 ? last_pipe.height
360 : tiling->pipe0.height;
361 const uint32_t n = tiling->pipe_count.width * y + x;
362
363 tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
364 A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
365 A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
366 A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
367 tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
368 }
369 }
370
371 memset(tiling->pipe_config + used_pipe_count, 0,
372 sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
373 }
374
375 static bool
is_hw_binning_possible(const struct tu_tiling_config * tiling)376 is_hw_binning_possible(const struct tu_tiling_config *tiling)
377 {
378 /* Similar to older gens, # of tiles per pipe cannot be more than 32.
379 * But there are no hangs with 16 or more tiles per pipe in either
380 * X or Y direction, so that limit does not seem to apply.
381 */
382 uint32_t tiles_per_pipe = tiling->pipe0.width * tiling->pipe0.height;
383 return tiles_per_pipe <= 32;
384 }
385
386 static void
tu_tiling_config_update_binning(struct tu_tiling_config * tiling,const struct tu_device * device)387 tu_tiling_config_update_binning(struct tu_tiling_config *tiling, const struct tu_device *device)
388 {
389 tiling->binning_possible = is_hw_binning_possible(tiling);
390
391 if (tiling->binning_possible) {
392 tiling->binning = (tiling->tile_count.width * tiling->tile_count.height) > 2;
393
394 if (TU_DEBUG(FORCEBIN))
395 tiling->binning = true;
396 if (TU_DEBUG(NOBIN))
397 tiling->binning = false;
398 } else {
399 tiling->binning = false;
400 }
401 }
402
403 void
tu_framebuffer_tiling_config(struct tu_framebuffer * fb,const struct tu_device * device,const struct tu_render_pass * pass)404 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
405 const struct tu_device *device,
406 const struct tu_render_pass *pass)
407 {
408 for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
409 struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
410 tu_tiling_config_update_tile_layout(fb, device, pass,
411 (enum tu_gmem_layout) gmem_layout);
412 if (!tiling->possible)
413 continue;
414
415 tu_tiling_config_update_pipe_layout(tiling, device);
416 tu_tiling_config_update_pipes(tiling, device);
417 tu_tiling_config_update_binning(tiling, device);
418 }
419 }
420
421 void
tu_dbg_log_gmem_load_store_skips(struct tu_device * device)422 tu_dbg_log_gmem_load_store_skips(struct tu_device *device)
423 {
424 static uint32_t last_skipped_loads = 0;
425 static uint32_t last_skipped_stores = 0;
426 static uint32_t last_total_loads = 0;
427 static uint32_t last_total_stores = 0;
428 static struct timespec last_time = {};
429
430 pthread_mutex_lock(&device->submit_mutex);
431
432 struct timespec current_time;
433 clock_gettime(CLOCK_MONOTONIC, ¤t_time);
434
435 if (timespec_sub_to_nsec(¤t_time, &last_time) > 1000 * 1000 * 1000) {
436 last_time = current_time;
437 } else {
438 pthread_mutex_unlock(&device->submit_mutex);
439 return;
440 }
441
442 struct tu6_global *global = device->global_bo_map;
443
444 uint32_t current_taken_loads = global->dbg_gmem_taken_loads;
445 uint32_t current_taken_stores = global->dbg_gmem_taken_stores;
446 uint32_t current_total_loads = global->dbg_gmem_total_loads;
447 uint32_t current_total_stores = global->dbg_gmem_total_stores;
448
449 uint32_t skipped_loads = current_total_loads - current_taken_loads;
450 uint32_t skipped_stores = current_total_stores - current_taken_stores;
451
452 uint32_t current_time_frame_skipped_loads = skipped_loads - last_skipped_loads;
453 uint32_t current_time_frame_skipped_stores = skipped_stores - last_skipped_stores;
454
455 uint32_t current_time_frame_total_loads = current_total_loads - last_total_loads;
456 uint32_t current_time_frame_total_stores = current_total_stores - last_total_stores;
457
458 mesa_logi("[GMEM] loads total: %u skipped: %.1f%%\n",
459 current_time_frame_total_loads,
460 current_time_frame_skipped_loads / (float) current_time_frame_total_loads * 100.f);
461 mesa_logi("[GMEM] stores total: %u skipped: %.1f%%\n",
462 current_time_frame_total_stores,
463 current_time_frame_skipped_stores / (float) current_time_frame_total_stores * 100.f);
464
465 last_skipped_loads = skipped_loads;
466 last_skipped_stores = skipped_stores;
467 last_total_loads = current_total_loads;
468 last_total_stores = current_total_stores;
469
470 pthread_mutex_unlock(&device->submit_mutex);
471 }
472