• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_util.h"
7 
8 #include <errno.h>
9 #include <stdarg.h>
10 
11 #include "common/freedreno_rd_output.h"
12 #include "util/u_math.h"
13 #include "util/timespec.h"
14 #include "util/os_file_notify.h"
15 #include "vk_enum_to_str.h"
16 
17 #include "tu_device.h"
18 #include "tu_pass.h"
19 
20 static const struct debug_control tu_debug_options[] = {
21    { "startup", TU_DEBUG_STARTUP },
22    { "nir", TU_DEBUG_NIR },
23    { "nobin", TU_DEBUG_NOBIN },
24    { "sysmem", TU_DEBUG_SYSMEM },
25    { "gmem", TU_DEBUG_GMEM },
26    { "forcebin", TU_DEBUG_FORCEBIN },
27    { "layout", TU_DEBUG_LAYOUT },
28    { "noubwc", TU_DEBUG_NOUBWC },
29    { "nomultipos", TU_DEBUG_NOMULTIPOS },
30    { "nolrz", TU_DEBUG_NOLRZ },
31    { "nolrzfc", TU_DEBUG_NOLRZFC },
32    { "perf", TU_DEBUG_PERF },
33    { "perfc", TU_DEBUG_PERFC },
34    { "flushall", TU_DEBUG_FLUSHALL },
35    { "syncdraw", TU_DEBUG_SYNCDRAW },
36    { "push_consts_per_stage", TU_DEBUG_PUSH_CONSTS_PER_STAGE },
37    { "rast_order", TU_DEBUG_RAST_ORDER },
38    { "unaligned_store", TU_DEBUG_UNALIGNED_STORE },
39    { "log_skip_gmem_ops", TU_DEBUG_LOG_SKIP_GMEM_OPS },
40    { "dynamic", TU_DEBUG_DYNAMIC },
41    { "bos", TU_DEBUG_BOS },
42    { "3d_load", TU_DEBUG_3D_LOAD },
43    { "fdm", TU_DEBUG_FDM },
44    { "noconform", TU_DEBUG_NOCONFORM },
45    { "rd", TU_DEBUG_RD },
46    { "hiprio", TU_DEBUG_HIPRIO },
47    { "noconcurrentresolves", TU_DEBUG_NO_CONCURRENT_RESOLVES },
48    { "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES },
49    { "dumpas", TU_DEBUG_DUMPAS },
50    { NULL, 0 }
51 };
52 
53 /*
54  * The runtime debug flags are a subset of the debug flags that can be set at
55  * runtime. Flags which depend on running state of the driver, the application
56  * or the hardware and would otherwise break when toggled should not be set here.
57  * Note: Keep in sync with the list of flags in 'docs/drivers/freedreno.rst'.
58  */
59 const uint32_t tu_runtime_debug_flags =
60    TU_DEBUG_NIR | TU_DEBUG_NOBIN | TU_DEBUG_SYSMEM | TU_DEBUG_GMEM |
61    TU_DEBUG_FORCEBIN | TU_DEBUG_LAYOUT | TU_DEBUG_NOLRZ | TU_DEBUG_NOLRZFC |
62    TU_DEBUG_PERF | TU_DEBUG_FLUSHALL | TU_DEBUG_SYNCDRAW |
63    TU_DEBUG_RAST_ORDER | TU_DEBUG_UNALIGNED_STORE |
64    TU_DEBUG_LOG_SKIP_GMEM_OPS | TU_DEBUG_3D_LOAD | TU_DEBUG_FDM |
65    TU_DEBUG_NO_CONCURRENT_RESOLVES | TU_DEBUG_NO_CONCURRENT_UNRESOLVES;
66 
67 os_file_notifier_t tu_debug_notifier;
68 struct tu_env tu_env;
69 
70 static void
tu_env_notify(void * data,const char * path,bool created,bool deleted,bool dir_deleted)71 tu_env_notify(
72    void *data, const char *path, bool created, bool deleted, bool dir_deleted)
73 {
74    int file_flags = 0;
75    if (!deleted) {
76       FILE *file = fopen(path, "r");
77       if (file) {
78          char buf[512];
79          size_t len = fread(buf, 1, sizeof(buf) - 1, file);
80          fclose(file);
81          buf[len] = '\0';
82 
83          file_flags = parse_debug_string(buf, tu_debug_options);
84       }
85    }
86 
87    int runtime_flags = file_flags & tu_runtime_debug_flags;
88    if (unlikely(runtime_flags != file_flags)) {
89       mesa_logw(
90          "Certain options in TU_DEBUG_FILE don't support runtime changes: 0x%x, ignoring",
91          file_flags & ~tu_runtime_debug_flags);
92    }
93 
94    tu_env.debug.store(runtime_flags | tu_env.env_debug, std::memory_order_release);
95 
96    if (unlikely(dir_deleted))
97       mesa_logw(
98          "Directory containing TU_DEBUG_FILE (%s) was deleted, stopping watching",
99          path);
100 }
101 
102 static void
tu_env_deinit(void)103 tu_env_deinit(void)
104 {
105    if (tu_debug_notifier)
106       os_file_notifier_destroy(tu_debug_notifier);
107 }
108 
109 static void
tu_env_init_once(void)110 tu_env_init_once(void)
111 {
112    tu_env.debug = parse_debug_string(os_get_option("TU_DEBUG"), tu_debug_options);
113    tu_env.env_debug = tu_env.debug & ~tu_runtime_debug_flags;
114 
115    if (TU_DEBUG(STARTUP))
116       mesa_logi("TU_DEBUG=0x%x", tu_env.env_debug);
117 
118    /* TU_DEBUG=rd functionality was moved to fd_rd_output. This debug option
119     * should translate to the basic-level FD_RD_DUMP_ENABLE option.
120     */
121    if (TU_DEBUG(RD))
122       fd_rd_dump_env.flags |= FD_RD_DUMP_ENABLE;
123 
124    const char *debug_file = os_get_option("TU_DEBUG_FILE");
125    if (debug_file) {
126       if (tu_env.debug != tu_env.env_debug) {
127          mesa_logw("TU_DEBUG_FILE is set (%s), but TU_DEBUG is also set. "
128                    "Any runtime options (0x%x) in TU_DEBUG will be ignored.",
129                    debug_file, tu_env.debug & ~tu_runtime_debug_flags);
130       }
131 
132       if (TU_DEBUG(STARTUP))
133          mesa_logi("Watching TU_DEBUG_FILE: %s", debug_file);
134 
135       const char* error_str = "Unknown error";
136       tu_debug_notifier =
137          os_file_notifier_create(debug_file, tu_env_notify, NULL, &error_str);
138       if (!tu_debug_notifier)
139          mesa_logw("Failed to watch TU_DEBUG_FILE (%s): %s", debug_file, error_str);
140    } else {
141       tu_debug_notifier = NULL;
142    }
143 
144    atexit(tu_env_deinit);
145 }
146 
147 void
tu_env_init(void)148 tu_env_init(void)
149 {
150    fd_rd_dump_env_init();
151 
152    static once_flag once = ONCE_FLAG_INIT;
153    call_once(&once, tu_env_init_once);
154 }
155 
156 void PRINTFLIKE(3, 4)
__tu_finishme(const char * file,int line,const char * format,...)157    __tu_finishme(const char *file, int line, const char *format, ...)
158 {
159    va_list ap;
160    char buffer[256];
161 
162    va_start(ap, format);
163    vsnprintf(buffer, sizeof(buffer), format, ap);
164    va_end(ap);
165 
166    mesa_loge("%s:%d: FINISHME: %s\n", file, line, buffer);
167 }
168 
169 VkResult
__vk_startup_errorf(struct tu_instance * instance,VkResult error,const char * file,int line,const char * format,...)170 __vk_startup_errorf(struct tu_instance *instance,
171                     VkResult error,
172                     const char *file,
173                     int line,
174                     const char *format,
175                     ...)
176 {
177    va_list ap;
178    char buffer[256];
179 
180    const char *error_str = vk_Result_to_str(error);
181 
182    if (format) {
183       va_start(ap, format);
184       vsnprintf(buffer, sizeof(buffer), format, ap);
185       va_end(ap);
186 
187       mesa_loge("%s:%d: %s (%s)\n", file, line, buffer, error_str);
188    } else {
189       mesa_loge("%s:%d: %s\n", file, line, error_str);
190    }
191 
192    return error;
193 }
194 
195 static void
tu_tiling_config_update_tile_layout(struct tu_framebuffer * fb,const struct tu_device * dev,const struct tu_render_pass * pass,enum tu_gmem_layout gmem_layout)196 tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
197                                     const struct tu_device *dev,
198                                     const struct tu_render_pass *pass,
199                                     enum tu_gmem_layout gmem_layout)
200 {
201    const uint32_t tile_align_w = pass->tile_align_w;
202    uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
203    struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
204 
205    *tiling = (struct tu_tiling_config) {
206       /* Put in dummy values that will assertion fail in register setup using
207        * them, since you shouldn't be doing gmem work if gmem is not possible.
208        */
209       .tile0 = (VkExtent2D) { ~0, ~0 },
210       .tile_count = (VkExtent2D) { .width = 1, .height = 1 },
211       .possible = false,
212    };
213 
214    /* From the Vulkan 1.3.232 spec, under VkFramebufferCreateInfo:
215     *
216     *   If the render pass uses multiview, then layers must be one and each
217     *   attachment requires a number of layers that is greater than the
218     *   maximum bit index set in the view mask in the subpasses in which it is
219     *   used.
220     */
221 
222    uint32_t layers = MAX2(fb->layers, pass->num_views);
223 
224    /* If there is more than one layer, we need to make sure that the layer
225     * stride is expressible as an offset in RB_BLIT_BASE_GMEM which ignores
226     * the low 12 bits. The layer stride seems to be implicitly calculated from
227     * the tile width and height so we need to adjust one of them.
228     */
229    const uint32_t gmem_align_log2 = 12;
230    const uint32_t gmem_align = 1 << gmem_align_log2;
231    uint32_t min_layer_stride = tile_align_h * tile_align_w * pass->min_cpp;
232    if (layers > 1 && align(min_layer_stride, gmem_align) != min_layer_stride) {
233       /* Make sure that min_layer_stride is a multiple of gmem_align. Because
234        * gmem_align is a power of two and min_layer_stride isn't already a
235        * multiple of gmem_align, this is equivalent to shifting tile_align_h
236        * until the number of 0 bits at the bottom of min_layer_stride is at
237        * least gmem_align_log2.
238        */
239       tile_align_h <<= gmem_align_log2 - (ffs(min_layer_stride) - 1);
240 
241       /* Check that we did the math right. */
242       min_layer_stride = tile_align_h * tile_align_w * pass->min_cpp;
243       assert(align(min_layer_stride, gmem_align) == min_layer_stride);
244    }
245 
246    /* will force to sysmem, don't bother trying to have a valid tile config
247     * TODO: just skip all GMEM stuff when sysmem is forced?
248     */
249    if (!pass->gmem_pixels[gmem_layout])
250       return;
251 
252    uint32_t best_tile_count = ~0;
253    VkExtent2D tile_count;
254    VkExtent2D tile_size;
255    /* There aren't that many different tile widths possible, so just walk all
256     * of them finding which produces the lowest number of bins.
257     */
258    const uint32_t max_tile_width = MIN2(
259       dev->physical_device->info->tile_max_w, util_align_npot(fb->width, tile_align_w));
260    const uint32_t max_tile_height =
261       MIN2(dev->physical_device->info->tile_max_h,
262            align(fb->height, tile_align_h));
263    for (tile_size.width = tile_align_w; tile_size.width <= max_tile_width;
264         tile_size.width += tile_align_w) {
265       tile_size.height = pass->gmem_pixels[gmem_layout] / (tile_size.width * layers);
266       tile_size.height = MIN2(tile_size.height, max_tile_height);
267       tile_size.height = ROUND_DOWN_TO(tile_size.height, tile_align_h);
268       if (!tile_size.height)
269          continue;
270 
271       tile_count.width = DIV_ROUND_UP(fb->width, tile_size.width);
272       tile_count.height = DIV_ROUND_UP(fb->height, tile_size.height);
273 
274       /* Drop the height of the tile down to split tiles more evenly across the
275        * screen for a given tile count.
276        */
277       tile_size.height =
278          align(DIV_ROUND_UP(fb->height, tile_count.height), tile_align_h);
279 
280       /* Pick the layout with the minimum number of bins (lowest CP overhead
281        * and amount of cache flushing), but the most square tiles in the case
282        * of a tie (likely highest cache locality).
283        */
284       if (tile_count.width * tile_count.height < best_tile_count ||
285           (tile_count.width * tile_count.height == best_tile_count &&
286            abs((int)(tile_size.width - tile_size.height)) <
287               abs((int)(tiling->tile0.width - tiling->tile0.height)))) {
288          tiling->possible = true;
289          tiling->tile0 = tile_size;
290          tiling->tile_count = tile_count;
291          best_tile_count = tile_count.width * tile_count.height;
292       }
293    }
294 
295    /* If forcing binning, try to get at least 2 tiles in each direction. */
296    if (TU_DEBUG(FORCEBIN) && tiling->possible) {
297       if (tiling->tile_count.width == 1 && tiling->tile0.width != tile_align_w) {
298          tiling->tile0.width = util_align_npot(DIV_ROUND_UP(tiling->tile0.width, 2), tile_align_w);
299          tiling->tile_count.width = 2;
300       }
301       if (tiling->tile_count.height == 1 && tiling->tile0.height != tile_align_h) {
302          tiling->tile0.height = align(DIV_ROUND_UP(tiling->tile0.height, 2), tile_align_h);
303          tiling->tile_count.height = 2;
304       }
305    }
306 }
307 
308 static void
tu_tiling_config_update_pipe_layout(struct tu_tiling_config * tiling,const struct tu_device * dev)309 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
310                                     const struct tu_device *dev)
311 {
312    const uint32_t max_pipe_count =
313       dev->physical_device->info->num_vsc_pipes;
314 
315    /* start from 1 tile per pipe */
316    tiling->pipe0 = (VkExtent2D) {
317       .width = 1,
318       .height = 1,
319    };
320    tiling->pipe_count = tiling->tile_count;
321 
322    while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) {
323       if (tiling->pipe0.width < tiling->pipe0.height) {
324          tiling->pipe0.width += 1;
325          tiling->pipe_count.width =
326             DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
327       } else {
328          tiling->pipe0.height += 1;
329          tiling->pipe_count.height =
330             DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
331       }
332    }
333 }
334 
335 static void
tu_tiling_config_update_pipes(struct tu_tiling_config * tiling,const struct tu_device * dev)336 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
337                               const struct tu_device *dev)
338 {
339    const uint32_t max_pipe_count =
340       dev->physical_device->info->num_vsc_pipes;
341    const uint32_t used_pipe_count =
342       tiling->pipe_count.width * tiling->pipe_count.height;
343    const VkExtent2D last_pipe = {
344       .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
345       .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
346    };
347 
348    assert(used_pipe_count <= max_pipe_count);
349    assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
350 
351    for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
352       for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
353          const uint32_t pipe_x = tiling->pipe0.width * x;
354          const uint32_t pipe_y = tiling->pipe0.height * y;
355          const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
356                                     ? last_pipe.width
357                                     : tiling->pipe0.width;
358          const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
359                                     ? last_pipe.height
360                                     : tiling->pipe0.height;
361          const uint32_t n = tiling->pipe_count.width * y + x;
362 
363          tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
364                                   A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
365                                   A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
366                                   A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
367          tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
368       }
369    }
370 
371    memset(tiling->pipe_config + used_pipe_count, 0,
372           sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
373 }
374 
375 static bool
is_hw_binning_possible(const struct tu_tiling_config * tiling)376 is_hw_binning_possible(const struct tu_tiling_config *tiling)
377 {
378    /* Similar to older gens, # of tiles per pipe cannot be more than 32.
379     * But there are no hangs with 16 or more tiles per pipe in either
380     * X or Y direction, so that limit does not seem to apply.
381     */
382    uint32_t tiles_per_pipe = tiling->pipe0.width * tiling->pipe0.height;
383    return tiles_per_pipe <= 32;
384 }
385 
386 static void
tu_tiling_config_update_binning(struct tu_tiling_config * tiling,const struct tu_device * device)387 tu_tiling_config_update_binning(struct tu_tiling_config *tiling, const struct tu_device *device)
388 {
389    tiling->binning_possible = is_hw_binning_possible(tiling);
390 
391    if (tiling->binning_possible) {
392       tiling->binning = (tiling->tile_count.width * tiling->tile_count.height) > 2;
393 
394       if (TU_DEBUG(FORCEBIN))
395          tiling->binning = true;
396       if (TU_DEBUG(NOBIN))
397          tiling->binning = false;
398    } else {
399       tiling->binning = false;
400    }
401 }
402 
403 void
tu_framebuffer_tiling_config(struct tu_framebuffer * fb,const struct tu_device * device,const struct tu_render_pass * pass)404 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
405                              const struct tu_device *device,
406                              const struct tu_render_pass *pass)
407 {
408    for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
409       struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
410       tu_tiling_config_update_tile_layout(fb, device, pass,
411                                           (enum tu_gmem_layout) gmem_layout);
412       if (!tiling->possible)
413          continue;
414 
415       tu_tiling_config_update_pipe_layout(tiling, device);
416       tu_tiling_config_update_pipes(tiling, device);
417       tu_tiling_config_update_binning(tiling, device);
418    }
419 }
420 
421 void
tu_dbg_log_gmem_load_store_skips(struct tu_device * device)422 tu_dbg_log_gmem_load_store_skips(struct tu_device *device)
423 {
424    static uint32_t last_skipped_loads = 0;
425    static uint32_t last_skipped_stores = 0;
426    static uint32_t last_total_loads = 0;
427    static uint32_t last_total_stores = 0;
428    static struct timespec last_time = {};
429 
430    pthread_mutex_lock(&device->submit_mutex);
431 
432    struct timespec current_time;
433    clock_gettime(CLOCK_MONOTONIC, &current_time);
434 
435    if (timespec_sub_to_nsec(&current_time, &last_time) > 1000 * 1000 * 1000) {
436       last_time = current_time;
437    } else {
438       pthread_mutex_unlock(&device->submit_mutex);
439       return;
440    }
441 
442    struct tu6_global *global = device->global_bo_map;
443 
444    uint32_t current_taken_loads = global->dbg_gmem_taken_loads;
445    uint32_t current_taken_stores = global->dbg_gmem_taken_stores;
446    uint32_t current_total_loads = global->dbg_gmem_total_loads;
447    uint32_t current_total_stores = global->dbg_gmem_total_stores;
448 
449    uint32_t skipped_loads = current_total_loads - current_taken_loads;
450    uint32_t skipped_stores = current_total_stores - current_taken_stores;
451 
452    uint32_t current_time_frame_skipped_loads = skipped_loads - last_skipped_loads;
453    uint32_t current_time_frame_skipped_stores = skipped_stores - last_skipped_stores;
454 
455    uint32_t current_time_frame_total_loads = current_total_loads - last_total_loads;
456    uint32_t current_time_frame_total_stores = current_total_stores - last_total_stores;
457 
458    mesa_logi("[GMEM] loads total: %u skipped: %.1f%%\n",
459          current_time_frame_total_loads,
460          current_time_frame_skipped_loads / (float) current_time_frame_total_loads * 100.f);
461    mesa_logi("[GMEM] stores total: %u skipped: %.1f%%\n",
462          current_time_frame_total_stores,
463          current_time_frame_skipped_stores / (float) current_time_frame_total_stores * 100.f);
464 
465    last_skipped_loads = skipped_loads;
466    last_skipped_stores = skipped_stores;
467    last_total_loads = current_total_loads;
468    last_total_stores = current_total_stores;
469 
470    pthread_mutex_unlock(&device->submit_mutex);
471 }
472