• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #ifndef TU_AUTOTUNE_H
7 #define TU_AUTOTUNE_H
8 
9 #include "tu_common.h"
10 
11 #include "util/hash_table.h"
12 #include "util/rwlock.h"
13 
14 #include "tu_suballoc.h"
15 
16 struct tu_renderpass_history;
17 
18 /**
19  * "autotune" our decisions about bypass vs GMEM rendering, based on historical
20  * data about a given render target.
21  *
22  * In deciding which path to take there are tradeoffs, including some that
23  * are not reasonably estimateable without having some additional information:
24  *
25  *  (1) If you know you are touching every pixel (ie. there is a clear),
26  *      then the GMEM path will at least not cost more memory bandwidth than
27  *      sysmem[1]
28  *
29  *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
30  *      if there is sysmem->GMEM restore pass.
31  *
32  *  (3) If you see a high draw count, that is an indication that there will be
33  *      enough pixels accessed multiple times to benefit from the reduced
34  *      memory bandwidth that GMEM brings
35  *
36  *  (4) But high draw count where there is not much overdraw can actually be
37  *      faster in bypass mode if it is pushing a lot of state change, due to
38  *      not having to go thru the state changes per-tile[1]
39  *
40  * The approach taken is to measure the samples-passed for the batch to estimate
41  * the amount of overdraw to detect cases where the number of pixels touched is
42  * low.
43  *
44  * [1] ignoring early-tile-exit optimizations, but any draw that touches all/
45  *     most of the tiles late in the tile-pass can defeat that
46  */
47 struct tu_autotune {
48 
49    /* We may have to disable autotuner if there are too many
50     * renderpasses in-flight.
51     */
52    bool enabled;
53 
54    struct tu_device *device;
55 
56    /**
57     * Cache to map renderpass key to historical information about
58     * rendering to that particular render target.
59     */
60    struct hash_table *ht;
61    struct u_rwlock ht_lock;
62 
63    /**
64     * List of per-renderpass results that we are waiting for the GPU
65     * to finish with before reading back the results.
66     */
67    struct list_head pending_results;
68 
69    /**
70     * List of per-submission data that we may want to free after we
71     * processed submission results.
72     * This could happend after command buffers which were in the submission
73     * are destroyed.
74     */
75    struct list_head pending_submission_data;
76 
77    uint32_t fence_counter;
78    uint32_t idx_counter;
79 };
80 
81 /**
82  * From the cmdstream, the captured samples-passed values are recorded
83  * at the start and end of the batch.
84  *
85  * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
86  * may force us to revisit that.
87  */
88 struct tu_renderpass_samples {
89    uint64_t samples_start;
90    /* hw requires the sample start/stop locations to be 128b aligned. */
91    uint64_t __pad0;
92    uint64_t samples_end;
93    uint64_t __pad1;
94 };
95 
96 /**
97  * Tracks the results from an individual renderpass. Initially created
98  * per renderpass, and appended to the tail of at->pending_results. At a later
99  * time, when the GPU has finished writing the results, we fill samples_passed.
100  */
101 struct tu_renderpass_result {
102    /* Points into GPU memory */
103    struct tu_renderpass_samples* samples;
104 
105    struct tu_suballoc_bo bo;
106 
107    /*
108     * Below here, only used internally within autotune
109     */
110    uint64_t rp_key;
111    struct tu_renderpass_history *history;
112    struct list_head node;
113    uint32_t fence;
114    uint64_t samples_passed;
115 };
116 
117 VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
118 void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
119 
120 bool tu_autotune_use_bypass(struct tu_autotune *at,
121                             struct tu_cmd_buffer *cmd_buffer,
122                             struct tu_renderpass_result **autotune_result);
123 void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
124 
125 bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
126                                        uint32_t cmd_buffer_count);
127 
128 /**
129  * A magic 8-ball that tells the gmem code whether we should do bypass mode
130  * for moar fps.
131  */
132 struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
133                                     struct tu_autotune *at,
134                                     struct tu_cmd_buffer **cmd_buffers,
135                                     uint32_t cmd_buffer_count);
136 
137 struct tu_autotune_results_buffer;
138 
139 void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
140                                   struct tu_cs *cs,
141                                   struct tu_renderpass_result *autotune_result);
142 
143 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
144                                 struct tu_cs *cs,
145                                 struct tu_renderpass_result *autotune_result);
146 
147 #endif /* TU_AUTOTUNE_H */
148