• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #ifndef IR3_COMPILER_H_
28 #define IR3_COMPILER_H_
29 
30 #include "compiler/nir/nir.h"
31 #include "util/disk_cache.h"
32 #include "util/log.h"
33 #include "util/perf/cpu_trace.h"
34 
35 #include "freedreno_dev_info.h"
36 
37 #include "ir3.h"
38 
39 BEGINC;
40 
41 struct ir3_ra_reg_set;
42 struct ir3_shader;
43 
44 struct ir3_compiler_options {
45    /* If true, UBO/SSBO accesses are assumed to be bounds-checked as defined by
46     * VK_EXT_robustness2 and optimizations may have to be more conservative.
47     */
48    bool robust_buffer_access2;
49 
50    /* If true, promote UBOs (except for constant data) to constants using ldc.k
51     * in the preamble. The driver should ignore everything in ubo_state except
52     * for the constant data UBO, which is excluded because the command pushing
53     * constants for it can be pre-baked when compiling the shader.
54     */
55    bool push_ubo_with_preamble;
56 
57    /* If true, disable the shader cache. The driver is then responsible for
58     * caching.
59     */
60    bool disable_cache;
61 
62    /* If >= 0, this specifies the bindless descriptor set + descriptor to use
63     * for txf_ms_fb
64     */
65    int bindless_fb_read_descriptor;
66    int bindless_fb_read_slot;
67 
68    /* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
69    bool storage_16bit;
70 
71   /* If base_vertex should be lowered in nir */
72   bool lower_base_vertex;
73 
74   bool shared_push_consts;
75 };
76 
77 struct ir3_compiler {
78    struct fd_device *dev;
79    const struct fd_dev_id *dev_id;
80    uint8_t gen;
81    uint32_t shader_count;
82 
83    struct disk_cache *disk_cache;
84 
85    struct nir_shader_compiler_options nir_options;
86 
87    /*
88     * Configuration options for things handled differently by turnip vs
89     * gallium
90     */
91    struct ir3_compiler_options options;
92 
93    /*
94     * Configuration options for things that are handled differently on
95     * different generations:
96     */
97 
98    bool is_64bit;
99 
100    /* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
101     * so we need to use ldlv.u32 to load the varying directly:
102     */
103    bool flat_bypass;
104 
105    /* on a3xx, we need to add one to # of array levels:
106     */
107    bool levels_add_one;
108 
109    /* on a3xx, we need to scale up integer coords for isaml based
110     * on LoD:
111     */
112    bool unminify_coords;
113 
114    /* on a3xx do txf_ms w/ isaml and scaled coords: */
115    bool txf_ms_with_isaml;
116 
117    /* on a4xx, for array textures we need to add 0.5 to the array
118     * index coordinate:
119     */
120    bool array_index_add_half;
121 
122    /* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders:
123     */
124    bool samgq_workaround;
125 
126    /* on a650, vertex shader <-> tess control io uses LDL/STL */
127    bool tess_use_shared;
128 
129    /* The maximum number of constants, in vec4's, across the entire graphics
130     * pipeline.
131     */
132    uint16_t max_const_pipeline;
133 
134    /* The maximum number of constants, in vec4's, for VS+HS+DS+GS. */
135    uint16_t max_const_geom;
136 
137    /* The maximum number of constants, in vec4's, for FS. */
138    uint16_t max_const_frag;
139 
140    /* A "safe" max constlen that can be applied to each shader in the
141     * pipeline which we guarantee will never exceed any combined limits.
142     */
143    uint16_t max_const_safe;
144 
145    /* The maximum number of constants, in vec4's, for compute shaders. */
146    uint16_t max_const_compute;
147 
148    /* Number of instructions that the shader's base address and length
149     * (instrlen divides instruction count by this) must be aligned to.
150     */
151    uint32_t instr_align;
152 
153    /* on a3xx, the unit of indirect const load is higher than later gens (in
154     * vec4 units):
155     */
156    uint32_t const_upload_unit;
157 
158    /* The base number of threads per wave. Some stages may be able to double
159     * this.
160     */
161    uint32_t threadsize_base;
162 
163    /* On at least a6xx, waves are always launched in pairs. In calculations
164     * about occupancy, we pretend that each wave pair is actually one wave,
165     * which simplifies many of the calculations, but means we have to
166     * multiply threadsize_base by this number.
167     */
168    uint32_t wave_granularity;
169 
170    /* The maximum number of simultaneous waves per core. */
171    uint32_t max_waves;
172 
173    /* This is theoretical maximum number of vec4 registers that one wave of
174     * the base threadsize could use. To get the actual size of the register
175     * file in bytes one would need to compute:
176     *
177     * reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
178     *
179     * However this number is more often what we actually need. For example, a
180     * max_reg more than half of this will result in a doubled threadsize
181     * being impossible (because double-sized waves take up twice as many
182     * registers). Also, the formula for the occupancy given a particular
183     * register footprint is simpler.
184     *
185     * It is in vec4 units because the register file is allocated
186     * with vec4 granularity, so it's in the same units as max_reg.
187     */
188    uint32_t reg_size_vec4;
189 
190    /* The size of local memory in bytes */
191    uint32_t local_mem_size;
192 
193    /* The number of total branch stack entries, divided by wave_granularity. */
194    uint32_t branchstack_size;
195 
196    /* The byte increment of MEMSIZEPERITEM, the private memory per-fiber allocation. */
197    uint32_t pvtmem_per_fiber_align;
198 
199    /* Whether clip+cull distances are supported */
200    bool has_clip_cull;
201 
202    /* Whether private memory is supported */
203    bool has_pvtmem;
204 
205    /* Whether SSBOs have descriptors for sampling with ISAM */
206    bool has_isam_ssbo;
207 
208    /* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
209    bool storage_16bit;
210 
211    /* True if getfiberid, getlast.w8, brcst.active, and quad_shuffle
212     * instructions are supported which are necessary to support
213     * subgroup quad and arithmetic operations.
214     */
215    bool has_getfiberid;
216 
217    /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
218    uint32_t max_variable_workgroup_size;
219 
220    bool has_dp2acc;
221    bool has_dp4acc;
222 
223    /* Type to use for 1b nir bools: */
224    type_t bool_type;
225 
226    /* Whether compute invocation params are passed in via shared regfile or
227     * constbuf. a5xx+ has the shared regfile.
228     */
229    bool has_shared_regfile;
230 
231    /* True if preamble instructions (shps, shpe, etc.) are supported */
232    bool has_preamble;
233 
234    /* Where the shared consts start in constants file, in vec4's. */
235    uint16_t shared_consts_base_offset;
236 
237    /* The size of shared consts for CS and FS(in vec4's).
238     * Also the size that is actually used on geometry stages (on a6xx).
239     */
240    uint64_t shared_consts_size;
241 
242    /* Found on a6xx for geometry stages, that is different from
243     * actually used shared consts.
244     *
245     * TODO: Keep an eye on this for next gens.
246     */
247    uint64_t geom_shared_consts_size_quirk;
248 
249    bool has_fs_tex_prefetch;
250 
251    bool stsc_duplication_quirk;
252 
253    bool load_shader_consts_via_preamble;
254    bool load_inline_uniforms_via_preamble_ldgk;
255 };
256 
257 void ir3_compiler_destroy(struct ir3_compiler *compiler);
258 struct ir3_compiler *ir3_compiler_create(struct fd_device *dev,
259                                          const struct fd_dev_id *dev_id,
260                                          const struct fd_dev_info *dev_info,
261                                          const struct ir3_compiler_options *options);
262 
263 void ir3_disk_cache_init(struct ir3_compiler *compiler);
264 void ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
265                                     struct ir3_shader *shader);
266 struct ir3_shader_variant *ir3_retrieve_variant(struct blob_reader *blob,
267                                                 struct ir3_compiler *compiler,
268                                                 void *mem_ctx);
269 void ir3_store_variant(struct blob *blob, const struct ir3_shader_variant *v);
270 bool ir3_disk_cache_retrieve(struct ir3_shader *shader,
271                              struct ir3_shader_variant *v);
272 void ir3_disk_cache_store(struct ir3_shader *shader,
273                           struct ir3_shader_variant *v);
274 
275 const nir_shader_compiler_options *
276 ir3_get_compiler_options(struct ir3_compiler *compiler);
277 
278 int ir3_compile_shader_nir(struct ir3_compiler *compiler,
279                            struct ir3_shader *shader,
280                            struct ir3_shader_variant *so);
281 
282 /* gpu pointer size in units of 32bit registers/slots */
283 static inline unsigned
ir3_pointer_size(struct ir3_compiler * compiler)284 ir3_pointer_size(struct ir3_compiler *compiler)
285 {
286    return compiler->is_64bit ? 2 : 1;
287 }
288 
289 enum ir3_shader_debug {
290    IR3_DBG_SHADER_VS = BITFIELD_BIT(0),
291    IR3_DBG_SHADER_TCS = BITFIELD_BIT(1),
292    IR3_DBG_SHADER_TES = BITFIELD_BIT(2),
293    IR3_DBG_SHADER_GS = BITFIELD_BIT(3),
294    IR3_DBG_SHADER_FS = BITFIELD_BIT(4),
295    IR3_DBG_SHADER_CS = BITFIELD_BIT(5),
296    IR3_DBG_DISASM = BITFIELD_BIT(6),
297    IR3_DBG_OPTMSGS = BITFIELD_BIT(7),
298    IR3_DBG_FORCES2EN = BITFIELD_BIT(8),
299    IR3_DBG_NOUBOOPT = BITFIELD_BIT(9),
300    IR3_DBG_NOFP16 = BITFIELD_BIT(10),
301    IR3_DBG_NOCACHE = BITFIELD_BIT(11),
302    IR3_DBG_SPILLALL = BITFIELD_BIT(12),
303    IR3_DBG_NOPREAMBLE = BITFIELD_BIT(13),
304    IR3_DBG_SHADER_INTERNAL = BITFIELD_BIT(14),
305    IR3_DBG_FULLSYNC = BITFIELD_BIT(15),
306    IR3_DBG_FULLNOP = BITFIELD_BIT(16),
307 
308    /* DEBUG-only options: */
309    IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
310    IR3_DBG_RAMSGS = BITFIELD_BIT(21),
311 
312    /* Only used for the disk-caching logic: */
313    IR3_DBG_ROBUST_UBO_ACCESS = BITFIELD_BIT(30),
314 };
315 
316 extern enum ir3_shader_debug ir3_shader_debug;
317 extern const char *ir3_shader_override_path;
318 
319 static inline bool
shader_debug_enabled(gl_shader_stage type,bool internal)320 shader_debug_enabled(gl_shader_stage type, bool internal)
321 {
322    if (internal)
323       return !!(ir3_shader_debug & IR3_DBG_SHADER_INTERNAL);
324 
325    if (ir3_shader_debug & IR3_DBG_DISASM)
326       return true;
327 
328    switch (type) {
329    case MESA_SHADER_VERTEX:
330       return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
331    case MESA_SHADER_TESS_CTRL:
332       return !!(ir3_shader_debug & IR3_DBG_SHADER_TCS);
333    case MESA_SHADER_TESS_EVAL:
334       return !!(ir3_shader_debug & IR3_DBG_SHADER_TES);
335    case MESA_SHADER_GEOMETRY:
336       return !!(ir3_shader_debug & IR3_DBG_SHADER_GS);
337    case MESA_SHADER_FRAGMENT:
338       return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
339    case MESA_SHADER_COMPUTE:
340    case MESA_SHADER_KERNEL:
341       return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
342    default:
343       assert(0);
344       return false;
345    }
346 }
347 
348 static inline void
ir3_debug_print(struct ir3 * ir,const char * when)349 ir3_debug_print(struct ir3 *ir, const char *when)
350 {
351    if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
352       mesa_logi("%s:", when);
353       ir3_print(ir);
354    }
355 }
356 
357 ENDC;
358 
359 #endif /* IR3_COMPILER_H_ */
360