1 /*
2 * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #ifndef IR3_COMPILER_H_
28 #define IR3_COMPILER_H_
29
30 #include "compiler/nir/nir.h"
31 #include "util/disk_cache.h"
32 #include "util/log.h"
33 #include "util/perf/cpu_trace.h"
34
35 #include "freedreno_dev_info.h"
36
37 #include "ir3.h"
38
39 BEGINC;
40
41 struct ir3_ra_reg_set;
42 struct ir3_shader;
43
44 struct ir3_compiler_options {
45 /* If true, UBO/SSBO accesses are assumed to be bounds-checked as defined by
46 * VK_EXT_robustness2 and optimizations may have to be more conservative.
47 */
48 bool robust_buffer_access2;
49
50 /* If true, promote UBOs (except for constant data) to constants using ldc.k
51 * in the preamble. The driver should ignore everything in ubo_state except
52 * for the constant data UBO, which is excluded because the command pushing
53 * constants for it can be pre-baked when compiling the shader.
54 */
55 bool push_ubo_with_preamble;
56
57 /* If true, disable the shader cache. The driver is then responsible for
58 * caching.
59 */
60 bool disable_cache;
61
62 /* If >= 0, this specifies the bindless descriptor set + descriptor to use
63 * for txf_ms_fb
64 */
65 int bindless_fb_read_descriptor;
66 int bindless_fb_read_slot;
67
68 /* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
69 bool storage_16bit;
70
71 /* If base_vertex should be lowered in nir */
72 bool lower_base_vertex;
73
74 bool shared_push_consts;
75 };
76
77 struct ir3_compiler {
78 struct fd_device *dev;
79 const struct fd_dev_id *dev_id;
80 uint8_t gen;
81 uint32_t shader_count;
82
83 struct disk_cache *disk_cache;
84
85 struct nir_shader_compiler_options nir_options;
86
87 /*
88 * Configuration options for things handled differently by turnip vs
89 * gallium
90 */
91 struct ir3_compiler_options options;
92
93 /*
94 * Configuration options for things that are handled differently on
95 * different generations:
96 */
97
98 bool is_64bit;
99
100 /* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
101 * so we need to use ldlv.u32 to load the varying directly:
102 */
103 bool flat_bypass;
104
105 /* on a3xx, we need to add one to # of array levels:
106 */
107 bool levels_add_one;
108
109 /* on a3xx, we need to scale up integer coords for isaml based
110 * on LoD:
111 */
112 bool unminify_coords;
113
114 /* on a3xx do txf_ms w/ isaml and scaled coords: */
115 bool txf_ms_with_isaml;
116
117 /* on a4xx, for array textures we need to add 0.5 to the array
118 * index coordinate:
119 */
120 bool array_index_add_half;
121
122 /* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders:
123 */
124 bool samgq_workaround;
125
126 /* on a650, vertex shader <-> tess control io uses LDL/STL */
127 bool tess_use_shared;
128
129 /* The maximum number of constants, in vec4's, across the entire graphics
130 * pipeline.
131 */
132 uint16_t max_const_pipeline;
133
134 /* The maximum number of constants, in vec4's, for VS+HS+DS+GS. */
135 uint16_t max_const_geom;
136
137 /* The maximum number of constants, in vec4's, for FS. */
138 uint16_t max_const_frag;
139
140 /* A "safe" max constlen that can be applied to each shader in the
141 * pipeline which we guarantee will never exceed any combined limits.
142 */
143 uint16_t max_const_safe;
144
145 /* The maximum number of constants, in vec4's, for compute shaders. */
146 uint16_t max_const_compute;
147
148 /* Number of instructions that the shader's base address and length
149 * (instrlen divides instruction count by this) must be aligned to.
150 */
151 uint32_t instr_align;
152
153 /* on a3xx, the unit of indirect const load is higher than later gens (in
154 * vec4 units):
155 */
156 uint32_t const_upload_unit;
157
158 /* The base number of threads per wave. Some stages may be able to double
159 * this.
160 */
161 uint32_t threadsize_base;
162
163 /* On at least a6xx, waves are always launched in pairs. In calculations
164 * about occupancy, we pretend that each wave pair is actually one wave,
165 * which simplifies many of the calculations, but means we have to
166 * multiply threadsize_base by this number.
167 */
168 uint32_t wave_granularity;
169
170 /* The maximum number of simultaneous waves per core. */
171 uint32_t max_waves;
172
173 /* This is theoretical maximum number of vec4 registers that one wave of
174 * the base threadsize could use. To get the actual size of the register
175 * file in bytes one would need to compute:
176 *
177 * reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
178 *
179 * However this number is more often what we actually need. For example, a
180 * max_reg more than half of this will result in a doubled threadsize
181 * being impossible (because double-sized waves take up twice as many
182 * registers). Also, the formula for the occupancy given a particular
183 * register footprint is simpler.
184 *
185 * It is in vec4 units because the register file is allocated
186 * with vec4 granularity, so it's in the same units as max_reg.
187 */
188 uint32_t reg_size_vec4;
189
190 /* The size of local memory in bytes */
191 uint32_t local_mem_size;
192
193 /* The number of total branch stack entries, divided by wave_granularity. */
194 uint32_t branchstack_size;
195
196 /* The byte increment of MEMSIZEPERITEM, the private memory per-fiber allocation. */
197 uint32_t pvtmem_per_fiber_align;
198
199 /* Whether clip+cull distances are supported */
200 bool has_clip_cull;
201
202 /* Whether private memory is supported */
203 bool has_pvtmem;
204
205 /* Whether SSBOs have descriptors for sampling with ISAM */
206 bool has_isam_ssbo;
207
208 /* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
209 bool storage_16bit;
210
211 /* True if getfiberid, getlast.w8, brcst.active, and quad_shuffle
212 * instructions are supported which are necessary to support
213 * subgroup quad and arithmetic operations.
214 */
215 bool has_getfiberid;
216
217 /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
218 uint32_t max_variable_workgroup_size;
219
220 bool has_dp2acc;
221 bool has_dp4acc;
222
223 /* Type to use for 1b nir bools: */
224 type_t bool_type;
225
226 /* Whether compute invocation params are passed in via shared regfile or
227 * constbuf. a5xx+ has the shared regfile.
228 */
229 bool has_shared_regfile;
230
231 /* True if preamble instructions (shps, shpe, etc.) are supported */
232 bool has_preamble;
233
234 /* Where the shared consts start in constants file, in vec4's. */
235 uint16_t shared_consts_base_offset;
236
237 /* The size of shared consts for CS and FS(in vec4's).
238 * Also the size that is actually used on geometry stages (on a6xx).
239 */
240 uint64_t shared_consts_size;
241
242 /* Found on a6xx for geometry stages, that is different from
243 * actually used shared consts.
244 *
245 * TODO: Keep an eye on this for next gens.
246 */
247 uint64_t geom_shared_consts_size_quirk;
248
249 bool has_fs_tex_prefetch;
250
251 bool stsc_duplication_quirk;
252
253 bool load_shader_consts_via_preamble;
254 bool load_inline_uniforms_via_preamble_ldgk;
255 };
256
257 void ir3_compiler_destroy(struct ir3_compiler *compiler);
258 struct ir3_compiler *ir3_compiler_create(struct fd_device *dev,
259 const struct fd_dev_id *dev_id,
260 const struct fd_dev_info *dev_info,
261 const struct ir3_compiler_options *options);
262
263 void ir3_disk_cache_init(struct ir3_compiler *compiler);
264 void ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
265 struct ir3_shader *shader);
266 struct ir3_shader_variant *ir3_retrieve_variant(struct blob_reader *blob,
267 struct ir3_compiler *compiler,
268 void *mem_ctx);
269 void ir3_store_variant(struct blob *blob, const struct ir3_shader_variant *v);
270 bool ir3_disk_cache_retrieve(struct ir3_shader *shader,
271 struct ir3_shader_variant *v);
272 void ir3_disk_cache_store(struct ir3_shader *shader,
273 struct ir3_shader_variant *v);
274
275 const nir_shader_compiler_options *
276 ir3_get_compiler_options(struct ir3_compiler *compiler);
277
278 int ir3_compile_shader_nir(struct ir3_compiler *compiler,
279 struct ir3_shader *shader,
280 struct ir3_shader_variant *so);
281
282 /* gpu pointer size in units of 32bit registers/slots */
283 static inline unsigned
ir3_pointer_size(struct ir3_compiler * compiler)284 ir3_pointer_size(struct ir3_compiler *compiler)
285 {
286 return compiler->is_64bit ? 2 : 1;
287 }
288
289 enum ir3_shader_debug {
290 IR3_DBG_SHADER_VS = BITFIELD_BIT(0),
291 IR3_DBG_SHADER_TCS = BITFIELD_BIT(1),
292 IR3_DBG_SHADER_TES = BITFIELD_BIT(2),
293 IR3_DBG_SHADER_GS = BITFIELD_BIT(3),
294 IR3_DBG_SHADER_FS = BITFIELD_BIT(4),
295 IR3_DBG_SHADER_CS = BITFIELD_BIT(5),
296 IR3_DBG_DISASM = BITFIELD_BIT(6),
297 IR3_DBG_OPTMSGS = BITFIELD_BIT(7),
298 IR3_DBG_FORCES2EN = BITFIELD_BIT(8),
299 IR3_DBG_NOUBOOPT = BITFIELD_BIT(9),
300 IR3_DBG_NOFP16 = BITFIELD_BIT(10),
301 IR3_DBG_NOCACHE = BITFIELD_BIT(11),
302 IR3_DBG_SPILLALL = BITFIELD_BIT(12),
303 IR3_DBG_NOPREAMBLE = BITFIELD_BIT(13),
304 IR3_DBG_SHADER_INTERNAL = BITFIELD_BIT(14),
305 IR3_DBG_FULLSYNC = BITFIELD_BIT(15),
306 IR3_DBG_FULLNOP = BITFIELD_BIT(16),
307
308 /* DEBUG-only options: */
309 IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
310 IR3_DBG_RAMSGS = BITFIELD_BIT(21),
311
312 /* Only used for the disk-caching logic: */
313 IR3_DBG_ROBUST_UBO_ACCESS = BITFIELD_BIT(30),
314 };
315
316 extern enum ir3_shader_debug ir3_shader_debug;
317 extern const char *ir3_shader_override_path;
318
319 static inline bool
shader_debug_enabled(gl_shader_stage type,bool internal)320 shader_debug_enabled(gl_shader_stage type, bool internal)
321 {
322 if (internal)
323 return !!(ir3_shader_debug & IR3_DBG_SHADER_INTERNAL);
324
325 if (ir3_shader_debug & IR3_DBG_DISASM)
326 return true;
327
328 switch (type) {
329 case MESA_SHADER_VERTEX:
330 return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
331 case MESA_SHADER_TESS_CTRL:
332 return !!(ir3_shader_debug & IR3_DBG_SHADER_TCS);
333 case MESA_SHADER_TESS_EVAL:
334 return !!(ir3_shader_debug & IR3_DBG_SHADER_TES);
335 case MESA_SHADER_GEOMETRY:
336 return !!(ir3_shader_debug & IR3_DBG_SHADER_GS);
337 case MESA_SHADER_FRAGMENT:
338 return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
339 case MESA_SHADER_COMPUTE:
340 case MESA_SHADER_KERNEL:
341 return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
342 default:
343 assert(0);
344 return false;
345 }
346 }
347
348 static inline void
ir3_debug_print(struct ir3 * ir,const char * when)349 ir3_debug_print(struct ir3 *ir, const char *when)
350 {
351 if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
352 mesa_logi("%s:", when);
353 ir3_print(ir);
354 }
355 }
356
357 ENDC;
358
359 #endif /* IR3_COMPILER_H_ */
360