1 /*
2 * Copyright © 2015 Rob Clark <robclark@freedesktop.org>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <robclark@freedesktop.org>
7 */
8
9 #include "util/ralloc.h"
10
11 #include "freedreno_dev_info.h"
12
13 #include "ir3_compiler.h"
14
15 static const struct debug_named_value shader_debug_options[] = {
16 /* clang-format off */
17 {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
18 {"tcs", IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
19 {"tes", IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
20 {"gs", IR3_DBG_SHADER_GS, "Print shader disasm for geometry shaders"},
21 {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
22 {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
23 {"internal", IR3_DBG_SHADER_INTERNAL, "Print shader disasm for internal shaders (normally not included in vs/fs/cs/etc)"},
24 {"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
25 {"optmsgs", IR3_DBG_OPTMSGS, "Enable optimizer debug messages"},
26 {"forces2en", IR3_DBG_FORCES2EN, "Force s2en mode for tex sampler instructions"},
27 {"nouboopt", IR3_DBG_NOUBOOPT, "Disable lowering UBO to uniform"},
28 {"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"},
29 {"nocache", IR3_DBG_NOCACHE, "Disable shader cache"},
30 {"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"},
31 {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
32 {"fullsync", IR3_DBG_FULLSYNC, "Add (sy) + (ss) after each cat5/cat6"},
33 {"fullnop", IR3_DBG_FULLNOP, "Add nops before each instruction"},
34 {"noearlypreamble", IR3_DBG_NOEARLYPREAMBLE, "Disable early preambles"},
35 {"nodescprefetch", IR3_DBG_NODESCPREFETCH, "Disable descriptor prefetch optimization"},
36 {"expandrpt", IR3_DBG_EXPANDRPT, "Expand rptN instructions"},
37 #if MESA_DEBUG
38 /* MESA_DEBUG-only options: */
39 {"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"},
40 {"ramsgs", IR3_DBG_RAMSGS, "Enable register-allocation debug messages"},
41 #endif
42 DEBUG_NAMED_VALUE_END
43 /* clang-format on */
44 };
45
46 DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
47 shader_debug_options, 0)
48 DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
49 NULL)
50
51 enum ir3_shader_debug ir3_shader_debug = 0;
52 const char *ir3_shader_override_path = NULL;
53
54 void
ir3_compiler_destroy(struct ir3_compiler * compiler)55 ir3_compiler_destroy(struct ir3_compiler *compiler)
56 {
57 disk_cache_destroy(compiler->disk_cache);
58 ralloc_free(compiler);
59 }
60
61 static const nir_shader_compiler_options ir3_base_options = {
62 .compact_arrays = true,
63 .lower_fpow = true,
64 .lower_scmp = true,
65 .lower_flrp16 = true,
66 .lower_flrp32 = true,
67 .lower_flrp64 = true,
68 .lower_ffract = true,
69 .lower_fmod = true,
70 .lower_fdiv = true,
71 .lower_isign = true,
72 .lower_ldexp = true,
73 .lower_uadd_carry = true,
74 .lower_usub_borrow = true,
75 .lower_mul_high = true,
76 .lower_mul_2x32_64 = true,
77 .fuse_ffma16 = true,
78 .fuse_ffma32 = true,
79 .fuse_ffma64 = true,
80 .vertex_id_zero_based = false,
81 .lower_extract_byte = true,
82 .lower_extract_word = true,
83 .lower_insert_byte = true,
84 .lower_insert_word = true,
85 .lower_helper_invocation = true,
86 .lower_bitfield_insert = true,
87 .lower_bitfield_extract = true,
88 .lower_pack_half_2x16 = true,
89 .lower_pack_snorm_4x8 = true,
90 .lower_pack_snorm_2x16 = true,
91 .lower_pack_unorm_4x8 = true,
92 .lower_pack_unorm_2x16 = true,
93 .lower_unpack_half_2x16 = true,
94 .lower_unpack_snorm_4x8 = true,
95 .lower_unpack_snorm_2x16 = true,
96 .lower_unpack_unorm_4x8 = true,
97 .lower_unpack_unorm_2x16 = true,
98 .lower_pack_split = true,
99 .lower_to_scalar = true,
100 .has_imul24 = true,
101 .has_icsel_eqz32 = true,
102 .has_icsel_eqz16 = true,
103 .has_fsub = true,
104 .has_isub = true,
105 .force_indirect_unrolling_sampler = true,
106 .lower_uniforms_to_ubo = true,
107 .max_unroll_iterations = 32,
108
109 .lower_cs_local_index_to_id = true,
110 .lower_wpos_pntc = true,
111
112 .lower_hadd = true,
113 .lower_hadd64 = true,
114 .lower_fisnormal = true,
115
116 .lower_int64_options = (nir_lower_int64_options)~0,
117 .lower_doubles_options = (nir_lower_doubles_options)~0,
118
119 .divergence_analysis_options = nir_divergence_uniform_load_tears,
120 .scalarize_ddx = true,
121
122 .per_view_unique_driver_locations = true,
123 .compact_view_index = true,
124 };
125
126 struct ir3_compiler *
ir3_compiler_create(struct fd_device * dev,const struct fd_dev_id * dev_id,const struct fd_dev_info * dev_info,const struct ir3_compiler_options * options)127 ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
128 const struct fd_dev_info *dev_info,
129 const struct ir3_compiler_options *options)
130 {
131 struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
132
133 ir3_shader_debug = debug_get_option_ir3_shader_debug();
134 ir3_shader_override_path =
135 __normal_user() ? debug_get_option_ir3_shader_override_path() : NULL;
136
137 if (ir3_shader_override_path) {
138 ir3_shader_debug |= IR3_DBG_NOCACHE;
139 }
140
141 compiler->dev = dev;
142 compiler->dev_id = dev_id;
143 compiler->gen = fd_dev_gen(dev_id);
144 compiler->is_64bit = fd_dev_64b(dev_id);
145 compiler->options = *options;
146
147 /* TODO see if older GPU's were different here */
148 compiler->branchstack_size = 64;
149 compiler->wave_granularity = dev_info->wave_granularity;
150 compiler->max_waves = dev_info->max_waves;
151
152 compiler->max_variable_workgroup_size = 1024;
153
154 compiler->local_mem_size = dev_info->cs_shared_mem_size;
155
156 compiler->num_predicates = 1;
157 compiler->bitops_can_write_predicates = false;
158 compiler->has_branch_and_or = false;
159 compiler->has_rpt_bary_f = false;
160
161 if (compiler->gen >= 6) {
162 compiler->samgq_workaround = true;
163 /* a6xx split the pipeline state into geometry and fragment state, in
164 * order to let the VS run ahead of the FS. As a result there are now
165 * separate const files for the the fragment shader and everything
166 * else, and separate limits. There seems to be a shared limit, but
167 * it's higher than the vert or frag limits.
168 *
169 * Also, according to the observation on a630/a650/a660, max_const_pipeline
170 * has to be 512 when all geometry stages are present. Otherwise a gpu hang
171 * happens. Accordingly maximum safe size for each stage should be under
172 * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for
173 * const files.
174 *
175 * Only when VS and FS stages are present, the limit is 640.
176 *
177 * TODO: The shared limit seems to be different on different models.
178 */
179 compiler->max_const_pipeline = 512;
180 compiler->max_const_frag = 512;
181 compiler->max_const_geom = 512;
182 compiler->max_const_safe = 100;
183
184 /* Compute shaders don't share a const file with the FS. Instead they
185 * have their own file, which is smaller than the FS one. On a7xx the size
186 * was doubled, although this doesn't work on X1-85.
187 *
188 * TODO: is this true on earlier gen's?
189 */
190 compiler->max_const_compute =
191 (compiler->gen >= 7 && !dev_info->a7xx.compute_constlen_quirk) ? 512 : 256;
192
193 /* TODO: implement clip+cull distances on earlier gen's */
194 compiler->has_clip_cull = true;
195
196 compiler->has_preamble = true;
197
198 compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
199
200 compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;
201
202 compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
203 compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;
204 compiler->has_compliant_dp4acc = dev_info->a7xx.has_compliant_dp4acc;
205
206 if (compiler->gen == 6 && options->shared_push_consts) {
207 compiler->shared_consts_base_offset = 504;
208 compiler->shared_consts_size = 8;
209 compiler->geom_shared_consts_size_quirk = 16;
210 } else {
211 compiler->shared_consts_base_offset = -1;
212 compiler->shared_consts_size = 0;
213 compiler->geom_shared_consts_size_quirk = 0;
214 }
215
216 compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch;
217 compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
218 compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble;
219 compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk;
220 compiler->num_predicates = 4;
221 compiler->bitops_can_write_predicates = true;
222 compiler->has_branch_and_or = true;
223 compiler->has_predication = true;
224 compiler->predtf_nop_quirk = dev_info->a6xx.predtf_nop_quirk;
225 compiler->prede_nop_quirk = dev_info->a6xx.prede_nop_quirk;
226 compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
227 compiler->has_isam_v = dev_info->a6xx.has_isam_v;
228 compiler->has_ssbo_imm_offsets = dev_info->a6xx.has_ssbo_imm_offsets;
229 compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk;
230 compiler->has_early_preamble = dev_info->a6xx.has_early_preamble;
231 compiler->has_rpt_bary_f = true;
232 compiler->has_shfl = true;
233 compiler->reading_shading_rate_requires_smask_quirk =
234 dev_info->a7xx.reading_shading_rate_requires_smask_quirk;
235 } else {
236 compiler->max_const_pipeline = 512;
237 compiler->max_const_geom = 512;
238 compiler->max_const_frag = 512;
239 compiler->max_const_compute = 512;
240
241 /* Note: this will have to change if/when we support tess+GS on
242 * earlier gen's.
243 */
244 compiler->max_const_safe = 256;
245
246 compiler->has_scalar_alu = false;
247 compiler->has_isam_v = false;
248 compiler->has_ssbo_imm_offsets = false;
249 compiler->has_early_preamble = false;
250 }
251
252 /* This is just a guess for a4xx. */
253 compiler->pvtmem_per_fiber_align = compiler->gen >= 4 ? 512 : 128;
254 /* TODO: implement private memory on earlier gen's */
255 compiler->has_pvtmem = compiler->gen >= 5;
256
257 compiler->has_isam_ssbo = compiler->gen >= 6;
258
259 if (compiler->gen >= 6) {
260 compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
261 } else if (compiler->gen >= 4) {
262 /* On a4xx-a5xx, using r24.x and above requires using the smallest
263 * threadsize.
264 */
265 compiler->reg_size_vec4 = 48;
266 } else {
267 /* TODO: confirm this */
268 compiler->reg_size_vec4 = 96;
269 }
270
271 compiler->threadsize_base = dev_info->threadsize_base;
272
273 if (compiler->gen >= 4) {
274 /* need special handling for "flat" */
275 compiler->flat_bypass = true;
276 compiler->levels_add_one = false;
277 compiler->unminify_coords = false;
278 compiler->txf_ms_with_isaml = false;
279 compiler->array_index_add_half = true;
280 compiler->instr_align = 16;
281 compiler->const_upload_unit = 4;
282 } else {
283 /* no special handling for "flat" */
284 compiler->flat_bypass = false;
285 compiler->levels_add_one = true;
286 compiler->unminify_coords = true;
287 compiler->txf_ms_with_isaml = true;
288 compiler->array_index_add_half = false;
289 compiler->instr_align = 4;
290 compiler->const_upload_unit = 8;
291 }
292
293 compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
294 compiler->has_shared_regfile = compiler->gen >= 5;
295 compiler->has_bitwise_triops = compiler->gen >= 5;
296
297 /* The driver can't request this unless preambles are supported. */
298 if (options->push_ubo_with_preamble)
299 assert(compiler->has_preamble);
300
301 /* Set up nir shader compiler options, using device-specific overrides of our base settings. */
302 compiler->nir_options = ir3_base_options;
303 compiler->nir_options.has_iadd3 = dev_info->a6xx.has_sad;
304
305 if (compiler->gen >= 6) {
306 compiler->nir_options.vectorize_io = true,
307 compiler->nir_options.force_indirect_unrolling = nir_var_all,
308 compiler->nir_options.lower_device_index_to_zero = true;
309
310 if (dev_info->a6xx.has_dp2acc || dev_info->a6xx.has_dp4acc) {
311 compiler->nir_options.has_udot_4x8 =
312 compiler->nir_options.has_udot_4x8_sat = true;
313 compiler->nir_options.has_sudot_4x8 =
314 compiler->nir_options.has_sudot_4x8_sat = true;
315 }
316
317 if (dev_info->a6xx.has_dp4acc && dev_info->a7xx.has_compliant_dp4acc) {
318 compiler->nir_options.has_sdot_4x8 =
319 compiler->nir_options.has_sdot_4x8_sat = true;
320 }
321 } else if (compiler->gen >= 3 && compiler->gen <= 5) {
322 compiler->nir_options.vertex_id_zero_based = true;
323 } else if (compiler->gen <= 2) {
324 /* a2xx compiler doesn't handle indirect: */
325 compiler->nir_options.force_indirect_unrolling = nir_var_all;
326 }
327
328 if (options->lower_base_vertex) {
329 compiler->nir_options.lower_base_vertex = true;
330 }
331
332 /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
333 * this core NIR option enables some optimizations of 16-bit operations.
334 */
335 if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16))
336 compiler->nir_options.support_16bit_alu = true;
337
338 compiler->nir_options.support_indirect_inputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES);
339 compiler->nir_options.support_indirect_outputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES);
340
341 if (!options->disable_cache)
342 ir3_disk_cache_init(compiler);
343
344 return compiler;
345 }
346
347 const nir_shader_compiler_options *
ir3_get_compiler_options(struct ir3_compiler * compiler)348 ir3_get_compiler_options(struct ir3_compiler *compiler)
349 {
350 return &compiler->nir_options;
351 }
352