1 /*
2 * Copyright © 2015 Rob Clark <robclark@freedesktop.org>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <robclark@freedesktop.org>
7 */
8
9 #include "util/ralloc.h"
10
11 #include "freedreno_dev_info.h"
12
13 #include "ir3_compiler.h"
14
15 static const struct debug_named_value shader_debug_options[] = {
16 /* clang-format off */
17 {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
18 {"tcs", IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
19 {"tes", IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
20 {"gs", IR3_DBG_SHADER_GS, "Print shader disasm for geometry shaders"},
21 {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
22 {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
23 {"internal", IR3_DBG_SHADER_INTERNAL, "Print shader disasm for internal shaders (normally not included in vs/fs/cs/etc)"},
24 {"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
25 {"optmsgs", IR3_DBG_OPTMSGS, "Enable optimizer debug messages"},
26 {"forces2en", IR3_DBG_FORCES2EN, "Force s2en mode for tex sampler instructions"},
27 {"nouboopt", IR3_DBG_NOUBOOPT, "Disable lowering UBO to uniform"},
28 {"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"},
29 {"nocache", IR3_DBG_NOCACHE, "Disable shader cache"},
30 {"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"},
31 {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
32 {"fullsync", IR3_DBG_FULLSYNC, "Add (sy) + (ss) after each cat5/cat6"},
33 {"fullnop", IR3_DBG_FULLNOP, "Add nops before each instruction"},
34 {"noearlypreamble", IR3_DBG_NOEARLYPREAMBLE, "Disable early preambles"},
35 {"nodescprefetch", IR3_DBG_NODESCPREFETCH, "Disable descriptor prefetch optimization"},
36 {"expandrpt", IR3_DBG_EXPANDRPT, "Expand rptN instructions"},
37 {"noaliastex", IR3_DBG_NOALIASTEX, "Don't use alias.tex"},
38 {"noaliasrt", IR3_DBG_NOALIASRT, "Don't use alias.rt"},
39 #if MESA_DEBUG
40 /* MESA_DEBUG-only options: */
41 {"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"},
42 {"ramsgs", IR3_DBG_RAMSGS, "Enable register-allocation debug messages"},
43 #endif
44 DEBUG_NAMED_VALUE_END
45 /* clang-format on */
46 };
47
48 DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
49 shader_debug_options, 0)
50 DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
51 NULL)
52
53 enum ir3_shader_debug ir3_shader_debug = 0;
54 const char *ir3_shader_override_path = NULL;
55
56 void
ir3_compiler_destroy(struct ir3_compiler * compiler)57 ir3_compiler_destroy(struct ir3_compiler *compiler)
58 {
59 disk_cache_destroy(compiler->disk_cache);
60 ralloc_free(compiler);
61 }
62
63 static const nir_shader_compiler_options ir3_base_options = {
64 .compact_arrays = true,
65 .lower_fpow = true,
66 .lower_scmp = true,
67 .lower_flrp16 = true,
68 .lower_flrp32 = true,
69 .lower_flrp64 = true,
70 .lower_ffract = true,
71 .lower_fmod = true,
72 .lower_fdiv = true,
73 .lower_isign = true,
74 .lower_ldexp = true,
75 .lower_uadd_carry = true,
76 .lower_usub_borrow = true,
77 .lower_mul_high = true,
78 .lower_mul_2x32_64 = true,
79 .fuse_ffma16 = true,
80 .fuse_ffma32 = true,
81 .fuse_ffma64 = true,
82 .vertex_id_zero_based = false,
83 .lower_extract_byte = true,
84 .lower_extract_word = true,
85 .lower_insert_byte = true,
86 .lower_insert_word = true,
87 .lower_helper_invocation = true,
88 .lower_bitfield_insert = true,
89 .lower_bitfield_extract = true,
90 .lower_pack_half_2x16 = true,
91 .lower_pack_snorm_4x8 = true,
92 .lower_pack_snorm_2x16 = true,
93 .lower_pack_unorm_4x8 = true,
94 .lower_pack_unorm_2x16 = true,
95 .lower_unpack_half_2x16 = true,
96 .lower_unpack_snorm_4x8 = true,
97 .lower_unpack_snorm_2x16 = true,
98 .lower_unpack_unorm_4x8 = true,
99 .lower_unpack_unorm_2x16 = true,
100 .lower_pack_split = true,
101 .lower_to_scalar = true,
102 .has_imul24 = true,
103 .has_icsel_eqz32 = true,
104 .has_icsel_eqz16 = true,
105 .has_fsub = true,
106 .has_isub = true,
107 .force_indirect_unrolling_sampler = true,
108 .lower_uniforms_to_ubo = true,
109 .max_unroll_iterations = 32,
110
111 .lower_cs_local_index_to_id = true,
112 .lower_wpos_pntc = true,
113
114 .lower_hadd = true,
115 .lower_hadd64 = true,
116 .lower_fisnormal = true,
117
118 .lower_int64_options = (nir_lower_int64_options)~0,
119 .lower_doubles_options = (nir_lower_doubles_options)~0,
120
121 .divergence_analysis_options = nir_divergence_uniform_load_tears,
122 .scalarize_ddx = true,
123
124 .per_view_unique_driver_locations = true,
125 .compact_view_index = true,
126 };
127
128 struct ir3_compiler *
ir3_compiler_create(struct fd_device * dev,const struct fd_dev_id * dev_id,const struct fd_dev_info * dev_info,const struct ir3_compiler_options * options)129 ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
130 const struct fd_dev_info *dev_info,
131 const struct ir3_compiler_options *options)
132 {
133 struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
134
135 ir3_shader_debug = debug_get_option_ir3_shader_debug();
136 ir3_shader_override_path =
137 __normal_user() ? debug_get_option_ir3_shader_override_path() : NULL;
138
139 if (ir3_shader_override_path) {
140 ir3_shader_debug |= IR3_DBG_NOCACHE;
141 }
142
143 compiler->dev = dev;
144 compiler->dev_id = dev_id;
145 compiler->gen = fd_dev_gen(dev_id);
146 compiler->is_64bit = fd_dev_64b(dev_id);
147 compiler->options = *options;
148
149 /* TODO see if older GPU's were different here */
150 compiler->branchstack_size = 64;
151 compiler->wave_granularity = dev_info->wave_granularity;
152 compiler->max_waves = dev_info->max_waves;
153
154 compiler->max_variable_workgroup_size = 1024;
155
156 compiler->local_mem_size = dev_info->cs_shared_mem_size;
157
158 compiler->num_predicates = 1;
159 compiler->bitops_can_write_predicates = false;
160 compiler->has_branch_and_or = false;
161 compiler->has_rpt_bary_f = false;
162 compiler->has_alias_tex = false;
163 compiler->delay_slots.alu_to_alu = 3;
164 compiler->delay_slots.non_alu = 6;
165 compiler->delay_slots.cat3_src2_read = 2;
166
167 if (compiler->gen >= 6) {
168 compiler->samgq_workaround = true;
169 /* a6xx split the pipeline state into geometry and fragment state, in
170 * order to let the VS run ahead of the FS. As a result there are now
171 * separate const files for the the fragment shader and everything
172 * else, and separate limits. There seems to be a shared limit, but
173 * it's higher than the vert or frag limits.
174 *
175 * Also, according to the observation on a630/a650/a660, max_const_pipeline
176 * has to be 512 when all geometry stages are present. Otherwise a gpu hang
177 * happens. Accordingly maximum safe size for each stage should be under
178 * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for
179 * const files.
180 *
181 * Only when VS and FS stages are present, the limit is 640.
182 *
183 * TODO: The shared limit seems to be different on different models.
184 */
185 compiler->max_const_pipeline = 512;
186 compiler->max_const_frag = 512;
187 compiler->max_const_geom = 512;
188 compiler->max_const_safe = 100;
189
190 /* Compute shaders don't share a const file with the FS. Instead they
191 * have their own file, which is smaller than the FS one. On a7xx the size
192 * was doubled, although this doesn't work on X1-85.
193 *
194 * TODO: is this true on earlier gen's?
195 */
196 compiler->max_const_compute =
197 (compiler->gen >= 7 && !dev_info->a7xx.compute_constlen_quirk) ? 512 : 256;
198
199 /* TODO: implement clip+cull distances on earlier gen's */
200 compiler->has_clip_cull = true;
201
202 compiler->has_preamble = true;
203
204 compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
205
206 compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;
207
208 compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
209 compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;
210 compiler->has_compliant_dp4acc = dev_info->a7xx.has_compliant_dp4acc;
211
212 if (compiler->gen == 6 && options->shared_push_consts) {
213 compiler->shared_consts_base_offset = 504;
214 compiler->shared_consts_size = 8;
215 compiler->geom_shared_consts_size_quirk = 16;
216 } else {
217 compiler->shared_consts_base_offset = -1;
218 compiler->shared_consts_size = 0;
219 compiler->geom_shared_consts_size_quirk = 0;
220 }
221
222 compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch;
223 compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
224 compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble;
225 compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk;
226 compiler->num_predicates = 4;
227 compiler->bitops_can_write_predicates = true;
228 compiler->has_branch_and_or = true;
229 compiler->has_predication = true;
230 compiler->predtf_nop_quirk = dev_info->a6xx.predtf_nop_quirk;
231 compiler->prede_nop_quirk = dev_info->a6xx.prede_nop_quirk;
232 compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
233 compiler->has_isam_v = dev_info->a6xx.has_isam_v;
234 compiler->has_ssbo_imm_offsets = dev_info->a6xx.has_ssbo_imm_offsets;
235 compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk;
236 compiler->has_early_preamble = dev_info->a6xx.has_early_preamble;
237 compiler->has_rpt_bary_f = true;
238 compiler->has_shfl = true;
239 compiler->reading_shading_rate_requires_smask_quirk =
240 dev_info->a7xx.reading_shading_rate_requires_smask_quirk;
241 compiler->has_alias_rt = dev_info->a7xx.has_alias_rt;
242
243 if (compiler->gen >= 7) {
244 compiler->has_alias_tex = true;
245 compiler->delay_slots.alu_to_alu = 2;
246 compiler->delay_slots.non_alu = 5;
247 compiler->delay_slots.cat3_src2_read = 1;
248 }
249 } else {
250 compiler->max_const_pipeline = 512;
251 compiler->max_const_geom = 512;
252 compiler->max_const_frag = 512;
253 compiler->max_const_compute = 512;
254
255 /* Note: this will have to change if/when we support tess+GS on
256 * earlier gen's.
257 */
258 compiler->max_const_safe = 256;
259
260 compiler->has_scalar_alu = false;
261 compiler->has_isam_v = false;
262 compiler->has_ssbo_imm_offsets = false;
263 compiler->has_early_preamble = false;
264 }
265
266 /* This is just a guess for a4xx. */
267 compiler->pvtmem_per_fiber_align = compiler->gen >= 4 ? 512 : 128;
268 /* TODO: implement private memory on earlier gen's */
269 compiler->has_pvtmem = compiler->gen >= 5;
270
271 compiler->has_isam_ssbo = compiler->gen >= 6;
272
273 if (compiler->gen >= 6) {
274 compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
275 } else if (compiler->gen >= 4) {
276 /* On a4xx-a5xx, using r24.x and above requires using the smallest
277 * threadsize.
278 */
279 compiler->reg_size_vec4 = 48;
280 } else {
281 /* TODO: confirm this */
282 compiler->reg_size_vec4 = 96;
283 }
284
285 compiler->threadsize_base = dev_info->threadsize_base;
286
287 if (compiler->gen >= 4) {
288 /* need special handling for "flat" */
289 compiler->flat_bypass = true;
290 compiler->levels_add_one = false;
291 compiler->unminify_coords = false;
292 compiler->txf_ms_with_isaml = false;
293 compiler->array_index_add_half = true;
294 compiler->instr_align = 16;
295 compiler->const_upload_unit = 4;
296 } else {
297 /* no special handling for "flat" */
298 compiler->flat_bypass = false;
299 compiler->levels_add_one = true;
300 compiler->unminify_coords = true;
301 compiler->txf_ms_with_isaml = true;
302 compiler->array_index_add_half = false;
303 compiler->instr_align = 4;
304 compiler->const_upload_unit = 8;
305 }
306
307 compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
308 compiler->has_shared_regfile = compiler->gen >= 5;
309 compiler->has_bitwise_triops = compiler->gen >= 5;
310
311 /* The driver can't request this unless preambles are supported. */
312 if (options->push_ubo_with_preamble)
313 assert(compiler->has_preamble);
314
315 /* Set up nir shader compiler options, using device-specific overrides of our base settings. */
316 compiler->nir_options = ir3_base_options;
317 compiler->nir_options.has_iadd3 = dev_info->a6xx.has_sad;
318
319 if (compiler->gen >= 6) {
320 compiler->nir_options.force_indirect_unrolling = nir_var_all,
321 compiler->nir_options.lower_device_index_to_zero = true;
322
323 if (dev_info->a6xx.has_dp2acc || dev_info->a6xx.has_dp4acc) {
324 compiler->nir_options.has_udot_4x8 =
325 compiler->nir_options.has_udot_4x8_sat = true;
326 compiler->nir_options.has_sudot_4x8 =
327 compiler->nir_options.has_sudot_4x8_sat = true;
328 }
329
330 if (dev_info->a6xx.has_dp4acc && dev_info->a7xx.has_compliant_dp4acc) {
331 compiler->nir_options.has_sdot_4x8 =
332 compiler->nir_options.has_sdot_4x8_sat = true;
333 }
334 } else if (compiler->gen >= 3 && compiler->gen <= 5) {
335 compiler->nir_options.vertex_id_zero_based = true;
336 } else if (compiler->gen <= 2) {
337 /* a2xx compiler doesn't handle indirect: */
338 compiler->nir_options.force_indirect_unrolling = nir_var_all;
339 }
340
341 if (options->lower_base_vertex) {
342 compiler->nir_options.lower_base_vertex = true;
343 }
344
345 /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
346 * this core NIR option enables some optimizations of 16-bit operations.
347 */
348 if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16))
349 compiler->nir_options.support_16bit_alu = true;
350
351 compiler->nir_options.support_indirect_inputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES);
352 compiler->nir_options.support_indirect_outputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES);
353
354 if (!options->disable_cache)
355 ir3_disk_cache_init(compiler);
356
357 return compiler;
358 }
359
360 const nir_shader_compiler_options *
ir3_get_compiler_options(struct ir3_compiler * compiler)361 ir3_get_compiler_options(struct ir3_compiler *compiler)
362 {
363 return &compiler->nir_options;
364 }
365