• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Rob Clark <robclark@freedesktop.org>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <robclark@freedesktop.org>
7  */
8 
9 #include "util/ralloc.h"
10 
11 #include "freedreno_dev_info.h"
12 
13 #include "ir3_compiler.h"
14 
15 static const struct debug_named_value shader_debug_options[] = {
16    /* clang-format off */
17    {"vs",         IR3_DBG_SHADER_VS,  "Print shader disasm for vertex shaders"},
18    {"tcs",        IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
19    {"tes",        IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
20    {"gs",         IR3_DBG_SHADER_GS,  "Print shader disasm for geometry shaders"},
21    {"fs",         IR3_DBG_SHADER_FS,  "Print shader disasm for fragment shaders"},
22    {"cs",         IR3_DBG_SHADER_CS,  "Print shader disasm for compute shaders"},
23    {"internal",   IR3_DBG_SHADER_INTERNAL, "Print shader disasm for internal shaders (normally not included in vs/fs/cs/etc)"},
24    {"disasm",     IR3_DBG_DISASM,     "Dump NIR and adreno shader disassembly"},
25    {"optmsgs",    IR3_DBG_OPTMSGS,    "Enable optimizer debug messages"},
26    {"forces2en",  IR3_DBG_FORCES2EN,  "Force s2en mode for tex sampler instructions"},
27    {"nouboopt",   IR3_DBG_NOUBOOPT,   "Disable lowering UBO to uniform"},
28    {"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
29    {"nocache",    IR3_DBG_NOCACHE,    "Disable shader cache"},
30    {"spillall",   IR3_DBG_SPILLALL,   "Spill as much as possible to test the spiller"},
31    {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
32    {"fullsync",   IR3_DBG_FULLSYNC,   "Add (sy) + (ss) after each cat5/cat6"},
33    {"fullnop",    IR3_DBG_FULLNOP,    "Add nops before each instruction"},
34    {"noearlypreamble", IR3_DBG_NOEARLYPREAMBLE, "Disable early preambles"},
35    {"nodescprefetch", IR3_DBG_NODESCPREFETCH, "Disable descriptor prefetch optimization"},
36    {"expandrpt",  IR3_DBG_EXPANDRPT,  "Expand rptN instructions"},
37    {"noaliastex", IR3_DBG_NOALIASTEX, "Don't use alias.tex"},
38    {"noaliasrt",  IR3_DBG_NOALIASRT,  "Don't use alias.rt"},
39 #if MESA_DEBUG
40    /* MESA_DEBUG-only options: */
41    {"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
42    {"ramsgs",     IR3_DBG_RAMSGS,     "Enable register-allocation debug messages"},
43 #endif
44    DEBUG_NAMED_VALUE_END
45    /* clang-format on */
46 };
47 
48 DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
49                             shader_debug_options, 0)
50 DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
51                       NULL)
52 
53 enum ir3_shader_debug ir3_shader_debug = 0;
54 const char *ir3_shader_override_path = NULL;
55 
56 void
ir3_compiler_destroy(struct ir3_compiler * compiler)57 ir3_compiler_destroy(struct ir3_compiler *compiler)
58 {
59    disk_cache_destroy(compiler->disk_cache);
60    ralloc_free(compiler);
61 }
62 
63 static const nir_shader_compiler_options ir3_base_options = {
64    .compact_arrays = true,
65    .lower_fpow = true,
66    .lower_scmp = true,
67    .lower_flrp16 = true,
68    .lower_flrp32 = true,
69    .lower_flrp64 = true,
70    .lower_ffract = true,
71    .lower_fmod = true,
72    .lower_fdiv = true,
73    .lower_isign = true,
74    .lower_ldexp = true,
75    .lower_uadd_carry = true,
76    .lower_usub_borrow = true,
77    .lower_mul_high = true,
78    .lower_mul_2x32_64 = true,
79    .fuse_ffma16 = true,
80    .fuse_ffma32 = true,
81    .fuse_ffma64 = true,
82    .vertex_id_zero_based = false,
83    .lower_extract_byte = true,
84    .lower_extract_word = true,
85    .lower_insert_byte = true,
86    .lower_insert_word = true,
87    .lower_helper_invocation = true,
88    .lower_bitfield_insert = true,
89    .lower_bitfield_extract = true,
90    .lower_pack_half_2x16 = true,
91    .lower_pack_snorm_4x8 = true,
92    .lower_pack_snorm_2x16 = true,
93    .lower_pack_unorm_4x8 = true,
94    .lower_pack_unorm_2x16 = true,
95    .lower_unpack_half_2x16 = true,
96    .lower_unpack_snorm_4x8 = true,
97    .lower_unpack_snorm_2x16 = true,
98    .lower_unpack_unorm_4x8 = true,
99    .lower_unpack_unorm_2x16 = true,
100    .lower_pack_split = true,
101    .lower_to_scalar = true,
102    .has_imul24 = true,
103    .has_icsel_eqz32 = true,
104    .has_icsel_eqz16 = true,
105    .has_fsub = true,
106    .has_isub = true,
107    .force_indirect_unrolling_sampler = true,
108    .lower_uniforms_to_ubo = true,
109    .max_unroll_iterations = 32,
110 
111    .lower_cs_local_index_to_id = true,
112    .lower_wpos_pntc = true,
113 
114    .lower_hadd = true,
115    .lower_hadd64 = true,
116    .lower_fisnormal = true,
117 
118    .lower_int64_options = (nir_lower_int64_options)~0,
119    .lower_doubles_options = (nir_lower_doubles_options)~0,
120 
121    .divergence_analysis_options = nir_divergence_uniform_load_tears,
122    .scalarize_ddx = true,
123 
124    .per_view_unique_driver_locations = true,
125    .compact_view_index = true,
126 };
127 
128 struct ir3_compiler *
ir3_compiler_create(struct fd_device * dev,const struct fd_dev_id * dev_id,const struct fd_dev_info * dev_info,const struct ir3_compiler_options * options)129 ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
130                     const struct fd_dev_info *dev_info,
131                     const struct ir3_compiler_options *options)
132 {
133    struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
134 
135    ir3_shader_debug = debug_get_option_ir3_shader_debug();
136    ir3_shader_override_path =
137       __normal_user() ? debug_get_option_ir3_shader_override_path() : NULL;
138 
139    if (ir3_shader_override_path) {
140       ir3_shader_debug |= IR3_DBG_NOCACHE;
141    }
142 
143    compiler->dev = dev;
144    compiler->dev_id = dev_id;
145    compiler->gen = fd_dev_gen(dev_id);
146    compiler->is_64bit = fd_dev_64b(dev_id);
147    compiler->options = *options;
148 
149    /* TODO see if older GPU's were different here */
150    compiler->branchstack_size = 64;
151    compiler->wave_granularity = dev_info->wave_granularity;
152    compiler->max_waves = dev_info->max_waves;
153 
154    compiler->max_variable_workgroup_size = 1024;
155 
156    compiler->local_mem_size = dev_info->cs_shared_mem_size;
157 
158    compiler->num_predicates = 1;
159    compiler->bitops_can_write_predicates = false;
160    compiler->has_branch_and_or = false;
161    compiler->has_rpt_bary_f = false;
162    compiler->has_alias_tex = false;
163    compiler->delay_slots.alu_to_alu = 3;
164    compiler->delay_slots.non_alu = 6;
165    compiler->delay_slots.cat3_src2_read = 2;
166 
167    if (compiler->gen >= 6) {
168       compiler->samgq_workaround = true;
169       /* a6xx split the pipeline state into geometry and fragment state, in
170        * order to let the VS run ahead of the FS. As a result there are now
171        * separate const files for the the fragment shader and everything
172        * else, and separate limits. There seems to be a shared limit, but
173        * it's higher than the vert or frag limits.
174        *
175        * Also, according to the observation on a630/a650/a660, max_const_pipeline
176        * has to be 512 when all geometry stages are present. Otherwise a gpu hang
177        * happens. Accordingly maximum safe size for each stage should be under
178        * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for
179        * const files.
180        *
181        * Only when VS and FS stages are present, the limit is 640.
182        *
183        * TODO: The shared limit seems to be different on different models.
184        */
185       compiler->max_const_pipeline = 512;
186       compiler->max_const_frag = 512;
187       compiler->max_const_geom = 512;
188       compiler->max_const_safe = 100;
189 
190       /* Compute shaders don't share a const file with the FS. Instead they
191        * have their own file, which is smaller than the FS one. On a7xx the size
192        * was doubled, although this doesn't work on X1-85.
193        *
194        * TODO: is this true on earlier gen's?
195        */
196       compiler->max_const_compute =
197          (compiler->gen >= 7 && !dev_info->a7xx.compute_constlen_quirk) ? 512 : 256;
198 
199       /* TODO: implement clip+cull distances on earlier gen's */
200       compiler->has_clip_cull = true;
201 
202       compiler->has_preamble = true;
203 
204       compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
205 
206       compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;
207 
208       compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
209       compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;
210       compiler->has_compliant_dp4acc = dev_info->a7xx.has_compliant_dp4acc;
211 
212       if (compiler->gen == 6 && options->shared_push_consts) {
213          compiler->shared_consts_base_offset = 504;
214          compiler->shared_consts_size = 8;
215          compiler->geom_shared_consts_size_quirk = 16;
216       } else {
217          compiler->shared_consts_base_offset = -1;
218          compiler->shared_consts_size = 0;
219          compiler->geom_shared_consts_size_quirk = 0;
220       }
221 
222       compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch;
223       compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
224       compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble;
225       compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk;
226       compiler->num_predicates = 4;
227       compiler->bitops_can_write_predicates = true;
228       compiler->has_branch_and_or = true;
229       compiler->has_predication = true;
230       compiler->predtf_nop_quirk = dev_info->a6xx.predtf_nop_quirk;
231       compiler->prede_nop_quirk = dev_info->a6xx.prede_nop_quirk;
232       compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
233       compiler->has_isam_v = dev_info->a6xx.has_isam_v;
234       compiler->has_ssbo_imm_offsets = dev_info->a6xx.has_ssbo_imm_offsets;
235       compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk;
236       compiler->has_early_preamble = dev_info->a6xx.has_early_preamble;
237       compiler->has_rpt_bary_f = true;
238       compiler->has_shfl = true;
239       compiler->reading_shading_rate_requires_smask_quirk =
240          dev_info->a7xx.reading_shading_rate_requires_smask_quirk;
241       compiler->has_alias_rt = dev_info->a7xx.has_alias_rt;
242 
243       if (compiler->gen >= 7) {
244          compiler->has_alias_tex = true;
245          compiler->delay_slots.alu_to_alu = 2;
246          compiler->delay_slots.non_alu = 5;
247          compiler->delay_slots.cat3_src2_read = 1;
248       }
249    } else {
250       compiler->max_const_pipeline = 512;
251       compiler->max_const_geom = 512;
252       compiler->max_const_frag = 512;
253       compiler->max_const_compute = 512;
254 
255       /* Note: this will have to change if/when we support tess+GS on
256        * earlier gen's.
257        */
258       compiler->max_const_safe = 256;
259 
260       compiler->has_scalar_alu = false;
261       compiler->has_isam_v = false;
262       compiler->has_ssbo_imm_offsets = false;
263       compiler->has_early_preamble = false;
264    }
265 
266    /* This is just a guess for a4xx. */
267    compiler->pvtmem_per_fiber_align = compiler->gen >= 4 ? 512 : 128;
268    /* TODO: implement private memory on earlier gen's */
269    compiler->has_pvtmem = compiler->gen >= 5;
270 
271    compiler->has_isam_ssbo = compiler->gen >= 6;
272 
273    if (compiler->gen >= 6) {
274       compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
275    } else if (compiler->gen >= 4) {
276       /* On a4xx-a5xx, using r24.x and above requires using the smallest
277        * threadsize.
278        */
279       compiler->reg_size_vec4 = 48;
280    } else {
281       /* TODO: confirm this */
282       compiler->reg_size_vec4 = 96;
283    }
284 
285    compiler->threadsize_base = dev_info->threadsize_base;
286 
287    if (compiler->gen >= 4) {
288       /* need special handling for "flat" */
289       compiler->flat_bypass = true;
290       compiler->levels_add_one = false;
291       compiler->unminify_coords = false;
292       compiler->txf_ms_with_isaml = false;
293       compiler->array_index_add_half = true;
294       compiler->instr_align = 16;
295       compiler->const_upload_unit = 4;
296    } else {
297       /* no special handling for "flat" */
298       compiler->flat_bypass = false;
299       compiler->levels_add_one = true;
300       compiler->unminify_coords = true;
301       compiler->txf_ms_with_isaml = true;
302       compiler->array_index_add_half = false;
303       compiler->instr_align = 4;
304       compiler->const_upload_unit = 8;
305    }
306 
307    compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
308    compiler->has_shared_regfile = compiler->gen >= 5;
309    compiler->has_bitwise_triops = compiler->gen >= 5;
310 
311    /* The driver can't request this unless preambles are supported. */
312    if (options->push_ubo_with_preamble)
313       assert(compiler->has_preamble);
314 
315    /* Set up nir shader compiler options, using device-specific overrides of our base settings. */
316    compiler->nir_options = ir3_base_options;
317    compiler->nir_options.has_iadd3 = dev_info->a6xx.has_sad;
318 
319    if (compiler->gen >= 6) {
320       compiler->nir_options.force_indirect_unrolling = nir_var_all,
321       compiler->nir_options.lower_device_index_to_zero = true;
322 
323       if (dev_info->a6xx.has_dp2acc || dev_info->a6xx.has_dp4acc) {
324          compiler->nir_options.has_udot_4x8 =
325             compiler->nir_options.has_udot_4x8_sat = true;
326          compiler->nir_options.has_sudot_4x8 =
327             compiler->nir_options.has_sudot_4x8_sat = true;
328       }
329 
330       if (dev_info->a6xx.has_dp4acc && dev_info->a7xx.has_compliant_dp4acc) {
331          compiler->nir_options.has_sdot_4x8 =
332             compiler->nir_options.has_sdot_4x8_sat = true;
333       }
334    } else if (compiler->gen >= 3 && compiler->gen <= 5) {
335       compiler->nir_options.vertex_id_zero_based = true;
336    } else if (compiler->gen <= 2) {
337       /* a2xx compiler doesn't handle indirect: */
338       compiler->nir_options.force_indirect_unrolling = nir_var_all;
339    }
340 
341    if (options->lower_base_vertex) {
342       compiler->nir_options.lower_base_vertex = true;
343    }
344 
345    /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
346     * this core NIR option enables some optimizations of 16-bit operations.
347     */
348    if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16))
349       compiler->nir_options.support_16bit_alu = true;
350 
351    compiler->nir_options.support_indirect_inputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES);
352    compiler->nir_options.support_indirect_outputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES);
353 
354    if (!options->disable_cache)
355       ir3_disk_cache_init(compiler);
356 
357    return compiler;
358 }
359 
360 const nir_shader_compiler_options *
ir3_get_compiler_options(struct ir3_compiler * compiler)361 ir3_get_compiler_options(struct ir3_compiler *compiler)
362 {
363    return &compiler->nir_options;
364 }
365