• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "util/debug.h"
28 #include "util/u_math.h"
29 
30 #include "ir3_compiler.h"
31 #include "ir3_nir.h"
32 #include "ir3_shader.h"
33 
34 static const nir_shader_compiler_options options = {
35    .lower_fpow = true,
36    .lower_scmp = true,
37    .lower_flrp16 = true,
38    .lower_flrp32 = true,
39    .lower_flrp64 = true,
40    .lower_ffract = true,
41    .lower_fmod = true,
42    .lower_fdiv = true,
43    .lower_isign = true,
44    .lower_ldexp = true,
45    .lower_uadd_carry = true,
46    .lower_usub_borrow = true,
47    .lower_mul_high = true,
48    .lower_mul_2x32_64 = true,
49    .fuse_ffma16 = true,
50    .fuse_ffma32 = true,
51    .fuse_ffma64 = true,
52    .vertex_id_zero_based = true,
53    .lower_extract_byte = true,
54    .lower_extract_word = true,
55    .lower_insert_byte = true,
56    .lower_insert_word = true,
57    .lower_helper_invocation = true,
58    .lower_bitfield_insert_to_shifts = true,
59    .lower_bitfield_extract_to_shifts = true,
60    .lower_pack_half_2x16 = true,
61    .lower_pack_snorm_4x8 = true,
62    .lower_pack_snorm_2x16 = true,
63    .lower_pack_unorm_4x8 = true,
64    .lower_pack_unorm_2x16 = true,
65    .lower_unpack_half_2x16 = true,
66    .lower_unpack_snorm_4x8 = true,
67    .lower_unpack_snorm_2x16 = true,
68    .lower_unpack_unorm_4x8 = true,
69    .lower_unpack_unorm_2x16 = true,
70    .lower_pack_split = true,
71    .use_interpolated_input_intrinsics = true,
72    .lower_rotate = true,
73    .lower_to_scalar = true,
74    .has_imul24 = true,
75    .has_fsub = true,
76    .has_isub = true,
77    .lower_wpos_pntc = true,
78    .lower_cs_local_index_from_id = true,
79 
80    /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
81     * but that should be harmless for GL since 64b is not
82     * supported there.
83     */
84    .lower_int64_options = (nir_lower_int64_options)~0,
85    .lower_uniforms_to_ubo = true,
86    .use_scoped_barrier = true,
87 };
88 
89 /* we don't want to lower vertex_id to _zero_based on newer gpus: */
90 static const nir_shader_compiler_options options_a6xx = {
91    .lower_fpow = true,
92    .lower_scmp = true,
93    .lower_flrp16 = true,
94    .lower_flrp32 = true,
95    .lower_flrp64 = true,
96    .lower_ffract = true,
97    .lower_fmod = true,
98    .lower_fdiv = true,
99    .lower_isign = true,
100    .lower_ldexp = true,
101    .lower_uadd_carry = true,
102    .lower_usub_borrow = true,
103    .lower_mul_high = true,
104    .lower_mul_2x32_64 = true,
105    .fuse_ffma16 = true,
106    .fuse_ffma32 = true,
107    .fuse_ffma64 = true,
108    .vertex_id_zero_based = false,
109    .lower_extract_byte = true,
110    .lower_extract_word = true,
111    .lower_insert_byte = true,
112    .lower_insert_word = true,
113    .lower_helper_invocation = true,
114    .lower_bitfield_insert_to_shifts = true,
115    .lower_bitfield_extract_to_shifts = true,
116    .lower_pack_half_2x16 = true,
117    .lower_pack_snorm_4x8 = true,
118    .lower_pack_snorm_2x16 = true,
119    .lower_pack_unorm_4x8 = true,
120    .lower_pack_unorm_2x16 = true,
121    .lower_unpack_half_2x16 = true,
122    .lower_unpack_snorm_4x8 = true,
123    .lower_unpack_snorm_2x16 = true,
124    .lower_unpack_unorm_4x8 = true,
125    .lower_unpack_unorm_2x16 = true,
126    .lower_pack_split = true,
127    .use_interpolated_input_intrinsics = true,
128    .lower_rotate = true,
129    .vectorize_io = true,
130    .lower_to_scalar = true,
131    .has_imul24 = true,
132    .has_fsub = true,
133    .has_isub = true,
134    .max_unroll_iterations = 32,
135    .force_indirect_unrolling = nir_var_all,
136    .lower_wpos_pntc = true,
137    .lower_cs_local_index_from_id = true,
138 
139    /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
140     * but that should be harmless for GL since 64b is not
141     * supported there.
142     */
143    .lower_int64_options = (nir_lower_int64_options)~0,
144    .lower_uniforms_to_ubo = true,
145    .lower_device_index_to_zero = true,
146    .use_scoped_barrier = true,
147 };
148 
149 const nir_shader_compiler_options *
ir3_get_compiler_options(struct ir3_compiler * compiler)150 ir3_get_compiler_options(struct ir3_compiler *compiler)
151 {
152    if (compiler->gen >= 6)
153       return &options_a6xx;
154    return &options;
155 }
156 
157 static bool
ir3_nir_should_vectorize_mem(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)158 ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
159                              unsigned bit_size, unsigned num_components,
160                              nir_intrinsic_instr *low,
161                              nir_intrinsic_instr *high, void *data)
162 {
163    assert(bit_size >= 8);
164    if (bit_size != 32)
165       return false;
166    unsigned byte_size = bit_size / 8;
167 
168    int size = num_components * byte_size;
169 
170    /* Don't care about alignment past vec4. */
171    assert(util_is_power_of_two_nonzero(align_mul));
172    align_mul = MIN2(align_mul, 16);
173    align_offset &= 15;
174 
175    /* Our offset alignment should aways be at least 4 bytes */
176    if (align_mul < 4)
177       return false;
178 
179    unsigned worst_start_offset = 16 - align_mul + align_offset;
180    if (worst_start_offset + size > 16)
181       return false;
182 
183    return true;
184 }
185 
186 #define OPT(nir, pass, ...)                                                    \
187    ({                                                                          \
188       bool this_progress = false;                                              \
189       NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);                       \
190       this_progress;                                                           \
191    })
192 
193 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
194 
195 void
ir3_optimize_loop(struct ir3_compiler * compiler,nir_shader * s)196 ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)
197 {
198    bool progress;
199    unsigned lower_flrp = (s->options->lower_flrp16 ? 16 : 0) |
200                          (s->options->lower_flrp32 ? 32 : 0) |
201                          (s->options->lower_flrp64 ? 64 : 0);
202 
203    do {
204       progress = false;
205 
206       OPT_V(s, nir_lower_vars_to_ssa);
207       progress |= OPT(s, nir_opt_copy_prop_vars);
208       progress |= OPT(s, nir_opt_dead_write_vars);
209       progress |= OPT(s, nir_lower_alu_to_scalar, NULL, NULL);
210       progress |= OPT(s, nir_lower_phis_to_scalar, false);
211 
212       progress |= OPT(s, nir_copy_prop);
213       progress |= OPT(s, nir_opt_dce);
214       progress |= OPT(s, nir_opt_cse);
215       static int gcm = -1;
216       if (gcm == -1)
217          gcm = env_var_as_unsigned("GCM", 0);
218       if (gcm == 1)
219          progress |= OPT(s, nir_opt_gcm, true);
220       else if (gcm == 2)
221          progress |= OPT(s, nir_opt_gcm, false);
222       progress |= OPT(s, nir_opt_peephole_select, 16, true, true);
223       progress |= OPT(s, nir_opt_intrinsics);
224       /* NOTE: GS lowering inserts an output var with varying slot that
225        * is larger than VARYING_SLOT_MAX (ie. GS_VERTEX_FLAGS_IR3),
226        * which triggers asserts in nir_shader_gather_info().  To work
227        * around that skip lowering phi precision for GS.
228        *
229        * Calling nir_shader_gather_info() late also seems to cause
230        * problems for tess lowering, for now since we only enable
231        * fp16/int16 for frag and compute, skip phi precision lowering
232        * for other stages.
233        */
234       if ((s->info.stage == MESA_SHADER_FRAGMENT) ||
235           (s->info.stage == MESA_SHADER_COMPUTE)) {
236          progress |= OPT(s, nir_opt_phi_precision);
237       }
238       progress |= OPT(s, nir_opt_algebraic);
239       progress |= OPT(s, nir_lower_alu);
240       progress |= OPT(s, nir_lower_pack);
241       progress |= OPT(s, nir_opt_constant_folding);
242 
243       nir_load_store_vectorize_options vectorize_opts = {
244          .modes = nir_var_mem_ubo,
245          .callback = ir3_nir_should_vectorize_mem,
246          .robust_modes = compiler->robust_ubo_access ? nir_var_mem_ubo : 0,
247       };
248       progress |= OPT(s, nir_opt_load_store_vectorize, &vectorize_opts);
249 
250       if (lower_flrp != 0) {
251          if (OPT(s, nir_lower_flrp, lower_flrp, false /* always_precise */)) {
252             OPT(s, nir_opt_constant_folding);
253             progress = true;
254          }
255 
256          /* Nothing should rematerialize any flrps, so we only
257           * need to do this lowering once.
258           */
259          lower_flrp = 0;
260       }
261 
262       progress |= OPT(s, nir_opt_dead_cf);
263       if (OPT(s, nir_opt_trivial_continues)) {
264          progress |= true;
265          /* If nir_opt_trivial_continues makes progress, then we need to clean
266           * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
267           * to make progress.
268           */
269          OPT(s, nir_copy_prop);
270          OPT(s, nir_opt_dce);
271       }
272       progress |= OPT(s, nir_opt_if, false);
273       progress |= OPT(s, nir_opt_loop_unroll);
274       progress |= OPT(s, nir_opt_remove_phis);
275       progress |= OPT(s, nir_opt_undef);
276    } while (progress);
277 }
278 
279 static bool
should_split_wrmask(const nir_instr * instr,const void * data)280 should_split_wrmask(const nir_instr *instr, const void *data)
281 {
282    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
283 
284    switch (intr->intrinsic) {
285    case nir_intrinsic_store_ssbo:
286    case nir_intrinsic_store_shared:
287    case nir_intrinsic_store_global:
288    case nir_intrinsic_store_scratch:
289       return true;
290    default:
291       return false;
292    }
293 }
294 
295 static bool
ir3_nir_lower_ssbo_size_filter(const nir_instr * instr,const void * data)296 ir3_nir_lower_ssbo_size_filter(const nir_instr *instr, const void *data)
297 {
298    return instr->type == nir_instr_type_intrinsic &&
299           nir_instr_as_intrinsic(instr)->intrinsic ==
300              nir_intrinsic_get_ssbo_size;
301 }
302 
303 static nir_ssa_def *
ir3_nir_lower_ssbo_size_instr(nir_builder * b,nir_instr * instr,void * data)304 ir3_nir_lower_ssbo_size_instr(nir_builder *b, nir_instr *instr, void *data)
305 {
306    uint8_t ssbo_size_to_bytes_shift = *(uint8_t *) data;
307    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
308    return nir_ishl(b, &intr->dest.ssa, nir_imm_int(b, ssbo_size_to_bytes_shift));
309 }
310 
311 /**
312  * The resinfo opcode we have for getting the SSBO size on a6xx returns a byte
313  * length divided by IBO_0_FMT, while the NIR intrinsic coming in is a number of
314  * bytes. Switch things so the NIR intrinsic in our backend means dwords.
315  */
316 static bool
ir3_nir_lower_ssbo_size(nir_shader * s,bool storage_16bit)317 ir3_nir_lower_ssbo_size(nir_shader *s, bool storage_16bit)
318 {
319    uint8_t ssbo_size_to_bytes_shift = storage_16bit ? 1 : 2;
320    return nir_shader_lower_instructions(s, ir3_nir_lower_ssbo_size_filter,
321                                         ir3_nir_lower_ssbo_size_instr,
322                                         &ssbo_size_to_bytes_shift);
323 }
324 
325 void
ir3_nir_lower_io_to_temporaries(nir_shader * s)326 ir3_nir_lower_io_to_temporaries(nir_shader *s)
327 {
328    /* Outputs consumed by the VPC, VS inputs, and FS outputs are all handled
329     * by the hardware pre-loading registers at the beginning and then reading
330     * them at the end, so we can't access them indirectly except through
331     * normal register-indirect accesses, and therefore ir3 doesn't support
332     * indirect accesses on those. Other i/o is lowered in ir3_nir_lower_tess,
333     * and indirects work just fine for those. GS outputs may be consumed by
334     * VPC, but have their own lowering in ir3_nir_lower_gs() which does
335     * something similar to nir_lower_io_to_temporaries so we shouldn't need
336     * to lower them.
337     *
338     * Note: this might be a little inefficient for VS or TES outputs which are
339     * when the next stage isn't an FS, but it probably don't make sense to
340     * depend on the next stage before variant creation.
341     *
342     * TODO: for gallium, mesa/st also does some redundant lowering, including
343     * running this pass for GS inputs/outputs which we don't want but not
344     * including TES outputs or FS inputs which we do need. We should probably
345     * stop doing that once we're sure all drivers are doing their own
346     * indirect i/o lowering.
347     */
348    bool lower_input = s->info.stage == MESA_SHADER_VERTEX ||
349                       s->info.stage == MESA_SHADER_FRAGMENT;
350    bool lower_output = s->info.stage != MESA_SHADER_TESS_CTRL &&
351                        s->info.stage != MESA_SHADER_GEOMETRY;
352    if (lower_input || lower_output) {
353       NIR_PASS_V(s, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(s),
354                  lower_output, lower_input);
355 
356       /* nir_lower_io_to_temporaries() creates global variables and copy
357        * instructions which need to be cleaned up.
358        */
359       NIR_PASS_V(s, nir_split_var_copies);
360       NIR_PASS_V(s, nir_lower_var_copies);
361       NIR_PASS_V(s, nir_lower_global_vars_to_local);
362    }
363 
364    /* Regardless of the above, we need to lower indirect references to
365     * compact variables such as clip/cull distances because due to how
366     * TCS<->TES IO works we cannot handle indirect accesses that "straddle"
367     * vec4 components. nir_lower_indirect_derefs has a special case for
368     * compact variables, so it will actually lower them even though we pass
369     * in 0 modes.
370     *
371     * Using temporaries would be slightly better but
372     * nir_lower_io_to_temporaries currently doesn't support TCS i/o.
373     */
374    NIR_PASS_V(s, nir_lower_indirect_derefs, 0, UINT32_MAX);
375 }
376 
377 void
ir3_finalize_nir(struct ir3_compiler * compiler,nir_shader * s)378 ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)
379 {
380    struct nir_lower_tex_options tex_options = {
381       .lower_rect = 0,
382       .lower_tg4_offsets = true,
383    };
384 
385    if (compiler->gen >= 4) {
386       /* a4xx seems to have *no* sam.p */
387       tex_options.lower_txp = ~0; /* lower all txp */
388    } else {
389       /* a3xx just needs to avoid sam.p for 3d tex */
390       tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
391    }
392 
393    if (ir3_shader_debug & IR3_DBG_DISASM) {
394       mesa_logi("----------------------");
395       nir_log_shaderi(s);
396       mesa_logi("----------------------");
397    }
398 
399    if (s->info.stage == MESA_SHADER_GEOMETRY)
400       NIR_PASS_V(s, ir3_nir_lower_gs);
401 
402    NIR_PASS_V(s, nir_lower_amul, ir3_glsl_type_size);
403 
404    OPT_V(s, nir_lower_regs_to_ssa);
405    OPT_V(s, nir_lower_wrmasks, should_split_wrmask, s);
406 
407    OPT_V(s, nir_lower_tex, &tex_options);
408    OPT_V(s, nir_lower_load_const_to_scalar);
409    if (compiler->gen < 5)
410       OPT_V(s, ir3_nir_lower_tg4_to_tex);
411 
412    ir3_optimize_loop(compiler, s);
413 
414    /* do idiv lowering after first opt loop to get a chance to propagate
415     * constants for divide by immed power-of-two:
416     */
417    nir_lower_idiv_options idiv_options = {
418       .imprecise_32bit_lowering = true,
419       .allow_fp16 = true,
420    };
421    const bool idiv_progress = OPT(s, nir_lower_idiv, &idiv_options);
422 
423    if (idiv_progress)
424       ir3_optimize_loop(compiler, s);
425 
426    OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
427 
428    if (ir3_shader_debug & IR3_DBG_DISASM) {
429       mesa_logi("----------------------");
430       nir_log_shaderi(s);
431       mesa_logi("----------------------");
432    }
433 
434    /* st_program.c's parameter list optimization requires that future nir
435     * variants don't reallocate the uniform storage, so we have to remove
436     * uniforms that occupy storage.  But we don't want to remove samplers,
437     * because they're needed for YUV variant lowering.
438     */
439    nir_foreach_uniform_variable_safe (var, s) {
440       if (var->data.mode == nir_var_uniform &&
441           (glsl_type_get_image_count(var->type) ||
442            glsl_type_get_sampler_count(var->type)))
443          continue;
444 
445       exec_node_remove(&var->node);
446    }
447    nir_validate_shader(s, "after uniform var removal");
448 
449    nir_sweep(s);
450 }
451 
452 static bool
lower_subgroup_id_filter(const nir_instr * instr,const void * unused)453 lower_subgroup_id_filter(const nir_instr *instr, const void *unused)
454 {
455    (void)unused;
456 
457    if (instr->type != nir_instr_type_intrinsic)
458       return false;
459 
460    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
461    return intr->intrinsic == nir_intrinsic_load_subgroup_invocation ||
462           intr->intrinsic == nir_intrinsic_load_subgroup_id ||
463           intr->intrinsic == nir_intrinsic_load_num_subgroups;
464 }
465 
466 static nir_ssa_def *
lower_subgroup_id(nir_builder * b,nir_instr * instr,void * unused)467 lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
468 {
469    (void)unused;
470 
471    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
472    if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
473       return nir_iand(
474          b, nir_load_local_invocation_index(b),
475          nir_isub(b, nir_load_subgroup_size(b), nir_imm_int(b, 1)));
476    } else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) {
477       return nir_ishr(b, nir_load_local_invocation_index(b),
478                       nir_load_subgroup_id_shift_ir3(b));
479    } else {
480       assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
481       /* If the workgroup size is constant,
482        * nir_lower_compute_system_values() will replace local_size with a
483        * constant so this can mostly be constant folded away.
484        */
485       nir_ssa_def *local_size = nir_load_workgroup_size(b);
486       nir_ssa_def *size =
487          nir_imul24(b, nir_channel(b, local_size, 0),
488                     nir_imul24(b, nir_channel(b, local_size, 1),
489                                nir_channel(b, local_size, 2)));
490       nir_ssa_def *one = nir_imm_int(b, 1);
491       return nir_iadd(b, one,
492                       nir_ishr(b, nir_isub(b, size, one),
493                                nir_load_subgroup_id_shift_ir3(b)));
494    }
495 }
496 
497 static bool
ir3_nir_lower_subgroup_id_cs(nir_shader * shader)498 ir3_nir_lower_subgroup_id_cs(nir_shader *shader)
499 {
500    return nir_shader_lower_instructions(shader, lower_subgroup_id_filter,
501                                         lower_subgroup_id, NULL);
502 }
503 
504 static const nir_lower_idiv_options idiv_options = {
505    .imprecise_32bit_lowering = true,
506    .allow_fp16 = true,
507 };
508 
509 /**
510  * Late passes that need to be done after pscreen->finalize_nir()
511  */
512 void
ir3_nir_post_finalize(struct ir3_compiler * compiler,nir_shader * s)513 ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s)
514 {
515    NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
516               ir3_glsl_type_size, (nir_lower_io_options)0);
517 
518    if (s->info.stage == MESA_SHADER_FRAGMENT) {
519       /* NOTE: lower load_barycentric_at_sample first, since it
520        * produces load_barycentric_at_offset:
521        */
522       NIR_PASS_V(s, ir3_nir_lower_load_barycentric_at_sample);
523       NIR_PASS_V(s, ir3_nir_lower_load_barycentric_at_offset);
524       NIR_PASS_V(s, ir3_nir_move_varying_inputs);
525       NIR_PASS_V(s, nir_lower_fb_read);
526    }
527 
528    if (compiler->gen >= 6 && s->info.stage == MESA_SHADER_FRAGMENT &&
529        !(ir3_shader_debug & IR3_DBG_NOFP16)) {
530       NIR_PASS_V(s, nir_lower_mediump_io, nir_var_shader_out, 0, false);
531    }
532 
533    if (s->info.stage == MESA_SHADER_COMPUTE) {
534       bool progress = false;
535       NIR_PASS(progress, s, nir_lower_subgroups,
536                &(nir_lower_subgroups_options){
537                   .subgroup_size = 128,
538                   .ballot_bit_size = 32,
539                   .ballot_components = 4,
540                   .lower_to_scalar = true,
541                   .lower_vote_eq = true,
542                   .lower_subgroup_masks = true,
543                   .lower_read_invocation_to_cond = true,
544                });
545 
546       progress = false;
547       NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs);
548 
549       /* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which
550        * we need to lower again.
551        */
552       if (progress)
553          NIR_PASS_V(s, nir_lower_compute_system_values, NULL);
554    }
555 
556    /* we cannot ensure that ir3_finalize_nir() is only called once, so
557     * we also need to do any run-once workarounds here:
558     */
559    OPT_V(s, ir3_nir_apply_trig_workarounds);
560 
561    nir_lower_image_options lower_image_opts = {
562       .lower_cube_size = true,
563    };
564    NIR_PASS_V(s, nir_lower_image, &lower_image_opts);
565    NIR_PASS_V(s, nir_lower_idiv, &idiv_options); /* idiv generated by cube lowering */
566 
567    if (compiler->gen >= 6)
568       OPT_V(s, ir3_nir_lower_ssbo_size, compiler->storage_16bit);
569 
570    ir3_optimize_loop(compiler, s);
571 }
572 
573 static bool
ir3_nir_lower_view_layer_id(nir_shader * nir,bool layer_zero,bool view_zero)574 ir3_nir_lower_view_layer_id(nir_shader *nir, bool layer_zero, bool view_zero)
575 {
576    unsigned layer_id_loc = ~0, view_id_loc = ~0;
577    nir_foreach_shader_in_variable (var, nir) {
578       if (var->data.location == VARYING_SLOT_LAYER)
579          layer_id_loc = var->data.driver_location;
580       if (var->data.location == VARYING_SLOT_VIEWPORT)
581          view_id_loc = var->data.driver_location;
582    }
583 
584    assert(!layer_zero || layer_id_loc != ~0);
585    assert(!view_zero || view_id_loc != ~0);
586 
587    bool progress = false;
588    nir_builder b;
589 
590    nir_foreach_function (func, nir) {
591       nir_builder_init(&b, func->impl);
592 
593       nir_foreach_block (block, func->impl) {
594          nir_foreach_instr_safe (instr, block) {
595             if (instr->type != nir_instr_type_intrinsic)
596                continue;
597 
598             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
599 
600             if (intrin->intrinsic != nir_intrinsic_load_input)
601                continue;
602 
603             unsigned base = nir_intrinsic_base(intrin);
604             if (base != layer_id_loc && base != view_id_loc)
605                continue;
606 
607             b.cursor = nir_before_instr(&intrin->instr);
608             nir_ssa_def *zero = nir_imm_int(&b, 0);
609             nir_ssa_def_rewrite_uses(&intrin->dest.ssa, zero);
610             nir_instr_remove(&intrin->instr);
611             progress = true;
612          }
613       }
614 
615       if (progress) {
616          nir_metadata_preserve(
617             func->impl, nir_metadata_block_index | nir_metadata_dominance);
618       } else {
619          nir_metadata_preserve(func->impl, nir_metadata_all);
620       }
621    }
622 
623    return progress;
624 }
625 
626 void
ir3_nir_lower_variant(struct ir3_shader_variant * so,nir_shader * s)627 ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
628 {
629    if (ir3_shader_debug & IR3_DBG_DISASM) {
630       mesa_logi("----------------------");
631       nir_log_shaderi(s);
632       mesa_logi("----------------------");
633    }
634 
635    bool progress = false;
636 
637    if (so->key.has_gs || so->key.tessellation) {
638       switch (so->shader->type) {
639       case MESA_SHADER_VERTEX:
640          NIR_PASS_V(s, ir3_nir_lower_to_explicit_output, so,
641                     so->key.tessellation);
642          progress = true;
643          break;
644       case MESA_SHADER_TESS_CTRL:
645          NIR_PASS_V(s, ir3_nir_lower_tess_ctrl, so, so->key.tessellation);
646          NIR_PASS_V(s, ir3_nir_lower_to_explicit_input, so);
647          progress = true;
648          break;
649       case MESA_SHADER_TESS_EVAL:
650          NIR_PASS_V(s, ir3_nir_lower_tess_eval, so, so->key.tessellation);
651          if (so->key.has_gs)
652             NIR_PASS_V(s, ir3_nir_lower_to_explicit_output, so,
653                        so->key.tessellation);
654          progress = true;
655          break;
656       case MESA_SHADER_GEOMETRY:
657          NIR_PASS_V(s, ir3_nir_lower_to_explicit_input, so);
658          progress = true;
659          break;
660       default:
661          break;
662       }
663    }
664 
665    if (s->info.stage == MESA_SHADER_VERTEX) {
666       if (so->key.ucp_enables)
667          progress |=
668             OPT(s, nir_lower_clip_vs, so->key.ucp_enables, false, false, NULL);
669    } else if (s->info.stage == MESA_SHADER_FRAGMENT) {
670       bool layer_zero =
671          so->key.layer_zero && (s->info.inputs_read & VARYING_BIT_LAYER);
672       bool view_zero =
673          so->key.view_zero && (s->info.inputs_read & VARYING_BIT_VIEWPORT);
674 
675       if (so->key.ucp_enables && !so->shader->compiler->has_clip_cull)
676          progress |= OPT(s, nir_lower_clip_fs, so->key.ucp_enables, false);
677       if (layer_zero || view_zero)
678          progress |= OPT(s, ir3_nir_lower_view_layer_id, layer_zero, view_zero);
679    }
680 
681    /* Move large constant variables to the constants attached to the NIR
682     * shader, which we will upload in the immediates range.  This generates
683     * amuls, so we need to clean those up after.
684     *
685     * Passing no size_align, we would get packed values, which if we end up
686     * having to load with LDC would result in extra reads to unpack from
687     * straddling loads.  Align everything to vec4 to avoid that, though we
688     * could theoretically do better.
689     */
690    OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes,
691          32 /* bytes */);
692    OPT_V(s, ir3_nir_lower_load_constant, so);
693 
694    if (!so->binning_pass)
695       OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
696 
697    progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
698 
699    /* Lower large temporaries to scratch, which in Qualcomm terms is private
700     * memory, to avoid excess register pressure. This should happen after
701     * nir_opt_large_constants, because loading from a UBO is much, much less
702     * expensive.
703     */
704    if (so->shader->compiler->has_pvtmem) {
705       progress |= OPT(s, nir_lower_vars_to_scratch, nir_var_function_temp,
706                       16 * 16 /* bytes */, glsl_get_natural_size_align_bytes);
707    }
708 
709    /* Lower scratch writemasks */
710    progress |= OPT(s, nir_lower_wrmasks, should_split_wrmask, s);
711 
712    OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
713 
714    /* UBO offset lowering has to come after we've decided what will
715     * be left as load_ubo
716     */
717    if (so->shader->compiler->gen >= 6)
718       progress |= OPT(s, nir_lower_ubo_vec4);
719 
720    OPT_V(s, ir3_nir_lower_io_offsets);
721 
722    if (progress)
723       ir3_optimize_loop(so->shader->compiler, s);
724 
725    /* Fixup indirect load_uniform's which end up with a const base offset
726     * which is too large to encode.  Do this late(ish) so we actually
727     * can differentiate indirect vs non-indirect.
728     */
729    if (OPT(s, ir3_nir_fixup_load_uniform))
730       ir3_optimize_loop(so->shader->compiler, s);
731 
732    /* Do late algebraic optimization to turn add(a, neg(b)) back into
733     * subs, then the mandatory cleanup after algebraic.  Note that it may
734     * produce fnegs, and if so then we need to keep running to squash
735     * fneg(fneg(a)).
736     */
737    bool more_late_algebraic = true;
738    while (more_late_algebraic) {
739       more_late_algebraic = OPT(s, nir_opt_algebraic_late);
740       OPT_V(s, nir_opt_constant_folding);
741       OPT_V(s, nir_copy_prop);
742       OPT_V(s, nir_opt_dce);
743       OPT_V(s, nir_opt_cse);
744    }
745 
746    OPT_V(s, nir_opt_sink, nir_move_const_undef);
747 
748    if (ir3_shader_debug & IR3_DBG_DISASM) {
749       mesa_logi("----------------------");
750       nir_log_shaderi(s);
751       mesa_logi("----------------------");
752    }
753 
754    nir_sweep(s);
755 
756    /* Binning pass variants re-use  the const_state of the corresponding
757     * draw pass shader, so that same const emit can be re-used for both
758     * passes:
759     */
760    if (!so->binning_pass)
761       ir3_setup_const_state(s, so, ir3_const_state(so));
762 }
763 
764 static void
ir3_nir_scan_driver_consts(struct ir3_compiler * compiler,nir_shader * shader,struct ir3_const_state * layout)765 ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, struct ir3_const_state *layout)
766 {
767    nir_foreach_function (function, shader) {
768       if (!function->impl)
769          continue;
770 
771       nir_foreach_block (block, function->impl) {
772          nir_foreach_instr (instr, block) {
773             if (instr->type != nir_instr_type_intrinsic)
774                continue;
775 
776             nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
777             unsigned idx;
778 
779             switch (intr->intrinsic) {
780             case nir_intrinsic_image_atomic_add:
781             case nir_intrinsic_image_atomic_imin:
782             case nir_intrinsic_image_atomic_umin:
783             case nir_intrinsic_image_atomic_imax:
784             case nir_intrinsic_image_atomic_umax:
785             case nir_intrinsic_image_atomic_and:
786             case nir_intrinsic_image_atomic_or:
787             case nir_intrinsic_image_atomic_xor:
788             case nir_intrinsic_image_atomic_exchange:
789             case nir_intrinsic_image_atomic_comp_swap:
790             case nir_intrinsic_image_load:
791             case nir_intrinsic_image_store:
792             case nir_intrinsic_image_size:
793                if (compiler->gen < 6 &&
794                    !(intr->intrinsic == nir_intrinsic_image_load &&
795                      !(nir_intrinsic_access(intr) & ACCESS_COHERENT))) {
796                   idx = nir_src_as_uint(intr->src[0]);
797                   if (layout->image_dims.mask & (1 << idx))
798                      break;
799                   layout->image_dims.mask |= (1 << idx);
800                   layout->image_dims.off[idx] = layout->image_dims.count;
801                   layout->image_dims.count += 3; /* three const per */
802                }
803                break;
804             case nir_intrinsic_load_base_vertex:
805             case nir_intrinsic_load_first_vertex:
806                layout->num_driver_params =
807                   MAX2(layout->num_driver_params, IR3_DP_VTXID_BASE + 1);
808                break;
809             case nir_intrinsic_load_base_instance:
810                layout->num_driver_params =
811                   MAX2(layout->num_driver_params, IR3_DP_INSTID_BASE + 1);
812                break;
813             case nir_intrinsic_load_user_clip_plane:
814                idx = nir_intrinsic_ucp_id(intr);
815                layout->num_driver_params = MAX2(layout->num_driver_params,
816                                                 IR3_DP_UCP0_X + (idx + 1) * 4);
817                break;
818             case nir_intrinsic_load_num_workgroups:
819                layout->num_driver_params =
820                   MAX2(layout->num_driver_params, IR3_DP_NUM_WORK_GROUPS_Z + 1);
821                break;
822             case nir_intrinsic_load_workgroup_size:
823                layout->num_driver_params = MAX2(layout->num_driver_params,
824                                                 IR3_DP_LOCAL_GROUP_SIZE_Z + 1);
825                break;
826             case nir_intrinsic_load_base_workgroup_id:
827                layout->num_driver_params =
828                   MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1);
829                break;
830             case nir_intrinsic_load_subgroup_size:
831                layout->num_driver_params =
832                   MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_SIZE + 1);
833                break;
834             case nir_intrinsic_load_subgroup_id_shift_ir3:
835                layout->num_driver_params =
836                   MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1);
837                break;
838             default:
839                break;
840             }
841          }
842       }
843    }
844 }
845 
846 /* Sets up the variant-dependent constant state for the ir3_shader.  Note
847  * that it is also used from ir3_nir_analyze_ubo_ranges() to figure out the
848  * maximum number of driver params that would eventually be used, to leave
849  * space for this function to allocate the driver params.
850  */
851 void
ir3_setup_const_state(nir_shader * nir,struct ir3_shader_variant * v,struct ir3_const_state * const_state)852 ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
853                       struct ir3_const_state *const_state)
854 {
855    struct ir3_compiler *compiler = v->shader->compiler;
856 
857    memset(&const_state->offsets, ~0, sizeof(const_state->offsets));
858 
859    ir3_nir_scan_driver_consts(compiler, nir, const_state);
860 
861    if ((compiler->gen < 5) && (v->shader->stream_output.num_outputs > 0)) {
862       const_state->num_driver_params =
863          MAX2(const_state->num_driver_params, IR3_DP_VTXCNT_MAX + 1);
864    }
865 
866    const_state->num_ubos = nir->info.num_ubos;
867 
868    debug_assert((const_state->ubo_state.size % 16) == 0);
869    unsigned constoff = const_state->ubo_state.size / 16;
870    unsigned ptrsz = ir3_pointer_size(compiler);
871 
872    if (const_state->num_ubos > 0) {
873       const_state->offsets.ubo = constoff;
874       constoff += align(const_state->num_ubos * ptrsz, 4) / 4;
875    }
876 
877    if (const_state->image_dims.count > 0) {
878       unsigned cnt = const_state->image_dims.count;
879       const_state->offsets.image_dims = constoff;
880       constoff += align(cnt, 4) / 4;
881    }
882 
883    if (const_state->num_driver_params > 0) {
884       /* num_driver_params in dwords.  we only need to align to vec4s for the
885        * common case of immediate constant uploads, but for indirect dispatch
886        * the constants may also be indirect and so we have to align the area in
887        * const space to that requirement.
888        */
889       const_state->num_driver_params = align(const_state->num_driver_params, 4);
890       unsigned upload_unit = 1;
891       if (v->type == MESA_SHADER_COMPUTE ||
892           (const_state->num_driver_params >= IR3_DP_VTXID_BASE)) {
893          upload_unit = compiler->const_upload_unit;
894       }
895 
896       /* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */
897       if (v->type == MESA_SHADER_VERTEX && compiler->gen >= 6)
898          constoff = MAX2(constoff, 1);
899       constoff = align(constoff, upload_unit);
900       const_state->offsets.driver_param = constoff;
901 
902       constoff += align(const_state->num_driver_params / 4, upload_unit);
903    }
904 
905    if ((v->type == MESA_SHADER_VERTEX) && (compiler->gen < 5) &&
906        v->shader->stream_output.num_outputs > 0) {
907       const_state->offsets.tfbo = constoff;
908       constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
909    }
910 
911    switch (v->type) {
912    case MESA_SHADER_VERTEX:
913       const_state->offsets.primitive_param = constoff;
914       constoff += 1;
915       break;
916    case MESA_SHADER_TESS_CTRL:
917    case MESA_SHADER_TESS_EVAL:
918       constoff = align(constoff - 1, 4) + 3;
919       const_state->offsets.primitive_param = constoff;
920       const_state->offsets.primitive_map = constoff + 5;
921       constoff += 5 + DIV_ROUND_UP(v->input_size, 4);
922       break;
923    case MESA_SHADER_GEOMETRY:
924       const_state->offsets.primitive_param = constoff;
925       const_state->offsets.primitive_map = constoff + 1;
926       constoff += 1 + DIV_ROUND_UP(v->input_size, 4);
927       break;
928    default:
929       break;
930    }
931 
932    const_state->offsets.immediate = constoff;
933 
934    assert(constoff <= ir3_max_const(v));
935 }
936