• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /* This helps separate shaders because the next shader doesn't have to be known.
25  *
26  * It optimizes VS and TES outputs before FS as follows:
27  * - Eliminate and merge equal outputs, and treat undef as equal to everything, e.g.
28  *   (x,y,undef,undef) == (undef,y,z,undef) --> (x,y,z,undef) regardless of the interpolation
29  *   qualifier (AMD can map 1 output to multiple PS inputs and interpolate each differently).
30  * - Remove constant outputs that match AMD DEFAULT_VAL options, e.g. (0,0,0,1),
31  *   treat undef as whatever.
32  *
33  * It requires that there is no indirect indexing and all output stores must be scalar.
34  */
35 
36 #include "ac_nir.h"
37 #include "nir_builder.h"
38 
39 struct ac_chan_info {
40    nir_instr *value;
41    nir_intrinsic_instr *store_intr; /* The intrinsic writing the value. */
42 };
43 
44 struct ac_out_info {
45    unsigned base; /* nir_intrinsic_base */
46    nir_alu_type types;
47    bool duplicated;
48    bool constant;
49 
50    /* Channels 0-3 are 32-bit channels or low bits of 16-bit channels.
51     * Channels 4-7 are high bits of 16-bit channels.
52     */
53    struct ac_chan_info chan[8];
54 };
55 
ac_remove_varying(struct ac_out_info * out)56 static void ac_remove_varying(struct ac_out_info *out)
57 {
58    /* Remove the output. (all channels) */
59    for (unsigned i = 0; i < ARRAY_SIZE(out->chan); i++) {
60       if (out->chan[i].store_intr) {
61          nir_remove_varying(out->chan[i].store_intr);
62          out->chan[i].store_intr = NULL;
63          out->chan[i].value = NULL;
64       }
65    }
66 }
67 
68 /* Return true if the output matches DEFAULT_VAL and has been eliminated. */
ac_eliminate_const_output(struct ac_out_info * out,gl_varying_slot semantic,uint8_t * param_export_index)69 static bool ac_eliminate_const_output(struct ac_out_info *out,
70                                       gl_varying_slot semantic,
71                                       uint8_t *param_export_index)
72 {
73    if (!(out->types & 32))
74       return false;
75 
76    bool is_zero[4] = {0}, is_one[4] = {0};
77 
78    for (unsigned i = 0; i < 4; i++) {
79       /* NULL means undef. */
80       if (!out->chan[i].value) {
81          is_zero[i] = true;
82          is_one[i] = true;
83       } else if (out->chan[i].value->type == nir_instr_type_load_const) {
84          if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 0)
85             is_zero[i] = true;
86          else if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 1)
87             is_one[i] = true;
88          else
89             return false; /* other constant */
90       } else
91          return false;
92    }
93 
94    /* Only certain combinations of 0 and 1 are supported. */
95    unsigned default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
96 
97    if (is_zero[0] && is_zero[1] && is_zero[2]) {
98       if (is_zero[3])
99          default_val = AC_EXP_PARAM_DEFAULT_VAL_0000;
100       else if (is_one[3])
101          default_val = AC_EXP_PARAM_DEFAULT_VAL_0001;
102       else
103          return false;
104    } else if (is_one[0] && is_one[1] && is_one[2]) {
105       if (is_zero[3])
106          default_val = AC_EXP_PARAM_DEFAULT_VAL_1110;
107       else if (is_one[3])
108          default_val = AC_EXP_PARAM_DEFAULT_VAL_1111;
109       else
110          return false;
111    } else {
112       return false;
113    }
114 
115    /* Change OFFSET to DEFAULT_VAL. */
116    param_export_index[semantic] = default_val;
117    out->constant = true;
118    ac_remove_varying(out);
119    return true;
120 }
121 
ac_eliminate_duplicated_output(struct ac_out_info * outputs,BITSET_DECLARE (outputs_optimized,NUM_TOTAL_VARYING_SLOTS),gl_varying_slot current,struct nir_builder * b,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])122 static bool ac_eliminate_duplicated_output(struct ac_out_info *outputs,
123                                            BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS),
124                                            gl_varying_slot current, struct nir_builder *b,
125                                            int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
126 {
127    struct ac_out_info *cur = &outputs[current];
128    unsigned p, copy_back_channels = 0;
129 
130    /* Check all outputs before current. */
131    BITSET_FOREACH_SET(p, outputs_optimized, current) {
132       struct ac_out_info *prev = &outputs[p];
133 
134       /* Only compare with real outputs. */
135       if (prev->constant || prev->duplicated)
136          continue;
137 
138       /* The types must match (only 16-bit and 32-bit types are allowed). */
139       if ((prev->types & 16) != (cur->types & 16))
140          continue;
141 
142       bool different = false;
143 
144       /* Iterate over all channels, including 16-bit channels in chan_hi. */
145       for (unsigned j = 0; j < 8; j++) {
146          nir_instr *prev_chan = prev->chan[j].value;
147          nir_instr *cur_chan = cur->chan[j].value;
148 
149          /* Treat undef as a match. */
150          if (!cur_chan)
151             continue;
152 
153          /* If prev is undef but cur isn't, we can merge the outputs
154           * and consider the output duplicated.
155           */
156          if (!prev_chan) {
157             copy_back_channels |= 1 << j;
158             continue;
159          }
160 
161          /* Test whether the values are different. */
162          if (prev_chan != cur_chan &&
163              (prev_chan->type != nir_instr_type_load_const ||
164               cur_chan->type != nir_instr_type_load_const ||
165               nir_instr_as_load_const(prev_chan)->value[0].u32 !=
166               nir_instr_as_load_const(cur_chan)->value[0].u32)) {
167             different = true;
168             break;
169          }
170       }
171       if (!different)
172          break;
173 
174       copy_back_channels = 0;
175    }
176    if (p == current)
177       return false;
178 
179    /* An equal output already exists. Make FS use the existing one instead.
180     * This effectively disables the current output and the param export shouldn't
181     * be generated.
182     */
183    cur->duplicated = true;
184 
185    /* p is gl_varying_slot in addition to being an index into outputs. */
186    slot_remap[current] = p;
187 
188    /* If the matching preceding output has undef where the current one has a proper value,
189     * move the value to the preceding output.
190     */
191    struct ac_out_info *prev = &outputs[p];
192 
193    while (copy_back_channels) {
194       unsigned i = u_bit_scan(&copy_back_channels);
195       struct ac_chan_info *prev_chan = &prev->chan[i];
196       struct ac_chan_info *cur_chan = &cur->chan[i];
197 
198       b->cursor = nir_after_instr(&cur_chan->store_intr->instr);
199 
200       /* The store intrinsic doesn't exist for this channel. Create a new one. */
201       nir_alu_type src_type = nir_intrinsic_src_type(cur_chan->store_intr);
202       struct nir_io_semantics sem = nir_intrinsic_io_semantics(cur_chan->store_intr);
203       struct nir_io_xfb xfb = nir_intrinsic_io_xfb(cur_chan->store_intr);
204       struct nir_io_xfb xfb2 = nir_intrinsic_io_xfb2(cur_chan->store_intr);
205 
206       /* p is gl_varying_slot in addition to being an index into outputs. */
207       sem.location = p;
208       assert(sem.high_16bits == i / 4);
209 
210       /* If it's a sysval output (such as CLIPDIST), we move the varying portion but keep
211        * the system value output. This is just the varying portion.
212        */
213       sem.no_sysval_output = 1;
214 
215       /* Write just one component. */
216       prev_chan->store_intr = nir_store_output(b, nir_instr_ssa_def(cur_chan->value),
217                                                nir_imm_int(b, 0),
218                                                .base = prev->base,
219                                                .component = i % 4,
220                                                .io_semantics = sem,
221                                                .src_type = src_type,
222                                                .write_mask = 0x1,
223                                                .io_xfb = xfb,
224                                                .io_xfb2 = xfb2);
225 
226       /* Update the undef channels in the output info. */
227       assert(!prev_chan->value);
228       prev_chan->value = cur_chan->value;
229 
230       /* Remove transform feedback info from the current instruction because
231        * we moved it too. The instruction might not be removed if it's a system
232        * value output.
233        */
234       static struct nir_io_xfb zero_xfb;
235       nir_intrinsic_set_io_xfb(cur->chan[i].store_intr, zero_xfb);
236       nir_intrinsic_set_io_xfb2(cur->chan[i].store_intr, zero_xfb);
237    }
238 
239    ac_remove_varying(cur);
240    return true;
241 }
242 
ac_nir_optimize_outputs(nir_shader * nir,bool sprite_tex_disallowed,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS])243 bool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed,
244                              int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
245                              uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS])
246 {
247    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
248    assert(impl);
249 
250    if (nir->info.stage != MESA_SHADER_VERTEX &&
251        nir->info.stage != MESA_SHADER_TESS_EVAL) {
252       nir_metadata_preserve(impl, nir_metadata_all);
253       return false;
254    }
255 
256    struct ac_out_info outputs[NUM_TOTAL_VARYING_SLOTS] = { 0 };
257 
258    BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS);
259    BITSET_ZERO(outputs_optimized);
260 
261    /* Gather outputs. */
262    nir_foreach_block(block, impl) {
263       nir_foreach_instr_safe(instr, block) {
264          if (instr->type != nir_instr_type_intrinsic)
265             continue;
266 
267          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
268          if (intr->intrinsic != nir_intrinsic_store_output)
269             continue;
270 
271          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
272 
273          /* Only process varyings that appear as param exports. */
274          if (!nir_slot_is_varying(sem.location) || sem.no_varying)
275             continue;
276 
277          /* We can't optimize texture coordinates if sprite_coord_enable can override them. */
278          if (sem.location >= VARYING_SLOT_TEX0 && sem.location <= VARYING_SLOT_TEX7 &&
279              !sprite_tex_disallowed)
280             continue;
281 
282          BITSET_SET(outputs_optimized, sem.location);
283 
284          /* No indirect indexing allowed. */
285          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
286          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
287 
288          /* nir_lower_io_to_scalar is required before this */
289          assert(intr->src[0].ssa->num_components == 1);
290          /* No intrinsic should store undef. */
291          assert(intr->src[0].ssa->parent_instr->type != nir_instr_type_ssa_undef);
292 
293          /* Gather the output. */
294          struct ac_out_info *out_info = &outputs[sem.location];
295          if (!out_info->types)
296             out_info->base = nir_intrinsic_base(intr);
297          else
298             assert(out_info->base == nir_intrinsic_base(intr));
299 
300          out_info->types |= nir_intrinsic_src_type(intr);
301 
302          unsigned chan = sem.high_16bits * 4 + nir_intrinsic_component(intr);
303          out_info->chan[chan].store_intr = intr;
304          out_info->chan[chan].value = intr->src[0].ssa->parent_instr;
305       }
306    }
307 
308    unsigned i;
309    bool progress = false;
310 
311    struct nir_builder b;
312    nir_builder_init(&b, impl);
313 
314    /* Optimize outputs. */
315    BITSET_FOREACH_SET(i, outputs_optimized, NUM_TOTAL_VARYING_SLOTS) {
316       progress |=
317          ac_eliminate_const_output(&outputs[i], i, param_export_index) ||
318          ac_eliminate_duplicated_output(outputs, outputs_optimized, i, &b, slot_remap);
319    }
320 
321    if (progress) {
322       nir_metadata_preserve(impl, nir_metadata_dominance |
323                                   nir_metadata_block_index);
324    } else {
325       nir_metadata_preserve(impl, nir_metadata_all);
326    }
327    return progress;
328 }
329