1 /*
2 * Copyright © 2021 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /* This helps separate shaders because the next shader doesn't have to be known.
25 *
26 * It optimizes VS and TES outputs before FS as follows:
27 * - Eliminate and merge equal outputs, and treat undef as equal to everything, e.g.
28 * (x,y,undef,undef) == (undef,y,z,undef) --> (x,y,z,undef) regardless of the interpolation
29 * qualifier (AMD can map 1 output to multiple PS inputs and interpolate each differently).
30 * - Remove constant outputs that match AMD DEFAULT_VAL options, e.g. (0,0,0,1),
31 * treat undef as whatever.
32 *
33 * It requires that there is no indirect indexing and all output stores must be scalar.
34 */
35
36 #include "ac_nir.h"
37 #include "nir_builder.h"
38
39 struct ac_chan_info {
40 nir_instr *value;
41 nir_intrinsic_instr *store_intr; /* The intrinsic writing the value. */
42 };
43
44 struct ac_out_info {
45 unsigned base; /* nir_intrinsic_base */
46 nir_alu_type types;
47 bool duplicated;
48 bool constant;
49
50 /* Channels 0-3 are 32-bit channels or low bits of 16-bit channels.
51 * Channels 4-7 are high bits of 16-bit channels.
52 */
53 struct ac_chan_info chan[8];
54 };
55
ac_remove_varying(struct ac_out_info * out)56 static void ac_remove_varying(struct ac_out_info *out)
57 {
58 /* Remove the output. (all channels) */
59 for (unsigned i = 0; i < ARRAY_SIZE(out->chan); i++) {
60 if (out->chan[i].store_intr) {
61 nir_remove_varying(out->chan[i].store_intr);
62 out->chan[i].store_intr = NULL;
63 out->chan[i].value = NULL;
64 }
65 }
66 }
67
68 /* Return true if the output matches DEFAULT_VAL and has been eliminated. */
ac_eliminate_const_output(struct ac_out_info * out,gl_varying_slot semantic,uint8_t * param_export_index)69 static bool ac_eliminate_const_output(struct ac_out_info *out,
70 gl_varying_slot semantic,
71 uint8_t *param_export_index)
72 {
73 if (!(out->types & 32))
74 return false;
75
76 bool is_zero[4] = {0}, is_one[4] = {0};
77
78 for (unsigned i = 0; i < 4; i++) {
79 /* NULL means undef. */
80 if (!out->chan[i].value) {
81 is_zero[i] = true;
82 is_one[i] = true;
83 } else if (out->chan[i].value->type == nir_instr_type_load_const) {
84 if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 0)
85 is_zero[i] = true;
86 else if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 1)
87 is_one[i] = true;
88 else
89 return false; /* other constant */
90 } else
91 return false;
92 }
93
94 /* Only certain combinations of 0 and 1 are supported. */
95 unsigned default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
96
97 if (is_zero[0] && is_zero[1] && is_zero[2]) {
98 if (is_zero[3])
99 default_val = AC_EXP_PARAM_DEFAULT_VAL_0000;
100 else if (is_one[3])
101 default_val = AC_EXP_PARAM_DEFAULT_VAL_0001;
102 else
103 return false;
104 } else if (is_one[0] && is_one[1] && is_one[2]) {
105 if (is_zero[3])
106 default_val = AC_EXP_PARAM_DEFAULT_VAL_1110;
107 else if (is_one[3])
108 default_val = AC_EXP_PARAM_DEFAULT_VAL_1111;
109 else
110 return false;
111 } else {
112 return false;
113 }
114
115 /* Change OFFSET to DEFAULT_VAL. */
116 param_export_index[semantic] = default_val;
117 out->constant = true;
118 ac_remove_varying(out);
119 return true;
120 }
121
ac_eliminate_duplicated_output(struct ac_out_info * outputs,BITSET_DECLARE (outputs_optimized,NUM_TOTAL_VARYING_SLOTS),gl_varying_slot current,struct nir_builder * b,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])122 static bool ac_eliminate_duplicated_output(struct ac_out_info *outputs,
123 BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS),
124 gl_varying_slot current, struct nir_builder *b,
125 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
126 {
127 struct ac_out_info *cur = &outputs[current];
128 unsigned p, copy_back_channels = 0;
129
130 /* Check all outputs before current. */
131 BITSET_FOREACH_SET(p, outputs_optimized, current) {
132 struct ac_out_info *prev = &outputs[p];
133
134 /* Only compare with real outputs. */
135 if (prev->constant || prev->duplicated)
136 continue;
137
138 /* The types must match (only 16-bit and 32-bit types are allowed). */
139 if ((prev->types & 16) != (cur->types & 16))
140 continue;
141
142 bool different = false;
143
144 /* Iterate over all channels, including 16-bit channels in chan_hi. */
145 for (unsigned j = 0; j < 8; j++) {
146 nir_instr *prev_chan = prev->chan[j].value;
147 nir_instr *cur_chan = cur->chan[j].value;
148
149 /* Treat undef as a match. */
150 if (!cur_chan)
151 continue;
152
153 /* If prev is undef but cur isn't, we can merge the outputs
154 * and consider the output duplicated.
155 */
156 if (!prev_chan) {
157 copy_back_channels |= 1 << j;
158 continue;
159 }
160
161 /* Test whether the values are different. */
162 if (prev_chan != cur_chan &&
163 (prev_chan->type != nir_instr_type_load_const ||
164 cur_chan->type != nir_instr_type_load_const ||
165 nir_instr_as_load_const(prev_chan)->value[0].u32 !=
166 nir_instr_as_load_const(cur_chan)->value[0].u32)) {
167 different = true;
168 break;
169 }
170 }
171 if (!different)
172 break;
173
174 copy_back_channels = 0;
175 }
176 if (p == current)
177 return false;
178
179 /* An equal output already exists. Make FS use the existing one instead.
180 * This effectively disables the current output and the param export shouldn't
181 * be generated.
182 */
183 cur->duplicated = true;
184
185 /* p is gl_varying_slot in addition to being an index into outputs. */
186 slot_remap[current] = p;
187
188 /* If the matching preceding output has undef where the current one has a proper value,
189 * move the value to the preceding output.
190 */
191 struct ac_out_info *prev = &outputs[p];
192
193 while (copy_back_channels) {
194 unsigned i = u_bit_scan(©_back_channels);
195 struct ac_chan_info *prev_chan = &prev->chan[i];
196 struct ac_chan_info *cur_chan = &cur->chan[i];
197
198 b->cursor = nir_after_instr(&cur_chan->store_intr->instr);
199
200 /* The store intrinsic doesn't exist for this channel. Create a new one. */
201 nir_alu_type src_type = nir_intrinsic_src_type(cur_chan->store_intr);
202 struct nir_io_semantics sem = nir_intrinsic_io_semantics(cur_chan->store_intr);
203 struct nir_io_xfb xfb = nir_intrinsic_io_xfb(cur_chan->store_intr);
204 struct nir_io_xfb xfb2 = nir_intrinsic_io_xfb2(cur_chan->store_intr);
205
206 /* p is gl_varying_slot in addition to being an index into outputs. */
207 sem.location = p;
208 assert(sem.high_16bits == i / 4);
209
210 /* If it's a sysval output (such as CLIPDIST), we move the varying portion but keep
211 * the system value output. This is just the varying portion.
212 */
213 sem.no_sysval_output = 1;
214
215 /* Write just one component. */
216 prev_chan->store_intr = nir_store_output(b, nir_instr_ssa_def(cur_chan->value),
217 nir_imm_int(b, 0),
218 .base = prev->base,
219 .component = i % 4,
220 .io_semantics = sem,
221 .src_type = src_type,
222 .write_mask = 0x1,
223 .io_xfb = xfb,
224 .io_xfb2 = xfb2);
225
226 /* Update the undef channels in the output info. */
227 assert(!prev_chan->value);
228 prev_chan->value = cur_chan->value;
229
230 /* Remove transform feedback info from the current instruction because
231 * we moved it too. The instruction might not be removed if it's a system
232 * value output.
233 */
234 static struct nir_io_xfb zero_xfb;
235 nir_intrinsic_set_io_xfb(cur->chan[i].store_intr, zero_xfb);
236 nir_intrinsic_set_io_xfb2(cur->chan[i].store_intr, zero_xfb);
237 }
238
239 ac_remove_varying(cur);
240 return true;
241 }
242
ac_nir_optimize_outputs(nir_shader * nir,bool sprite_tex_disallowed,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS])243 bool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed,
244 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
245 uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS])
246 {
247 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
248 assert(impl);
249
250 if (nir->info.stage != MESA_SHADER_VERTEX &&
251 nir->info.stage != MESA_SHADER_TESS_EVAL) {
252 nir_metadata_preserve(impl, nir_metadata_all);
253 return false;
254 }
255
256 struct ac_out_info outputs[NUM_TOTAL_VARYING_SLOTS] = { 0 };
257
258 BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS);
259 BITSET_ZERO(outputs_optimized);
260
261 /* Gather outputs. */
262 nir_foreach_block(block, impl) {
263 nir_foreach_instr_safe(instr, block) {
264 if (instr->type != nir_instr_type_intrinsic)
265 continue;
266
267 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
268 if (intr->intrinsic != nir_intrinsic_store_output)
269 continue;
270
271 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
272
273 /* Only process varyings that appear as param exports. */
274 if (!nir_slot_is_varying(sem.location) || sem.no_varying)
275 continue;
276
277 /* We can't optimize texture coordinates if sprite_coord_enable can override them. */
278 if (sem.location >= VARYING_SLOT_TEX0 && sem.location <= VARYING_SLOT_TEX7 &&
279 !sprite_tex_disallowed)
280 continue;
281
282 BITSET_SET(outputs_optimized, sem.location);
283
284 /* No indirect indexing allowed. */
285 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
286 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
287
288 /* nir_lower_io_to_scalar is required before this */
289 assert(intr->src[0].ssa->num_components == 1);
290 /* No intrinsic should store undef. */
291 assert(intr->src[0].ssa->parent_instr->type != nir_instr_type_ssa_undef);
292
293 /* Gather the output. */
294 struct ac_out_info *out_info = &outputs[sem.location];
295 if (!out_info->types)
296 out_info->base = nir_intrinsic_base(intr);
297 else
298 assert(out_info->base == nir_intrinsic_base(intr));
299
300 out_info->types |= nir_intrinsic_src_type(intr);
301
302 unsigned chan = sem.high_16bits * 4 + nir_intrinsic_component(intr);
303 out_info->chan[chan].store_intr = intr;
304 out_info->chan[chan].value = intr->src[0].ssa->parent_instr;
305 }
306 }
307
308 unsigned i;
309 bool progress = false;
310
311 struct nir_builder b;
312 nir_builder_init(&b, impl);
313
314 /* Optimize outputs. */
315 BITSET_FOREACH_SET(i, outputs_optimized, NUM_TOTAL_VARYING_SLOTS) {
316 progress |=
317 ac_eliminate_const_output(&outputs[i], i, param_export_index) ||
318 ac_eliminate_duplicated_output(outputs, outputs_optimized, i, &b, slot_remap);
319 }
320
321 if (progress) {
322 nir_metadata_preserve(impl, nir_metadata_dominance |
323 nir_metadata_block_index);
324 } else {
325 nir_metadata_preserve(impl, nir_metadata_all);
326 }
327 return progress;
328 }
329