1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "bi_builder.h"
25 #include "compiler.h"
26
27 /* Not all 8-bit and 16-bit instructions support all swizzles on all sources.
28 * These passes, intended to run after NIR->BIR but before scheduling/RA, lower
29 * away swizzles that cannot be represented. In the future, we should try to
30 * recombine swizzles where we can as an optimization.
31 */
32
33 static bool
bi_swizzle_replicates_8(enum bi_swizzle swz)34 bi_swizzle_replicates_8(enum bi_swizzle swz)
35 {
36 switch (swz) {
37 case BI_SWIZZLE_B0000:
38 case BI_SWIZZLE_B1111:
39 case BI_SWIZZLE_B2222:
40 case BI_SWIZZLE_B3333:
41 return true;
42 default:
43 return false;
44 }
45 }
46
47 static void
lower_swizzle(bi_context * ctx,bi_instr * ins,unsigned src)48 lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src)
49 {
50 /* TODO: Use the opcode table and be a lot more methodical about this... */
51 switch (ins->op) {
52 /* Some instructions used with 16-bit data never have swizzles */
53 case BI_OPCODE_CSEL_V2F16:
54 case BI_OPCODE_CSEL_V2I16:
55 case BI_OPCODE_CSEL_V2S16:
56 case BI_OPCODE_CSEL_V2U16:
57 break;
58
59 /* Despite ostensibly being 32-bit instructions, CLPER does not
60 * inherently interpret the data, so it can be used for v2f16
61 * derivatives, which might require swizzle lowering */
62 case BI_OPCODE_CLPER_I32:
63 case BI_OPCODE_CLPER_OLD_I32:
64 if (src == 0)
65 break;
66 else
67 return;
68
69 /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the
70 * boolean is implemented as a 16-bit integer, the swizzle is needed
71 * for correct operation if the instruction producing the 16-bit
72 * boolean does not replicate to both halves of the containing 32-bit
73 * register. As such, we may need to lower a swizzle.
74 *
75 * This is a silly hack. Ideally, code gen would be smart enough to
76 * avoid this case (by replicating). In practice, silly hardware design
77 * decisions force our hand here.
78 */
79 case BI_OPCODE_MUX_I32:
80 case BI_OPCODE_CSEL_I32:
81 break;
82
83 case BI_OPCODE_IADD_V2S16:
84 case BI_OPCODE_IADD_V2U16:
85 case BI_OPCODE_ISUB_V2S16:
86 case BI_OPCODE_ISUB_V2U16:
87 if (src == 0 && ins->src[src].swizzle != BI_SWIZZLE_H10)
88 break;
89 else
90 return;
91 case BI_OPCODE_LSHIFT_AND_V2I16:
92 case BI_OPCODE_LSHIFT_OR_V2I16:
93 case BI_OPCODE_LSHIFT_XOR_V2I16:
94 case BI_OPCODE_RSHIFT_AND_V2I16:
95 case BI_OPCODE_RSHIFT_OR_V2I16:
96 case BI_OPCODE_RSHIFT_XOR_V2I16:
97 if (src == 2)
98 return;
99 else
100 break;
101
102 /* For some reason MUX.v2i16 allows swaps but not replication */
103 case BI_OPCODE_MUX_V2I16:
104 if (ins->src[src].swizzle == BI_SWIZZLE_H10)
105 return;
106 else
107 break;
108
109 /* No swizzles supported */
110 case BI_OPCODE_HADD_V4U8:
111 case BI_OPCODE_HADD_V4S8:
112 case BI_OPCODE_CLZ_V4U8:
113 case BI_OPCODE_IDP_V4I8:
114 case BI_OPCODE_IABS_V4S8:
115 case BI_OPCODE_ICMP_V4I8:
116 case BI_OPCODE_ICMP_V4U8:
117 case BI_OPCODE_MUX_V4I8:
118 case BI_OPCODE_IADD_IMM_V4I8:
119 break;
120
121 case BI_OPCODE_LSHIFT_AND_V4I8:
122 case BI_OPCODE_LSHIFT_OR_V4I8:
123 case BI_OPCODE_LSHIFT_XOR_V4I8:
124 case BI_OPCODE_RSHIFT_AND_V4I8:
125 case BI_OPCODE_RSHIFT_OR_V4I8:
126 case BI_OPCODE_RSHIFT_XOR_V4I8:
127 /* Last source allows identity or replication */
128 if (src == 2 && bi_swizzle_replicates_8(ins->src[src].swizzle))
129 return;
130
131 /* Others do not allow swizzles */
132 break;
133
134 /* We don't want to deal with reswizzling logic in modifier prop. Move
135 * the swizzle outside, it's easier for clamp propagation. */
136 case BI_OPCODE_FCLAMP_V2F16: {
137 bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
138 bi_index dest = ins->dest[0];
139 bi_index tmp = bi_temp(ctx);
140
141 bi_index swizzled_src = bi_replace_index(ins->src[0], tmp);
142 ins->src[0].swizzle = BI_SWIZZLE_H01;
143 ins->dest[0] = tmp;
144 bi_swz_v2i16_to(&b, dest, swizzled_src);
145 return;
146 }
147
148 default:
149 return;
150 }
151
152 /* First, try to apply a given swizzle to a constant to clear the
153 * runtime swizzle. This is less heavy-handed than ignoring the
154 * swizzle for scalar destinations, since it maintains
155 * replication of the destination.
156 */
157 if (ins->src[src].type == BI_INDEX_CONSTANT) {
158 ins->src[src].value =
159 bi_apply_swizzle(ins->src[src].value, ins->src[src].swizzle);
160 ins->src[src].swizzle = BI_SWIZZLE_H01;
161 return;
162 }
163
164 /* Even if the source does not replicate, if the consuming instruction
165 * produces a 16-bit scalar, we can ignore the other component.
166 */
167 if (ins->dest[0].swizzle == BI_SWIZZLE_H00 &&
168 ins->src[src].swizzle == BI_SWIZZLE_H00) {
169 ins->src[src].swizzle = BI_SWIZZLE_H01;
170 return;
171 }
172
173 /* Lower it away */
174 bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
175
176 bool is_8 = (bi_opcode_props[ins->op].size == BI_SIZE_8) ||
177 (bi_opcode_props[ins->op].size == BI_SIZE_32 &&
178 ins->src[src].swizzle >= BI_SWIZZLE_B0000);
179
180 bi_index orig = ins->src[src];
181 bi_index stripped = bi_replace_index(bi_null(), orig);
182 stripped.swizzle = ins->src[src].swizzle;
183
184 bi_index swz = is_8 ? bi_swz_v4i8(&b, stripped) : bi_swz_v2i16(&b, stripped);
185
186 bi_replace_src(ins, src, swz);
187 ins->src[src].swizzle = BI_SWIZZLE_H01;
188 }
189
190 static bool
bi_swizzle_replicates_16(enum bi_swizzle swz)191 bi_swizzle_replicates_16(enum bi_swizzle swz)
192 {
193 switch (swz) {
194 case BI_SWIZZLE_H00:
195 case BI_SWIZZLE_H11:
196 return true;
197 default:
198 /* If a swizzle replicates every 8-bits, it also replicates
199 * every 16-bits, so allow 8-bit replicating swizzles.
200 */
201 return bi_swizzle_replicates_8(swz);
202 }
203 }
204
205 static bool
bi_instr_replicates(bi_instr * I,BITSET_WORD * replicates_16)206 bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16)
207 {
208 switch (I->op) {
209
210 /* Instructions that construct vectors have replicated output if their
211 * sources are identical. Check this case first.
212 */
213 case BI_OPCODE_MKVEC_V2I16:
214 case BI_OPCODE_V2F16_TO_V2S16:
215 case BI_OPCODE_V2F16_TO_V2U16:
216 case BI_OPCODE_V2F32_TO_V2F16:
217 case BI_OPCODE_V2S16_TO_V2F16:
218 case BI_OPCODE_V2S8_TO_V2F16:
219 case BI_OPCODE_V2S8_TO_V2S16:
220 case BI_OPCODE_V2U16_TO_V2F16:
221 case BI_OPCODE_V2U8_TO_V2F16:
222 case BI_OPCODE_V2U8_TO_V2U16:
223 return bi_is_value_equiv(I->src[0], I->src[1]);
224
225 /* 16-bit transcendentals are defined to output zero in their
226 * upper half, so they do not replicate
227 */
228 case BI_OPCODE_FRCP_F16:
229 case BI_OPCODE_FRSQ_F16:
230 return false;
231
232 /* Not sure, be conservative, we don't use these.. */
233 case BI_OPCODE_VN_ASST1_F16:
234 case BI_OPCODE_FPCLASS_F16:
235 case BI_OPCODE_FPOW_SC_DET_F16:
236 return false;
237
238 default:
239 break;
240 }
241
242 /* Replication analysis only makes sense for ALU instructions */
243 if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE)
244 return false;
245
246 /* We only analyze 16-bit instructions for 16-bit replication. We could
247 * maybe do better.
248 */
249 if (bi_opcode_props[I->op].size != BI_SIZE_16)
250 return false;
251
252 bi_foreach_src(I, s) {
253 if (bi_is_null(I->src[s]))
254 continue;
255
256 /* Replicated swizzles */
257 if (bi_swizzle_replicates_16(I->src[s].swizzle))
258 continue;
259
260 /* Replicated values */
261 if (bi_is_ssa(I->src[s]) && BITSET_TEST(replicates_16, I->src[s].value))
262 continue;
263
264 /* Replicated constants */
265 if (I->src[s].type == BI_INDEX_CONSTANT &&
266 (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16))
267 continue;
268
269 return false;
270 }
271
272 return true;
273 }
274
275 void
bi_lower_swizzle(bi_context * ctx)276 bi_lower_swizzle(bi_context *ctx)
277 {
278 bi_foreach_instr_global_safe(ctx, ins) {
279 bi_foreach_src(ins, s) {
280 if (bi_is_null(ins->src[s]))
281 continue;
282 if (ins->src[s].swizzle == BI_SWIZZLE_H01)
283 continue;
284
285 lower_swizzle(ctx, ins, s);
286 }
287 }
288
289 /* Now that we've lowered swizzles, clean up the mess */
290 BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc);
291
292 bi_foreach_instr_global(ctx, ins) {
293 if (ins->nr_dests && bi_instr_replicates(ins, replicates_16))
294 BITSET_SET(replicates_16, ins->dest[0].value);
295
296 if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) &&
297 BITSET_TEST(replicates_16, ins->src[0].value)) {
298 ins->op = BI_OPCODE_MOV_I32;
299 ins->src[0].swizzle = BI_SWIZZLE_H01;
300 }
301
302 /* The above passes rely on replicating destinations. For
303 * Valhall, we will want to optimize this. For now, default
304 * to Bifrost compatible behaviour.
305 */
306 if (ins->nr_dests)
307 ins->dest[0].swizzle = BI_SWIZZLE_H01;
308 }
309
310 free(replicates_16);
311 }
312