1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25 #include "bi_builder.h"
26
27 /* Not all 8-bit and 16-bit instructions support all swizzles on all sources.
28 * These passes, intended to run after NIR->BIR but before scheduling/RA, lower
29 * away swizzles that cannot be represented. In the future, we should try to
30 * recombine swizzles where we can as an optimization.
31 */
32
33 static void
bi_lower_swizzle_16(bi_context * ctx,bi_instr * ins,unsigned src)34 bi_lower_swizzle_16(bi_context *ctx, bi_instr *ins, unsigned src)
35 {
36 /* Identity is ok */
37 if (ins->src[src].swizzle == BI_SWIZZLE_H01)
38 return;
39
40 /* TODO: Use the opcode table and be a lot more methodical about this... */
41 switch (ins->op) {
42 /* Some instructions used with 16-bit data never have swizzles */
43 case BI_OPCODE_CSEL_V2F16:
44 case BI_OPCODE_CSEL_V2I16:
45 case BI_OPCODE_CSEL_V2S16:
46 case BI_OPCODE_CSEL_V2U16:
47
48 /* Despite ostensibly being 32-bit instructions, CLPER does not
49 * inherently interpret the data, so it can be used for v2f16
50 * derivatives, which might require swizzle lowering */
51 case BI_OPCODE_CLPER_I32:
52 case BI_OPCODE_CLPER_OLD_I32:
53
54 /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the
55 * boolean is implemented as a 16-bit integer, the swizzle is needed
56 * for correct operation if the instruction producing the 16-bit
57 * boolean does not replicate to both halves of the containing 32-bit
58 * register. As such, we may need to lower a swizzle.
59 *
60 * This is a silly hack. Ideally, code gen would be smart enough to
61 * avoid this case (by replicating). In practice, silly hardware design
62 * decisions force our hand here.
63 */
64 case BI_OPCODE_MUX_I32:
65 case BI_OPCODE_CSEL_I32:
66 break;
67
68 case BI_OPCODE_IADD_V2S16:
69 case BI_OPCODE_IADD_V2U16:
70 case BI_OPCODE_ISUB_V2S16:
71 case BI_OPCODE_ISUB_V2U16:
72 if (src == 0 && ins->src[src].swizzle != BI_SWIZZLE_H10)
73 break;
74 else
75 return;
76 case BI_OPCODE_LSHIFT_AND_V2I16:
77 case BI_OPCODE_LSHIFT_OR_V2I16:
78 case BI_OPCODE_LSHIFT_XOR_V2I16:
79 case BI_OPCODE_RSHIFT_AND_V2I16:
80 case BI_OPCODE_RSHIFT_OR_V2I16:
81 case BI_OPCODE_RSHIFT_XOR_V2I16:
82 if (src == 2)
83 return;
84 else
85 break;
86
87 /* For some reason MUX.v2i16 allows swaps but not replication */
88 case BI_OPCODE_MUX_V2I16:
89 if (ins->src[src].swizzle == BI_SWIZZLE_H10)
90 return;
91 else
92 break;
93
94 /* We don't want to deal with reswizzling logic in modifier prop. Move
95 * the swizzle outside, it's easier for clamp propagation. */
96 case BI_OPCODE_FCLAMP_V2F16:
97 {
98 bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
99 bi_index dest = ins->dest[0];
100 bi_index tmp = bi_temp(ctx);
101
102 ins->dest[0] = tmp;
103 bi_swz_v2i16_to(&b, dest, bi_replace_index(ins->src[0], tmp));
104 return;
105 }
106
107 default:
108 return;
109 }
110
111 /* First, try to apply a given swizzle to a constant to clear the
112 * runtime swizzle. This is less heavy-handed than ignoring the
113 * swizzle for scalar destinations, since it maintains
114 * replication of the destination.
115 */
116 if (ins->src[src].type == BI_INDEX_CONSTANT) {
117 ins->src[src].value = bi_apply_swizzle(ins->src[src].value,
118 ins->src[src].swizzle);
119 ins->src[src].swizzle = BI_SWIZZLE_H01;
120 return;
121 }
122
123 /* Even if the source does not replicate, if the consuming instruction
124 * produces a 16-bit scalar, we can ignore the other component.
125 */
126 if (ins->dest[0].swizzle == BI_SWIZZLE_H00 &&
127 ins->src[src].swizzle == BI_SWIZZLE_H00)
128 {
129 ins->src[src].swizzle = BI_SWIZZLE_H01;
130 return;
131 }
132
133 /* Lower it away */
134 bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
135 ins->src[src] = bi_replace_index(ins->src[src],
136 bi_swz_v2i16(&b, ins->src[src]));
137 ins->src[src].swizzle = BI_SWIZZLE_H01;
138 }
139
140 static bool
bi_swizzle_replicates_8(enum bi_swizzle swz)141 bi_swizzle_replicates_8(enum bi_swizzle swz)
142 {
143 switch (swz) {
144 case BI_SWIZZLE_B0000:
145 case BI_SWIZZLE_B1111:
146 case BI_SWIZZLE_B2222:
147 case BI_SWIZZLE_B3333:
148 return true;
149 default:
150 return false;
151 }
152 }
153
154 static bool
bi_swizzle_replicates_16(enum bi_swizzle swz)155 bi_swizzle_replicates_16(enum bi_swizzle swz)
156 {
157 switch (swz) {
158 case BI_SWIZZLE_H00:
159 case BI_SWIZZLE_H11:
160 return true;
161 default:
162 /* If a swizzle replicates every 8-bits, it also replicates
163 * every 16-bits, so allow 8-bit replicating swizzles.
164 */
165 return bi_swizzle_replicates_8(swz);
166 }
167 }
168
169 static bool
bi_instr_replicates(bi_instr * I,BITSET_WORD * replicates_16)170 bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16)
171 {
172 switch (I->op) {
173
174 /* Instructions that construct vectors have replicated output if their
175 * sources are identical. Check this case first.
176 */
177 case BI_OPCODE_MKVEC_V2I16:
178 case BI_OPCODE_V2F16_TO_V2S16:
179 case BI_OPCODE_V2F16_TO_V2U16:
180 case BI_OPCODE_V2F32_TO_V2F16:
181 case BI_OPCODE_V2S16_TO_V2F16:
182 case BI_OPCODE_V2S8_TO_V2F16:
183 case BI_OPCODE_V2S8_TO_V2S16:
184 case BI_OPCODE_V2U16_TO_V2F16:
185 case BI_OPCODE_V2U8_TO_V2F16:
186 case BI_OPCODE_V2U8_TO_V2U16:
187 return bi_is_value_equiv(I->src[0], I->src[1]);
188
189 /* 16-bit transcendentals are defined to output zero in their
190 * upper half, so they do not replicate
191 */
192 case BI_OPCODE_FRCP_F16:
193 case BI_OPCODE_FRSQ_F16:
194 return false;
195
196 /* Not sure, be conservative, we don't use these.. */
197 case BI_OPCODE_VN_ASST1_F16:
198 case BI_OPCODE_FPCLASS_F16:
199 case BI_OPCODE_FPOW_SC_DET_F16:
200 return false;
201
202 default:
203 break;
204 }
205
206 /* Replication analysis only makes sense for ALU instructions */
207 if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE)
208 return false;
209
210 /* We only analyze 16-bit instructions for 16-bit replication. We could
211 * maybe do better.
212 */
213 if (bi_opcode_props[I->op].size != BI_SIZE_16)
214 return false;
215
216 bi_foreach_src(I, s) {
217 if (bi_is_null(I->src[s]))
218 continue;
219
220 /* Replicated swizzles */
221 if (bi_swizzle_replicates_16(I->src[s].swizzle))
222 continue;
223
224 /* Replicated values */
225 if (bi_is_ssa(I->src[s]) &&
226 BITSET_TEST(replicates_16, I->src[s].value))
227 continue;
228
229 /* Replicated constants */
230 if (I->src[s].type == BI_INDEX_CONSTANT &&
231 (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16))
232 continue;
233
234 return false;
235 }
236
237 return true;
238 }
239
240 void
bi_lower_swizzle(bi_context * ctx)241 bi_lower_swizzle(bi_context *ctx)
242 {
243 bi_foreach_instr_global_safe(ctx, ins) {
244 bi_foreach_src(ins, s) {
245 if (!bi_is_null(ins->src[s]))
246 bi_lower_swizzle_16(ctx, ins, s);
247 }
248 }
249
250 /* Now that we've lowered swizzles, clean up the mess */
251 BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc);
252
253 bi_foreach_instr_global(ctx, ins) {
254 if (bi_is_ssa(ins->dest[0]) && bi_instr_replicates(ins, replicates_16))
255 BITSET_SET(replicates_16, ins->dest[0].value);
256
257 if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) &&
258 BITSET_TEST(replicates_16, ins->src[0].value)) {
259 ins->op = BI_OPCODE_MOV_I32;
260 ins->src[0].swizzle = BI_SWIZZLE_H01;
261 }
262
263 /* The above passes rely on replicating destinations. For
264 * Valhall, we will want to optimize this. For now, default
265 * to Bifrost compatible behaviour.
266 */
267 ins->dest[0].swizzle = BI_SWIZZLE_H01;
268 }
269
270 free(replicates_16);
271 }
272