• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "bi_builder.h"
25 #include "compiler.h"
26 
27 /* Not all 8-bit and 16-bit instructions support all swizzles on all sources.
28  * These passes, intended to run after NIR->BIR but before scheduling/RA, lower
29  * away swizzles that cannot be represented. In the future, we should try to
30  * recombine swizzles where we can as an optimization.
31  */
32 
33 static bool
bi_swizzle_replicates_8(enum bi_swizzle swz)34 bi_swizzle_replicates_8(enum bi_swizzle swz)
35 {
36    switch (swz) {
37    case BI_SWIZZLE_B0000:
38    case BI_SWIZZLE_B1111:
39    case BI_SWIZZLE_B2222:
40    case BI_SWIZZLE_B3333:
41       return true;
42    default:
43       return false;
44    }
45 }
46 
47 static void
lower_swizzle(bi_context * ctx,bi_instr * ins,unsigned src)48 lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src)
49 {
50    /* TODO: Use the opcode table and be a lot more methodical about this... */
51    switch (ins->op) {
52    /* Some instructions used with 16-bit data never have swizzles */
53    case BI_OPCODE_CSEL_V2F16:
54    case BI_OPCODE_CSEL_V2I16:
55    case BI_OPCODE_CSEL_V2S16:
56    case BI_OPCODE_CSEL_V2U16:
57       break;
58 
59    /* Despite ostensibly being 32-bit instructions, CLPER does not
60     * inherently interpret the data, so it can be used for v2f16
61     * derivatives, which might require swizzle lowering */
62    case BI_OPCODE_CLPER_I32:
63    case BI_OPCODE_CLPER_OLD_I32:
64       if (src == 0)
65          break;
66       else
67          return;
68 
69    /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the
70     * boolean is implemented as a 16-bit integer, the swizzle is needed
71     * for correct operation if the instruction producing the 16-bit
72     * boolean does not replicate to both halves of the containing 32-bit
73     * register. As such, we may need to lower a swizzle.
74     *
75     * This is a silly hack. Ideally, code gen would be smart enough to
76     * avoid this case (by replicating). In practice, silly hardware design
77     * decisions force our hand here.
78     */
79    case BI_OPCODE_MUX_I32:
80    case BI_OPCODE_CSEL_I32:
81       break;
82 
83    case BI_OPCODE_IADD_V2S16:
84    case BI_OPCODE_IADD_V2U16:
85    case BI_OPCODE_ISUB_V2S16:
86    case BI_OPCODE_ISUB_V2U16:
87       if (src == 0 && ins->src[src].swizzle != BI_SWIZZLE_H10)
88          break;
89       else
90          return;
91    case BI_OPCODE_LSHIFT_AND_V2I16:
92    case BI_OPCODE_LSHIFT_OR_V2I16:
93    case BI_OPCODE_LSHIFT_XOR_V2I16:
94    case BI_OPCODE_RSHIFT_AND_V2I16:
95    case BI_OPCODE_RSHIFT_OR_V2I16:
96    case BI_OPCODE_RSHIFT_XOR_V2I16:
97       if (src == 2)
98          return;
99       else
100          break;
101 
102    /* For some reason MUX.v2i16 allows swaps but not replication */
103    case BI_OPCODE_MUX_V2I16:
104       if (ins->src[src].swizzle == BI_SWIZZLE_H10)
105          return;
106       else
107          break;
108 
109    /* No swizzles supported */
110    case BI_OPCODE_HADD_V4U8:
111    case BI_OPCODE_HADD_V4S8:
112    case BI_OPCODE_CLZ_V4U8:
113    case BI_OPCODE_IDP_V4I8:
114    case BI_OPCODE_IABS_V4S8:
115    case BI_OPCODE_ICMP_V4I8:
116    case BI_OPCODE_ICMP_V4U8:
117    case BI_OPCODE_MUX_V4I8:
118    case BI_OPCODE_IADD_IMM_V4I8:
119       break;
120 
121    case BI_OPCODE_LSHIFT_AND_V4I8:
122    case BI_OPCODE_LSHIFT_OR_V4I8:
123    case BI_OPCODE_LSHIFT_XOR_V4I8:
124    case BI_OPCODE_RSHIFT_AND_V4I8:
125    case BI_OPCODE_RSHIFT_OR_V4I8:
126    case BI_OPCODE_RSHIFT_XOR_V4I8:
127       /* Last source allows identity or replication */
128       if (src == 2 && bi_swizzle_replicates_8(ins->src[src].swizzle))
129          return;
130 
131       /* Others do not allow swizzles */
132       break;
133 
134    /* We don't want to deal with reswizzling logic in modifier prop. Move
135     * the swizzle outside, it's easier for clamp propagation. */
136    case BI_OPCODE_FCLAMP_V2F16: {
137       bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
138       bi_index dest = ins->dest[0];
139       bi_index tmp = bi_temp(ctx);
140 
141       bi_index swizzled_src = bi_replace_index(ins->src[0], tmp);
142       ins->src[0].swizzle = BI_SWIZZLE_H01;
143       ins->dest[0] = tmp;
144       bi_swz_v2i16_to(&b, dest, swizzled_src);
145       return;
146    }
147 
148    default:
149       return;
150    }
151 
152    /* First, try to apply a given swizzle to a constant to clear the
153     * runtime swizzle. This is less heavy-handed than ignoring the
154     * swizzle for scalar destinations, since it maintains
155     * replication of the destination.
156     */
157    if (ins->src[src].type == BI_INDEX_CONSTANT) {
158       ins->src[src].value =
159          bi_apply_swizzle(ins->src[src].value, ins->src[src].swizzle);
160       ins->src[src].swizzle = BI_SWIZZLE_H01;
161       return;
162    }
163 
164    /* Even if the source does not replicate, if the consuming instruction
165     * produces a 16-bit scalar, we can ignore the other component.
166     */
167    if (ins->dest[0].swizzle == BI_SWIZZLE_H00 &&
168        ins->src[src].swizzle == BI_SWIZZLE_H00) {
169       ins->src[src].swizzle = BI_SWIZZLE_H01;
170       return;
171    }
172 
173    /* Lower it away */
174    bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
175 
176    bool is_8 = (bi_opcode_props[ins->op].size == BI_SIZE_8) ||
177                (bi_opcode_props[ins->op].size == BI_SIZE_32 &&
178                 ins->src[src].swizzle >= BI_SWIZZLE_B0000);
179 
180    bi_index orig = ins->src[src];
181    bi_index stripped = bi_replace_index(bi_null(), orig);
182    stripped.swizzle = ins->src[src].swizzle;
183 
184    bi_index swz = is_8 ? bi_swz_v4i8(&b, stripped) : bi_swz_v2i16(&b, stripped);
185 
186    bi_replace_src(ins, src, swz);
187    ins->src[src].swizzle = BI_SWIZZLE_H01;
188 }
189 
190 static bool
bi_swizzle_replicates_16(enum bi_swizzle swz)191 bi_swizzle_replicates_16(enum bi_swizzle swz)
192 {
193    switch (swz) {
194    case BI_SWIZZLE_H00:
195    case BI_SWIZZLE_H11:
196       return true;
197    default:
198       /* If a swizzle replicates every 8-bits, it also replicates
199        * every 16-bits, so allow 8-bit replicating swizzles.
200        */
201       return bi_swizzle_replicates_8(swz);
202    }
203 }
204 
205 static bool
bi_instr_replicates(bi_instr * I,BITSET_WORD * replicates_16)206 bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16)
207 {
208    switch (I->op) {
209 
210    /* Instructions that construct vectors have replicated output if their
211     * sources are identical. Check this case first.
212     */
213    case BI_OPCODE_MKVEC_V2I16:
214    case BI_OPCODE_V2F16_TO_V2S16:
215    case BI_OPCODE_V2F16_TO_V2U16:
216    case BI_OPCODE_V2F32_TO_V2F16:
217    case BI_OPCODE_V2S16_TO_V2F16:
218    case BI_OPCODE_V2S8_TO_V2F16:
219    case BI_OPCODE_V2S8_TO_V2S16:
220    case BI_OPCODE_V2U16_TO_V2F16:
221    case BI_OPCODE_V2U8_TO_V2F16:
222    case BI_OPCODE_V2U8_TO_V2U16:
223       return bi_is_value_equiv(I->src[0], I->src[1]);
224 
225    /* 16-bit transcendentals are defined to output zero in their
226     * upper half, so they do not replicate
227     */
228    case BI_OPCODE_FRCP_F16:
229    case BI_OPCODE_FRSQ_F16:
230       return false;
231 
232    /* Not sure, be conservative, we don't use these.. */
233    case BI_OPCODE_VN_ASST1_F16:
234    case BI_OPCODE_FPCLASS_F16:
235    case BI_OPCODE_FPOW_SC_DET_F16:
236       return false;
237 
238    default:
239       break;
240    }
241 
242    /* Replication analysis only makes sense for ALU instructions */
243    if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE)
244       return false;
245 
246    /* We only analyze 16-bit instructions for 16-bit replication. We could
247     * maybe do better.
248     */
249    if (bi_opcode_props[I->op].size != BI_SIZE_16)
250       return false;
251 
252    bi_foreach_src(I, s) {
253       if (bi_is_null(I->src[s]))
254          continue;
255 
256       /* Replicated swizzles */
257       if (bi_swizzle_replicates_16(I->src[s].swizzle))
258          continue;
259 
260       /* Replicated values */
261       if (bi_is_ssa(I->src[s]) && BITSET_TEST(replicates_16, I->src[s].value))
262          continue;
263 
264       /* Replicated constants */
265       if (I->src[s].type == BI_INDEX_CONSTANT &&
266           (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16))
267          continue;
268 
269       return false;
270    }
271 
272    return true;
273 }
274 
275 void
bi_lower_swizzle(bi_context * ctx)276 bi_lower_swizzle(bi_context *ctx)
277 {
278    bi_foreach_instr_global_safe(ctx, ins) {
279       bi_foreach_src(ins, s) {
280          if (bi_is_null(ins->src[s]))
281             continue;
282          if (ins->src[s].swizzle == BI_SWIZZLE_H01)
283             continue;
284 
285          lower_swizzle(ctx, ins, s);
286       }
287    }
288 
289    /* Now that we've lowered swizzles, clean up the mess */
290    BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc);
291 
292    bi_foreach_instr_global(ctx, ins) {
293       if (ins->nr_dests && bi_instr_replicates(ins, replicates_16))
294          BITSET_SET(replicates_16, ins->dest[0].value);
295 
296       if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) &&
297           BITSET_TEST(replicates_16, ins->src[0].value)) {
298          ins->op = BI_OPCODE_MOV_I32;
299          ins->src[0].swizzle = BI_SWIZZLE_H01;
300       }
301 
302       /* The above passes rely on replicating destinations.  For
303        * Valhall, we will want to optimize this. For now, default
304        * to Bifrost compatible behaviour.
305        */
306       if (ins->nr_dests)
307          ins->dest[0].swizzle = BI_SWIZZLE_H01;
308    }
309 
310    free(replicates_16);
311 }
312