• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compiler.h"
25 #include "bi_builder.h"
26 
27 /* Not all 8-bit and 16-bit instructions support all swizzles on all sources.
28  * These passes, intended to run after NIR->BIR but before scheduling/RA, lower
29  * away swizzles that cannot be represented. In the future, we should try to
30  * recombine swizzles where we can as an optimization.
31  */
32 
33 static void
bi_lower_swizzle_16(bi_context * ctx,bi_instr * ins,unsigned src)34 bi_lower_swizzle_16(bi_context *ctx, bi_instr *ins, unsigned src)
35 {
36         /* Identity is ok */
37         if (ins->src[src].swizzle == BI_SWIZZLE_H01)
38                 return;
39 
40         /* TODO: Use the opcode table and be a lot more methodical about this... */
41         switch (ins->op) {
42         /* Some instructions used with 16-bit data never have swizzles */
43         case BI_OPCODE_CSEL_V2F16:
44         case BI_OPCODE_CSEL_V2I16:
45         case BI_OPCODE_CSEL_V2S16:
46         case BI_OPCODE_CSEL_V2U16:
47 
48         /* Despite ostensibly being 32-bit instructions, CLPER does not
49          * inherently interpret the data, so it can be used for v2f16
50          * derivatives, which might require swizzle lowering */
51         case BI_OPCODE_CLPER_I32:
52         case BI_OPCODE_CLPER_OLD_I32:
53 
54         /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the
55          * boolean is implemented as a 16-bit integer, the swizzle is needed
56          * for correct operation if the instruction producing the 16-bit
57          * boolean does not replicate to both halves of the containing 32-bit
58          * register. As such, we may need to lower a swizzle.
59          *
60          * This is a silly hack. Ideally, code gen would be smart enough to
61          * avoid this case (by replicating). In practice, silly hardware design
62          * decisions force our hand here.
63          */
64         case BI_OPCODE_MUX_I32:
65         case BI_OPCODE_CSEL_I32:
66             break;
67 
68         case BI_OPCODE_IADD_V2S16:
69         case BI_OPCODE_IADD_V2U16:
70         case BI_OPCODE_ISUB_V2S16:
71         case BI_OPCODE_ISUB_V2U16:
72             if (src == 0 && ins->src[src].swizzle != BI_SWIZZLE_H10)
73                     break;
74             else
75                     return;
76         case BI_OPCODE_LSHIFT_AND_V2I16:
77         case BI_OPCODE_LSHIFT_OR_V2I16:
78         case BI_OPCODE_LSHIFT_XOR_V2I16:
79         case BI_OPCODE_RSHIFT_AND_V2I16:
80         case BI_OPCODE_RSHIFT_OR_V2I16:
81         case BI_OPCODE_RSHIFT_XOR_V2I16:
82             if (src == 2)
83                     return;
84             else
85                     break;
86 
87         /* For some reason MUX.v2i16 allows swaps but not replication */
88         case BI_OPCODE_MUX_V2I16:
89                 if (ins->src[src].swizzle == BI_SWIZZLE_H10)
90                         return;
91                 else
92                         break;
93 
94         /* We don't want to deal with reswizzling logic in modifier prop. Move
95          * the swizzle outside, it's easier for clamp propagation. */
96         case BI_OPCODE_FCLAMP_V2F16:
97         {
98                 bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
99                 bi_index dest = ins->dest[0];
100                 bi_index tmp = bi_temp(ctx);
101 
102                 ins->dest[0] = tmp;
103                 bi_swz_v2i16_to(&b, dest, bi_replace_index(ins->src[0], tmp));
104                 return;
105         }
106 
107         default:
108             return;
109         }
110 
111         /* First, try to apply a given swizzle to a constant to clear the
112          * runtime swizzle. This is less heavy-handed than ignoring the
113          * swizzle for scalar destinations, since it maintains
114          * replication of the destination.
115          */
116         if (ins->src[src].type == BI_INDEX_CONSTANT) {
117                 ins->src[src].value = bi_apply_swizzle(ins->src[src].value,
118                                                        ins->src[src].swizzle);
119                 ins->src[src].swizzle = BI_SWIZZLE_H01;
120                 return;
121         }
122 
123         /* Even if the source does not replicate, if the consuming instruction
124          * produces a 16-bit scalar, we can ignore the other component.
125          */
126         if (ins->dest[0].swizzle == BI_SWIZZLE_H00 &&
127                         ins->src[src].swizzle == BI_SWIZZLE_H00)
128         {
129                 ins->src[src].swizzle = BI_SWIZZLE_H01;
130                 return;
131         }
132 
133         /* Lower it away */
134         bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
135         ins->src[src] = bi_replace_index(ins->src[src],
136                         bi_swz_v2i16(&b, ins->src[src]));
137         ins->src[src].swizzle = BI_SWIZZLE_H01;
138 }
139 
140 static bool
bi_swizzle_replicates_8(enum bi_swizzle swz)141 bi_swizzle_replicates_8(enum bi_swizzle swz)
142 {
143         switch (swz) {
144         case BI_SWIZZLE_B0000:
145         case BI_SWIZZLE_B1111:
146         case BI_SWIZZLE_B2222:
147         case BI_SWIZZLE_B3333:
148                 return true;
149         default:
150                 return false;
151         }
152 }
153 
154 static bool
bi_swizzle_replicates_16(enum bi_swizzle swz)155 bi_swizzle_replicates_16(enum bi_swizzle swz)
156 {
157         switch (swz) {
158         case BI_SWIZZLE_H00:
159         case BI_SWIZZLE_H11:
160                 return true;
161         default:
162                 /* If a swizzle replicates every 8-bits, it also replicates
163                  * every 16-bits, so allow 8-bit replicating swizzles.
164                  */
165                 return bi_swizzle_replicates_8(swz);
166         }
167 }
168 
169 static bool
bi_instr_replicates(bi_instr * I,BITSET_WORD * replicates_16)170 bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16)
171 {
172         switch (I->op) {
173 
174         /* Instructions that construct vectors have replicated output if their
175          * sources are identical. Check this case first.
176          */
177         case BI_OPCODE_MKVEC_V2I16:
178         case BI_OPCODE_V2F16_TO_V2S16:
179         case BI_OPCODE_V2F16_TO_V2U16:
180         case BI_OPCODE_V2F32_TO_V2F16:
181         case BI_OPCODE_V2S16_TO_V2F16:
182         case BI_OPCODE_V2S8_TO_V2F16:
183         case BI_OPCODE_V2S8_TO_V2S16:
184         case BI_OPCODE_V2U16_TO_V2F16:
185         case BI_OPCODE_V2U8_TO_V2F16:
186         case BI_OPCODE_V2U8_TO_V2U16:
187                 return bi_is_value_equiv(I->src[0], I->src[1]);
188 
189         /* 16-bit transcendentals are defined to output zero in their
190          * upper half, so they do not replicate
191          */
192         case BI_OPCODE_FRCP_F16:
193         case BI_OPCODE_FRSQ_F16:
194                 return false;
195 
196         /* Not sure, be conservative, we don't use these.. */
197         case BI_OPCODE_VN_ASST1_F16:
198         case BI_OPCODE_FPCLASS_F16:
199         case BI_OPCODE_FPOW_SC_DET_F16:
200                 return false;
201 
202         default:
203                 break;
204         }
205 
206         /* Replication analysis only makes sense for ALU instructions */
207         if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE)
208                 return false;
209 
210         /* We only analyze 16-bit instructions for 16-bit replication. We could
211          * maybe do better.
212          */
213         if (bi_opcode_props[I->op].size != BI_SIZE_16)
214                 return false;
215 
216         bi_foreach_src(I, s) {
217                 if (bi_is_null(I->src[s]))
218                         continue;
219 
220                 /* Replicated swizzles */
221                 if (bi_swizzle_replicates_16(I->src[s].swizzle))
222                         continue;
223 
224                 /* Replicated values */
225                 if (bi_is_ssa(I->src[s]) &&
226                     BITSET_TEST(replicates_16, I->src[s].value))
227                         continue;
228 
229                 /* Replicated constants */
230                 if (I->src[s].type == BI_INDEX_CONSTANT &&
231                     (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16))
232                         continue;
233 
234                 return false;
235         }
236 
237         return true;
238 }
239 
240 void
bi_lower_swizzle(bi_context * ctx)241 bi_lower_swizzle(bi_context *ctx)
242 {
243         bi_foreach_instr_global_safe(ctx, ins) {
244                 bi_foreach_src(ins, s) {
245                         if (!bi_is_null(ins->src[s]))
246                                 bi_lower_swizzle_16(ctx, ins, s);
247                 }
248         }
249 
250         /* Now that we've lowered swizzles, clean up the mess */
251         BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc);
252 
253         bi_foreach_instr_global(ctx, ins) {
254                 if (bi_is_ssa(ins->dest[0]) && bi_instr_replicates(ins, replicates_16))
255                         BITSET_SET(replicates_16, ins->dest[0].value);
256 
257                 if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) &&
258                     BITSET_TEST(replicates_16, ins->src[0].value)) {
259                         ins->op = BI_OPCODE_MOV_I32;
260                         ins->src[0].swizzle = BI_SWIZZLE_H01;
261                 }
262 
263                 /* The above passes rely on replicating destinations.  For
264                  * Valhall, we will want to optimize this. For now, default
265                  * to Bifrost compatible behaviour.
266                  */
267                 ins->dest[0].swizzle = BI_SWIZZLE_H01;
268         }
269 
270         free(replicates_16);
271 }
272