1 /*
2 * Copyright 2008 Nicolai Haehnle.
3 * SPDX-License-Identifier: MIT
4 */
5
6 /**
7 * @file
8 *
9 * Shareable transformations that transform "special" ALU instructions
10 * into ALU instructions that are supported by hardware.
11 *
12 */
13
14 #include "radeon_program_alu.h"
15
16 #include "radeon_compiler.h"
17 #include "radeon_compiler_util.h"
18 #include "radeon_dataflow.h"
19
20 #include "util/log.h"
21
22 static struct rc_instruction *
emit1(struct radeon_compiler * c,struct rc_instruction * after,rc_opcode Opcode,struct rc_sub_instruction * base,struct rc_dst_register DstReg,struct rc_src_register SrcReg)23 emit1(struct radeon_compiler *c, struct rc_instruction *after, rc_opcode Opcode,
24 struct rc_sub_instruction *base, struct rc_dst_register DstReg, struct rc_src_register SrcReg)
25 {
26 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
27
28 if (base) {
29 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
30 }
31
32 fpi->U.I.Opcode = Opcode;
33 fpi->U.I.DstReg = DstReg;
34 fpi->U.I.SrcReg[0] = SrcReg;
35 return fpi;
36 }
37
38 static struct rc_instruction *
emit2(struct radeon_compiler * c,struct rc_instruction * after,rc_opcode Opcode,struct rc_sub_instruction * base,struct rc_dst_register DstReg,struct rc_src_register SrcReg0,struct rc_src_register SrcReg1)39 emit2(struct radeon_compiler *c, struct rc_instruction *after, rc_opcode Opcode,
40 struct rc_sub_instruction *base, struct rc_dst_register DstReg,
41 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
42 {
43 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
44
45 if (base) {
46 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
47 }
48
49 fpi->U.I.Opcode = Opcode;
50 fpi->U.I.DstReg = DstReg;
51 fpi->U.I.SrcReg[0] = SrcReg0;
52 fpi->U.I.SrcReg[1] = SrcReg1;
53 return fpi;
54 }
55
56 static struct rc_instruction *
emit3(struct radeon_compiler * c,struct rc_instruction * after,rc_opcode Opcode,struct rc_sub_instruction * base,struct rc_dst_register DstReg,struct rc_src_register SrcReg0,struct rc_src_register SrcReg1,struct rc_src_register SrcReg2)57 emit3(struct radeon_compiler *c, struct rc_instruction *after, rc_opcode Opcode,
58 struct rc_sub_instruction *base, struct rc_dst_register DstReg,
59 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
60 struct rc_src_register SrcReg2)
61 {
62 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
63
64 if (base) {
65 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
66 }
67
68 fpi->U.I.Opcode = Opcode;
69 fpi->U.I.DstReg = DstReg;
70 fpi->U.I.SrcReg[0] = SrcReg0;
71 fpi->U.I.SrcReg[1] = SrcReg1;
72 fpi->U.I.SrcReg[2] = SrcReg2;
73 return fpi;
74 }
75
76 static struct rc_dst_register
dstregtmpmask(int index,int mask)77 dstregtmpmask(int index, int mask)
78 {
79 struct rc_dst_register dst = {0, 0, 0};
80 dst.File = RC_FILE_TEMPORARY;
81 dst.Index = index;
82 dst.WriteMask = mask;
83 return dst;
84 }
85
86 static const struct rc_src_register builtin_one = {
87 .File = RC_FILE_NONE, .Index = 0, .Swizzle = RC_SWIZZLE_1111};
88
89 static const struct rc_src_register srcreg_undefined = {
90 .File = RC_FILE_NONE, .Index = 0, .Swizzle = RC_SWIZZLE_XYZW};
91
92 static struct rc_src_register
srcreg(int file,int index)93 srcreg(int file, int index)
94 {
95 struct rc_src_register src = srcreg_undefined;
96 src.File = file;
97 src.Index = index;
98 return src;
99 }
100
101 static struct rc_src_register
srcregswz(int file,int index,int swz)102 srcregswz(int file, int index, int swz)
103 {
104 struct rc_src_register src = srcreg_undefined;
105 src.File = file;
106 src.Index = index;
107 src.Swizzle = swz;
108 return src;
109 }
110
111 static struct rc_src_register
absolute(struct rc_src_register reg)112 absolute(struct rc_src_register reg)
113 {
114 struct rc_src_register newreg = reg;
115 newreg.Abs = 1;
116 newreg.Negate = RC_MASK_NONE;
117 return newreg;
118 }
119
120 static struct rc_src_register
negate(struct rc_src_register reg)121 negate(struct rc_src_register reg)
122 {
123 struct rc_src_register newreg = reg;
124 newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
125 return newreg;
126 }
127
128 static struct rc_src_register
swizzle(struct rc_src_register reg,rc_swizzle x,rc_swizzle y,rc_swizzle z,rc_swizzle w)129 swizzle(struct rc_src_register reg, rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
130 {
131 struct rc_src_register swizzled = reg;
132 swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
133 return swizzled;
134 }
135
136 static struct rc_src_register
swizzle_smear(struct rc_src_register reg,rc_swizzle x)137 swizzle_smear(struct rc_src_register reg, rc_swizzle x)
138 {
139 return swizzle(reg, x, x, x, x);
140 }
141
142 static struct rc_src_register
swizzle_xxxx(struct rc_src_register reg)143 swizzle_xxxx(struct rc_src_register reg)
144 {
145 return swizzle_smear(reg, RC_SWIZZLE_X);
146 }
147
148 static struct rc_src_register
swizzle_yyyy(struct rc_src_register reg)149 swizzle_yyyy(struct rc_src_register reg)
150 {
151 return swizzle_smear(reg, RC_SWIZZLE_Y);
152 }
153
154 static struct rc_src_register
swizzle_zzzz(struct rc_src_register reg)155 swizzle_zzzz(struct rc_src_register reg)
156 {
157 return swizzle_smear(reg, RC_SWIZZLE_Z);
158 }
159
160 static struct rc_src_register
swizzle_wwww(struct rc_src_register reg)161 swizzle_wwww(struct rc_src_register reg)
162 {
163 return swizzle_smear(reg, RC_SWIZZLE_W);
164 }
165
166 static struct rc_dst_register
new_dst_reg(struct radeon_compiler * c,struct rc_instruction * inst)167 new_dst_reg(struct radeon_compiler *c, struct rc_instruction *inst)
168 {
169 unsigned tmp = rc_find_free_temporary(c);
170 return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
171 }
172
173 static void
transform_DP2(struct radeon_compiler * c,struct rc_instruction * inst)174 transform_DP2(struct radeon_compiler *c, struct rc_instruction *inst)
175 {
176 struct rc_src_register src0 = inst->U.I.SrcReg[0];
177 struct rc_src_register src1 = inst->U.I.SrcReg[1];
178 src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
179 src0.Swizzle &= ~(63 << (3 * 2));
180 src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
181 src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
182 src1.Swizzle &= ~(63 << (3 * 2));
183 src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
184 emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
185 rc_remove_instruction(inst);
186 }
187
188 static void
transform_RSQ(struct radeon_compiler * c,struct rc_instruction * inst)189 transform_RSQ(struct radeon_compiler *c, struct rc_instruction *inst)
190 {
191 inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
192 }
193
194 static void
transform_KILP(struct radeon_compiler * c,struct rc_instruction * inst)195 transform_KILP(struct radeon_compiler *c, struct rc_instruction *inst)
196 {
197 inst->U.I.SrcReg[0] = negate(builtin_one);
198 inst->U.I.Opcode = RC_OPCODE_KIL;
199 }
200
201 /**
202 * Can be used as a transformation for @ref radeonClauseLocalTransform,
203 * no userData necessary.
204 *
205 * Transforms RSQ to Radeon's native RSQ by explicitly setting
206 * absolute value.
207 *
208 * @note should be applicable to R300 and R500 fragment programs.
209 */
210 int
radeonTransformALU(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)211 radeonTransformALU(struct radeon_compiler *c, struct rc_instruction *inst, void *unused)
212 {
213 switch (inst->U.I.Opcode) {
214 case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
215 case RC_OPCODE_KILP: transform_KILP(c, inst); return 1;
216 case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
217 case RC_OPCODE_SEQ: unreachable();
218 case RC_OPCODE_SGE: unreachable();
219 case RC_OPCODE_SLT: unreachable();
220 case RC_OPCODE_SNE: unreachable();
221 default: return 0;
222 }
223 }
224
225 static void
transform_r300_vertex_CMP(struct radeon_compiler * c,struct rc_instruction * inst)226 transform_r300_vertex_CMP(struct radeon_compiler *c, struct rc_instruction *inst)
227 {
228 /* R5xx has a CMP, but we can use it only if it reads from less than
229 * three different temps. */
230 if (c->is_r500 && !rc_inst_has_three_diff_temp_srcs(inst))
231 return;
232
233 unreachable();
234 }
235
236 static void
transform_r300_vertex_DP2(struct radeon_compiler * c,struct rc_instruction * inst)237 transform_r300_vertex_DP2(struct radeon_compiler *c, struct rc_instruction *inst)
238 {
239 struct rc_instruction *next_inst = inst->Next;
240 transform_DP2(c, inst);
241 next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
242 }
243
244 static void
transform_r300_vertex_DP3(struct radeon_compiler * c,struct rc_instruction * inst)245 transform_r300_vertex_DP3(struct radeon_compiler *c, struct rc_instruction *inst)
246 {
247 struct rc_src_register src0 = inst->U.I.SrcReg[0];
248 struct rc_src_register src1 = inst->U.I.SrcReg[1];
249 src0.Negate &= ~RC_MASK_W;
250 src0.Swizzle &= ~(7 << (3 * 3));
251 src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
252 src1.Negate &= ~RC_MASK_W;
253 src1.Swizzle &= ~(7 << (3 * 3));
254 src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
255 emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
256 rc_remove_instruction(inst);
257 }
258
259 static void
transform_r300_vertex_fix_LIT(struct radeon_compiler * c,struct rc_instruction * inst)260 transform_r300_vertex_fix_LIT(struct radeon_compiler *c, struct rc_instruction *inst)
261 {
262 struct rc_dst_register dst = new_dst_reg(c, inst);
263 unsigned constant_swizzle;
264 int constant = rc_constants_add_immediate_scalar(&c->Program.Constants, 0.0000000000000000001,
265 &constant_swizzle);
266
267 /* MOV dst, src */
268 dst.WriteMask = RC_MASK_XYZW;
269 emit1(c, inst->Prev, RC_OPCODE_MOV, NULL, dst, inst->U.I.SrcReg[0]);
270
271 /* MAX dst.y, src, 0.00...001 */
272 emit2(c, inst->Prev, RC_OPCODE_MAX, NULL, dstregtmpmask(dst.Index, RC_MASK_Y),
273 srcreg(RC_FILE_TEMPORARY, dst.Index),
274 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
275
276 inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
277 }
278
279 static void
transform_r300_vertex_SEQ(struct radeon_compiler * c,struct rc_instruction * inst)280 transform_r300_vertex_SEQ(struct radeon_compiler *c, struct rc_instruction *inst)
281 {
282 /* x = y <==> x >= y && y >= x */
283 /* x <= y */
284 struct rc_dst_register dst0 = new_dst_reg(c, inst);
285 emit2(c, inst->Prev, RC_OPCODE_SGE, NULL, dst0, inst->U.I.SrcReg[0], inst->U.I.SrcReg[1]);
286
287 /* y <= x */
288 int tmp = rc_find_free_temporary(c);
289 emit2(c, inst->Prev, RC_OPCODE_SGE, NULL, dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
290 inst->U.I.SrcReg[1], inst->U.I.SrcReg[0]);
291
292 /* x && y = x * y */
293 emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, inst->U.I.DstReg, srcreg(dst0.File, dst0.Index),
294 srcreg(RC_FILE_TEMPORARY, tmp));
295
296 rc_remove_instruction(inst);
297 }
298
299 static void
transform_r300_vertex_SNE(struct radeon_compiler * c,struct rc_instruction * inst)300 transform_r300_vertex_SNE(struct radeon_compiler *c, struct rc_instruction *inst)
301 {
302 /* x != y <==> x < y || y < x */
303 /* x < y */
304 struct rc_dst_register dst0 = new_dst_reg(c, inst);
305 emit2(c, inst->Prev, RC_OPCODE_SLT, NULL, dst0, inst->U.I.SrcReg[0], inst->U.I.SrcReg[1]);
306
307 /* y < x */
308 int tmp = rc_find_free_temporary(c);
309 emit2(c, inst->Prev, RC_OPCODE_SLT, NULL, dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
310 inst->U.I.SrcReg[1], inst->U.I.SrcReg[0]);
311
312 /* x || y = max(x, y) */
313 emit2(c, inst->Prev, RC_OPCODE_MAX, NULL, inst->U.I.DstReg, srcreg(dst0.File, dst0.Index),
314 srcreg(RC_FILE_TEMPORARY, tmp));
315
316 rc_remove_instruction(inst);
317 }
318
319 /**
320 * For use with rc_local_transform, this transforms non-native ALU
321 * instructions of the r300 up to r500 vertex engine.
322 */
323 int
r300_transform_vertex_alu(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)324 r300_transform_vertex_alu(struct radeon_compiler *c, struct rc_instruction *inst, void *unused)
325 {
326 switch (inst->U.I.Opcode) {
327 case RC_OPCODE_CMP:
328 transform_r300_vertex_CMP(c, inst);
329 return 1;
330 case RC_OPCODE_DP2:
331 transform_r300_vertex_DP2(c, inst);
332 return 1;
333 case RC_OPCODE_DP3:
334 transform_r300_vertex_DP3(c, inst);
335 return 1;
336 case RC_OPCODE_LIT:
337 transform_r300_vertex_fix_LIT(c, inst);
338 return 1;
339 case RC_OPCODE_SEQ:
340 if (!c->is_r500) {
341 transform_r300_vertex_SEQ(c, inst);
342 return 1;
343 }
344 return 0;
345 case RC_OPCODE_SNE:
346 if (!c->is_r500) {
347 transform_r300_vertex_SNE(c, inst);
348 return 1;
349 }
350 return 0;
351 default:
352 return 0;
353 }
354 }
355
356 /**
357 * Replaces DDX/DDY instructions with MOV 0 to avoid using dummy shaders on r300/r400.
358 *
359 * @warning This explicitly changes the form of DDX and DDY!
360 */
361
362 int
radeonStubDeriv(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)363 radeonStubDeriv(struct radeon_compiler *c, struct rc_instruction *inst, void *unused)
364 {
365 if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
366 return 0;
367
368 inst->U.I.Opcode = RC_OPCODE_MOV;
369 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
370
371 mesa_logw_once("r300: WARNING: Shader is trying to use derivatives, "
372 "but the hardware doesn't support it. "
373 "Expect possible misrendering (it's not a bug, do not report it).");
374
375 return 1;
376 }
377
378 /**
379 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
380 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
381 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
382 *
383 * @warning This explicitly changes the form of DDX and DDY!
384 */
385
386 int
radeonTransformDeriv(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)387 radeonTransformDeriv(struct radeon_compiler *c, struct rc_instruction *inst, void *unused)
388 {
389 if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
390 return 0;
391
392 inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
393 inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
394
395 return 1;
396 }
397
398 int
rc_force_output_alpha_to_one(struct radeon_compiler * c,struct rc_instruction * inst,void * data)399 rc_force_output_alpha_to_one(struct radeon_compiler *c, struct rc_instruction *inst, void *data)
400 {
401 struct r300_fragment_program_compiler *fragc = (struct r300_fragment_program_compiler *)c;
402 const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
403 unsigned tmp;
404
405 if (!info->HasDstReg || inst->U.I.DstReg.File != RC_FILE_OUTPUT ||
406 inst->U.I.DstReg.Index == fragc->OutputDepth)
407 return 1;
408
409 tmp = rc_find_free_temporary(c);
410
411 /* Insert MOV after inst, set alpha to 1. */
412 emit1(c, inst, RC_OPCODE_MOV, NULL, inst->U.I.DstReg,
413 srcregswz(RC_FILE_TEMPORARY, tmp, RC_SWIZZLE_XYZ1));
414
415 /* Re-route the destination of inst to the source of mov. */
416 inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
417 inst->U.I.DstReg.Index = tmp;
418
419 /* Move the saturate output modifier to the MOV instruction
420 * (for better copy propagation). */
421 inst->Next->U.I.SaturateMode = inst->U.I.SaturateMode;
422 inst->U.I.SaturateMode = RC_SATURATE_NONE;
423 return 1;
424 }
425