1 /*
2 * Copyright 2009 Nicolai Haehnle.
3 * Copyright 2010 Tom Stellard <tstellar@gmail.com>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "util/u_math.h"
8
9 #include "radeon_dataflow.h"
10
11 #include "radeon_compiler.h"
12 #include "radeon_compiler_util.h"
13 #include "radeon_list.h"
14 #include "radeon_swizzle.h"
15 #include "radeon_variable.h"
16
17 struct src_clobbered_reads_cb_data {
18 rc_register_file File;
19 unsigned int Index;
20 unsigned int Mask;
21 struct rc_reader_data *ReaderData;
22 };
23
24 typedef void (*rc_presub_replace_fn)(struct rc_instruction *, struct rc_instruction *,
25 unsigned int);
26
27 static struct rc_src_register
chain_srcregs(struct rc_src_register outer,struct rc_src_register inner)28 chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
29 {
30 struct rc_src_register combine;
31 combine.File = inner.File;
32 combine.Index = inner.Index;
33 combine.RelAddr = inner.RelAddr;
34 if (outer.Abs) {
35 combine.Abs = 1;
36 combine.Negate = outer.Negate;
37 } else {
38 combine.Abs = inner.Abs;
39 combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
40 combine.Negate ^= outer.Negate;
41 }
42 combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
43 return combine;
44 }
45
46 static void
copy_propagate_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)47 copy_propagate_scan_read(void *data, struct rc_instruction *inst, struct rc_src_register *src)
48 {
49 rc_register_file file = src->File;
50 struct rc_reader_data *reader_data = data;
51
52 if (!rc_inst_can_use_presub(reader_data->C, inst, reader_data->Writer->U.I.PreSub.Opcode,
53 rc_swizzle_to_writemask(src->Swizzle), src,
54 &reader_data->Writer->U.I.PreSub.SrcReg[0],
55 &reader_data->Writer->U.I.PreSub.SrcReg[1])) {
56 reader_data->Abort = 1;
57 return;
58 }
59
60 /* XXX This could probably be handled better. */
61 if (file == RC_FILE_ADDRESS) {
62 reader_data->Abort = 1;
63 return;
64 }
65
66 /* R300/R400 is unhappy about propagating
67 * 0: MOV temp[1], -none.1111;
68 * 1: KIL temp[1];
69 * to
70 * 0: KIL -none.1111;
71 *
72 * R500 is fine with it.
73 */
74 if (!reader_data->C->is_r500 && inst->U.I.Opcode == RC_OPCODE_KIL &&
75 reader_data->Writer->U.I.SrcReg[0].File == RC_FILE_NONE) {
76 reader_data->Abort = 1;
77 return;
78 }
79
80 /* These instructions cannot read from the constants file.
81 * see radeonTransformTEX()
82 */
83 if (reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
84 reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
85 reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_NONE &&
86 (inst->U.I.Opcode == RC_OPCODE_TEX || inst->U.I.Opcode == RC_OPCODE_TXB ||
87 inst->U.I.Opcode == RC_OPCODE_TXP || inst->U.I.Opcode == RC_OPCODE_TXD ||
88 inst->U.I.Opcode == RC_OPCODE_TXL || inst->U.I.Opcode == RC_OPCODE_KIL)) {
89 reader_data->Abort = 1;
90 return;
91 }
92 }
93
94 static void
src_clobbered_reads_cb(void * data,struct rc_instruction * inst,struct rc_src_register * src)95 src_clobbered_reads_cb(void *data, struct rc_instruction *inst, struct rc_src_register *src)
96 {
97 struct src_clobbered_reads_cb_data *sc_data = data;
98
99 if (src->File == sc_data->File && src->Index == sc_data->Index &&
100 (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
101
102 sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
103 }
104
105 if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
106 sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
107 }
108 }
109
110 static void
is_src_clobbered_scan_write(void * data,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)111 is_src_clobbered_scan_write(void *data, struct rc_instruction *inst, rc_register_file file,
112 unsigned int index, unsigned int mask)
113 {
114 struct src_clobbered_reads_cb_data sc_data;
115 struct rc_reader_data *reader_data = data;
116 sc_data.File = file;
117 sc_data.Index = index;
118 sc_data.Mask = mask;
119 sc_data.ReaderData = reader_data;
120 rc_for_all_reads_src(reader_data->Writer, src_clobbered_reads_cb, &sc_data);
121 }
122
123 static void
copy_propagate(struct radeon_compiler * c,struct rc_instruction * inst_mov)124 copy_propagate(struct radeon_compiler *c, struct rc_instruction *inst_mov)
125 {
126 struct rc_reader_data reader_data;
127 unsigned int i;
128
129 if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY || inst_mov->U.I.WriteALUResult)
130 return;
131
132 /* Get a list of all the readers of this MOV instruction. */
133 reader_data.ExitOnAbort = 1;
134 rc_get_readers(c, inst_mov, &reader_data, copy_propagate_scan_read, NULL,
135 is_src_clobbered_scan_write);
136
137 if (reader_data.Abort || reader_data.ReaderCount == 0)
138 return;
139
140 /* We can propagate SaturateMode if all the readers are MOV instructions
141 * without a presubtract operation, source negation and absolute.
142 * In that case, we just move SaturateMode to all readers. */
143 if (inst_mov->U.I.SaturateMode) {
144 for (i = 0; i < reader_data.ReaderCount; i++) {
145 struct rc_instruction *inst = reader_data.Readers[i].Inst;
146
147 if (inst->U.I.Opcode != RC_OPCODE_MOV || inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
148 inst->U.I.SrcReg[0].Abs || inst->U.I.SrcReg[0].Negate) {
149 return;
150 }
151 }
152 }
153
154 /* Propagate the MOV instruction. */
155 for (i = 0; i < reader_data.ReaderCount; i++) {
156 struct rc_instruction *inst = reader_data.Readers[i].Inst;
157 *reader_data.Readers[i].U.I.Src =
158 chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
159
160 if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
161 inst->U.I.PreSub = inst_mov->U.I.PreSub;
162 if (!inst->U.I.SaturateMode)
163 inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
164 }
165
166 /* Finally, remove the original MOV instruction */
167 rc_remove_instruction(inst_mov);
168 }
169
170 /**
171 * Check if a source register is actually always the same
172 * swizzle constant.
173 */
174 static int
is_src_uniform_constant(struct rc_src_register src,rc_swizzle * pswz,unsigned int * pnegate)175 is_src_uniform_constant(struct rc_src_register src, rc_swizzle *pswz, unsigned int *pnegate)
176 {
177 int have_used = 0;
178
179 if (src.File != RC_FILE_NONE) {
180 *pswz = 0;
181 return 0;
182 }
183
184 for (unsigned int chan = 0; chan < 4; ++chan) {
185 unsigned int swz = GET_SWZ(src.Swizzle, chan);
186 if (swz < 4) {
187 *pswz = 0;
188 return 0;
189 }
190 if (swz == RC_SWIZZLE_UNUSED)
191 continue;
192
193 if (!have_used) {
194 *pswz = swz;
195 *pnegate = GET_BIT(src.Negate, chan);
196 have_used = 1;
197 } else {
198 if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
199 *pswz = 0;
200 return 0;
201 }
202 }
203 }
204
205 return 1;
206 }
207
208 /**
209 * Replace 0.0, 1.0 and 0.5 immediate constants by their
210 * respective swizzles. Simplify instructions like ADD dst, src, 0;
211 */
212 static void
constant_folding(struct radeon_compiler * c,struct rc_instruction * inst)213 constant_folding(struct radeon_compiler *c, struct rc_instruction *inst)
214 {
215 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
216 unsigned int i;
217
218 /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
219 for (unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
220 struct rc_constant *constant;
221 struct rc_src_register newsrc;
222 int have_real_reference;
223 unsigned int chan;
224
225 /* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
226 for (chan = 0; chan < 4; ++chan)
227 if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
228 break;
229 if (chan == 4) {
230 inst->U.I.SrcReg[src].File = RC_FILE_NONE;
231 continue;
232 }
233
234 /* Convert immediates to swizzles. */
235 if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT || inst->U.I.SrcReg[src].RelAddr ||
236 inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
237 continue;
238
239 constant = &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
240
241 if (constant->Type != RC_CONSTANT_IMMEDIATE)
242 continue;
243
244 newsrc = inst->U.I.SrcReg[src];
245 have_real_reference = 0;
246 for (chan = 0; chan < 4; ++chan) {
247 unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
248 unsigned int newswz;
249 float imm;
250 float baseimm;
251
252 if (swz >= 4)
253 continue;
254
255 imm = constant->u.Immediate[swz];
256 baseimm = imm;
257 if (imm < 0.0)
258 baseimm = -baseimm;
259
260 if (baseimm == 0.0) {
261 newswz = RC_SWIZZLE_ZERO;
262 } else if (baseimm == 1.0) {
263 newswz = RC_SWIZZLE_ONE;
264 } else if (baseimm == 0.5 && c->has_half_swizzles) {
265 newswz = RC_SWIZZLE_HALF;
266 } else {
267 have_real_reference = 1;
268 continue;
269 }
270
271 SET_SWZ(newsrc.Swizzle, chan, newswz);
272 if (imm < 0.0 && !newsrc.Abs)
273 newsrc.Negate ^= 1 << chan;
274 }
275
276 if (!have_real_reference) {
277 newsrc.File = RC_FILE_NONE;
278 newsrc.Index = 0;
279 }
280
281 /* don't make the swizzle worse */
282 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
283 continue;
284
285 inst->U.I.SrcReg[src] = newsrc;
286 }
287
288 /* In case this instruction has been converted, make sure all of the
289 * registers that are no longer used are empty. */
290 opcode = rc_get_opcode_info(inst->U.I.Opcode);
291 for (i = opcode->NumSrcRegs; i < 3; i++) {
292 memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
293 }
294 }
295
296 /**
297 * If src and dst use the same register, this function returns a writemask that
298 * indicates which components are read by src. Otherwise zero is returned.
299 */
300 static unsigned int
src_reads_dst_mask(struct rc_src_register src,struct rc_dst_register dst)301 src_reads_dst_mask(struct rc_src_register src, struct rc_dst_register dst)
302 {
303 if (dst.File != src.File || dst.Index != src.Index) {
304 return 0;
305 }
306 return rc_swizzle_to_writemask(src.Swizzle);
307 }
308
309 /* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
310 * in any of its channels. Return 0 otherwise. */
311 static int
src_has_const_swz(struct rc_src_register src)312 src_has_const_swz(struct rc_src_register src)
313 {
314 int chan;
315 for (chan = 0; chan < 4; chan++) {
316 unsigned int swz = GET_SWZ(src.Swizzle, chan);
317 if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF || swz == RC_SWIZZLE_ONE) {
318 return 1;
319 }
320 }
321 return 0;
322 }
323
324 static void
presub_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)325 presub_scan_read(void *data, struct rc_instruction *inst, struct rc_src_register *src)
326 {
327 struct rc_reader_data *reader_data = data;
328 rc_presubtract_op *presub_opcode = reader_data->CbData;
329
330 if (!rc_inst_can_use_presub(
331 reader_data->C, inst, *presub_opcode, reader_data->Writer->U.I.DstReg.WriteMask, src,
332 &reader_data->Writer->U.I.SrcReg[0], &reader_data->Writer->U.I.SrcReg[1])) {
333 reader_data->Abort = 1;
334 return;
335 }
336 }
337
338 static int
presub_helper(struct radeon_compiler * c,struct rc_instruction * inst_add,rc_presubtract_op presub_opcode,rc_presub_replace_fn presub_replace)339 presub_helper(struct radeon_compiler *c, struct rc_instruction *inst_add,
340 rc_presubtract_op presub_opcode, rc_presub_replace_fn presub_replace)
341 {
342 struct rc_reader_data reader_data;
343 unsigned int i;
344 rc_presubtract_op cb_op = presub_opcode;
345
346 reader_data.CbData = &cb_op;
347 reader_data.ExitOnAbort = 1;
348 rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL, is_src_clobbered_scan_write);
349
350 if (reader_data.Abort || reader_data.ReaderCount == 0)
351 return 0;
352
353 for (i = 0; i < reader_data.ReaderCount; i++) {
354 unsigned int src_index;
355 struct rc_reader reader = reader_data.Readers[i];
356 const struct rc_opcode_info *info = rc_get_opcode_info(reader.Inst->U.I.Opcode);
357
358 for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
359 if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
360 presub_replace(inst_add, reader.Inst, src_index);
361 }
362 }
363 return 1;
364 }
365
366 static void
presub_replace_add(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)367 presub_replace_add(struct rc_instruction *inst_add, struct rc_instruction *inst_reader,
368 unsigned int src_index)
369 {
370 rc_presubtract_op presub_opcode;
371
372 unsigned int negates = 0;
373 if (inst_add->U.I.SrcReg[0].Negate)
374 negates++;
375 if (inst_add->U.I.SrcReg[1].Negate)
376 negates++;
377 assert(negates != 2 ||
378 ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) ==
379 (inst_add->U.I.SrcReg[0].Negate & inst_add->U.I.DstReg.WriteMask)));
380
381 if (negates == 1)
382 presub_opcode = RC_PRESUB_SUB;
383 else
384 presub_opcode = RC_PRESUB_ADD;
385
386 if (inst_add->U.I.SrcReg[1].Negate && negates == 1) {
387 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
388 inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
389 } else {
390 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
391 inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
392 }
393 /* If both sources are negative we can move the negate to the presub. */
394 unsigned negate_mask = negates == 1 ? 0 : inst_add->U.I.SrcReg[0].Negate;
395 inst_reader->U.I.PreSub.SrcReg[0].Negate = negate_mask;
396 inst_reader->U.I.PreSub.SrcReg[1].Negate = negate_mask;
397 inst_reader->U.I.PreSub.Opcode = presub_opcode;
398 inst_reader->U.I.SrcReg[src_index] =
399 chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
400 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
401 inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
402 }
403
404 static int
is_presub_candidate(struct radeon_compiler * c,struct rc_instruction * inst)405 is_presub_candidate(struct radeon_compiler *c, struct rc_instruction *inst)
406 {
407 const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
408 unsigned int i;
409 unsigned int is_constant[2] = {0, 0};
410
411 assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);
412
413 if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE || inst->U.I.SaturateMode ||
414 inst->U.I.WriteALUResult || inst->U.I.Omod) {
415 return 0;
416 }
417
418 /* If first two sources use a constant swizzle, then we can't convert it to
419 * a presubtract operation. In fact for the ADD and SUB presubtract
420 * operations neither source can contain a constant swizzle. This
421 * specific case is checked in peephole_add_presub_add() when
422 * we make sure the swizzles for both sources are equal, so we
423 * don't need to worry about it here. */
424 for (i = 0; i < 2; i++) {
425 int chan;
426 for (chan = 0; chan < 4; chan++) {
427 rc_swizzle swz = get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
428 if (swz == RC_SWIZZLE_ONE || swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF) {
429 is_constant[i] = 1;
430 }
431 }
432 }
433 if (is_constant[0] && is_constant[1])
434 return 0;
435
436 for (i = 0; i < info->NumSrcRegs; i++) {
437 struct rc_src_register src = inst->U.I.SrcReg[i];
438 if (src_reads_dst_mask(src, inst->U.I.DstReg))
439 return 0;
440
441 src.File = RC_FILE_PRESUB;
442 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
443 return 0;
444 }
445 return 1;
446 }
447
448 static int
peephole_add_presub_add(struct radeon_compiler * c,struct rc_instruction * inst_add)449 peephole_add_presub_add(struct radeon_compiler *c, struct rc_instruction *inst_add)
450 {
451 unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
452 unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
453 unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
454
455 if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
456 return 0;
457
458 /* src0 and src1 can't have absolute values */
459 if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
460 return 0;
461
462 /* if src0 is negative, at least all bits of dstmask have to be set */
463 if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
464 return 0;
465
466 /* if src1 is negative, at least all bits of dstmask have to be set */
467 if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
468 return 0;
469
470 if (!is_presub_candidate(c, inst_add))
471 return 0;
472
473 if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
474 rc_remove_instruction(inst_add);
475 return 1;
476 }
477 return 0;
478 }
479
480 static void
presub_replace_inv(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)481 presub_replace_inv(struct rc_instruction *inst_add, struct rc_instruction *inst_reader,
482 unsigned int src_index)
483 {
484 /* We must be careful not to modify inst_add, since it
485 * is possible it will remain part of the program.*/
486 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
487 inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
488 inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
489 inst_reader->U.I.SrcReg[src_index] =
490 chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
491
492 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
493 inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
494 }
495
496 static void
presub_replace_bias(struct rc_instruction * inst_mad,struct rc_instruction * inst_reader,unsigned int src_index)497 presub_replace_bias(struct rc_instruction *inst_mad, struct rc_instruction *inst_reader,
498 unsigned int src_index)
499 {
500 /* We must be careful not to modify inst_mad, since it
501 * is possible it will remain part of the program.*/
502 inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
503 inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
504 inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
505 inst_reader->U.I.SrcReg[src_index] =
506 chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
507
508 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
509 inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
510 }
511
512 /**
513 * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
514 * Use the presubtract 1 - src0 for all readers of TEMP[0]. The first source
515 * of the add instruction must have the constant 1 swizzle. This function
516 * does not check const registers to see if their value is 1.0, so it should
517 * be called after the constant_folding optimization.
518 * @return
519 * 0 if the ADD instruction is still part of the program.
520 * 1 if the ADD instruction is no longer part of the program.
521 */
522 static int
peephole_add_presub_inv(struct radeon_compiler * c,struct rc_instruction * inst_add)523 peephole_add_presub_inv(struct radeon_compiler *c, struct rc_instruction *inst_add)
524 {
525 unsigned int i, swz;
526
527 if (!is_presub_candidate(c, inst_add))
528 return 0;
529
530 /* Check if src0 is 1. */
531 /* XXX It would be nice to use is_src_uniform_constant here, but that
532 * function only works if the register's file is RC_FILE_NONE */
533 for (i = 0; i < 4; i++) {
534 if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
535 continue;
536
537 swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
538 if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
539 return 0;
540 }
541
542 /* Check src1. */
543 if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
544 inst_add->U.I.DstReg.WriteMask ||
545 inst_add->U.I.SrcReg[1].Abs || src_has_const_swz(inst_add->U.I.SrcReg[1])) {
546
547 return 0;
548 }
549
550 if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
551 rc_remove_instruction(inst_add);
552 return 1;
553 }
554 return 0;
555 }
556
557 /**
558 * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
559 * Use the presubtract 1 - 2*src0 for all readers of TEMP[0]. The first source
560 * of the add instruction must have the constant 1 swizzle. This function
561 * does not check const registers to see if their value is 1.0, so it should
562 * be called after the constant_folding optimization.
563 * @return
564 * 0 if the MAD instruction is still part of the program.
565 * 1 if the MAD instruction is no longer part of the program.
566 */
567 static int
peephole_mad_presub_bias(struct radeon_compiler * c,struct rc_instruction * inst_mad)568 peephole_mad_presub_bias(struct radeon_compiler *c, struct rc_instruction *inst_mad)
569 {
570 unsigned int i, swz;
571
572 if (!is_presub_candidate(c, inst_mad))
573 return 0;
574
575 /* Check if src2 is 1. */
576 for (i = 0; i < 4; i++) {
577 if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
578 continue;
579
580 swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
581 if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
582 return 0;
583 }
584
585 /* Check if src1 is 2. */
586 struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
587 if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
588 return 0;
589 if (src1_reg.File == RC_FILE_INLINE) {
590 if (rc_inline_to_float(src1_reg.Index) != 2.0f)
591 return 0;
592 } else {
593 if (src1_reg.File != RC_FILE_CONSTANT)
594 return 0;
595
596 struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
597 if (constant->Type != RC_CONSTANT_IMMEDIATE)
598 return 0;
599 for (i = 0; i < 4; i++) {
600 if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
601 continue;
602 swz = GET_SWZ(src1_reg.Swizzle, i);
603 if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
604 return 0;
605 }
606 }
607
608 /* Check src0. */
609 if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
610 inst_mad->U.I.DstReg.WriteMask ||
611 inst_mad->U.I.SrcReg[0].Abs || src_has_const_swz(inst_mad->U.I.SrcReg[0])) {
612
613 return 0;
614 }
615
616 if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
617 rc_remove_instruction(inst_mad);
618 return 1;
619 }
620 return 0;
621 }
622
623 struct peephole_mul_cb_data {
624 struct rc_dst_register *Writer;
625 unsigned int Clobbered;
626 };
627
628 static void
omod_filter_reader_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)629 omod_filter_reader_cb(void *userdata, struct rc_instruction *inst, rc_register_file file,
630 unsigned int index, unsigned int mask)
631 {
632 struct peephole_mul_cb_data *d = userdata;
633 if (rc_src_reads_dst_mask(file, mask, index, d->Writer->File, d->Writer->Index,
634 d->Writer->WriteMask)) {
635
636 d->Clobbered = 1;
637 }
638 }
639
640 static void
omod_filter_writer_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)641 omod_filter_writer_cb(void *userdata, struct rc_instruction *inst, rc_register_file file,
642 unsigned int index, unsigned int mask)
643 {
644 struct peephole_mul_cb_data *d = userdata;
645 if (file == d->Writer->File && index == d->Writer->Index && (mask & d->Writer->WriteMask)) {
646 d->Clobbered = 1;
647 }
648 }
649
650 static int
peephole_mul_omod(struct radeon_compiler * c,struct rc_instruction * inst_mul,struct rc_list * var_list)651 peephole_mul_omod(struct radeon_compiler *c, struct rc_instruction *inst_mul,
652 struct rc_list *var_list)
653 {
654 unsigned int chan = 0, swz, i;
655 int const_index = -1;
656 int temp_index = -1;
657 float const_value;
658 rc_omod_op omod_op = RC_OMOD_DISABLE;
659 struct rc_list *writer_list;
660 struct rc_variable *var;
661 struct peephole_mul_cb_data cb_data;
662 unsigned writemask_sum;
663
664 for (i = 0; i < 2; i++) {
665 unsigned int j;
666 if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT &&
667 inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY &&
668 inst_mul->U.I.SrcReg[i].File != RC_FILE_NONE) {
669 return 0;
670 }
671
672 /* The only relevant case with constant swizzles we should check for
673 * is multiply by one half.
674 */
675 if (inst_mul->U.I.SrcReg[i].File == RC_FILE_NONE) {
676 for (j = 0; j < 4; j++) {
677 swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
678 if (swz == RC_SWIZZLE_UNUSED) {
679 continue;
680 }
681 if (swz != RC_SWIZZLE_HALF) {
682 return 0;
683 } else {
684 omod_op = RC_OMOD_DIV_2;
685 }
686 }
687 }
688
689 if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
690 if (temp_index != -1) {
691 /* The instruction has two temp sources */
692 return 0;
693 } else {
694 temp_index = i;
695 continue;
696 }
697 }
698 /* If we get this far Src[i] must be a constant src */
699 if (inst_mul->U.I.SrcReg[i].Negate) {
700 return 0;
701 }
702 /* The constant src needs to read from the same swizzle */
703 swz = RC_SWIZZLE_UNUSED;
704 chan = 0;
705 for (j = 0; j < 4; j++) {
706 unsigned int j_swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
707 if (j_swz == RC_SWIZZLE_UNUSED) {
708 continue;
709 }
710 if (swz == RC_SWIZZLE_UNUSED) {
711 swz = j_swz;
712 chan = j;
713 } else if (j_swz != swz) {
714 return 0;
715 }
716 }
717
718 if (const_index != -1) {
719 /* The instruction has two constant sources */
720 return 0;
721 } else {
722 const_index = i;
723 }
724 }
725
726 if (omod_op == RC_OMOD_DISABLE) {
727 if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
728 inst_mul->U.I.SrcReg[const_index].Index)) {
729 return 0;
730 }
731 const_value = rc_get_constant_value(c, inst_mul->U.I.SrcReg[const_index].Index,
732 inst_mul->U.I.SrcReg[const_index].Swizzle,
733 inst_mul->U.I.SrcReg[const_index].Negate, chan);
734
735 if (const_value == 2.0f) {
736 omod_op = RC_OMOD_MUL_2;
737 } else if (const_value == 4.0f) {
738 omod_op = RC_OMOD_MUL_4;
739 } else if (const_value == 8.0f) {
740 omod_op = RC_OMOD_MUL_8;
741 } else if (const_value == (1.0f / 2.0f)) {
742 omod_op = RC_OMOD_DIV_2;
743 } else if (const_value == (1.0f / 4.0f)) {
744 omod_op = RC_OMOD_DIV_4;
745 } else if (const_value == (1.0f / 8.0f)) {
746 omod_op = RC_OMOD_DIV_8;
747 } else {
748 return 0;
749 }
750 }
751
752 writer_list = rc_variable_list_get_writers_one_reader(var_list, RC_INSTRUCTION_NORMAL,
753 &inst_mul->U.I.SrcReg[temp_index]);
754
755 if (!writer_list) {
756 return 0;
757 }
758
759 cb_data.Clobbered = 0;
760 cb_data.Writer = &inst_mul->U.I.DstReg;
761 for (var = writer_list->Item; var; var = var->Friend) {
762 struct rc_instruction *inst;
763 const struct rc_opcode_info *info = rc_get_opcode_info(var->Inst->U.I.Opcode);
764 if (info->HasTexture) {
765 return 0;
766 }
767 if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
768 return 0;
769 }
770
771 /* Empirical testing shows that DDX/DDY directly into output
772 * with non-identity omod is problematic.
773 */
774 if ((info->Opcode == RC_OPCODE_DDX || info->Opcode == RC_OPCODE_DDY) &&
775 inst_mul->U.I.DstReg.File == RC_FILE_OUTPUT) {
776 return 0;
777 }
778
779 for (inst = inst_mul->Prev; inst != var->Inst; inst = inst->Prev) {
780 rc_for_all_reads_mask(inst, omod_filter_reader_cb, &cb_data);
781 rc_for_all_writes_mask(inst, omod_filter_writer_cb, &cb_data);
782 if (cb_data.Clobbered) {
783 break;
784 }
785 }
786 }
787
788 if (cb_data.Clobbered) {
789 return 0;
790 }
791
792 writemask_sum = rc_variable_writemask_sum(writer_list->Item);
793
794 /* rc_normal_rewrite_writemask can't expand a previous writemask to store
795 * more channels replicated.
796 */
797 if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
798 return 0;
799
800 /* Rewrite the instructions */
801 for (var = writer_list->Item; var; var = var->Friend) {
802 struct rc_variable *writer = var;
803 unsigned conversion_swizzle = RC_SWIZZLE_UUUU;
804 for (chan = 0; chan < 4; chan++) {
805 unsigned swz = GET_SWZ(inst_mul->U.I.SrcReg[temp_index].Swizzle, chan);
806 if (swz <= RC_SWIZZLE_W)
807 SET_SWZ(conversion_swizzle, swz, chan);
808 }
809 writer->Inst->U.I.Omod = omod_op;
810 writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
811 writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
812 rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
813 writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
814 }
815
816 rc_remove_instruction(inst_mul);
817
818 return 1;
819 }
820
821 /**
822 * @return
823 * 0 if inst is still part of the program.
824 * 1 if inst is no longer part of the program.
825 */
826 int
rc_opt_presubtract(struct radeon_compiler * c,struct rc_instruction * inst,void * data)827 rc_opt_presubtract(struct radeon_compiler *c, struct rc_instruction *inst, void *data)
828 {
829 switch (inst->U.I.Opcode) {
830 case RC_OPCODE_ADD: {
831 if (peephole_add_presub_inv(c, inst))
832 return 1;
833 if (peephole_add_presub_add(c, inst))
834 return 1;
835 break;
836 }
837 case RC_OPCODE_MAD: {
838 if (peephole_mad_presub_bias(c, inst))
839 return 1;
840 break;
841 }
842 default:
843 break;
844 }
845 return 0;
846 }
847
848 static unsigned int
merge_swizzles(unsigned int swz1,unsigned int swz2)849 merge_swizzles(unsigned int swz1, unsigned int swz2)
850 {
851 unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
852 for (unsigned int chan = 0; chan < 4; chan++) {
853 unsigned int swz = GET_SWZ(swz1, chan);
854 if (swz != RC_SWIZZLE_UNUSED) {
855 SET_SWZ(new_swz, chan, swz);
856 continue;
857 }
858 swz = GET_SWZ(swz2, chan);
859 SET_SWZ(new_swz, chan, swz);
860 }
861 return new_swz;
862 }
863
864 /* Sets negate to 0 for unused channels. */
865 static unsigned int
clean_negate(struct rc_src_register src)866 clean_negate(struct rc_src_register src)
867 {
868 unsigned int new_negate = 0;
869 for (unsigned int chan = 0; chan < 4; chan++) {
870 unsigned int swz = GET_SWZ(src.Swizzle, chan);
871 if (swz != RC_SWIZZLE_UNUSED)
872 new_negate |= src.Negate & (1 << chan);
873 }
874 return new_negate;
875 }
876
877 static unsigned int
merge_negates(struct rc_src_register src1,struct rc_src_register src2)878 merge_negates(struct rc_src_register src1, struct rc_src_register src2)
879 {
880 return clean_negate(src1) | clean_negate(src2);
881 }
882
883 static unsigned int
fill_swizzle(unsigned int orig_swz,unsigned int wmask,unsigned int const_swz)884 fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz)
885 {
886 for (unsigned int chan = 0; chan < 4; chan++) {
887 unsigned int swz = GET_SWZ(orig_swz, chan);
888 if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) {
889 SET_SWZ(orig_swz, chan, const_swz);
890 }
891 }
892 return orig_swz;
893 }
894
895 static int
have_shared_source(struct rc_instruction * inst1,struct rc_instruction * inst2)896 have_shared_source(struct rc_instruction *inst1, struct rc_instruction *inst2)
897 {
898 int shared_src = -1;
899 const struct rc_opcode_info *opcode1 = rc_get_opcode_info(inst1->U.I.Opcode);
900 const struct rc_opcode_info *opcode2 = rc_get_opcode_info(inst2->U.I.Opcode);
901 for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) {
902 for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
903 if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
904 inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
905 inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
906 inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
907 shared_src = i;
908 }
909 }
910 return shared_src;
911 }
912
913 /**
914 * Merges two MOVs writing different channels of the same destination register
915 * with the use of the constant swizzles.
916 */
917 static bool
merge_movs(struct radeon_compiler * c,struct rc_instruction * inst,struct rc_instruction * cur)918 merge_movs(struct radeon_compiler *c, struct rc_instruction *inst, struct rc_instruction *cur)
919 {
920 /* We can merge two MOVs into MOV if one of them is from inline constant,
921 * i.e., constant swizzles and RC_FILE_NONE).
922 *
923 * For example
924 * MOV temp[0].x none.1___
925 * MOV temp[0].y input[0]._x__
926 *
927 * becomes
928 * MOV temp[0].xy input[0].1x__
929 */
930 unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
931 if (cur->U.I.SrcReg[0].File == RC_FILE_NONE || inst->U.I.SrcReg[0].File == RC_FILE_NONE) {
932 struct rc_src_register src;
933 if (cur->U.I.SrcReg[0].File == RC_FILE_NONE)
934 src = inst->U.I.SrcReg[0];
935 else
936 src = cur->U.I.SrcReg[0];
937 src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle);
938 src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
939 if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
940 cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
941 cur->U.I.SrcReg[0] = src;
942 rc_remove_instruction(inst);
943 return true;
944 }
945 }
946
947 /* Handle the trivial case where the MOVs share a source.
948 *
949 * For example
950 * MOV temp[0].x const[0].x
951 * MOV temp[0].y const[0].z
952 *
953 * becomes
954 * MOV temp[0].xy const[0].xz
955 */
956 if (have_shared_source(inst, cur) == 0) {
957 struct rc_src_register src = cur->U.I.SrcReg[0];
958 src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
959 src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle);
960
961 if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
962 cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
963 cur->U.I.SrcReg[0] = src;
964 rc_remove_instruction(inst);
965 return true;
966 }
967 }
968
969 /* Otherwise, we can convert the MOVs into ADD.
970 *
971 * For example
972 * MOV temp[0].x const[0].x
973 * MOV temp[0].y input[0].y
974 *
975 * becomes
976 * ADD temp[0].xy const[0].x0 input[0].0y
977 */
978 unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask;
979 struct rc_src_register src0 = inst->U.I.SrcReg[0];
980 struct rc_src_register src1 = cur->U.I.SrcReg[0];
981
982 src0.Swizzle = fill_swizzle(src0.Swizzle, wmask, RC_SWIZZLE_ZERO);
983 src1.Swizzle = fill_swizzle(src1.Swizzle, wmask, RC_SWIZZLE_ZERO);
984 if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) ||
985 !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1))
986 return false;
987
988 cur->U.I.DstReg.WriteMask = wmask;
989 cur->U.I.Opcode = RC_OPCODE_ADD;
990 cur->U.I.SrcReg[0] = src0;
991 cur->U.I.SrcReg[1] = src1;
992
993 /* finally delete the original mov */
994 rc_remove_instruction(inst);
995 return true;
996 }
997
998 /**
999 * This function will try to merge MOV and ADD/MUL instructions with the same
1000 * destination, making use of the constant swizzles.
1001 *
1002 * For example:
1003 * MOV temp[0].x const[0].x
1004 * MUL temp[0].yz const[1].yz const[2].yz
1005 *
1006 * becomes
1007 * MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00
1008 */
1009 static int
merge_mov_add_mul(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1010 merge_mov_add_mul(struct radeon_compiler *c, struct rc_instruction *inst1,
1011 struct rc_instruction *inst2)
1012 {
1013 struct rc_instruction *inst, *mov;
1014 if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1015 mov = inst1;
1016 inst = inst2;
1017 } else {
1018 mov = inst2;
1019 inst = inst1;
1020 }
1021
1022 const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL;
1023 int shared_index = have_shared_source(inst, mov);
1024 unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask;
1025
1026 /* If there is a shared source, just merge the swizzles and be done with it. */
1027 if (shared_index != -1) {
1028 struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index];
1029 struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index];
1030
1031 shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src);
1032 shared_src.Swizzle = merge_swizzles(shared_src.Swizzle, mov->U.I.SrcReg[0].Swizzle);
1033 other_src.Negate = clean_negate(other_src);
1034 unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO;
1035 other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz);
1036
1037 if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) ||
1038 !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src))
1039 return 0;
1040
1041 inst2->U.I.Opcode = inst->U.I.Opcode;
1042 inst2->U.I.SrcReg[0] = shared_src;
1043 inst2->U.I.SrcReg[1] = other_src;
1044
1045 /* TODO: we can do a bit better in the special case when one of the sources is none.
1046 * Convert to MAD otherwise.
1047 */
1048 } else {
1049 struct rc_src_register src0, src1, src2;
1050 if (is_mul) {
1051 src2 = mov->U.I.SrcReg[0];
1052 src0 = inst->U.I.SrcReg[0];
1053 src1 = inst->U.I.SrcReg[1];
1054 } else {
1055 src0 = mov->U.I.SrcReg[0];
1056 src1 = inst->U.I.SrcReg[0];
1057 src2 = inst->U.I.SrcReg[1];
1058 }
1059 /* The following login expects that the unused channels have empty negate bits. */
1060 src0.Negate = clean_negate(src0);
1061 src1.Negate = clean_negate(src1);
1062 src2.Negate = clean_negate(src2);
1063
1064 src0.Swizzle = fill_swizzle(src0.Swizzle, wmask, RC_SWIZZLE_ONE);
1065 src1.Swizzle = fill_swizzle(src1.Swizzle, wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE);
1066 src2.Swizzle = fill_swizzle(src2.Swizzle, wmask, RC_SWIZZLE_ZERO);
1067 if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) ||
1068 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) ||
1069 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2))
1070 return 0;
1071
1072 inst2->U.I.Opcode = RC_OPCODE_MAD;
1073 inst2->U.I.SrcReg[0] = src0;
1074 inst2->U.I.SrcReg[1] = src1;
1075 inst2->U.I.SrcReg[2] = src2;
1076 }
1077 inst2->U.I.DstReg.WriteMask = wmask;
1078 /* finally delete the original instruction */
1079 rc_remove_instruction(inst1);
1080
1081 return 1;
1082 }
1083
1084 /**
1085 * This function will try to merge MOV and MAD instructions with the same
1086 * destination, making use of the constant swizzles. This only works
1087 * if there is a shared source or one of the sources is RC_FILE_NONE.
1088 *
1089 * For example:
1090 * MOV temp[0].x const[0].x
1091 * MAD temp[0].yz const[0].yz const[1].yz input[0].xw
1092 *
1093 * becomes
1094 * MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
1095 */
1096 static bool
merge_mov_mad(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1097 merge_mov_mad(struct radeon_compiler *c, struct rc_instruction *inst1, struct rc_instruction *inst2)
1098 {
1099 struct rc_instruction *mov, *mad;
1100 if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1101 mov = inst1;
1102 mad = inst2;
1103 } else {
1104 mov = inst2;
1105 mad = inst1;
1106 }
1107
1108 int shared_index = have_shared_source(mad, mov);
1109 unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
1110 struct rc_src_register src[3];
1111 src[0] = mad->U.I.SrcReg[0];
1112 src[1] = mad->U.I.SrcReg[1];
1113 src[2] = mad->U.I.SrcReg[2];
1114
1115 /* Shared source is the one for multiplication. */
1116 if (shared_index == 0 || shared_index == 1) {
1117 src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
1118 src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
1119 src[shared_index].Swizzle =
1120 merge_swizzles(src[shared_index].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1121 src[1 - shared_index].Swizzle =
1122 fill_swizzle(src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
1123 src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1124
1125 /* Shared source is the one for used for addition, or it is none. Additionally,
1126 * if the mov SrcReg is none, we merge it with the addition (third) reg as well
1127 * because than we have the highest change the swizzles will be legal.
1128 */
1129 } else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
1130 src[2].File == RC_FILE_NONE) {
1131 src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
1132 src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1133 src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
1134 src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
1135 if (src[2].File == RC_FILE_NONE) {
1136 src[2].File = mov->U.I.SrcReg[0].File;
1137 src[2].Index = mov->U.I.SrcReg[0].Index;
1138 src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
1139 src[2].Abs = mov->U.I.SrcReg[0].Abs;
1140 }
1141
1142 /* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
1143 * fill the other one with ones and the reg for addition with zeros.
1144 */
1145 } else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
1146 unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
1147 src[none_src] = mov->U.I.SrcReg[0];
1148 src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
1149 src[none_src].Swizzle =
1150 merge_swizzles(src[none_src].Swizzle, mad->U.I.SrcReg[none_src].Swizzle);
1151 src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
1152 src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle, wmask, RC_SWIZZLE_ONE);
1153 src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1154 } else {
1155 return false;
1156 }
1157
1158 if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
1159 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
1160 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
1161 return false;
1162
1163 inst2->U.I.Opcode = RC_OPCODE_MAD;
1164 inst2->U.I.SrcReg[0] = src[0];
1165 inst2->U.I.SrcReg[1] = src[1];
1166 inst2->U.I.SrcReg[2] = src[2];
1167 inst2->U.I.DstReg.WriteMask = wmask;
1168 rc_remove_instruction(inst1);
1169 return true;
1170 }
1171
1172 static bool
inst_combination(struct rc_instruction * inst1,struct rc_instruction * inst2,rc_opcode opcode1,rc_opcode opcode2)1173 inst_combination(struct rc_instruction *inst1, struct rc_instruction *inst2, rc_opcode opcode1,
1174 rc_opcode opcode2)
1175 {
1176 return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) ||
1177 (inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2));
1178 }
1179
1180 /**
1181 * Searches for instructions writing different channels of the same register that could
1182 * be merged together with the use of constant swizzles.
1183 *
1184 * The potential candidates are combinations of MOVs, ADDs, MULs and MADs.
1185 */
1186 static void
merge_channels(struct radeon_compiler * c,struct rc_instruction * inst)1187 merge_channels(struct radeon_compiler *c, struct rc_instruction *inst)
1188 {
1189 unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
1190 unsigned int orig_dst_file = inst->U.I.DstReg.File;
1191 unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
1192 const struct rc_opcode_info *orig_opcode = rc_get_opcode_info(inst->U.I.Opcode);
1193
1194 struct rc_instruction *cur = inst;
1195 while (cur != &c->Program.Instructions) {
1196 cur = cur->Next;
1197 const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);
1198
1199 /* Keep it simple for now and stop when encountering any
1200 * control flow.
1201 */
1202 if (opcode->IsFlowControl)
1203 return;
1204
1205 /* Stop when the original destination is overwritten */
1206 if (orig_dst_reg == cur->U.I.DstReg.Index && orig_dst_file == cur->U.I.DstReg.File &&
1207 (orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
1208 return;
1209
1210 /* Stop the search when the original instruction destination
1211 * is used as a source for anything.
1212 */
1213 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
1214 if (cur->U.I.SrcReg[i].File == orig_dst_file && cur->U.I.SrcReg[i].Index == orig_dst_reg)
1215 return;
1216 }
1217
1218 /* Stop the search when some of the original sources are touched. */
1219 for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) {
1220 if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File &&
1221 inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index)
1222 return;
1223 }
1224
1225 if (cur->U.I.DstReg.File == orig_dst_file && cur->U.I.DstReg.Index == orig_dst_reg &&
1226 cur->U.I.SaturateMode == inst->U.I.SaturateMode &&
1227 (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {
1228
1229 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) {
1230 if (merge_movs(c, inst, cur))
1231 return;
1232 }
1233
1234 /* Skip the merge if one of the instructions writes just w channel
1235 * and we are compiling a fragment shader. We can pair-schedule it together
1236 * later anyway and it will also give the scheduler a bit more flexibility.
1237 * Only check this after merging MOVs as when we manage to merge two MOVs
1238 * into another MOV we can still copy propagate it away. So it is a win in
1239 * that case.
1240 */
1241 if (c->has_omod &&
1242 (cur->U.I.DstReg.WriteMask == RC_MASK_W || inst->U.I.DstReg.WriteMask == RC_MASK_W))
1243 continue;
1244
1245 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) ||
1246 inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) {
1247 if (merge_mov_add_mul(c, inst, cur))
1248 return;
1249 }
1250
1251 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
1252 if (merge_mov_mad(c, inst, cur))
1253 return;
1254 }
1255 }
1256 }
1257 }
1258
1259 /**
1260 * Searches for duplicate ARLs/ARRs
1261 *
1262 * Only a very trivial case is now optimized where if a second one is detected which reads from
1263 * the same register as the first one and source is the same, just remove the second one.
1264 */
1265 static void
merge_A0_loads(struct radeon_compiler * c,struct rc_instruction * inst,bool is_ARL)1266 merge_A0_loads(struct radeon_compiler *c, struct rc_instruction *inst, bool is_ARL)
1267 {
1268 unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
1269 unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
1270 unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
1271 int cf_depth = 0;
1272
1273 struct rc_instruction *cur = inst;
1274 while (cur != &c->Program.Instructions) {
1275 cur = cur->Next;
1276 const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);
1277
1278 /* Keep it simple for now and stop when encountering any
1279 * control flow besides simple ifs.
1280 */
1281 if (opcode->IsFlowControl) {
1282 switch (cur->U.I.Opcode) {
1283 case RC_OPCODE_IF: {
1284 cf_depth++;
1285 break;
1286 }
1287 case RC_OPCODE_ELSE: {
1288 if (cf_depth < 1)
1289 return;
1290 break;
1291 }
1292 case RC_OPCODE_ENDIF: {
1293 cf_depth--;
1294 break;
1295 }
1296 default:
1297 return;
1298 }
1299 }
1300
1301 /* Stop when the original source is overwritten */
1302 if (A0_src_reg == cur->U.I.DstReg.Index && A0_src_file == cur->U.I.DstReg.File &&
1303 cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
1304 return;
1305
1306 /* Wrong A0 load type. */
1307 if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
1308 (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
1309 return;
1310
1311 if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
1312 if (A0_src_reg == cur->U.I.SrcReg[0].Index && A0_src_file == cur->U.I.SrcReg[0].File &&
1313 A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
1314 struct rc_instruction *next = cur->Next;
1315 rc_remove_instruction(cur);
1316 cur = next;
1317 } else {
1318 return;
1319 }
1320 }
1321 }
1322 }
1323
1324 /**
1325 * According to the GLSL spec, round is only 1.30 and up
1326 * so the only reason why we should ever see round is if it actually
1327 * is lowered ARR (from nine->ttn). In that case we want to reconstruct
1328 * the ARR instead of lowering the round.
1329 */
1330 static void
transform_vertex_ROUND(struct radeon_compiler * c,struct rc_instruction * inst)1331 transform_vertex_ROUND(struct radeon_compiler *c, struct rc_instruction *inst)
1332 {
1333 struct rc_reader_data readers;
1334 readers.ExitOnAbort = 0;
1335 rc_get_readers(c, inst, &readers, NULL, NULL, NULL);
1336
1337 assert(readers.ReaderCount > 0);
1338 for (unsigned i = 0; i < readers.ReaderCount; i++) {
1339 struct rc_instruction *reader = readers.Readers[i].Inst;
1340 if (reader->U.I.Opcode != RC_OPCODE_ARL) {
1341 assert(!"Unable to convert ROUND+ARL to ARR\n");
1342 return;
1343 }
1344 }
1345
1346 /* Only ARL readers, convert all to ARR */
1347 for (unsigned i = 0; i < readers.ReaderCount; i++) {
1348 readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
1349 }
1350 /* Switch ROUND to MOV and let copy propagate sort it out later. */
1351 inst->U.I.Opcode = RC_OPCODE_MOV;
1352 }
1353
1354 /**
1355 * Apply various optimizations specific to the A0 address register loads.
1356 */
1357 static void
optimize_A0_loads(struct radeon_compiler * c)1358 optimize_A0_loads(struct radeon_compiler *c)
1359 {
1360 struct rc_instruction *inst = c->Program.Instructions.Next;
1361
1362 while (inst != &c->Program.Instructions) {
1363 struct rc_instruction *cur = inst;
1364 inst = inst->Next;
1365 if (cur->U.I.Opcode == RC_OPCODE_ARL) {
1366 merge_A0_loads(c, cur, true);
1367 } else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
1368 merge_A0_loads(c, cur, false);
1369 } else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
1370 transform_vertex_ROUND(c, cur);
1371 }
1372 }
1373 }
1374
1375 void
rc_optimize(struct radeon_compiler * c,void * user)1376 rc_optimize(struct radeon_compiler *c, void *user)
1377 {
1378 struct rc_instruction *inst = c->Program.Instructions.Next;
1379 while (inst != &c->Program.Instructions) {
1380 struct rc_instruction *cur = inst;
1381 inst = inst->Next;
1382 constant_folding(c, cur);
1383 }
1384
1385 /* Copy propagate simple movs away. */
1386 inst = c->Program.Instructions.Next;
1387 while (inst != &c->Program.Instructions) {
1388 struct rc_instruction *cur = inst;
1389 inst = inst->Next;
1390 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1391 copy_propagate(c, cur);
1392 }
1393 }
1394
1395 if (c->type == RC_VERTEX_PROGRAM) {
1396 optimize_A0_loads(c);
1397 }
1398
1399 /* Merge MOVs to same source in different channels using the constant
1400 * swizzle.
1401 */
1402 if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
1403 inst = c->Program.Instructions.Next;
1404 while (inst != &c->Program.Instructions) {
1405 struct rc_instruction *cur = inst;
1406 inst = inst->Next;
1407 if (cur->U.I.Opcode == RC_OPCODE_MOV || cur->U.I.Opcode == RC_OPCODE_ADD ||
1408 cur->U.I.Opcode == RC_OPCODE_MAD || cur->U.I.Opcode == RC_OPCODE_MUL)
1409 merge_channels(c, cur);
1410 }
1411 }
1412
1413 /* Copy propagate few extra movs from the merge_channels pass. */
1414 inst = c->Program.Instructions.Next;
1415 while (inst != &c->Program.Instructions) {
1416 struct rc_instruction *cur = inst;
1417 inst = inst->Next;
1418 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1419 copy_propagate(c, cur);
1420 }
1421 }
1422
1423 if (c->type != RC_FRAGMENT_PROGRAM) {
1424 return;
1425 }
1426
1427 /* Output modifiers. */
1428 inst = c->Program.Instructions.Next;
1429 struct rc_list *var_list = NULL;
1430 while (inst != &c->Program.Instructions) {
1431 struct rc_instruction *cur = inst;
1432 inst = inst->Next;
1433 if (cur->U.I.Opcode == RC_OPCODE_MUL) {
1434 if (!var_list)
1435 var_list = rc_get_variables(c);
1436 if (peephole_mul_omod(c, cur, var_list))
1437 var_list = NULL;
1438 }
1439 }
1440 }
1441