• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2009 Nicolai Haehnle.
3  * Copyright 2010 Tom Stellard <tstellar@gmail.com>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "util/u_math.h"
8 
9 #include "radeon_dataflow.h"
10 
11 #include "radeon_compiler.h"
12 #include "radeon_compiler_util.h"
13 #include "radeon_list.h"
14 #include "radeon_swizzle.h"
15 #include "radeon_variable.h"
16 
17 struct src_clobbered_reads_cb_data {
18    rc_register_file File;
19    unsigned int Index;
20    unsigned int Mask;
21    struct rc_reader_data *ReaderData;
22 };
23 
24 typedef void (*rc_presub_replace_fn)(struct rc_instruction *, struct rc_instruction *,
25                                      unsigned int);
26 
27 static struct rc_src_register
chain_srcregs(struct rc_src_register outer,struct rc_src_register inner)28 chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
29 {
30    struct rc_src_register combine;
31    combine.File = inner.File;
32    combine.Index = inner.Index;
33    combine.RelAddr = inner.RelAddr;
34    if (outer.Abs) {
35       combine.Abs = 1;
36       combine.Negate = outer.Negate;
37    } else {
38       combine.Abs = inner.Abs;
39       combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
40       combine.Negate ^= outer.Negate;
41    }
42    combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
43    return combine;
44 }
45 
46 static void
copy_propagate_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)47 copy_propagate_scan_read(void *data, struct rc_instruction *inst, struct rc_src_register *src)
48 {
49    rc_register_file file = src->File;
50    struct rc_reader_data *reader_data = data;
51 
52    if (!rc_inst_can_use_presub(reader_data->C, inst, reader_data->Writer->U.I.PreSub.Opcode,
53                                rc_swizzle_to_writemask(src->Swizzle), src,
54                                &reader_data->Writer->U.I.PreSub.SrcReg[0],
55                                &reader_data->Writer->U.I.PreSub.SrcReg[1])) {
56       reader_data->Abort = 1;
57       return;
58    }
59 
60    /* XXX This could probably be handled better. */
61    if (file == RC_FILE_ADDRESS) {
62       reader_data->Abort = 1;
63       return;
64    }
65 
66    /* R300/R400 is unhappy about propagating
67     *  0: MOV temp[1], -none.1111;
68     *  1: KIL temp[1];
69     * to
70     *  0: KIL -none.1111;
71     *
72     * R500 is fine with it.
73     */
74    if (!reader_data->C->is_r500 && inst->U.I.Opcode == RC_OPCODE_KIL &&
75        reader_data->Writer->U.I.SrcReg[0].File == RC_FILE_NONE) {
76       reader_data->Abort = 1;
77       return;
78    }
79 
80    /* These instructions cannot read from the constants file.
81     * see radeonTransformTEX()
82     */
83    if (reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
84        reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
85        reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_NONE &&
86        (inst->U.I.Opcode == RC_OPCODE_TEX || inst->U.I.Opcode == RC_OPCODE_TXB ||
87         inst->U.I.Opcode == RC_OPCODE_TXP || inst->U.I.Opcode == RC_OPCODE_TXD ||
88         inst->U.I.Opcode == RC_OPCODE_TXL || inst->U.I.Opcode == RC_OPCODE_KIL)) {
89       reader_data->Abort = 1;
90       return;
91    }
92 }
93 
94 static void
src_clobbered_reads_cb(void * data,struct rc_instruction * inst,struct rc_src_register * src)95 src_clobbered_reads_cb(void *data, struct rc_instruction *inst, struct rc_src_register *src)
96 {
97    struct src_clobbered_reads_cb_data *sc_data = data;
98 
99    if (src->File == sc_data->File && src->Index == sc_data->Index &&
100        (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
101 
102       sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
103    }
104 
105    if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
106       sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
107    }
108 }
109 
110 static void
is_src_clobbered_scan_write(void * data,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)111 is_src_clobbered_scan_write(void *data, struct rc_instruction *inst, rc_register_file file,
112                             unsigned int index, unsigned int mask)
113 {
114    struct src_clobbered_reads_cb_data sc_data;
115    struct rc_reader_data *reader_data = data;
116    sc_data.File = file;
117    sc_data.Index = index;
118    sc_data.Mask = mask;
119    sc_data.ReaderData = reader_data;
120    rc_for_all_reads_src(reader_data->Writer, src_clobbered_reads_cb, &sc_data);
121 }
122 
123 static void
copy_propagate(struct radeon_compiler * c,struct rc_instruction * inst_mov)124 copy_propagate(struct radeon_compiler *c, struct rc_instruction *inst_mov)
125 {
126    struct rc_reader_data reader_data;
127    unsigned int i;
128 
129    if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY || inst_mov->U.I.WriteALUResult)
130       return;
131 
132    /* Get a list of all the readers of this MOV instruction. */
133    reader_data.ExitOnAbort = 1;
134    rc_get_readers(c, inst_mov, &reader_data, copy_propagate_scan_read, NULL,
135                   is_src_clobbered_scan_write);
136 
137    if (reader_data.Abort || reader_data.ReaderCount == 0)
138       return;
139 
140    /* We can propagate SaturateMode if all the readers are MOV instructions
141     * without a presubtract operation, source negation and absolute.
142     * In that case, we just move SaturateMode to all readers. */
143    if (inst_mov->U.I.SaturateMode) {
144       for (i = 0; i < reader_data.ReaderCount; i++) {
145          struct rc_instruction *inst = reader_data.Readers[i].Inst;
146 
147          if (inst->U.I.Opcode != RC_OPCODE_MOV || inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
148              inst->U.I.SrcReg[0].Abs || inst->U.I.SrcReg[0].Negate) {
149             return;
150          }
151       }
152    }
153 
154    /* Propagate the MOV instruction. */
155    for (i = 0; i < reader_data.ReaderCount; i++) {
156       struct rc_instruction *inst = reader_data.Readers[i].Inst;
157       *reader_data.Readers[i].U.I.Src =
158          chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
159 
160       if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
161          inst->U.I.PreSub = inst_mov->U.I.PreSub;
162       if (!inst->U.I.SaturateMode)
163          inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
164    }
165 
166    /* Finally, remove the original MOV instruction */
167    rc_remove_instruction(inst_mov);
168 }
169 
170 /**
171  * Check if a source register is actually always the same
172  * swizzle constant.
173  */
174 static int
is_src_uniform_constant(struct rc_src_register src,rc_swizzle * pswz,unsigned int * pnegate)175 is_src_uniform_constant(struct rc_src_register src, rc_swizzle *pswz, unsigned int *pnegate)
176 {
177    int have_used = 0;
178 
179    if (src.File != RC_FILE_NONE) {
180       *pswz = 0;
181       return 0;
182    }
183 
184    for (unsigned int chan = 0; chan < 4; ++chan) {
185       unsigned int swz = GET_SWZ(src.Swizzle, chan);
186       if (swz < 4) {
187          *pswz = 0;
188          return 0;
189       }
190       if (swz == RC_SWIZZLE_UNUSED)
191          continue;
192 
193       if (!have_used) {
194          *pswz = swz;
195          *pnegate = GET_BIT(src.Negate, chan);
196          have_used = 1;
197       } else {
198          if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
199             *pswz = 0;
200             return 0;
201          }
202       }
203    }
204 
205    return 1;
206 }
207 
208 /**
209  * Replace 0.0, 1.0 and 0.5 immediate constants by their
210  * respective swizzles. Simplify instructions like ADD dst, src, 0;
211  */
212 static void
constant_folding(struct radeon_compiler * c,struct rc_instruction * inst)213 constant_folding(struct radeon_compiler *c, struct rc_instruction *inst)
214 {
215    const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
216    unsigned int i;
217 
218    /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
219    for (unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
220       struct rc_constant *constant;
221       struct rc_src_register newsrc;
222       int have_real_reference;
223       unsigned int chan;
224 
225       /* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
226       for (chan = 0; chan < 4; ++chan)
227          if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
228             break;
229       if (chan == 4) {
230          inst->U.I.SrcReg[src].File = RC_FILE_NONE;
231          continue;
232       }
233 
234       /* Convert immediates to swizzles. */
235       if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT || inst->U.I.SrcReg[src].RelAddr ||
236           inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
237          continue;
238 
239       constant = &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
240 
241       if (constant->Type != RC_CONSTANT_IMMEDIATE)
242          continue;
243 
244       newsrc = inst->U.I.SrcReg[src];
245       have_real_reference = 0;
246       for (chan = 0; chan < 4; ++chan) {
247          unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
248          unsigned int newswz;
249          float imm;
250          float baseimm;
251 
252          if (swz >= 4)
253             continue;
254 
255          imm = constant->u.Immediate[swz];
256          baseimm = imm;
257          if (imm < 0.0)
258             baseimm = -baseimm;
259 
260          if (baseimm == 0.0) {
261             newswz = RC_SWIZZLE_ZERO;
262          } else if (baseimm == 1.0) {
263             newswz = RC_SWIZZLE_ONE;
264          } else if (baseimm == 0.5 && c->has_half_swizzles) {
265             newswz = RC_SWIZZLE_HALF;
266          } else {
267             have_real_reference = 1;
268             continue;
269          }
270 
271          SET_SWZ(newsrc.Swizzle, chan, newswz);
272          if (imm < 0.0 && !newsrc.Abs)
273             newsrc.Negate ^= 1 << chan;
274       }
275 
276       if (!have_real_reference) {
277          newsrc.File = RC_FILE_NONE;
278          newsrc.Index = 0;
279       }
280 
281       /* don't make the swizzle worse */
282       if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
283          continue;
284 
285       inst->U.I.SrcReg[src] = newsrc;
286    }
287 
288    /* In case this instruction has been converted, make sure all of the
289     * registers that are no longer used are empty. */
290    opcode = rc_get_opcode_info(inst->U.I.Opcode);
291    for (i = opcode->NumSrcRegs; i < 3; i++) {
292       memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
293    }
294 }
295 
296 /**
297  * If src and dst use the same register, this function returns a writemask that
298  * indicates which components are read by src.  Otherwise zero is returned.
299  */
300 static unsigned int
src_reads_dst_mask(struct rc_src_register src,struct rc_dst_register dst)301 src_reads_dst_mask(struct rc_src_register src, struct rc_dst_register dst)
302 {
303    if (dst.File != src.File || dst.Index != src.Index) {
304       return 0;
305    }
306    return rc_swizzle_to_writemask(src.Swizzle);
307 }
308 
309 /* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
310  * in any of its channels.  Return 0 otherwise. */
311 static int
src_has_const_swz(struct rc_src_register src)312 src_has_const_swz(struct rc_src_register src)
313 {
314    int chan;
315    for (chan = 0; chan < 4; chan++) {
316       unsigned int swz = GET_SWZ(src.Swizzle, chan);
317       if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF || swz == RC_SWIZZLE_ONE) {
318          return 1;
319       }
320    }
321    return 0;
322 }
323 
324 static void
presub_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)325 presub_scan_read(void *data, struct rc_instruction *inst, struct rc_src_register *src)
326 {
327    struct rc_reader_data *reader_data = data;
328    rc_presubtract_op *presub_opcode = reader_data->CbData;
329 
330    if (!rc_inst_can_use_presub(
331           reader_data->C, inst, *presub_opcode, reader_data->Writer->U.I.DstReg.WriteMask, src,
332           &reader_data->Writer->U.I.SrcReg[0], &reader_data->Writer->U.I.SrcReg[1])) {
333       reader_data->Abort = 1;
334       return;
335    }
336 }
337 
338 static int
presub_helper(struct radeon_compiler * c,struct rc_instruction * inst_add,rc_presubtract_op presub_opcode,rc_presub_replace_fn presub_replace)339 presub_helper(struct radeon_compiler *c, struct rc_instruction *inst_add,
340               rc_presubtract_op presub_opcode, rc_presub_replace_fn presub_replace)
341 {
342    struct rc_reader_data reader_data;
343    unsigned int i;
344    rc_presubtract_op cb_op = presub_opcode;
345 
346    reader_data.CbData = &cb_op;
347    reader_data.ExitOnAbort = 1;
348    rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL, is_src_clobbered_scan_write);
349 
350    if (reader_data.Abort || reader_data.ReaderCount == 0)
351       return 0;
352 
353    for (i = 0; i < reader_data.ReaderCount; i++) {
354       unsigned int src_index;
355       struct rc_reader reader = reader_data.Readers[i];
356       const struct rc_opcode_info *info = rc_get_opcode_info(reader.Inst->U.I.Opcode);
357 
358       for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
359          if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
360             presub_replace(inst_add, reader.Inst, src_index);
361       }
362    }
363    return 1;
364 }
365 
366 static void
presub_replace_add(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)367 presub_replace_add(struct rc_instruction *inst_add, struct rc_instruction *inst_reader,
368                    unsigned int src_index)
369 {
370    rc_presubtract_op presub_opcode;
371 
372    unsigned int negates = 0;
373    if (inst_add->U.I.SrcReg[0].Negate)
374       negates++;
375    if (inst_add->U.I.SrcReg[1].Negate)
376       negates++;
377    assert(negates != 2 ||
378           ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) ==
379            (inst_add->U.I.SrcReg[0].Negate & inst_add->U.I.DstReg.WriteMask)));
380 
381    if (negates == 1)
382       presub_opcode = RC_PRESUB_SUB;
383    else
384       presub_opcode = RC_PRESUB_ADD;
385 
386    if (inst_add->U.I.SrcReg[1].Negate && negates == 1) {
387       inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
388       inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
389    } else {
390       inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
391       inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
392    }
393    /* If both sources are negative we can move the negate to the presub. */
394    unsigned negate_mask = negates == 1 ? 0 : inst_add->U.I.SrcReg[0].Negate;
395    inst_reader->U.I.PreSub.SrcReg[0].Negate = negate_mask;
396    inst_reader->U.I.PreSub.SrcReg[1].Negate = negate_mask;
397    inst_reader->U.I.PreSub.Opcode = presub_opcode;
398    inst_reader->U.I.SrcReg[src_index] =
399       chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
400    inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
401    inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
402 }
403 
404 static int
is_presub_candidate(struct radeon_compiler * c,struct rc_instruction * inst)405 is_presub_candidate(struct radeon_compiler *c, struct rc_instruction *inst)
406 {
407    const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
408    unsigned int i;
409    unsigned int is_constant[2] = {0, 0};
410 
411    assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);
412 
413    if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE || inst->U.I.SaturateMode ||
414        inst->U.I.WriteALUResult || inst->U.I.Omod) {
415       return 0;
416    }
417 
418    /* If first two sources use a constant swizzle, then we can't convert it to
419     * a presubtract operation.  In fact for the ADD and SUB presubtract
420     * operations neither source can contain a constant swizzle.  This
421     * specific case is checked in peephole_add_presub_add() when
422     * we make sure the swizzles for both sources are equal, so we
423     * don't need to worry about it here. */
424    for (i = 0; i < 2; i++) {
425       int chan;
426       for (chan = 0; chan < 4; chan++) {
427          rc_swizzle swz = get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
428          if (swz == RC_SWIZZLE_ONE || swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF) {
429             is_constant[i] = 1;
430          }
431       }
432    }
433    if (is_constant[0] && is_constant[1])
434       return 0;
435 
436    for (i = 0; i < info->NumSrcRegs; i++) {
437       struct rc_src_register src = inst->U.I.SrcReg[i];
438       if (src_reads_dst_mask(src, inst->U.I.DstReg))
439          return 0;
440 
441       src.File = RC_FILE_PRESUB;
442       if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
443          return 0;
444    }
445    return 1;
446 }
447 
448 static int
peephole_add_presub_add(struct radeon_compiler * c,struct rc_instruction * inst_add)449 peephole_add_presub_add(struct radeon_compiler *c, struct rc_instruction *inst_add)
450 {
451    unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
452    unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
453    unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
454 
455    if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
456       return 0;
457 
458    /* src0 and src1 can't have absolute values */
459    if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
460       return 0;
461 
462    /* if src0 is negative, at least all bits of dstmask have to be set */
463    if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
464       return 0;
465 
466    /* if src1 is negative, at least all bits of dstmask have to be set */
467    if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
468       return 0;
469 
470    if (!is_presub_candidate(c, inst_add))
471       return 0;
472 
473    if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
474       rc_remove_instruction(inst_add);
475       return 1;
476    }
477    return 0;
478 }
479 
480 static void
presub_replace_inv(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)481 presub_replace_inv(struct rc_instruction *inst_add, struct rc_instruction *inst_reader,
482                    unsigned int src_index)
483 {
484    /* We must be careful not to modify inst_add, since it
485     * is possible it will remain part of the program.*/
486    inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
487    inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
488    inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
489    inst_reader->U.I.SrcReg[src_index] =
490       chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
491 
492    inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
493    inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
494 }
495 
496 static void
presub_replace_bias(struct rc_instruction * inst_mad,struct rc_instruction * inst_reader,unsigned int src_index)497 presub_replace_bias(struct rc_instruction *inst_mad, struct rc_instruction *inst_reader,
498                     unsigned int src_index)
499 {
500    /* We must be careful not to modify inst_mad, since it
501     * is possible it will remain part of the program.*/
502    inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
503    inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
504    inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
505    inst_reader->U.I.SrcReg[src_index] =
506       chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
507 
508    inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
509    inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
510 }
511 
512 /**
513  * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
514  * Use the presubtract 1 - src0 for all readers of TEMP[0].  The first source
515  * of the add instruction must have the constant 1 swizzle.  This function
516  * does not check const registers to see if their value is 1.0, so it should
517  * be called after the constant_folding optimization.
518  * @return
519  * 	0 if the ADD instruction is still part of the program.
520  * 	1 if the ADD instruction is no longer part of the program.
521  */
522 static int
peephole_add_presub_inv(struct radeon_compiler * c,struct rc_instruction * inst_add)523 peephole_add_presub_inv(struct radeon_compiler *c, struct rc_instruction *inst_add)
524 {
525    unsigned int i, swz;
526 
527    if (!is_presub_candidate(c, inst_add))
528       return 0;
529 
530    /* Check if src0 is 1. */
531    /* XXX It would be nice to use is_src_uniform_constant here, but that
532     * function only works if the register's file is RC_FILE_NONE */
533    for (i = 0; i < 4; i++) {
534       if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
535          continue;
536 
537       swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
538       if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
539          return 0;
540    }
541 
542    /* Check src1. */
543    if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
544           inst_add->U.I.DstReg.WriteMask ||
545        inst_add->U.I.SrcReg[1].Abs || src_has_const_swz(inst_add->U.I.SrcReg[1])) {
546 
547       return 0;
548    }
549 
550    if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
551       rc_remove_instruction(inst_add);
552       return 1;
553    }
554    return 0;
555 }
556 
557 /**
558  * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
559  * Use the presubtract 1 - 2*src0 for all readers of TEMP[0].  The first source
560  * of the add instruction must have the constant 1 swizzle.  This function
561  * does not check const registers to see if their value is 1.0, so it should
562  * be called after the constant_folding optimization.
563  * @return
564  * 	0 if the MAD instruction is still part of the program.
565  * 	1 if the MAD instruction is no longer part of the program.
566  */
567 static int
peephole_mad_presub_bias(struct radeon_compiler * c,struct rc_instruction * inst_mad)568 peephole_mad_presub_bias(struct radeon_compiler *c, struct rc_instruction *inst_mad)
569 {
570    unsigned int i, swz;
571 
572    if (!is_presub_candidate(c, inst_mad))
573       return 0;
574 
575    /* Check if src2 is 1. */
576    for (i = 0; i < 4; i++) {
577       if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
578          continue;
579 
580       swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
581       if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
582          return 0;
583    }
584 
585    /* Check if src1 is 2. */
586    struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
587    if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
588       return 0;
589    if (src1_reg.File == RC_FILE_INLINE) {
590       if (rc_inline_to_float(src1_reg.Index) != 2.0f)
591          return 0;
592    } else {
593       if (src1_reg.File != RC_FILE_CONSTANT)
594          return 0;
595 
596       struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
597       if (constant->Type != RC_CONSTANT_IMMEDIATE)
598          return 0;
599       for (i = 0; i < 4; i++) {
600          if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
601             continue;
602          swz = GET_SWZ(src1_reg.Swizzle, i);
603          if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
604             return 0;
605       }
606    }
607 
608    /* Check src0. */
609    if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
610           inst_mad->U.I.DstReg.WriteMask ||
611        inst_mad->U.I.SrcReg[0].Abs || src_has_const_swz(inst_mad->U.I.SrcReg[0])) {
612 
613       return 0;
614    }
615 
616    if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
617       rc_remove_instruction(inst_mad);
618       return 1;
619    }
620    return 0;
621 }
622 
623 struct peephole_mul_cb_data {
624    struct rc_dst_register *Writer;
625    unsigned int Clobbered;
626 };
627 
628 static void
omod_filter_reader_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)629 omod_filter_reader_cb(void *userdata, struct rc_instruction *inst, rc_register_file file,
630                       unsigned int index, unsigned int mask)
631 {
632    struct peephole_mul_cb_data *d = userdata;
633    if (rc_src_reads_dst_mask(file, mask, index, d->Writer->File, d->Writer->Index,
634                              d->Writer->WriteMask)) {
635 
636       d->Clobbered = 1;
637    }
638 }
639 
640 static void
omod_filter_writer_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)641 omod_filter_writer_cb(void *userdata, struct rc_instruction *inst, rc_register_file file,
642                       unsigned int index, unsigned int mask)
643 {
644    struct peephole_mul_cb_data *d = userdata;
645    if (file == d->Writer->File && index == d->Writer->Index && (mask & d->Writer->WriteMask)) {
646       d->Clobbered = 1;
647    }
648 }
649 
650 static int
peephole_mul_omod(struct radeon_compiler * c,struct rc_instruction * inst_mul,struct rc_list * var_list)651 peephole_mul_omod(struct radeon_compiler *c, struct rc_instruction *inst_mul,
652                   struct rc_list *var_list)
653 {
654    unsigned int chan = 0, swz, i;
655    int const_index = -1;
656    int temp_index = -1;
657    float const_value;
658    rc_omod_op omod_op = RC_OMOD_DISABLE;
659    struct rc_list *writer_list;
660    struct rc_variable *var;
661    struct peephole_mul_cb_data cb_data;
662    unsigned writemask_sum;
663 
664    for (i = 0; i < 2; i++) {
665       unsigned int j;
666       if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT &&
667           inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY &&
668           inst_mul->U.I.SrcReg[i].File != RC_FILE_NONE) {
669          return 0;
670       }
671 
672       /* The only relevant case with constant swizzles we should check for
673        * is multiply by one half.
674        */
675       if (inst_mul->U.I.SrcReg[i].File == RC_FILE_NONE) {
676          for (j = 0; j < 4; j++) {
677             swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
678             if (swz == RC_SWIZZLE_UNUSED) {
679                continue;
680             }
681             if (swz != RC_SWIZZLE_HALF) {
682                return 0;
683             } else {
684                omod_op = RC_OMOD_DIV_2;
685             }
686          }
687       }
688 
689       if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
690          if (temp_index != -1) {
691             /* The instruction has two temp sources */
692             return 0;
693          } else {
694             temp_index = i;
695             continue;
696          }
697       }
698       /* If we get this far Src[i] must be a constant src */
699       if (inst_mul->U.I.SrcReg[i].Negate) {
700          return 0;
701       }
702       /* The constant src needs to read from the same swizzle */
703       swz = RC_SWIZZLE_UNUSED;
704       chan = 0;
705       for (j = 0; j < 4; j++) {
706          unsigned int j_swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
707          if (j_swz == RC_SWIZZLE_UNUSED) {
708             continue;
709          }
710          if (swz == RC_SWIZZLE_UNUSED) {
711             swz = j_swz;
712             chan = j;
713          } else if (j_swz != swz) {
714             return 0;
715          }
716       }
717 
718       if (const_index != -1) {
719          /* The instruction has two constant sources */
720          return 0;
721       } else {
722          const_index = i;
723       }
724    }
725 
726    if (omod_op == RC_OMOD_DISABLE) {
727       if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
728                                    inst_mul->U.I.SrcReg[const_index].Index)) {
729          return 0;
730       }
731       const_value = rc_get_constant_value(c, inst_mul->U.I.SrcReg[const_index].Index,
732                                           inst_mul->U.I.SrcReg[const_index].Swizzle,
733                                           inst_mul->U.I.SrcReg[const_index].Negate, chan);
734 
735       if (const_value == 2.0f) {
736          omod_op = RC_OMOD_MUL_2;
737       } else if (const_value == 4.0f) {
738          omod_op = RC_OMOD_MUL_4;
739       } else if (const_value == 8.0f) {
740          omod_op = RC_OMOD_MUL_8;
741       } else if (const_value == (1.0f / 2.0f)) {
742          omod_op = RC_OMOD_DIV_2;
743       } else if (const_value == (1.0f / 4.0f)) {
744          omod_op = RC_OMOD_DIV_4;
745       } else if (const_value == (1.0f / 8.0f)) {
746          omod_op = RC_OMOD_DIV_8;
747       } else {
748          return 0;
749       }
750    }
751 
752    writer_list = rc_variable_list_get_writers_one_reader(var_list, RC_INSTRUCTION_NORMAL,
753                                                          &inst_mul->U.I.SrcReg[temp_index]);
754 
755    if (!writer_list) {
756       return 0;
757    }
758 
759    cb_data.Clobbered = 0;
760    cb_data.Writer = &inst_mul->U.I.DstReg;
761    for (var = writer_list->Item; var; var = var->Friend) {
762       struct rc_instruction *inst;
763       const struct rc_opcode_info *info = rc_get_opcode_info(var->Inst->U.I.Opcode);
764       if (info->HasTexture) {
765          return 0;
766       }
767       if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
768          return 0;
769       }
770 
771       /* Empirical testing shows that DDX/DDY directly into output
772        * with non-identity omod is problematic.
773        */
774       if ((info->Opcode == RC_OPCODE_DDX || info->Opcode == RC_OPCODE_DDY) &&
775           inst_mul->U.I.DstReg.File == RC_FILE_OUTPUT) {
776          return 0;
777       }
778 
779       for (inst = inst_mul->Prev; inst != var->Inst; inst = inst->Prev) {
780          rc_for_all_reads_mask(inst, omod_filter_reader_cb, &cb_data);
781          rc_for_all_writes_mask(inst, omod_filter_writer_cb, &cb_data);
782          if (cb_data.Clobbered) {
783             break;
784          }
785       }
786    }
787 
788    if (cb_data.Clobbered) {
789       return 0;
790    }
791 
792    writemask_sum = rc_variable_writemask_sum(writer_list->Item);
793 
794    /* rc_normal_rewrite_writemask can't expand a previous writemask to store
795     * more channels replicated.
796     */
797    if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
798       return 0;
799 
800    /* Rewrite the instructions */
801    for (var = writer_list->Item; var; var = var->Friend) {
802       struct rc_variable *writer = var;
803       unsigned conversion_swizzle = RC_SWIZZLE_UUUU;
804       for (chan = 0; chan < 4; chan++) {
805          unsigned swz = GET_SWZ(inst_mul->U.I.SrcReg[temp_index].Swizzle, chan);
806          if (swz <= RC_SWIZZLE_W)
807             SET_SWZ(conversion_swizzle, swz, chan);
808       }
809       writer->Inst->U.I.Omod = omod_op;
810       writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
811       writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
812       rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
813       writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
814    }
815 
816    rc_remove_instruction(inst_mul);
817 
818    return 1;
819 }
820 
821 /**
822  * @return
823  * 	0 if inst is still part of the program.
824  * 	1 if inst is no longer part of the program.
825  */
826 int
rc_opt_presubtract(struct radeon_compiler * c,struct rc_instruction * inst,void * data)827 rc_opt_presubtract(struct radeon_compiler *c, struct rc_instruction *inst, void *data)
828 {
829    switch (inst->U.I.Opcode) {
830    case RC_OPCODE_ADD: {
831       if (peephole_add_presub_inv(c, inst))
832          return 1;
833       if (peephole_add_presub_add(c, inst))
834          return 1;
835       break;
836    }
837    case RC_OPCODE_MAD: {
838       if (peephole_mad_presub_bias(c, inst))
839          return 1;
840       break;
841    }
842    default:
843       break;
844    }
845    return 0;
846 }
847 
848 static unsigned int
merge_swizzles(unsigned int swz1,unsigned int swz2)849 merge_swizzles(unsigned int swz1, unsigned int swz2)
850 {
851    unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
852    for (unsigned int chan = 0; chan < 4; chan++) {
853       unsigned int swz = GET_SWZ(swz1, chan);
854       if (swz != RC_SWIZZLE_UNUSED) {
855          SET_SWZ(new_swz, chan, swz);
856          continue;
857       }
858       swz = GET_SWZ(swz2, chan);
859       SET_SWZ(new_swz, chan, swz);
860    }
861    return new_swz;
862 }
863 
864 /* Sets negate to 0 for unused channels. */
865 static unsigned int
clean_negate(struct rc_src_register src)866 clean_negate(struct rc_src_register src)
867 {
868    unsigned int new_negate = 0;
869    for (unsigned int chan = 0; chan < 4; chan++) {
870       unsigned int swz = GET_SWZ(src.Swizzle, chan);
871       if (swz != RC_SWIZZLE_UNUSED)
872          new_negate |= src.Negate & (1 << chan);
873    }
874    return new_negate;
875 }
876 
877 static unsigned int
merge_negates(struct rc_src_register src1,struct rc_src_register src2)878 merge_negates(struct rc_src_register src1, struct rc_src_register src2)
879 {
880    return clean_negate(src1) | clean_negate(src2);
881 }
882 
883 static unsigned int
fill_swizzle(unsigned int orig_swz,unsigned int wmask,unsigned int const_swz)884 fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz)
885 {
886    for (unsigned int chan = 0; chan < 4; chan++) {
887       unsigned int swz = GET_SWZ(orig_swz, chan);
888       if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) {
889          SET_SWZ(orig_swz, chan, const_swz);
890       }
891    }
892    return orig_swz;
893 }
894 
895 static int
have_shared_source(struct rc_instruction * inst1,struct rc_instruction * inst2)896 have_shared_source(struct rc_instruction *inst1, struct rc_instruction *inst2)
897 {
898    int shared_src = -1;
899    const struct rc_opcode_info *opcode1 = rc_get_opcode_info(inst1->U.I.Opcode);
900    const struct rc_opcode_info *opcode2 = rc_get_opcode_info(inst2->U.I.Opcode);
901    for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) {
902       for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
903          if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
904              inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
905              inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
906              inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
907             shared_src = i;
908       }
909    }
910    return shared_src;
911 }
912 
913 /**
914  * Merges two MOVs writing different channels of the same destination register
915  * with the use of the constant swizzles.
916  */
917 static bool
merge_movs(struct radeon_compiler * c,struct rc_instruction * inst,struct rc_instruction * cur)918 merge_movs(struct radeon_compiler *c, struct rc_instruction *inst, struct rc_instruction *cur)
919 {
920    /* We can merge two MOVs into MOV if one of them is from inline constant,
921     * i.e., constant swizzles and RC_FILE_NONE).
922     *
923     * For example
924     *   MOV temp[0].x none.1___
925     *   MOV temp[0].y input[0]._x__
926     *
927     * becomes
928     *   MOV temp[0].xy input[0].1x__
929     */
930    unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
931    if (cur->U.I.SrcReg[0].File == RC_FILE_NONE || inst->U.I.SrcReg[0].File == RC_FILE_NONE) {
932       struct rc_src_register src;
933       if (cur->U.I.SrcReg[0].File == RC_FILE_NONE)
934          src = inst->U.I.SrcReg[0];
935       else
936          src = cur->U.I.SrcReg[0];
937       src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle);
938       src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
939       if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
940          cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
941          cur->U.I.SrcReg[0] = src;
942          rc_remove_instruction(inst);
943          return true;
944       }
945    }
946 
947    /* Handle the trivial case where the MOVs share a source.
948     *
949     * For example
950     *   MOV temp[0].x const[0].x
951     *   MOV temp[0].y const[0].z
952     *
953     * becomes
954     *   MOV temp[0].xy const[0].xz
955     */
956    if (have_shared_source(inst, cur) == 0) {
957       struct rc_src_register src = cur->U.I.SrcReg[0];
958       src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
959       src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle);
960 
961       if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
962          cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
963          cur->U.I.SrcReg[0] = src;
964          rc_remove_instruction(inst);
965          return true;
966       }
967    }
968 
969    /* Otherwise, we can convert the MOVs into ADD.
970     *
971     * For example
972     *   MOV temp[0].x const[0].x
973     *   MOV temp[0].y input[0].y
974     *
975     * becomes
976     *   ADD temp[0].xy const[0].x0 input[0].0y
977     */
978    unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask;
979    struct rc_src_register src0 = inst->U.I.SrcReg[0];
980    struct rc_src_register src1 = cur->U.I.SrcReg[0];
981 
982    src0.Swizzle = fill_swizzle(src0.Swizzle, wmask, RC_SWIZZLE_ZERO);
983    src1.Swizzle = fill_swizzle(src1.Swizzle, wmask, RC_SWIZZLE_ZERO);
984    if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) ||
985        !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1))
986       return false;
987 
988    cur->U.I.DstReg.WriteMask = wmask;
989    cur->U.I.Opcode = RC_OPCODE_ADD;
990    cur->U.I.SrcReg[0] = src0;
991    cur->U.I.SrcReg[1] = src1;
992 
993    /* finally delete the original mov */
994    rc_remove_instruction(inst);
995    return true;
996 }
997 
998 /**
999  * This function will try to merge MOV and ADD/MUL instructions with the same
1000  * destination, making use of the constant swizzles.
1001  *
1002  * For example:
1003  *   MOV temp[0].x const[0].x
1004  *   MUL temp[0].yz const[1].yz const[2].yz
1005  *
1006  * becomes
1007  *   MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00
1008  */
1009 static int
merge_mov_add_mul(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1010 merge_mov_add_mul(struct radeon_compiler *c, struct rc_instruction *inst1,
1011                   struct rc_instruction *inst2)
1012 {
1013    struct rc_instruction *inst, *mov;
1014    if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1015       mov = inst1;
1016       inst = inst2;
1017    } else {
1018       mov = inst2;
1019       inst = inst1;
1020    }
1021 
1022    const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL;
1023    int shared_index = have_shared_source(inst, mov);
1024    unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask;
1025 
1026    /* If there is a shared source, just merge the swizzles and be done with it. */
1027    if (shared_index != -1) {
1028       struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index];
1029       struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index];
1030 
1031       shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src);
1032       shared_src.Swizzle = merge_swizzles(shared_src.Swizzle, mov->U.I.SrcReg[0].Swizzle);
1033       other_src.Negate = clean_negate(other_src);
1034       unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO;
1035       other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz);
1036 
1037       if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) ||
1038           !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src))
1039          return 0;
1040 
1041       inst2->U.I.Opcode = inst->U.I.Opcode;
1042       inst2->U.I.SrcReg[0] = shared_src;
1043       inst2->U.I.SrcReg[1] = other_src;
1044 
1045       /* TODO: we can do a bit better in the special case when one of the sources is none.
1046        * Convert to MAD otherwise.
1047        */
1048    } else {
1049       struct rc_src_register src0, src1, src2;
1050       if (is_mul) {
1051          src2 = mov->U.I.SrcReg[0];
1052          src0 = inst->U.I.SrcReg[0];
1053          src1 = inst->U.I.SrcReg[1];
1054       } else {
1055          src0 = mov->U.I.SrcReg[0];
1056          src1 = inst->U.I.SrcReg[0];
1057          src2 = inst->U.I.SrcReg[1];
1058       }
1059       /* The following login expects that the unused channels have empty negate bits. */
1060       src0.Negate = clean_negate(src0);
1061       src1.Negate = clean_negate(src1);
1062       src2.Negate = clean_negate(src2);
1063 
1064       src0.Swizzle = fill_swizzle(src0.Swizzle, wmask, RC_SWIZZLE_ONE);
1065       src1.Swizzle = fill_swizzle(src1.Swizzle, wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE);
1066       src2.Swizzle = fill_swizzle(src2.Swizzle, wmask, RC_SWIZZLE_ZERO);
1067       if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) ||
1068           !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) ||
1069           !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2))
1070          return 0;
1071 
1072       inst2->U.I.Opcode = RC_OPCODE_MAD;
1073       inst2->U.I.SrcReg[0] = src0;
1074       inst2->U.I.SrcReg[1] = src1;
1075       inst2->U.I.SrcReg[2] = src2;
1076    }
1077    inst2->U.I.DstReg.WriteMask = wmask;
1078    /* finally delete the original instruction */
1079    rc_remove_instruction(inst1);
1080 
1081    return 1;
1082 }
1083 
1084 /**
1085  * This function will try to merge MOV and MAD instructions with the same
1086  * destination, making use of the constant swizzles. This only works
1087  * if there is a shared source or one of the sources is RC_FILE_NONE.
1088  *
1089  * For example:
1090  *   MOV temp[0].x const[0].x
1091  *   MAD temp[0].yz const[0].yz const[1].yz input[0].xw
1092  *
1093  * becomes
1094  *   MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
1095  */
1096 static bool
merge_mov_mad(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1097 merge_mov_mad(struct radeon_compiler *c, struct rc_instruction *inst1, struct rc_instruction *inst2)
1098 {
1099    struct rc_instruction *mov, *mad;
1100    if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1101       mov = inst1;
1102       mad = inst2;
1103    } else {
1104       mov = inst2;
1105       mad = inst1;
1106    }
1107 
1108    int shared_index = have_shared_source(mad, mov);
1109    unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
1110    struct rc_src_register src[3];
1111    src[0] = mad->U.I.SrcReg[0];
1112    src[1] = mad->U.I.SrcReg[1];
1113    src[2] = mad->U.I.SrcReg[2];
1114 
1115    /* Shared source is the one for multiplication. */
1116    if (shared_index == 0 || shared_index == 1) {
1117       src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
1118       src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
1119       src[shared_index].Swizzle =
1120          merge_swizzles(src[shared_index].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1121       src[1 - shared_index].Swizzle =
1122          fill_swizzle(src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
1123       src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1124 
1125       /* Shared source is the one for used for addition, or it is none. Additionally,
1126        * if the mov SrcReg is none, we merge it with the addition (third) reg as well
1127        * because than we have the highest change the swizzles will be legal.
1128        */
1129    } else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
1130               src[2].File == RC_FILE_NONE) {
1131       src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
1132       src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1133       src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
1134       src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
1135       if (src[2].File == RC_FILE_NONE) {
1136          src[2].File = mov->U.I.SrcReg[0].File;
1137          src[2].Index = mov->U.I.SrcReg[0].Index;
1138          src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
1139          src[2].Abs = mov->U.I.SrcReg[0].Abs;
1140       }
1141 
1142       /* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
1143        * fill the other one with ones and the reg for addition with zeros.
1144        */
1145    } else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
1146       unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
1147       src[none_src] = mov->U.I.SrcReg[0];
1148       src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
1149       src[none_src].Swizzle =
1150          merge_swizzles(src[none_src].Swizzle, mad->U.I.SrcReg[none_src].Swizzle);
1151       src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
1152       src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle, wmask, RC_SWIZZLE_ONE);
1153       src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1154    } else {
1155       return false;
1156    }
1157 
1158    if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
1159        !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
1160        !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
1161       return false;
1162 
1163    inst2->U.I.Opcode = RC_OPCODE_MAD;
1164    inst2->U.I.SrcReg[0] = src[0];
1165    inst2->U.I.SrcReg[1] = src[1];
1166    inst2->U.I.SrcReg[2] = src[2];
1167    inst2->U.I.DstReg.WriteMask = wmask;
1168    rc_remove_instruction(inst1);
1169    return true;
1170 }
1171 
1172 static bool
inst_combination(struct rc_instruction * inst1,struct rc_instruction * inst2,rc_opcode opcode1,rc_opcode opcode2)1173 inst_combination(struct rc_instruction *inst1, struct rc_instruction *inst2, rc_opcode opcode1,
1174                  rc_opcode opcode2)
1175 {
1176    return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) ||
1177            (inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2));
1178 }
1179 
1180 /**
1181  * Searches for instructions writing different channels of the same register that could
1182  * be merged together with the use of constant swizzles.
1183  *
1184  * The potential candidates are combinations of MOVs, ADDs, MULs and MADs.
1185  */
1186 static void
merge_channels(struct radeon_compiler * c,struct rc_instruction * inst)1187 merge_channels(struct radeon_compiler *c, struct rc_instruction *inst)
1188 {
1189    unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
1190    unsigned int orig_dst_file = inst->U.I.DstReg.File;
1191    unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
1192    const struct rc_opcode_info *orig_opcode = rc_get_opcode_info(inst->U.I.Opcode);
1193 
1194    struct rc_instruction *cur = inst;
1195    while (cur != &c->Program.Instructions) {
1196       cur = cur->Next;
1197       const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);
1198 
1199       /* Keep it simple for now and stop when encountering any
1200        * control flow.
1201        */
1202       if (opcode->IsFlowControl)
1203          return;
1204 
1205       /* Stop when the original destination is overwritten */
1206       if (orig_dst_reg == cur->U.I.DstReg.Index && orig_dst_file == cur->U.I.DstReg.File &&
1207           (orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
1208          return;
1209 
1210       /* Stop the search when the original instruction destination
1211        * is used as a source for anything.
1212        */
1213       for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
1214          if (cur->U.I.SrcReg[i].File == orig_dst_file && cur->U.I.SrcReg[i].Index == orig_dst_reg)
1215             return;
1216       }
1217 
1218       /* Stop the search when some of the original sources are touched. */
1219       for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) {
1220          if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File &&
1221              inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index)
1222             return;
1223       }
1224 
1225       if (cur->U.I.DstReg.File == orig_dst_file && cur->U.I.DstReg.Index == orig_dst_reg &&
1226           cur->U.I.SaturateMode == inst->U.I.SaturateMode &&
1227           (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {
1228 
1229          if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) {
1230             if (merge_movs(c, inst, cur))
1231                return;
1232          }
1233 
1234          /* Skip the merge if one of the instructions writes just w channel
1235           * and we are compiling a fragment shader. We can pair-schedule it together
1236           * later anyway and it will also give the scheduler a bit more flexibility.
1237           * Only check this after merging MOVs as when we manage to merge two MOVs
1238           * into another MOV we can still copy propagate it away. So it is a win in
1239           * that case.
1240           */
1241          if (c->has_omod &&
1242              (cur->U.I.DstReg.WriteMask == RC_MASK_W || inst->U.I.DstReg.WriteMask == RC_MASK_W))
1243             continue;
1244 
1245          if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) ||
1246              inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) {
1247             if (merge_mov_add_mul(c, inst, cur))
1248                return;
1249          }
1250 
1251          if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
1252             if (merge_mov_mad(c, inst, cur))
1253                return;
1254          }
1255       }
1256    }
1257 }
1258 
1259 /**
1260  * Searches for duplicate ARLs/ARRs
1261  *
1262  * Only a very trivial case is now optimized where if a second one is detected which reads from
1263  * the same register as the first one and source is the same, just remove the second one.
1264  */
1265 static void
merge_A0_loads(struct radeon_compiler * c,struct rc_instruction * inst,bool is_ARL)1266 merge_A0_loads(struct radeon_compiler *c, struct rc_instruction *inst, bool is_ARL)
1267 {
1268    unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
1269    unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
1270    unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
1271    int cf_depth = 0;
1272 
1273    struct rc_instruction *cur = inst;
1274    while (cur != &c->Program.Instructions) {
1275       cur = cur->Next;
1276       const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);
1277 
1278       /* Keep it simple for now and stop when encountering any
1279        * control flow besides simple ifs.
1280        */
1281       if (opcode->IsFlowControl) {
1282          switch (cur->U.I.Opcode) {
1283          case RC_OPCODE_IF: {
1284             cf_depth++;
1285             break;
1286          }
1287          case RC_OPCODE_ELSE: {
1288             if (cf_depth < 1)
1289                return;
1290             break;
1291          }
1292          case RC_OPCODE_ENDIF: {
1293             cf_depth--;
1294             break;
1295          }
1296          default:
1297             return;
1298          }
1299       }
1300 
1301       /* Stop when the original source is overwritten */
1302       if (A0_src_reg == cur->U.I.DstReg.Index && A0_src_file == cur->U.I.DstReg.File &&
1303           cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
1304          return;
1305 
1306       /* Wrong A0 load type. */
1307       if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
1308           (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
1309          return;
1310 
1311       if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
1312          if (A0_src_reg == cur->U.I.SrcReg[0].Index && A0_src_file == cur->U.I.SrcReg[0].File &&
1313              A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
1314             struct rc_instruction *next = cur->Next;
1315             rc_remove_instruction(cur);
1316             cur = next;
1317          } else {
1318             return;
1319          }
1320       }
1321    }
1322 }
1323 
1324 /**
1325  * According to the GLSL spec, round is only 1.30 and up
1326  * so the only reason why we should ever see round is if it actually
1327  * is lowered ARR (from nine->ttn). In that case we want to reconstruct
1328  * the ARR instead of lowering the round.
1329  */
1330 static void
transform_vertex_ROUND(struct radeon_compiler * c,struct rc_instruction * inst)1331 transform_vertex_ROUND(struct radeon_compiler *c, struct rc_instruction *inst)
1332 {
1333    struct rc_reader_data readers;
1334    readers.ExitOnAbort = 0;
1335    rc_get_readers(c, inst, &readers, NULL, NULL, NULL);
1336 
1337    assert(readers.ReaderCount > 0);
1338    for (unsigned i = 0; i < readers.ReaderCount; i++) {
1339       struct rc_instruction *reader = readers.Readers[i].Inst;
1340       if (reader->U.I.Opcode != RC_OPCODE_ARL) {
1341          assert(!"Unable to convert ROUND+ARL to ARR\n");
1342          return;
1343       }
1344    }
1345 
1346    /* Only ARL readers, convert all to ARR */
1347    for (unsigned i = 0; i < readers.ReaderCount; i++) {
1348       readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
1349    }
1350    /* Switch ROUND to MOV and let copy propagate sort it out later. */
1351    inst->U.I.Opcode = RC_OPCODE_MOV;
1352 }
1353 
1354 /**
1355  * Apply various optimizations specific to the A0 address register loads.
1356  */
1357 static void
optimize_A0_loads(struct radeon_compiler * c)1358 optimize_A0_loads(struct radeon_compiler *c)
1359 {
1360    struct rc_instruction *inst = c->Program.Instructions.Next;
1361 
1362    while (inst != &c->Program.Instructions) {
1363       struct rc_instruction *cur = inst;
1364       inst = inst->Next;
1365       if (cur->U.I.Opcode == RC_OPCODE_ARL) {
1366          merge_A0_loads(c, cur, true);
1367       } else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
1368          merge_A0_loads(c, cur, false);
1369       } else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
1370          transform_vertex_ROUND(c, cur);
1371       }
1372    }
1373 }
1374 
1375 void
rc_optimize(struct radeon_compiler * c,void * user)1376 rc_optimize(struct radeon_compiler *c, void *user)
1377 {
1378    struct rc_instruction *inst = c->Program.Instructions.Next;
1379    while (inst != &c->Program.Instructions) {
1380       struct rc_instruction *cur = inst;
1381       inst = inst->Next;
1382       constant_folding(c, cur);
1383    }
1384 
1385    /* Copy propagate simple movs away. */
1386    inst = c->Program.Instructions.Next;
1387    while (inst != &c->Program.Instructions) {
1388       struct rc_instruction *cur = inst;
1389       inst = inst->Next;
1390       if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1391          copy_propagate(c, cur);
1392       }
1393    }
1394 
1395    if (c->type == RC_VERTEX_PROGRAM) {
1396       optimize_A0_loads(c);
1397    }
1398 
1399    /* Merge MOVs to same source in different channels using the constant
1400     * swizzle.
1401     */
1402    if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
1403       inst = c->Program.Instructions.Next;
1404       while (inst != &c->Program.Instructions) {
1405          struct rc_instruction *cur = inst;
1406          inst = inst->Next;
1407          if (cur->U.I.Opcode == RC_OPCODE_MOV || cur->U.I.Opcode == RC_OPCODE_ADD ||
1408              cur->U.I.Opcode == RC_OPCODE_MAD || cur->U.I.Opcode == RC_OPCODE_MUL)
1409             merge_channels(c, cur);
1410       }
1411    }
1412 
1413    /* Copy propagate few extra movs from the merge_channels pass. */
1414    inst = c->Program.Instructions.Next;
1415    while (inst != &c->Program.Instructions) {
1416       struct rc_instruction *cur = inst;
1417       inst = inst->Next;
1418       if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1419          copy_propagate(c, cur);
1420       }
1421    }
1422 
1423    if (c->type != RC_FRAGMENT_PROGRAM) {
1424       return;
1425    }
1426 
1427    /* Output modifiers. */
1428    inst = c->Program.Instructions.Next;
1429    struct rc_list *var_list = NULL;
1430    while (inst != &c->Program.Instructions) {
1431       struct rc_instruction *cur = inst;
1432       inst = inst->Next;
1433       if (cur->U.I.Opcode == RC_OPCODE_MUL) {
1434          if (!var_list)
1435             var_list = rc_get_variables(c);
1436          if (peephole_mul_omod(c, cur, var_list))
1437             var_list = NULL;
1438       }
1439    }
1440 }
1441