• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2009 Nicolai Haehnle.
3  * Copyright 2010 Tom Stellard <tstellar@gmail.com>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "util/u_math.h"
8 
9 #include "radeon_dataflow.h"
10 
11 #include "radeon_compiler.h"
12 #include "radeon_compiler_util.h"
13 #include "radeon_list.h"
14 #include "radeon_swizzle.h"
15 #include "radeon_variable.h"
16 
17 struct src_clobbered_reads_cb_data {
18    rc_register_file File;
19    unsigned int Index;
20    unsigned int Mask;
21    struct rc_reader_data *ReaderData;
22 };
23 
24 typedef void (*rc_presub_replace_fn)(struct rc_instruction *, struct rc_instruction *,
25                                      unsigned int);
26 
27 static struct rc_src_register
chain_srcregs(struct rc_src_register outer,struct rc_src_register inner)28 chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
29 {
30    struct rc_src_register combine;
31    combine.File = inner.File;
32    combine.Index = inner.Index;
33    combine.RelAddr = inner.RelAddr;
34    if (outer.Abs) {
35       combine.Abs = 1;
36       combine.Negate = outer.Negate;
37    } else {
38       combine.Abs = inner.Abs;
39       combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
40       combine.Negate ^= outer.Negate;
41    }
42    combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
43    return combine;
44 }
45 
46 static void
copy_propagate_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)47 copy_propagate_scan_read(void *data, struct rc_instruction *inst, struct rc_src_register *src)
48 {
49    rc_register_file file = src->File;
50    struct rc_reader_data *reader_data = data;
51 
52    if (!rc_inst_can_use_presub(reader_data->C, inst, reader_data->Writer->U.I.PreSub.Opcode,
53                                rc_swizzle_to_writemask(src->Swizzle), src,
54                                &reader_data->Writer->U.I.PreSub.SrcReg[0],
55                                &reader_data->Writer->U.I.PreSub.SrcReg[1])) {
56       reader_data->Abort = 1;
57       return;
58    }
59 
60    /* XXX This could probably be handled better. */
61    if (file == RC_FILE_ADDRESS) {
62       reader_data->Abort = 1;
63       return;
64    }
65 
66    /* R300/R400 is unhappy about propagating
67     *  0: MOV temp[1], -none.1111;
68     *  1: KIL temp[1];
69     * to
70     *  0: KIL -none.1111;
71     *
72     * R500 is fine with it.
73     */
74    if (!reader_data->C->is_r500 && inst->U.I.Opcode == RC_OPCODE_KIL &&
75        reader_data->Writer->U.I.SrcReg[0].File == RC_FILE_NONE) {
76       reader_data->Abort = 1;
77       return;
78    }
79 
80    /* These instructions cannot read from the constants file.
81     * see radeonTransformTEX()
82     */
83    if (reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
84        reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
85        reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_NONE &&
86        (inst->U.I.Opcode == RC_OPCODE_TEX || inst->U.I.Opcode == RC_OPCODE_TXB ||
87         inst->U.I.Opcode == RC_OPCODE_TXP || inst->U.I.Opcode == RC_OPCODE_TXD ||
88         inst->U.I.Opcode == RC_OPCODE_TXL || inst->U.I.Opcode == RC_OPCODE_KIL)) {
89       reader_data->Abort = 1;
90       return;
91    }
92 }
93 
94 static void
src_clobbered_reads_cb(void * data,struct rc_instruction * inst,struct rc_src_register * src)95 src_clobbered_reads_cb(void *data, struct rc_instruction *inst, struct rc_src_register *src)
96 {
97    struct src_clobbered_reads_cb_data *sc_data = data;
98 
99    if (src->File == sc_data->File && src->Index == sc_data->Index &&
100        (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
101 
102       sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
103    }
104 
105    if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
106       sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
107    }
108 }
109 
110 static void
is_src_clobbered_scan_write(void * data,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)111 is_src_clobbered_scan_write(void *data, struct rc_instruction *inst, rc_register_file file,
112                             unsigned int index, unsigned int mask)
113 {
114    struct src_clobbered_reads_cb_data sc_data;
115    struct rc_reader_data *reader_data = data;
116    sc_data.File = file;
117    sc_data.Index = index;
118    sc_data.Mask = mask;
119    sc_data.ReaderData = reader_data;
120    rc_for_all_reads_src(reader_data->Writer, src_clobbered_reads_cb, &sc_data);
121 }
122 
123 static void
copy_propagate(struct radeon_compiler * c,struct rc_instruction * inst_mov)124 copy_propagate(struct radeon_compiler *c, struct rc_instruction *inst_mov)
125 {
126    struct rc_reader_data reader_data;
127    unsigned int i;
128 
129    if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY || inst_mov->U.I.WriteALUResult)
130       return;
131 
132    /* Get a list of all the readers of this MOV instruction. */
133    reader_data.ExitOnAbort = 1;
134    rc_get_readers(c, inst_mov, &reader_data, copy_propagate_scan_read, NULL,
135                   is_src_clobbered_scan_write);
136 
137    if (reader_data.Abort || reader_data.ReaderCount == 0)
138       return;
139 
140    /* We can propagate SaturateMode if all the readers are MOV instructions
141     * without a presubtract operation, source negation and absolute.
142     * In that case, we just move SaturateMode to all readers. */
143    if (inst_mov->U.I.SaturateMode) {
144       for (i = 0; i < reader_data.ReaderCount; i++) {
145          struct rc_instruction *inst = reader_data.Readers[i].Inst;
146 
147          if (inst->U.I.Opcode != RC_OPCODE_MOV || inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
148              inst->U.I.SrcReg[0].Abs || inst->U.I.SrcReg[0].Negate) {
149             return;
150          }
151       }
152    }
153 
154    /* Propagate the MOV instruction. */
155    for (i = 0; i < reader_data.ReaderCount; i++) {
156       struct rc_instruction *inst = reader_data.Readers[i].Inst;
157       *reader_data.Readers[i].U.I.Src =
158          chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
159 
160       if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
161          inst->U.I.PreSub = inst_mov->U.I.PreSub;
162       if (!inst->U.I.SaturateMode)
163          inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
164    }
165 
166    /* Finally, remove the original MOV instruction */
167    rc_remove_instruction(inst_mov);
168 }
169 
170 /**
171  * Check if a source register is actually always the same
172  * swizzle constant.
173  */
174 static int
is_src_uniform_constant(struct rc_src_register src,rc_swizzle * pswz,unsigned int * pnegate)175 is_src_uniform_constant(struct rc_src_register src, rc_swizzle *pswz, unsigned int *pnegate)
176 {
177    int have_used = 0;
178 
179    if (src.File != RC_FILE_NONE) {
180       *pswz = 0;
181       return 0;
182    }
183 
184    for (unsigned int chan = 0; chan < 4; ++chan) {
185       unsigned int swz = GET_SWZ(src.Swizzle, chan);
186       if (swz < 4) {
187          *pswz = 0;
188          return 0;
189       }
190       if (swz == RC_SWIZZLE_UNUSED)
191          continue;
192 
193       if (!have_used) {
194          *pswz = swz;
195          *pnegate = GET_BIT(src.Negate, chan);
196          have_used = 1;
197       } else {
198          if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
199             *pswz = 0;
200             return 0;
201          }
202       }
203    }
204 
205    return 1;
206 }
207 
208 /**
209  * Replace 0.0, 1.0 and 0.5 immediate constants by their
210  * respective swizzles. Simplify instructions like ADD dst, src, 0;
211  */
212 static void
constant_folding(struct radeon_compiler * c,struct rc_instruction * inst)213 constant_folding(struct radeon_compiler *c, struct rc_instruction *inst)
214 {
215    const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
216    unsigned int i;
217 
218    /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
219    for (unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
220       struct rc_constant *constant;
221       struct rc_src_register newsrc;
222       int have_real_reference;
223       unsigned int chan;
224 
225       /* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
226       for (chan = 0; chan < 4; ++chan)
227          if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
228             break;
229       if (chan == 4) {
230          inst->U.I.SrcReg[src].File = RC_FILE_NONE;
231          continue;
232       }
233 
234       /* Convert immediates to swizzles. */
235       if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT || inst->U.I.SrcReg[src].RelAddr ||
236           inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
237          continue;
238 
239       constant = &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
240 
241       if (constant->Type != RC_CONSTANT_IMMEDIATE)
242          continue;
243 
244       newsrc = inst->U.I.SrcReg[src];
245       have_real_reference = 0;
246       for (chan = 0; chan < 4; ++chan) {
247          unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
248          unsigned int newswz;
249          float imm;
250          float baseimm;
251 
252          if (swz >= 4)
253             continue;
254 
255          imm = constant->u.Immediate[swz];
256          baseimm = imm;
257          if (imm < 0.0)
258             baseimm = -baseimm;
259 
260          if (baseimm == 0.0) {
261             newswz = RC_SWIZZLE_ZERO;
262          } else if (baseimm == 1.0) {
263             newswz = RC_SWIZZLE_ONE;
264          } else if (baseimm == 0.5 && c->has_half_swizzles) {
265             newswz = RC_SWIZZLE_HALF;
266          } else {
267             have_real_reference = 1;
268             continue;
269          }
270 
271          SET_SWZ(newsrc.Swizzle, chan, newswz);
272          if (imm < 0.0 && !newsrc.Abs)
273             newsrc.Negate ^= 1 << chan;
274       }
275 
276       if (!have_real_reference) {
277          newsrc.File = RC_FILE_NONE;
278          newsrc.Index = 0;
279       }
280 
281       /* don't make the swizzle worse */
282       if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
283          continue;
284 
285       inst->U.I.SrcReg[src] = newsrc;
286    }
287 
288    /* In case this instruction has been converted, make sure all of the
289     * registers that are no longer used are empty. */
290    opcode = rc_get_opcode_info(inst->U.I.Opcode);
291    for (i = opcode->NumSrcRegs; i < 3; i++) {
292       memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
293    }
294 }
295 
296 /**
297  * If src and dst use the same register, this function returns a writemask that
298  * indicates which components are read by src.  Otherwise zero is returned.
299  */
300 static unsigned int
src_reads_dst_mask(struct rc_src_register src,struct rc_dst_register dst)301 src_reads_dst_mask(struct rc_src_register src, struct rc_dst_register dst)
302 {
303    if (dst.File != src.File || dst.Index != src.Index) {
304       return 0;
305    }
306    return rc_swizzle_to_writemask(src.Swizzle);
307 }
308 
309 /* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
310  * in any of its channels.  Return 0 otherwise. */
311 static int
src_has_const_swz(struct rc_src_register src)312 src_has_const_swz(struct rc_src_register src)
313 {
314    int chan;
315    for (chan = 0; chan < 4; chan++) {
316       unsigned int swz = GET_SWZ(src.Swizzle, chan);
317       if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF || swz == RC_SWIZZLE_ONE) {
318          return 1;
319       }
320    }
321    return 0;
322 }
323 
324 static void
presub_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)325 presub_scan_read(void *data, struct rc_instruction *inst, struct rc_src_register *src)
326 {
327    struct rc_reader_data *reader_data = data;
328    rc_presubtract_op *presub_opcode = reader_data->CbData;
329 
330    if (!rc_inst_can_use_presub(
331           reader_data->C, inst, *presub_opcode, reader_data->Writer->U.I.DstReg.WriteMask, src,
332           &reader_data->Writer->U.I.SrcReg[0], &reader_data->Writer->U.I.SrcReg[1])) {
333       reader_data->Abort = 1;
334       return;
335    }
336 }
337 
338 static int
presub_helper(struct radeon_compiler * c,struct rc_instruction * inst_add,rc_presubtract_op presub_opcode,rc_presub_replace_fn presub_replace)339 presub_helper(struct radeon_compiler *c, struct rc_instruction *inst_add,
340               rc_presubtract_op presub_opcode, rc_presub_replace_fn presub_replace)
341 {
342    struct rc_reader_data reader_data;
343    unsigned int i;
344    rc_presubtract_op cb_op = presub_opcode;
345 
346    reader_data.CbData = &cb_op;
347    reader_data.ExitOnAbort = 1;
348    rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL, is_src_clobbered_scan_write);
349 
350    if (reader_data.Abort || reader_data.ReaderCount == 0)
351       return 0;
352 
353    for (i = 0; i < reader_data.ReaderCount; i++) {
354       unsigned int src_index;
355       struct rc_reader reader = reader_data.Readers[i];
356       const struct rc_opcode_info *info = rc_get_opcode_info(reader.Inst->U.I.Opcode);
357 
358       for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
359          if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
360             presub_replace(inst_add, reader.Inst, src_index);
361       }
362    }
363    return 1;
364 }
365 
366 static void
presub_replace_add(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)367 presub_replace_add(struct rc_instruction *inst_add, struct rc_instruction *inst_reader,
368                    unsigned int src_index)
369 {
370    rc_presubtract_op presub_opcode;
371 
372    unsigned int negates = 0;
373    if (inst_add->U.I.SrcReg[0].Negate)
374       negates++;
375    if (inst_add->U.I.SrcReg[1].Negate)
376       negates++;
377    assert(negates != 2 || inst_add->U.I.SrcReg[1].Negate == inst_add->U.I.SrcReg[0].Negate);
378 
379    if (negates == 1)
380       presub_opcode = RC_PRESUB_SUB;
381    else
382       presub_opcode = RC_PRESUB_ADD;
383 
384    if (inst_add->U.I.SrcReg[1].Negate && negates == 1) {
385       inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
386       inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
387    } else {
388       inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
389       inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
390    }
391    /* If both sources are negative we can move the negate to the presub. */
392    unsigned negate_mask = negates == 1 ? 0 : inst_add->U.I.SrcReg[0].Negate;
393    inst_reader->U.I.PreSub.SrcReg[0].Negate = negate_mask;
394    inst_reader->U.I.PreSub.SrcReg[1].Negate = negate_mask;
395    inst_reader->U.I.PreSub.Opcode = presub_opcode;
396    inst_reader->U.I.SrcReg[src_index] =
397       chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
398    inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
399    inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
400 }
401 
402 static int
is_presub_candidate(struct radeon_compiler * c,struct rc_instruction * inst)403 is_presub_candidate(struct radeon_compiler *c, struct rc_instruction *inst)
404 {
405    const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
406    unsigned int i;
407    unsigned int is_constant[2] = {0, 0};
408 
409    assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);
410 
411    if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE || inst->U.I.SaturateMode ||
412        inst->U.I.WriteALUResult || inst->U.I.Omod) {
413       return 0;
414    }
415 
416    /* If first two sources use a constant swizzle, then we can't convert it to
417     * a presubtract operation.  In fact for the ADD and SUB presubtract
418     * operations neither source can contain a constant swizzle.  This
419     * specific case is checked in peephole_add_presub_add() when
420     * we make sure the swizzles for both sources are equal, so we
421     * don't need to worry about it here. */
422    for (i = 0; i < 2; i++) {
423       int chan;
424       for (chan = 0; chan < 4; chan++) {
425          rc_swizzle swz = get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
426          if (swz == RC_SWIZZLE_ONE || swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF) {
427             is_constant[i] = 1;
428          }
429       }
430    }
431    if (is_constant[0] && is_constant[1])
432       return 0;
433 
434    for (i = 0; i < info->NumSrcRegs; i++) {
435       struct rc_src_register src = inst->U.I.SrcReg[i];
436       if (src_reads_dst_mask(src, inst->U.I.DstReg))
437          return 0;
438 
439       src.File = RC_FILE_PRESUB;
440       if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
441          return 0;
442    }
443    return 1;
444 }
445 
446 static int
peephole_add_presub_add(struct radeon_compiler * c,struct rc_instruction * inst_add)447 peephole_add_presub_add(struct radeon_compiler *c, struct rc_instruction *inst_add)
448 {
449    unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
450    unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
451    unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
452 
453    if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
454       return 0;
455 
456    /* src0 and src1 can't have absolute values */
457    if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
458       return 0;
459 
460    /* if src0 is negative, at least all bits of dstmask have to be set */
461    if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
462       return 0;
463 
464    /* if src1 is negative, at least all bits of dstmask have to be set */
465    if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
466       return 0;
467 
468    if (!is_presub_candidate(c, inst_add))
469       return 0;
470 
471    if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
472       rc_remove_instruction(inst_add);
473       return 1;
474    }
475    return 0;
476 }
477 
478 static void
presub_replace_inv(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)479 presub_replace_inv(struct rc_instruction *inst_add, struct rc_instruction *inst_reader,
480                    unsigned int src_index)
481 {
482    /* We must be careful not to modify inst_add, since it
483     * is possible it will remain part of the program.*/
484    inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
485    inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
486    inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
487    inst_reader->U.I.SrcReg[src_index] =
488       chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
489 
490    inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
491    inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
492 }
493 
494 static void
presub_replace_bias(struct rc_instruction * inst_mad,struct rc_instruction * inst_reader,unsigned int src_index)495 presub_replace_bias(struct rc_instruction *inst_mad, struct rc_instruction *inst_reader,
496                     unsigned int src_index)
497 {
498    /* We must be careful not to modify inst_mad, since it
499     * is possible it will remain part of the program.*/
500    inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
501    inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
502    inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
503    inst_reader->U.I.SrcReg[src_index] =
504       chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
505 
506    inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
507    inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
508 }
509 
510 /**
511  * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
512  * Use the presubtract 1 - src0 for all readers of TEMP[0].  The first source
513  * of the add instruction must have the constant 1 swizzle.  This function
514  * does not check const registers to see if their value is 1.0, so it should
515  * be called after the constant_folding optimization.
516  * @return
517  * 	0 if the ADD instruction is still part of the program.
518  * 	1 if the ADD instruction is no longer part of the program.
519  */
520 static int
peephole_add_presub_inv(struct radeon_compiler * c,struct rc_instruction * inst_add)521 peephole_add_presub_inv(struct radeon_compiler *c, struct rc_instruction *inst_add)
522 {
523    unsigned int i, swz;
524 
525    if (!is_presub_candidate(c, inst_add))
526       return 0;
527 
528    /* Check if src0 is 1. */
529    /* XXX It would be nice to use is_src_uniform_constant here, but that
530     * function only works if the register's file is RC_FILE_NONE */
531    for (i = 0; i < 4; i++) {
532       if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
533          continue;
534 
535       swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
536       if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
537          return 0;
538    }
539 
540    /* Check src1. */
541    if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
542           inst_add->U.I.DstReg.WriteMask ||
543        inst_add->U.I.SrcReg[1].Abs || src_has_const_swz(inst_add->U.I.SrcReg[1])) {
544 
545       return 0;
546    }
547 
548    if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
549       rc_remove_instruction(inst_add);
550       return 1;
551    }
552    return 0;
553 }
554 
555 /**
556  * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
557  * Use the presubtract 1 - 2*src0 for all readers of TEMP[0].  The first source
558  * of the add instruction must have the constant 1 swizzle.  This function
559  * does not check const registers to see if their value is 1.0, so it should
560  * be called after the constant_folding optimization.
561  * @return
562  * 	0 if the MAD instruction is still part of the program.
563  * 	1 if the MAD instruction is no longer part of the program.
564  */
565 static int
peephole_mad_presub_bias(struct radeon_compiler * c,struct rc_instruction * inst_mad)566 peephole_mad_presub_bias(struct radeon_compiler *c, struct rc_instruction *inst_mad)
567 {
568    unsigned int i, swz;
569 
570    if (!is_presub_candidate(c, inst_mad))
571       return 0;
572 
573    /* Check if src2 is 1. */
574    for (i = 0; i < 4; i++) {
575       if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
576          continue;
577 
578       swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
579       if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
580          return 0;
581    }
582 
583    /* Check if src1 is 2. */
584    struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
585    if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
586       return 0;
587    if (src1_reg.File == RC_FILE_INLINE) {
588       if (rc_inline_to_float(src1_reg.Index) != 2.0f)
589          return 0;
590    } else {
591       if (src1_reg.File != RC_FILE_CONSTANT)
592          return 0;
593 
594       struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
595       if (constant->Type != RC_CONSTANT_IMMEDIATE)
596          return 0;
597       for (i = 0; i < 4; i++) {
598          if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
599             continue;
600          swz = GET_SWZ(src1_reg.Swizzle, i);
601          if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
602             return 0;
603       }
604    }
605 
606    /* Check src0. */
607    if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
608           inst_mad->U.I.DstReg.WriteMask ||
609        inst_mad->U.I.SrcReg[0].Abs || src_has_const_swz(inst_mad->U.I.SrcReg[0])) {
610 
611       return 0;
612    }
613 
614    if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
615       rc_remove_instruction(inst_mad);
616       return 1;
617    }
618    return 0;
619 }
620 
621 struct peephole_mul_cb_data {
622    struct rc_dst_register *Writer;
623    unsigned int Clobbered;
624 };
625 
626 static void
omod_filter_reader_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)627 omod_filter_reader_cb(void *userdata, struct rc_instruction *inst, rc_register_file file,
628                       unsigned int index, unsigned int mask)
629 {
630    struct peephole_mul_cb_data *d = userdata;
631    if (rc_src_reads_dst_mask(file, mask, index, d->Writer->File, d->Writer->Index,
632                              d->Writer->WriteMask)) {
633 
634       d->Clobbered = 1;
635    }
636 }
637 
638 static void
omod_filter_writer_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)639 omod_filter_writer_cb(void *userdata, struct rc_instruction *inst, rc_register_file file,
640                       unsigned int index, unsigned int mask)
641 {
642    struct peephole_mul_cb_data *d = userdata;
643    if (file == d->Writer->File && index == d->Writer->Index && (mask & d->Writer->WriteMask)) {
644       d->Clobbered = 1;
645    }
646 }
647 
648 static int
peephole_mul_omod(struct radeon_compiler * c,struct rc_instruction * inst_mul,struct rc_list * var_list)649 peephole_mul_omod(struct radeon_compiler *c, struct rc_instruction *inst_mul,
650                   struct rc_list *var_list)
651 {
652    unsigned int chan = 0, swz, i;
653    int const_index = -1;
654    int temp_index = -1;
655    float const_value;
656    rc_omod_op omod_op = RC_OMOD_DISABLE;
657    struct rc_list *writer_list;
658    struct rc_variable *var;
659    struct peephole_mul_cb_data cb_data;
660    unsigned writemask_sum;
661 
662    for (i = 0; i < 2; i++) {
663       unsigned int j;
664       if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT &&
665           inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY &&
666           inst_mul->U.I.SrcReg[i].File != RC_FILE_NONE) {
667          return 0;
668       }
669 
670       /* The only relevant case with constant swizzles we should check for
671        * is multiply by one half.
672        */
673       if (inst_mul->U.I.SrcReg[i].File == RC_FILE_NONE) {
674          for (j = 0; j < 4; j++) {
675             swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
676             if (swz == RC_SWIZZLE_UNUSED) {
677                continue;
678             }
679             if (swz != RC_SWIZZLE_HALF) {
680                return 0;
681             } else {
682                omod_op = RC_OMOD_DIV_2;
683             }
684          }
685       }
686 
687       if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
688          if (temp_index != -1) {
689             /* The instruction has two temp sources */
690             return 0;
691          } else {
692             temp_index = i;
693             continue;
694          }
695       }
696       /* If we get this far Src[i] must be a constant src */
697       if (inst_mul->U.I.SrcReg[i].Negate) {
698          return 0;
699       }
700       /* The constant src needs to read from the same swizzle */
701       swz = RC_SWIZZLE_UNUSED;
702       chan = 0;
703       for (j = 0; j < 4; j++) {
704          unsigned int j_swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
705          if (j_swz == RC_SWIZZLE_UNUSED) {
706             continue;
707          }
708          if (swz == RC_SWIZZLE_UNUSED) {
709             swz = j_swz;
710             chan = j;
711          } else if (j_swz != swz) {
712             return 0;
713          }
714       }
715 
716       if (const_index != -1) {
717          /* The instruction has two constant sources */
718          return 0;
719       } else {
720          const_index = i;
721       }
722    }
723 
724    if (omod_op == RC_OMOD_DISABLE) {
725       if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
726                                    inst_mul->U.I.SrcReg[const_index].Index)) {
727          return 0;
728       }
729       const_value = rc_get_constant_value(c, inst_mul->U.I.SrcReg[const_index].Index,
730                                           inst_mul->U.I.SrcReg[const_index].Swizzle,
731                                           inst_mul->U.I.SrcReg[const_index].Negate, chan);
732 
733       if (const_value == 2.0f) {
734          omod_op = RC_OMOD_MUL_2;
735       } else if (const_value == 4.0f) {
736          omod_op = RC_OMOD_MUL_4;
737       } else if (const_value == 8.0f) {
738          omod_op = RC_OMOD_MUL_8;
739       } else if (const_value == (1.0f / 2.0f)) {
740          omod_op = RC_OMOD_DIV_2;
741       } else if (const_value == (1.0f / 4.0f)) {
742          omod_op = RC_OMOD_DIV_4;
743       } else if (const_value == (1.0f / 8.0f)) {
744          omod_op = RC_OMOD_DIV_8;
745       } else {
746          return 0;
747       }
748    }
749 
750    writer_list = rc_variable_list_get_writers_one_reader(var_list, RC_INSTRUCTION_NORMAL,
751                                                          &inst_mul->U.I.SrcReg[temp_index]);
752 
753    if (!writer_list) {
754       return 0;
755    }
756 
757    cb_data.Clobbered = 0;
758    cb_data.Writer = &inst_mul->U.I.DstReg;
759    for (var = writer_list->Item; var; var = var->Friend) {
760       struct rc_instruction *inst;
761       const struct rc_opcode_info *info = rc_get_opcode_info(var->Inst->U.I.Opcode);
762       if (info->HasTexture) {
763          return 0;
764       }
765       if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
766          return 0;
767       }
768 
769       /* Empirical testing shows that DDX/DDY directly into output
770        * with non-identity omod is problematic.
771        */
772       if ((info->Opcode == RC_OPCODE_DDX || info->Opcode == RC_OPCODE_DDY) &&
773           inst_mul->U.I.DstReg.File == RC_FILE_OUTPUT) {
774          return 0;
775       }
776 
777       for (inst = inst_mul->Prev; inst != var->Inst; inst = inst->Prev) {
778          rc_for_all_reads_mask(inst, omod_filter_reader_cb, &cb_data);
779          rc_for_all_writes_mask(inst, omod_filter_writer_cb, &cb_data);
780          if (cb_data.Clobbered) {
781             break;
782          }
783       }
784    }
785 
786    if (cb_data.Clobbered) {
787       return 0;
788    }
789 
790    writemask_sum = rc_variable_writemask_sum(writer_list->Item);
791 
792    /* rc_normal_rewrite_writemask can't expand a previous writemask to store
793     * more channels replicated.
794     */
795    if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
796       return 0;
797 
798    /* Rewrite the instructions */
799    for (var = writer_list->Item; var; var = var->Friend) {
800       struct rc_variable *writer = var;
801       unsigned conversion_swizzle = RC_SWIZZLE_UUUU;
802       for (chan = 0; chan < 4; chan++) {
803          unsigned swz = GET_SWZ(inst_mul->U.I.SrcReg[temp_index].Swizzle, chan);
804          if (swz <= RC_SWIZZLE_W)
805             SET_SWZ(conversion_swizzle, swz, chan);
806       }
807       writer->Inst->U.I.Omod = omod_op;
808       writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
809       writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
810       rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
811       writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
812    }
813 
814    rc_remove_instruction(inst_mul);
815 
816    return 1;
817 }
818 
819 /**
820  * @return
821  * 	0 if inst is still part of the program.
822  * 	1 if inst is no longer part of the program.
823  */
824 int
rc_opt_presubtract(struct radeon_compiler * c,struct rc_instruction * inst,void * data)825 rc_opt_presubtract(struct radeon_compiler *c, struct rc_instruction *inst, void *data)
826 {
827    switch (inst->U.I.Opcode) {
828    case RC_OPCODE_ADD: {
829       if (peephole_add_presub_inv(c, inst))
830          return 1;
831       if (peephole_add_presub_add(c, inst))
832          return 1;
833       break;
834    }
835    case RC_OPCODE_MAD: {
836       if (peephole_mad_presub_bias(c, inst))
837          return 1;
838       break;
839    }
840    default:
841       break;
842    }
843    return 0;
844 }
845 
846 static unsigned int
merge_swizzles(unsigned int swz1,unsigned int swz2)847 merge_swizzles(unsigned int swz1, unsigned int swz2)
848 {
849    unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
850    for (unsigned int chan = 0; chan < 4; chan++) {
851       unsigned int swz = GET_SWZ(swz1, chan);
852       if (swz != RC_SWIZZLE_UNUSED) {
853          SET_SWZ(new_swz, chan, swz);
854          continue;
855       }
856       swz = GET_SWZ(swz2, chan);
857       SET_SWZ(new_swz, chan, swz);
858    }
859    return new_swz;
860 }
861 
862 /* Sets negate to 0 for unused channels. */
863 static unsigned int
clean_negate(struct rc_src_register src)864 clean_negate(struct rc_src_register src)
865 {
866    unsigned int new_negate = 0;
867    for (unsigned int chan = 0; chan < 4; chan++) {
868       unsigned int swz = GET_SWZ(src.Swizzle, chan);
869       if (swz != RC_SWIZZLE_UNUSED)
870          new_negate |= src.Negate & (1 << chan);
871    }
872    return new_negate;
873 }
874 
875 static unsigned int
merge_negates(struct rc_src_register src1,struct rc_src_register src2)876 merge_negates(struct rc_src_register src1, struct rc_src_register src2)
877 {
878    return clean_negate(src1) | clean_negate(src2);
879 }
880 
881 static unsigned int
fill_swizzle(unsigned int orig_swz,unsigned int wmask,unsigned int const_swz)882 fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz)
883 {
884    for (unsigned int chan = 0; chan < 4; chan++) {
885       unsigned int swz = GET_SWZ(orig_swz, chan);
886       if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) {
887          SET_SWZ(orig_swz, chan, const_swz);
888       }
889    }
890    return orig_swz;
891 }
892 
893 static int
have_shared_source(struct rc_instruction * inst1,struct rc_instruction * inst2)894 have_shared_source(struct rc_instruction *inst1, struct rc_instruction *inst2)
895 {
896    int shared_src = -1;
897    const struct rc_opcode_info *opcode1 = rc_get_opcode_info(inst1->U.I.Opcode);
898    const struct rc_opcode_info *opcode2 = rc_get_opcode_info(inst2->U.I.Opcode);
899    for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) {
900       for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
901          if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
902              inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
903              inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
904              inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
905             shared_src = i;
906       }
907    }
908    return shared_src;
909 }
910 
911 /**
912  * Merges two MOVs writing different channels of the same destination register
913  * with the use of the constant swizzles.
914  */
915 static bool
merge_movs(struct radeon_compiler * c,struct rc_instruction * inst,struct rc_instruction * cur)916 merge_movs(struct radeon_compiler *c, struct rc_instruction *inst, struct rc_instruction *cur)
917 {
918    /* We can merge two MOVs into MOV if one of them is from inline constant,
919     * i.e., constant swizzles and RC_FILE_NONE).
920     *
921     * For example
922     *   MOV temp[0].x none.1___
923     *   MOV temp[0].y input[0]._x__
924     *
925     * becomes
926     *   MOV temp[0].xy input[0].1x__
927     */
928    unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
929    if (cur->U.I.SrcReg[0].File == RC_FILE_NONE || inst->U.I.SrcReg[0].File == RC_FILE_NONE) {
930       struct rc_src_register src;
931       if (cur->U.I.SrcReg[0].File == RC_FILE_NONE)
932          src = inst->U.I.SrcReg[0];
933       else
934          src = cur->U.I.SrcReg[0];
935       src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle);
936       src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
937       if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
938          cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
939          cur->U.I.SrcReg[0] = src;
940          rc_remove_instruction(inst);
941          return true;
942       }
943    }
944 
945    /* Handle the trivial case where the MOVs share a source.
946     *
947     * For example
948     *   MOV temp[0].x const[0].x
949     *   MOV temp[0].y const[0].z
950     *
951     * becomes
952     *   MOV temp[0].xy const[0].xz
953     */
954    if (have_shared_source(inst, cur) == 0) {
955       struct rc_src_register src = cur->U.I.SrcReg[0];
956       src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
957       src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle);
958 
959       if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
960          cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
961          cur->U.I.SrcReg[0] = src;
962          rc_remove_instruction(inst);
963          return true;
964       }
965    }
966 
967    /* Otherwise, we can convert the MOVs into ADD.
968     *
969     * For example
970     *   MOV temp[0].x const[0].x
971     *   MOV temp[0].y input[0].y
972     *
973     * becomes
974     *   ADD temp[0].xy const[0].x0 input[0].0y
975     */
976    unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask;
977    struct rc_src_register src0 = inst->U.I.SrcReg[0];
978    struct rc_src_register src1 = cur->U.I.SrcReg[0];
979 
980    src0.Swizzle = fill_swizzle(src0.Swizzle, wmask, RC_SWIZZLE_ZERO);
981    src1.Swizzle = fill_swizzle(src1.Swizzle, wmask, RC_SWIZZLE_ZERO);
982    if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) ||
983        !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1))
984       return false;
985 
986    cur->U.I.DstReg.WriteMask = wmask;
987    cur->U.I.Opcode = RC_OPCODE_ADD;
988    cur->U.I.SrcReg[0] = src0;
989    cur->U.I.SrcReg[1] = src1;
990 
991    /* finally delete the original mov */
992    rc_remove_instruction(inst);
993    return true;
994 }
995 
996 /**
997  * This function will try to merge MOV and ADD/MUL instructions with the same
998  * destination, making use of the constant swizzles.
999  *
1000  * For example:
1001  *   MOV temp[0].x const[0].x
1002  *   MUL temp[0].yz const[1].yz const[2].yz
1003  *
1004  * becomes
1005  *   MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00
1006  */
1007 static int
merge_mov_add_mul(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1008 merge_mov_add_mul(struct radeon_compiler *c, struct rc_instruction *inst1,
1009                   struct rc_instruction *inst2)
1010 {
1011    struct rc_instruction *inst, *mov;
1012    if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1013       mov = inst1;
1014       inst = inst2;
1015    } else {
1016       mov = inst2;
1017       inst = inst1;
1018    }
1019 
1020    const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL;
1021    int shared_index = have_shared_source(inst, mov);
1022    unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask;
1023 
1024    /* If there is a shared source, just merge the swizzles and be done with it. */
1025    if (shared_index != -1) {
1026       struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index];
1027       struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index];
1028 
1029       shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src);
1030       shared_src.Swizzle = merge_swizzles(shared_src.Swizzle, mov->U.I.SrcReg[0].Swizzle);
1031       other_src.Negate = clean_negate(other_src);
1032       unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO;
1033       other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz);
1034 
1035       if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) ||
1036           !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src))
1037          return 0;
1038 
1039       inst2->U.I.Opcode = inst->U.I.Opcode;
1040       inst2->U.I.SrcReg[0] = shared_src;
1041       inst2->U.I.SrcReg[1] = other_src;
1042 
1043       /* TODO: we can do a bit better in the special case when one of the sources is none.
1044        * Convert to MAD otherwise.
1045        */
1046    } else {
1047       struct rc_src_register src0, src1, src2;
1048       if (is_mul) {
1049          src2 = mov->U.I.SrcReg[0];
1050          src0 = inst->U.I.SrcReg[0];
1051          src1 = inst->U.I.SrcReg[1];
1052       } else {
1053          src0 = mov->U.I.SrcReg[0];
1054          src1 = inst->U.I.SrcReg[0];
1055          src2 = inst->U.I.SrcReg[1];
1056       }
1057       /* The following login expects that the unused channels have empty negate bits. */
1058       src0.Negate = clean_negate(src0);
1059       src1.Negate = clean_negate(src1);
1060       src2.Negate = clean_negate(src2);
1061 
1062       src0.Swizzle = fill_swizzle(src0.Swizzle, wmask, RC_SWIZZLE_ONE);
1063       src1.Swizzle = fill_swizzle(src1.Swizzle, wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE);
1064       src2.Swizzle = fill_swizzle(src2.Swizzle, wmask, RC_SWIZZLE_ZERO);
1065       if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) ||
1066           !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) ||
1067           !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2))
1068          return 0;
1069 
1070       inst2->U.I.Opcode = RC_OPCODE_MAD;
1071       inst2->U.I.SrcReg[0] = src0;
1072       inst2->U.I.SrcReg[1] = src1;
1073       inst2->U.I.SrcReg[2] = src2;
1074    }
1075    inst2->U.I.DstReg.WriteMask = wmask;
1076    /* finally delete the original instruction */
1077    rc_remove_instruction(inst1);
1078 
1079    return 1;
1080 }
1081 
1082 /**
1083  * This function will try to merge MOV and MAD instructions with the same
1084  * destination, making use of the constant swizzles. This only works
1085  * if there is a shared source or one of the sources is RC_FILE_NONE.
1086  *
1087  * For example:
1088  *   MOV temp[0].x const[0].x
1089  *   MAD temp[0].yz const[0].yz const[1].yz input[0].xw
1090  *
1091  * becomes
1092  *   MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
1093  */
1094 static bool
merge_mov_mad(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1095 merge_mov_mad(struct radeon_compiler *c, struct rc_instruction *inst1, struct rc_instruction *inst2)
1096 {
1097    struct rc_instruction *mov, *mad;
1098    if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1099       mov = inst1;
1100       mad = inst2;
1101    } else {
1102       mov = inst2;
1103       mad = inst1;
1104    }
1105 
1106    int shared_index = have_shared_source(mad, mov);
1107    unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
1108    struct rc_src_register src[3];
1109    src[0] = mad->U.I.SrcReg[0];
1110    src[1] = mad->U.I.SrcReg[1];
1111    src[2] = mad->U.I.SrcReg[2];
1112 
1113    /* Shared source is the one for multiplication. */
1114    if (shared_index == 0 || shared_index == 1) {
1115       src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
1116       src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
1117       src[shared_index].Swizzle =
1118          merge_swizzles(src[shared_index].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1119       src[1 - shared_index].Swizzle =
1120          fill_swizzle(src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
1121       src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1122 
1123       /* Shared source is the one for used for addition, or it is none. Additionally,
1124        * if the mov SrcReg is none, we merge it with the addition (third) reg as well
1125        * because than we have the highest change the swizzles will be legal.
1126        */
1127    } else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
1128               src[2].File == RC_FILE_NONE) {
1129       src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
1130       src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1131       src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
1132       src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
1133       if (src[2].File == RC_FILE_NONE) {
1134          src[2].File = mov->U.I.SrcReg[0].File;
1135          src[2].Index = mov->U.I.SrcReg[0].Index;
1136          src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
1137          src[2].Abs = mov->U.I.SrcReg[0].Abs;
1138       }
1139 
1140       /* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
1141        * fill the other one with ones and the reg for addition with zeros.
1142        */
1143    } else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
1144       unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
1145       src[none_src] = mov->U.I.SrcReg[0];
1146       src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
1147       src[none_src].Swizzle =
1148          merge_swizzles(src[none_src].Swizzle, mad->U.I.SrcReg[none_src].Swizzle);
1149       src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
1150       src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle, wmask, RC_SWIZZLE_ONE);
1151       src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1152    } else {
1153       return false;
1154    }
1155 
1156    if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
1157        !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
1158        !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
1159       return false;
1160 
1161    inst2->U.I.Opcode = RC_OPCODE_MAD;
1162    inst2->U.I.SrcReg[0] = src[0];
1163    inst2->U.I.SrcReg[1] = src[1];
1164    inst2->U.I.SrcReg[2] = src[2];
1165    inst2->U.I.DstReg.WriteMask = wmask;
1166    rc_remove_instruction(inst1);
1167    return true;
1168 }
1169 
1170 static bool
inst_combination(struct rc_instruction * inst1,struct rc_instruction * inst2,rc_opcode opcode1,rc_opcode opcode2)1171 inst_combination(struct rc_instruction *inst1, struct rc_instruction *inst2, rc_opcode opcode1,
1172                  rc_opcode opcode2)
1173 {
1174    return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) ||
1175            (inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2));
1176 }
1177 
1178 /**
1179  * Searches for instructions writing different channels of the same register that could
1180  * be merged together with the use of constant swizzles.
1181  *
1182  * The potential candidates are combinations of MOVs, ADDs, MULs and MADs.
1183  */
1184 static void
merge_channels(struct radeon_compiler * c,struct rc_instruction * inst)1185 merge_channels(struct radeon_compiler *c, struct rc_instruction *inst)
1186 {
1187    unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
1188    unsigned int orig_dst_file = inst->U.I.DstReg.File;
1189    unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
1190    const struct rc_opcode_info *orig_opcode = rc_get_opcode_info(inst->U.I.Opcode);
1191 
1192    struct rc_instruction *cur = inst;
1193    while (cur != &c->Program.Instructions) {
1194       cur = cur->Next;
1195       const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);
1196 
1197       /* Keep it simple for now and stop when encountering any
1198        * control flow.
1199        */
1200       if (opcode->IsFlowControl)
1201          return;
1202 
1203       /* Stop when the original destination is overwritten */
1204       if (orig_dst_reg == cur->U.I.DstReg.Index && orig_dst_file == cur->U.I.DstReg.File &&
1205           (orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
1206          return;
1207 
1208       /* Stop the search when the original instruction destination
1209        * is used as a source for anything.
1210        */
1211       for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
1212          if (cur->U.I.SrcReg[i].File == orig_dst_file && cur->U.I.SrcReg[i].Index == orig_dst_reg)
1213             return;
1214       }
1215 
1216       /* Stop the search when some of the original sources are touched. */
1217       for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) {
1218          if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File &&
1219              inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index)
1220             return;
1221       }
1222 
1223       if (cur->U.I.DstReg.File == orig_dst_file && cur->U.I.DstReg.Index == orig_dst_reg &&
1224           cur->U.I.SaturateMode == inst->U.I.SaturateMode &&
1225           (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {
1226 
1227          if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) {
1228             if (merge_movs(c, inst, cur))
1229                return;
1230          }
1231 
1232          /* Skip the merge if one of the instructions writes just w channel
1233           * and we are compiling a fragment shader. We can pair-schedule it together
1234           * later anyway and it will also give the scheduler a bit more flexibility.
1235           * Only check this after merging MOVs as when we manage to merge two MOVs
1236           * into another MOV we can still copy propagate it away. So it is a win in
1237           * that case.
1238           */
1239          if (c->has_omod &&
1240              (cur->U.I.DstReg.WriteMask == RC_MASK_W || inst->U.I.DstReg.WriteMask == RC_MASK_W))
1241             continue;
1242 
1243          if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) ||
1244              inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) {
1245             if (merge_mov_add_mul(c, inst, cur))
1246                return;
1247          }
1248 
1249          if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
1250             if (merge_mov_mad(c, inst, cur))
1251                return;
1252          }
1253       }
1254    }
1255 }
1256 
1257 /**
1258  * Searches for duplicate ARLs/ARRs
1259  *
1260  * Only a very trivial case is now optimized where if a second one is detected which reads from
1261  * the same register as the first one and source is the same, just remove the second one.
1262  */
1263 static void
merge_A0_loads(struct radeon_compiler * c,struct rc_instruction * inst,bool is_ARL)1264 merge_A0_loads(struct radeon_compiler *c, struct rc_instruction *inst, bool is_ARL)
1265 {
1266    unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
1267    unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
1268    unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
1269    int cf_depth = 0;
1270 
1271    struct rc_instruction *cur = inst;
1272    while (cur != &c->Program.Instructions) {
1273       cur = cur->Next;
1274       const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);
1275 
1276       /* Keep it simple for now and stop when encountering any
1277        * control flow besides simple ifs.
1278        */
1279       if (opcode->IsFlowControl) {
1280          switch (cur->U.I.Opcode) {
1281          case RC_OPCODE_IF: {
1282             cf_depth++;
1283             break;
1284          }
1285          case RC_OPCODE_ELSE: {
1286             if (cf_depth < 1)
1287                return;
1288             break;
1289          }
1290          case RC_OPCODE_ENDIF: {
1291             cf_depth--;
1292             break;
1293          }
1294          default:
1295             return;
1296          }
1297       }
1298 
1299       /* Stop when the original source is overwritten */
1300       if (A0_src_reg == cur->U.I.DstReg.Index && A0_src_file == cur->U.I.DstReg.File &&
1301           cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
1302          return;
1303 
1304       /* Wrong A0 load type. */
1305       if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
1306           (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
1307          return;
1308 
1309       if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
1310          if (A0_src_reg == cur->U.I.SrcReg[0].Index && A0_src_file == cur->U.I.SrcReg[0].File &&
1311              A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
1312             struct rc_instruction *next = cur->Next;
1313             rc_remove_instruction(cur);
1314             cur = next;
1315          } else {
1316             return;
1317          }
1318       }
1319    }
1320 }
1321 
1322 /**
1323  * According to the GLSL spec, round is only 1.30 and up
1324  * so the only reason why we should ever see round is if it actually
1325  * is lowered ARR (from nine->ttn). In that case we want to reconstruct
1326  * the ARR instead of lowering the round.
1327  */
1328 static void
transform_vertex_ROUND(struct radeon_compiler * c,struct rc_instruction * inst)1329 transform_vertex_ROUND(struct radeon_compiler *c, struct rc_instruction *inst)
1330 {
1331    struct rc_reader_data readers;
1332    rc_get_readers(c, inst, &readers, NULL, NULL, NULL);
1333 
1334    assert(readers.ReaderCount > 0);
1335    for (unsigned i = 0; i < readers.ReaderCount; i++) {
1336       struct rc_instruction *reader = readers.Readers[i].Inst;
1337       if (reader->U.I.Opcode != RC_OPCODE_ARL) {
1338          assert(!"Unable to convert ROUND+ARL to ARR\n");
1339          return;
1340       }
1341    }
1342 
1343    /* Only ARL readers, convert all to ARR */
1344    for (unsigned i = 0; i < readers.ReaderCount; i++) {
1345       readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
1346    }
1347    /* Switch ROUND to MOV and let copy propagate sort it out later. */
1348    inst->U.I.Opcode = RC_OPCODE_MOV;
1349 }
1350 
1351 /**
1352  * Apply various optimizations specific to the A0 address register loads.
1353  */
1354 static void
optimize_A0_loads(struct radeon_compiler * c)1355 optimize_A0_loads(struct radeon_compiler *c)
1356 {
1357    struct rc_instruction *inst = c->Program.Instructions.Next;
1358 
1359    while (inst != &c->Program.Instructions) {
1360       struct rc_instruction *cur = inst;
1361       inst = inst->Next;
1362       if (cur->U.I.Opcode == RC_OPCODE_ARL) {
1363          merge_A0_loads(c, cur, true);
1364       } else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
1365          merge_A0_loads(c, cur, false);
1366       } else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
1367          transform_vertex_ROUND(c, cur);
1368       }
1369    }
1370 }
1371 
1372 void
rc_optimize(struct radeon_compiler * c,void * user)1373 rc_optimize(struct radeon_compiler *c, void *user)
1374 {
1375    struct rc_instruction *inst = c->Program.Instructions.Next;
1376    while (inst != &c->Program.Instructions) {
1377       struct rc_instruction *cur = inst;
1378       inst = inst->Next;
1379       constant_folding(c, cur);
1380    }
1381 
1382    /* Copy propagate simple movs away. */
1383    inst = c->Program.Instructions.Next;
1384    while (inst != &c->Program.Instructions) {
1385       struct rc_instruction *cur = inst;
1386       inst = inst->Next;
1387       if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1388          copy_propagate(c, cur);
1389       }
1390    }
1391 
1392    if (c->type == RC_VERTEX_PROGRAM) {
1393       optimize_A0_loads(c);
1394    }
1395 
1396    /* Merge MOVs to same source in different channels using the constant
1397     * swizzle.
1398     */
1399    if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
1400       inst = c->Program.Instructions.Next;
1401       while (inst != &c->Program.Instructions) {
1402          struct rc_instruction *cur = inst;
1403          inst = inst->Next;
1404          if (cur->U.I.Opcode == RC_OPCODE_MOV || cur->U.I.Opcode == RC_OPCODE_ADD ||
1405              cur->U.I.Opcode == RC_OPCODE_MAD || cur->U.I.Opcode == RC_OPCODE_MUL)
1406             merge_channels(c, cur);
1407       }
1408    }
1409 
1410    /* Copy propagate few extra movs from the merge_channels pass. */
1411    inst = c->Program.Instructions.Next;
1412    while (inst != &c->Program.Instructions) {
1413       struct rc_instruction *cur = inst;
1414       inst = inst->Next;
1415       if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1416          copy_propagate(c, cur);
1417       }
1418    }
1419 
1420    if (c->type != RC_FRAGMENT_PROGRAM) {
1421       return;
1422    }
1423 
1424    /* Output modifiers. */
1425    inst = c->Program.Instructions.Next;
1426    struct rc_list *var_list = NULL;
1427    while (inst != &c->Program.Instructions) {
1428       struct rc_instruction *cur = inst;
1429       inst = inst->Next;
1430       if (cur->U.I.Opcode == RC_OPCODE_MUL) {
1431          if (!var_list)
1432             var_list = rc_get_variables(c);
1433          if (peephole_mul_omod(c, cur, var_list))
1434             var_list = NULL;
1435       }
1436    }
1437 }
1438