1 /*
2 * Copyright 2009 Nicolai Haehnle.
3 * Copyright 2010 Tom Stellard <tstellar@gmail.com>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "util/u_math.h"
8
9 #include "radeon_dataflow.h"
10
11 #include "radeon_compiler.h"
12 #include "radeon_compiler_util.h"
13 #include "radeon_list.h"
14 #include "radeon_swizzle.h"
15 #include "radeon_variable.h"
16
17 struct src_clobbered_reads_cb_data {
18 rc_register_file File;
19 unsigned int Index;
20 unsigned int Mask;
21 struct rc_reader_data *ReaderData;
22 };
23
24 typedef void (*rc_presub_replace_fn)(struct rc_instruction *, struct rc_instruction *,
25 unsigned int);
26
27 static struct rc_src_register
chain_srcregs(struct rc_src_register outer,struct rc_src_register inner)28 chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
29 {
30 struct rc_src_register combine;
31 combine.File = inner.File;
32 combine.Index = inner.Index;
33 combine.RelAddr = inner.RelAddr;
34 if (outer.Abs) {
35 combine.Abs = 1;
36 combine.Negate = outer.Negate;
37 } else {
38 combine.Abs = inner.Abs;
39 combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
40 combine.Negate ^= outer.Negate;
41 }
42 combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
43 return combine;
44 }
45
46 static void
copy_propagate_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)47 copy_propagate_scan_read(void *data, struct rc_instruction *inst, struct rc_src_register *src)
48 {
49 rc_register_file file = src->File;
50 struct rc_reader_data *reader_data = data;
51
52 if (!rc_inst_can_use_presub(reader_data->C, inst, reader_data->Writer->U.I.PreSub.Opcode,
53 rc_swizzle_to_writemask(src->Swizzle), src,
54 &reader_data->Writer->U.I.PreSub.SrcReg[0],
55 &reader_data->Writer->U.I.PreSub.SrcReg[1])) {
56 reader_data->Abort = 1;
57 return;
58 }
59
60 /* XXX This could probably be handled better. */
61 if (file == RC_FILE_ADDRESS) {
62 reader_data->Abort = 1;
63 return;
64 }
65
66 /* R300/R400 is unhappy about propagating
67 * 0: MOV temp[1], -none.1111;
68 * 1: KIL temp[1];
69 * to
70 * 0: KIL -none.1111;
71 *
72 * R500 is fine with it.
73 */
74 if (!reader_data->C->is_r500 && inst->U.I.Opcode == RC_OPCODE_KIL &&
75 reader_data->Writer->U.I.SrcReg[0].File == RC_FILE_NONE) {
76 reader_data->Abort = 1;
77 return;
78 }
79
80 /* These instructions cannot read from the constants file.
81 * see radeonTransformTEX()
82 */
83 if (reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
84 reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
85 reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_NONE &&
86 (inst->U.I.Opcode == RC_OPCODE_TEX || inst->U.I.Opcode == RC_OPCODE_TXB ||
87 inst->U.I.Opcode == RC_OPCODE_TXP || inst->U.I.Opcode == RC_OPCODE_TXD ||
88 inst->U.I.Opcode == RC_OPCODE_TXL || inst->U.I.Opcode == RC_OPCODE_KIL)) {
89 reader_data->Abort = 1;
90 return;
91 }
92 }
93
94 static void
src_clobbered_reads_cb(void * data,struct rc_instruction * inst,struct rc_src_register * src)95 src_clobbered_reads_cb(void *data, struct rc_instruction *inst, struct rc_src_register *src)
96 {
97 struct src_clobbered_reads_cb_data *sc_data = data;
98
99 if (src->File == sc_data->File && src->Index == sc_data->Index &&
100 (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
101
102 sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
103 }
104
105 if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
106 sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
107 }
108 }
109
110 static void
is_src_clobbered_scan_write(void * data,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)111 is_src_clobbered_scan_write(void *data, struct rc_instruction *inst, rc_register_file file,
112 unsigned int index, unsigned int mask)
113 {
114 struct src_clobbered_reads_cb_data sc_data;
115 struct rc_reader_data *reader_data = data;
116 sc_data.File = file;
117 sc_data.Index = index;
118 sc_data.Mask = mask;
119 sc_data.ReaderData = reader_data;
120 rc_for_all_reads_src(reader_data->Writer, src_clobbered_reads_cb, &sc_data);
121 }
122
123 static void
copy_propagate(struct radeon_compiler * c,struct rc_instruction * inst_mov)124 copy_propagate(struct radeon_compiler *c, struct rc_instruction *inst_mov)
125 {
126 struct rc_reader_data reader_data;
127 unsigned int i;
128
129 if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY || inst_mov->U.I.WriteALUResult)
130 return;
131
132 /* Get a list of all the readers of this MOV instruction. */
133 reader_data.ExitOnAbort = 1;
134 rc_get_readers(c, inst_mov, &reader_data, copy_propagate_scan_read, NULL,
135 is_src_clobbered_scan_write);
136
137 if (reader_data.Abort || reader_data.ReaderCount == 0)
138 return;
139
140 /* We can propagate SaturateMode if all the readers are MOV instructions
141 * without a presubtract operation, source negation and absolute.
142 * In that case, we just move SaturateMode to all readers. */
143 if (inst_mov->U.I.SaturateMode) {
144 for (i = 0; i < reader_data.ReaderCount; i++) {
145 struct rc_instruction *inst = reader_data.Readers[i].Inst;
146
147 if (inst->U.I.Opcode != RC_OPCODE_MOV || inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
148 inst->U.I.SrcReg[0].Abs || inst->U.I.SrcReg[0].Negate) {
149 return;
150 }
151 }
152 }
153
154 /* Propagate the MOV instruction. */
155 for (i = 0; i < reader_data.ReaderCount; i++) {
156 struct rc_instruction *inst = reader_data.Readers[i].Inst;
157 *reader_data.Readers[i].U.I.Src =
158 chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
159
160 if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
161 inst->U.I.PreSub = inst_mov->U.I.PreSub;
162 if (!inst->U.I.SaturateMode)
163 inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
164 }
165
166 /* Finally, remove the original MOV instruction */
167 rc_remove_instruction(inst_mov);
168 }
169
170 /**
171 * Check if a source register is actually always the same
172 * swizzle constant.
173 */
174 static int
is_src_uniform_constant(struct rc_src_register src,rc_swizzle * pswz,unsigned int * pnegate)175 is_src_uniform_constant(struct rc_src_register src, rc_swizzle *pswz, unsigned int *pnegate)
176 {
177 int have_used = 0;
178
179 if (src.File != RC_FILE_NONE) {
180 *pswz = 0;
181 return 0;
182 }
183
184 for (unsigned int chan = 0; chan < 4; ++chan) {
185 unsigned int swz = GET_SWZ(src.Swizzle, chan);
186 if (swz < 4) {
187 *pswz = 0;
188 return 0;
189 }
190 if (swz == RC_SWIZZLE_UNUSED)
191 continue;
192
193 if (!have_used) {
194 *pswz = swz;
195 *pnegate = GET_BIT(src.Negate, chan);
196 have_used = 1;
197 } else {
198 if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
199 *pswz = 0;
200 return 0;
201 }
202 }
203 }
204
205 return 1;
206 }
207
208 /**
209 * Replace 0.0, 1.0 and 0.5 immediate constants by their
210 * respective swizzles. Simplify instructions like ADD dst, src, 0;
211 */
212 static void
constant_folding(struct radeon_compiler * c,struct rc_instruction * inst)213 constant_folding(struct radeon_compiler *c, struct rc_instruction *inst)
214 {
215 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
216 unsigned int i;
217
218 /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
219 for (unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
220 struct rc_constant *constant;
221 struct rc_src_register newsrc;
222 int have_real_reference;
223 unsigned int chan;
224
225 /* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
226 for (chan = 0; chan < 4; ++chan)
227 if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
228 break;
229 if (chan == 4) {
230 inst->U.I.SrcReg[src].File = RC_FILE_NONE;
231 continue;
232 }
233
234 /* Convert immediates to swizzles. */
235 if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT || inst->U.I.SrcReg[src].RelAddr ||
236 inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
237 continue;
238
239 constant = &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
240
241 if (constant->Type != RC_CONSTANT_IMMEDIATE)
242 continue;
243
244 newsrc = inst->U.I.SrcReg[src];
245 have_real_reference = 0;
246 for (chan = 0; chan < 4; ++chan) {
247 unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
248 unsigned int newswz;
249 float imm;
250 float baseimm;
251
252 if (swz >= 4)
253 continue;
254
255 imm = constant->u.Immediate[swz];
256 baseimm = imm;
257 if (imm < 0.0)
258 baseimm = -baseimm;
259
260 if (baseimm == 0.0) {
261 newswz = RC_SWIZZLE_ZERO;
262 } else if (baseimm == 1.0) {
263 newswz = RC_SWIZZLE_ONE;
264 } else if (baseimm == 0.5 && c->has_half_swizzles) {
265 newswz = RC_SWIZZLE_HALF;
266 } else {
267 have_real_reference = 1;
268 continue;
269 }
270
271 SET_SWZ(newsrc.Swizzle, chan, newswz);
272 if (imm < 0.0 && !newsrc.Abs)
273 newsrc.Negate ^= 1 << chan;
274 }
275
276 if (!have_real_reference) {
277 newsrc.File = RC_FILE_NONE;
278 newsrc.Index = 0;
279 }
280
281 /* don't make the swizzle worse */
282 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
283 continue;
284
285 inst->U.I.SrcReg[src] = newsrc;
286 }
287
288 /* In case this instruction has been converted, make sure all of the
289 * registers that are no longer used are empty. */
290 opcode = rc_get_opcode_info(inst->U.I.Opcode);
291 for (i = opcode->NumSrcRegs; i < 3; i++) {
292 memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
293 }
294 }
295
296 /**
297 * If src and dst use the same register, this function returns a writemask that
298 * indicates which components are read by src. Otherwise zero is returned.
299 */
300 static unsigned int
src_reads_dst_mask(struct rc_src_register src,struct rc_dst_register dst)301 src_reads_dst_mask(struct rc_src_register src, struct rc_dst_register dst)
302 {
303 if (dst.File != src.File || dst.Index != src.Index) {
304 return 0;
305 }
306 return rc_swizzle_to_writemask(src.Swizzle);
307 }
308
309 /* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
310 * in any of its channels. Return 0 otherwise. */
311 static int
src_has_const_swz(struct rc_src_register src)312 src_has_const_swz(struct rc_src_register src)
313 {
314 int chan;
315 for (chan = 0; chan < 4; chan++) {
316 unsigned int swz = GET_SWZ(src.Swizzle, chan);
317 if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF || swz == RC_SWIZZLE_ONE) {
318 return 1;
319 }
320 }
321 return 0;
322 }
323
324 static void
presub_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)325 presub_scan_read(void *data, struct rc_instruction *inst, struct rc_src_register *src)
326 {
327 struct rc_reader_data *reader_data = data;
328 rc_presubtract_op *presub_opcode = reader_data->CbData;
329
330 if (!rc_inst_can_use_presub(
331 reader_data->C, inst, *presub_opcode, reader_data->Writer->U.I.DstReg.WriteMask, src,
332 &reader_data->Writer->U.I.SrcReg[0], &reader_data->Writer->U.I.SrcReg[1])) {
333 reader_data->Abort = 1;
334 return;
335 }
336 }
337
338 static int
presub_helper(struct radeon_compiler * c,struct rc_instruction * inst_add,rc_presubtract_op presub_opcode,rc_presub_replace_fn presub_replace)339 presub_helper(struct radeon_compiler *c, struct rc_instruction *inst_add,
340 rc_presubtract_op presub_opcode, rc_presub_replace_fn presub_replace)
341 {
342 struct rc_reader_data reader_data;
343 unsigned int i;
344 rc_presubtract_op cb_op = presub_opcode;
345
346 reader_data.CbData = &cb_op;
347 reader_data.ExitOnAbort = 1;
348 rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL, is_src_clobbered_scan_write);
349
350 if (reader_data.Abort || reader_data.ReaderCount == 0)
351 return 0;
352
353 for (i = 0; i < reader_data.ReaderCount; i++) {
354 unsigned int src_index;
355 struct rc_reader reader = reader_data.Readers[i];
356 const struct rc_opcode_info *info = rc_get_opcode_info(reader.Inst->U.I.Opcode);
357
358 for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
359 if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
360 presub_replace(inst_add, reader.Inst, src_index);
361 }
362 }
363 return 1;
364 }
365
366 static void
presub_replace_add(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)367 presub_replace_add(struct rc_instruction *inst_add, struct rc_instruction *inst_reader,
368 unsigned int src_index)
369 {
370 rc_presubtract_op presub_opcode;
371
372 unsigned int negates = 0;
373 if (inst_add->U.I.SrcReg[0].Negate)
374 negates++;
375 if (inst_add->U.I.SrcReg[1].Negate)
376 negates++;
377 assert(negates != 2 || inst_add->U.I.SrcReg[1].Negate == inst_add->U.I.SrcReg[0].Negate);
378
379 if (negates == 1)
380 presub_opcode = RC_PRESUB_SUB;
381 else
382 presub_opcode = RC_PRESUB_ADD;
383
384 if (inst_add->U.I.SrcReg[1].Negate && negates == 1) {
385 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
386 inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
387 } else {
388 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
389 inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
390 }
391 /* If both sources are negative we can move the negate to the presub. */
392 unsigned negate_mask = negates == 1 ? 0 : inst_add->U.I.SrcReg[0].Negate;
393 inst_reader->U.I.PreSub.SrcReg[0].Negate = negate_mask;
394 inst_reader->U.I.PreSub.SrcReg[1].Negate = negate_mask;
395 inst_reader->U.I.PreSub.Opcode = presub_opcode;
396 inst_reader->U.I.SrcReg[src_index] =
397 chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
398 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
399 inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
400 }
401
402 static int
is_presub_candidate(struct radeon_compiler * c,struct rc_instruction * inst)403 is_presub_candidate(struct radeon_compiler *c, struct rc_instruction *inst)
404 {
405 const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
406 unsigned int i;
407 unsigned int is_constant[2] = {0, 0};
408
409 assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);
410
411 if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE || inst->U.I.SaturateMode ||
412 inst->U.I.WriteALUResult || inst->U.I.Omod) {
413 return 0;
414 }
415
416 /* If first two sources use a constant swizzle, then we can't convert it to
417 * a presubtract operation. In fact for the ADD and SUB presubtract
418 * operations neither source can contain a constant swizzle. This
419 * specific case is checked in peephole_add_presub_add() when
420 * we make sure the swizzles for both sources are equal, so we
421 * don't need to worry about it here. */
422 for (i = 0; i < 2; i++) {
423 int chan;
424 for (chan = 0; chan < 4; chan++) {
425 rc_swizzle swz = get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
426 if (swz == RC_SWIZZLE_ONE || swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF) {
427 is_constant[i] = 1;
428 }
429 }
430 }
431 if (is_constant[0] && is_constant[1])
432 return 0;
433
434 for (i = 0; i < info->NumSrcRegs; i++) {
435 struct rc_src_register src = inst->U.I.SrcReg[i];
436 if (src_reads_dst_mask(src, inst->U.I.DstReg))
437 return 0;
438
439 src.File = RC_FILE_PRESUB;
440 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
441 return 0;
442 }
443 return 1;
444 }
445
446 static int
peephole_add_presub_add(struct radeon_compiler * c,struct rc_instruction * inst_add)447 peephole_add_presub_add(struct radeon_compiler *c, struct rc_instruction *inst_add)
448 {
449 unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
450 unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
451 unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
452
453 if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
454 return 0;
455
456 /* src0 and src1 can't have absolute values */
457 if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
458 return 0;
459
460 /* if src0 is negative, at least all bits of dstmask have to be set */
461 if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
462 return 0;
463
464 /* if src1 is negative, at least all bits of dstmask have to be set */
465 if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
466 return 0;
467
468 if (!is_presub_candidate(c, inst_add))
469 return 0;
470
471 if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
472 rc_remove_instruction(inst_add);
473 return 1;
474 }
475 return 0;
476 }
477
478 static void
presub_replace_inv(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)479 presub_replace_inv(struct rc_instruction *inst_add, struct rc_instruction *inst_reader,
480 unsigned int src_index)
481 {
482 /* We must be careful not to modify inst_add, since it
483 * is possible it will remain part of the program.*/
484 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
485 inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
486 inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
487 inst_reader->U.I.SrcReg[src_index] =
488 chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
489
490 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
491 inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
492 }
493
494 static void
presub_replace_bias(struct rc_instruction * inst_mad,struct rc_instruction * inst_reader,unsigned int src_index)495 presub_replace_bias(struct rc_instruction *inst_mad, struct rc_instruction *inst_reader,
496 unsigned int src_index)
497 {
498 /* We must be careful not to modify inst_mad, since it
499 * is possible it will remain part of the program.*/
500 inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
501 inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
502 inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
503 inst_reader->U.I.SrcReg[src_index] =
504 chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
505
506 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
507 inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
508 }
509
510 /**
511 * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
512 * Use the presubtract 1 - src0 for all readers of TEMP[0]. The first source
513 * of the add instruction must have the constant 1 swizzle. This function
514 * does not check const registers to see if their value is 1.0, so it should
515 * be called after the constant_folding optimization.
516 * @return
517 * 0 if the ADD instruction is still part of the program.
518 * 1 if the ADD instruction is no longer part of the program.
519 */
520 static int
peephole_add_presub_inv(struct radeon_compiler * c,struct rc_instruction * inst_add)521 peephole_add_presub_inv(struct radeon_compiler *c, struct rc_instruction *inst_add)
522 {
523 unsigned int i, swz;
524
525 if (!is_presub_candidate(c, inst_add))
526 return 0;
527
528 /* Check if src0 is 1. */
529 /* XXX It would be nice to use is_src_uniform_constant here, but that
530 * function only works if the register's file is RC_FILE_NONE */
531 for (i = 0; i < 4; i++) {
532 if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
533 continue;
534
535 swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
536 if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
537 return 0;
538 }
539
540 /* Check src1. */
541 if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
542 inst_add->U.I.DstReg.WriteMask ||
543 inst_add->U.I.SrcReg[1].Abs || src_has_const_swz(inst_add->U.I.SrcReg[1])) {
544
545 return 0;
546 }
547
548 if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
549 rc_remove_instruction(inst_add);
550 return 1;
551 }
552 return 0;
553 }
554
555 /**
556 * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
557 * Use the presubtract 1 - 2*src0 for all readers of TEMP[0]. The first source
558 * of the add instruction must have the constant 1 swizzle. This function
559 * does not check const registers to see if their value is 1.0, so it should
560 * be called after the constant_folding optimization.
561 * @return
562 * 0 if the MAD instruction is still part of the program.
563 * 1 if the MAD instruction is no longer part of the program.
564 */
565 static int
peephole_mad_presub_bias(struct radeon_compiler * c,struct rc_instruction * inst_mad)566 peephole_mad_presub_bias(struct radeon_compiler *c, struct rc_instruction *inst_mad)
567 {
568 unsigned int i, swz;
569
570 if (!is_presub_candidate(c, inst_mad))
571 return 0;
572
573 /* Check if src2 is 1. */
574 for (i = 0; i < 4; i++) {
575 if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
576 continue;
577
578 swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
579 if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
580 return 0;
581 }
582
583 /* Check if src1 is 2. */
584 struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
585 if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
586 return 0;
587 if (src1_reg.File == RC_FILE_INLINE) {
588 if (rc_inline_to_float(src1_reg.Index) != 2.0f)
589 return 0;
590 } else {
591 if (src1_reg.File != RC_FILE_CONSTANT)
592 return 0;
593
594 struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
595 if (constant->Type != RC_CONSTANT_IMMEDIATE)
596 return 0;
597 for (i = 0; i < 4; i++) {
598 if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
599 continue;
600 swz = GET_SWZ(src1_reg.Swizzle, i);
601 if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
602 return 0;
603 }
604 }
605
606 /* Check src0. */
607 if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
608 inst_mad->U.I.DstReg.WriteMask ||
609 inst_mad->U.I.SrcReg[0].Abs || src_has_const_swz(inst_mad->U.I.SrcReg[0])) {
610
611 return 0;
612 }
613
614 if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
615 rc_remove_instruction(inst_mad);
616 return 1;
617 }
618 return 0;
619 }
620
621 struct peephole_mul_cb_data {
622 struct rc_dst_register *Writer;
623 unsigned int Clobbered;
624 };
625
626 static void
omod_filter_reader_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)627 omod_filter_reader_cb(void *userdata, struct rc_instruction *inst, rc_register_file file,
628 unsigned int index, unsigned int mask)
629 {
630 struct peephole_mul_cb_data *d = userdata;
631 if (rc_src_reads_dst_mask(file, mask, index, d->Writer->File, d->Writer->Index,
632 d->Writer->WriteMask)) {
633
634 d->Clobbered = 1;
635 }
636 }
637
638 static void
omod_filter_writer_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)639 omod_filter_writer_cb(void *userdata, struct rc_instruction *inst, rc_register_file file,
640 unsigned int index, unsigned int mask)
641 {
642 struct peephole_mul_cb_data *d = userdata;
643 if (file == d->Writer->File && index == d->Writer->Index && (mask & d->Writer->WriteMask)) {
644 d->Clobbered = 1;
645 }
646 }
647
648 static int
peephole_mul_omod(struct radeon_compiler * c,struct rc_instruction * inst_mul,struct rc_list * var_list)649 peephole_mul_omod(struct radeon_compiler *c, struct rc_instruction *inst_mul,
650 struct rc_list *var_list)
651 {
652 unsigned int chan = 0, swz, i;
653 int const_index = -1;
654 int temp_index = -1;
655 float const_value;
656 rc_omod_op omod_op = RC_OMOD_DISABLE;
657 struct rc_list *writer_list;
658 struct rc_variable *var;
659 struct peephole_mul_cb_data cb_data;
660 unsigned writemask_sum;
661
662 for (i = 0; i < 2; i++) {
663 unsigned int j;
664 if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT &&
665 inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY &&
666 inst_mul->U.I.SrcReg[i].File != RC_FILE_NONE) {
667 return 0;
668 }
669
670 /* The only relevant case with constant swizzles we should check for
671 * is multiply by one half.
672 */
673 if (inst_mul->U.I.SrcReg[i].File == RC_FILE_NONE) {
674 for (j = 0; j < 4; j++) {
675 swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
676 if (swz == RC_SWIZZLE_UNUSED) {
677 continue;
678 }
679 if (swz != RC_SWIZZLE_HALF) {
680 return 0;
681 } else {
682 omod_op = RC_OMOD_DIV_2;
683 }
684 }
685 }
686
687 if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
688 if (temp_index != -1) {
689 /* The instruction has two temp sources */
690 return 0;
691 } else {
692 temp_index = i;
693 continue;
694 }
695 }
696 /* If we get this far Src[i] must be a constant src */
697 if (inst_mul->U.I.SrcReg[i].Negate) {
698 return 0;
699 }
700 /* The constant src needs to read from the same swizzle */
701 swz = RC_SWIZZLE_UNUSED;
702 chan = 0;
703 for (j = 0; j < 4; j++) {
704 unsigned int j_swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
705 if (j_swz == RC_SWIZZLE_UNUSED) {
706 continue;
707 }
708 if (swz == RC_SWIZZLE_UNUSED) {
709 swz = j_swz;
710 chan = j;
711 } else if (j_swz != swz) {
712 return 0;
713 }
714 }
715
716 if (const_index != -1) {
717 /* The instruction has two constant sources */
718 return 0;
719 } else {
720 const_index = i;
721 }
722 }
723
724 if (omod_op == RC_OMOD_DISABLE) {
725 if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
726 inst_mul->U.I.SrcReg[const_index].Index)) {
727 return 0;
728 }
729 const_value = rc_get_constant_value(c, inst_mul->U.I.SrcReg[const_index].Index,
730 inst_mul->U.I.SrcReg[const_index].Swizzle,
731 inst_mul->U.I.SrcReg[const_index].Negate, chan);
732
733 if (const_value == 2.0f) {
734 omod_op = RC_OMOD_MUL_2;
735 } else if (const_value == 4.0f) {
736 omod_op = RC_OMOD_MUL_4;
737 } else if (const_value == 8.0f) {
738 omod_op = RC_OMOD_MUL_8;
739 } else if (const_value == (1.0f / 2.0f)) {
740 omod_op = RC_OMOD_DIV_2;
741 } else if (const_value == (1.0f / 4.0f)) {
742 omod_op = RC_OMOD_DIV_4;
743 } else if (const_value == (1.0f / 8.0f)) {
744 omod_op = RC_OMOD_DIV_8;
745 } else {
746 return 0;
747 }
748 }
749
750 writer_list = rc_variable_list_get_writers_one_reader(var_list, RC_INSTRUCTION_NORMAL,
751 &inst_mul->U.I.SrcReg[temp_index]);
752
753 if (!writer_list) {
754 return 0;
755 }
756
757 cb_data.Clobbered = 0;
758 cb_data.Writer = &inst_mul->U.I.DstReg;
759 for (var = writer_list->Item; var; var = var->Friend) {
760 struct rc_instruction *inst;
761 const struct rc_opcode_info *info = rc_get_opcode_info(var->Inst->U.I.Opcode);
762 if (info->HasTexture) {
763 return 0;
764 }
765 if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
766 return 0;
767 }
768
769 /* Empirical testing shows that DDX/DDY directly into output
770 * with non-identity omod is problematic.
771 */
772 if ((info->Opcode == RC_OPCODE_DDX || info->Opcode == RC_OPCODE_DDY) &&
773 inst_mul->U.I.DstReg.File == RC_FILE_OUTPUT) {
774 return 0;
775 }
776
777 for (inst = inst_mul->Prev; inst != var->Inst; inst = inst->Prev) {
778 rc_for_all_reads_mask(inst, omod_filter_reader_cb, &cb_data);
779 rc_for_all_writes_mask(inst, omod_filter_writer_cb, &cb_data);
780 if (cb_data.Clobbered) {
781 break;
782 }
783 }
784 }
785
786 if (cb_data.Clobbered) {
787 return 0;
788 }
789
790 writemask_sum = rc_variable_writemask_sum(writer_list->Item);
791
792 /* rc_normal_rewrite_writemask can't expand a previous writemask to store
793 * more channels replicated.
794 */
795 if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
796 return 0;
797
798 /* Rewrite the instructions */
799 for (var = writer_list->Item; var; var = var->Friend) {
800 struct rc_variable *writer = var;
801 unsigned conversion_swizzle = RC_SWIZZLE_UUUU;
802 for (chan = 0; chan < 4; chan++) {
803 unsigned swz = GET_SWZ(inst_mul->U.I.SrcReg[temp_index].Swizzle, chan);
804 if (swz <= RC_SWIZZLE_W)
805 SET_SWZ(conversion_swizzle, swz, chan);
806 }
807 writer->Inst->U.I.Omod = omod_op;
808 writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
809 writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
810 rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
811 writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
812 }
813
814 rc_remove_instruction(inst_mul);
815
816 return 1;
817 }
818
819 /**
820 * @return
821 * 0 if inst is still part of the program.
822 * 1 if inst is no longer part of the program.
823 */
824 int
rc_opt_presubtract(struct radeon_compiler * c,struct rc_instruction * inst,void * data)825 rc_opt_presubtract(struct radeon_compiler *c, struct rc_instruction *inst, void *data)
826 {
827 switch (inst->U.I.Opcode) {
828 case RC_OPCODE_ADD: {
829 if (peephole_add_presub_inv(c, inst))
830 return 1;
831 if (peephole_add_presub_add(c, inst))
832 return 1;
833 break;
834 }
835 case RC_OPCODE_MAD: {
836 if (peephole_mad_presub_bias(c, inst))
837 return 1;
838 break;
839 }
840 default:
841 break;
842 }
843 return 0;
844 }
845
846 static unsigned int
merge_swizzles(unsigned int swz1,unsigned int swz2)847 merge_swizzles(unsigned int swz1, unsigned int swz2)
848 {
849 unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
850 for (unsigned int chan = 0; chan < 4; chan++) {
851 unsigned int swz = GET_SWZ(swz1, chan);
852 if (swz != RC_SWIZZLE_UNUSED) {
853 SET_SWZ(new_swz, chan, swz);
854 continue;
855 }
856 swz = GET_SWZ(swz2, chan);
857 SET_SWZ(new_swz, chan, swz);
858 }
859 return new_swz;
860 }
861
862 /* Sets negate to 0 for unused channels. */
863 static unsigned int
clean_negate(struct rc_src_register src)864 clean_negate(struct rc_src_register src)
865 {
866 unsigned int new_negate = 0;
867 for (unsigned int chan = 0; chan < 4; chan++) {
868 unsigned int swz = GET_SWZ(src.Swizzle, chan);
869 if (swz != RC_SWIZZLE_UNUSED)
870 new_negate |= src.Negate & (1 << chan);
871 }
872 return new_negate;
873 }
874
875 static unsigned int
merge_negates(struct rc_src_register src1,struct rc_src_register src2)876 merge_negates(struct rc_src_register src1, struct rc_src_register src2)
877 {
878 return clean_negate(src1) | clean_negate(src2);
879 }
880
881 static unsigned int
fill_swizzle(unsigned int orig_swz,unsigned int wmask,unsigned int const_swz)882 fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz)
883 {
884 for (unsigned int chan = 0; chan < 4; chan++) {
885 unsigned int swz = GET_SWZ(orig_swz, chan);
886 if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) {
887 SET_SWZ(orig_swz, chan, const_swz);
888 }
889 }
890 return orig_swz;
891 }
892
893 static int
have_shared_source(struct rc_instruction * inst1,struct rc_instruction * inst2)894 have_shared_source(struct rc_instruction *inst1, struct rc_instruction *inst2)
895 {
896 int shared_src = -1;
897 const struct rc_opcode_info *opcode1 = rc_get_opcode_info(inst1->U.I.Opcode);
898 const struct rc_opcode_info *opcode2 = rc_get_opcode_info(inst2->U.I.Opcode);
899 for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) {
900 for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
901 if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
902 inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
903 inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
904 inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
905 shared_src = i;
906 }
907 }
908 return shared_src;
909 }
910
911 /**
912 * Merges two MOVs writing different channels of the same destination register
913 * with the use of the constant swizzles.
914 */
915 static bool
merge_movs(struct radeon_compiler * c,struct rc_instruction * inst,struct rc_instruction * cur)916 merge_movs(struct radeon_compiler *c, struct rc_instruction *inst, struct rc_instruction *cur)
917 {
918 /* We can merge two MOVs into MOV if one of them is from inline constant,
919 * i.e., constant swizzles and RC_FILE_NONE).
920 *
921 * For example
922 * MOV temp[0].x none.1___
923 * MOV temp[0].y input[0]._x__
924 *
925 * becomes
926 * MOV temp[0].xy input[0].1x__
927 */
928 unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
929 if (cur->U.I.SrcReg[0].File == RC_FILE_NONE || inst->U.I.SrcReg[0].File == RC_FILE_NONE) {
930 struct rc_src_register src;
931 if (cur->U.I.SrcReg[0].File == RC_FILE_NONE)
932 src = inst->U.I.SrcReg[0];
933 else
934 src = cur->U.I.SrcReg[0];
935 src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle);
936 src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
937 if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
938 cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
939 cur->U.I.SrcReg[0] = src;
940 rc_remove_instruction(inst);
941 return true;
942 }
943 }
944
945 /* Handle the trivial case where the MOVs share a source.
946 *
947 * For example
948 * MOV temp[0].x const[0].x
949 * MOV temp[0].y const[0].z
950 *
951 * becomes
952 * MOV temp[0].xy const[0].xz
953 */
954 if (have_shared_source(inst, cur) == 0) {
955 struct rc_src_register src = cur->U.I.SrcReg[0];
956 src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
957 src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle);
958
959 if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
960 cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
961 cur->U.I.SrcReg[0] = src;
962 rc_remove_instruction(inst);
963 return true;
964 }
965 }
966
967 /* Otherwise, we can convert the MOVs into ADD.
968 *
969 * For example
970 * MOV temp[0].x const[0].x
971 * MOV temp[0].y input[0].y
972 *
973 * becomes
974 * ADD temp[0].xy const[0].x0 input[0].0y
975 */
976 unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask;
977 struct rc_src_register src0 = inst->U.I.SrcReg[0];
978 struct rc_src_register src1 = cur->U.I.SrcReg[0];
979
980 src0.Swizzle = fill_swizzle(src0.Swizzle, wmask, RC_SWIZZLE_ZERO);
981 src1.Swizzle = fill_swizzle(src1.Swizzle, wmask, RC_SWIZZLE_ZERO);
982 if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) ||
983 !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1))
984 return false;
985
986 cur->U.I.DstReg.WriteMask = wmask;
987 cur->U.I.Opcode = RC_OPCODE_ADD;
988 cur->U.I.SrcReg[0] = src0;
989 cur->U.I.SrcReg[1] = src1;
990
991 /* finally delete the original mov */
992 rc_remove_instruction(inst);
993 return true;
994 }
995
996 /**
997 * This function will try to merge MOV and ADD/MUL instructions with the same
998 * destination, making use of the constant swizzles.
999 *
1000 * For example:
1001 * MOV temp[0].x const[0].x
1002 * MUL temp[0].yz const[1].yz const[2].yz
1003 *
1004 * becomes
1005 * MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00
1006 */
1007 static int
merge_mov_add_mul(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1008 merge_mov_add_mul(struct radeon_compiler *c, struct rc_instruction *inst1,
1009 struct rc_instruction *inst2)
1010 {
1011 struct rc_instruction *inst, *mov;
1012 if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1013 mov = inst1;
1014 inst = inst2;
1015 } else {
1016 mov = inst2;
1017 inst = inst1;
1018 }
1019
1020 const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL;
1021 int shared_index = have_shared_source(inst, mov);
1022 unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask;
1023
1024 /* If there is a shared source, just merge the swizzles and be done with it. */
1025 if (shared_index != -1) {
1026 struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index];
1027 struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index];
1028
1029 shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src);
1030 shared_src.Swizzle = merge_swizzles(shared_src.Swizzle, mov->U.I.SrcReg[0].Swizzle);
1031 other_src.Negate = clean_negate(other_src);
1032 unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO;
1033 other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz);
1034
1035 if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) ||
1036 !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src))
1037 return 0;
1038
1039 inst2->U.I.Opcode = inst->U.I.Opcode;
1040 inst2->U.I.SrcReg[0] = shared_src;
1041 inst2->U.I.SrcReg[1] = other_src;
1042
1043 /* TODO: we can do a bit better in the special case when one of the sources is none.
1044 * Convert to MAD otherwise.
1045 */
1046 } else {
1047 struct rc_src_register src0, src1, src2;
1048 if (is_mul) {
1049 src2 = mov->U.I.SrcReg[0];
1050 src0 = inst->U.I.SrcReg[0];
1051 src1 = inst->U.I.SrcReg[1];
1052 } else {
1053 src0 = mov->U.I.SrcReg[0];
1054 src1 = inst->U.I.SrcReg[0];
1055 src2 = inst->U.I.SrcReg[1];
1056 }
1057 /* The following login expects that the unused channels have empty negate bits. */
1058 src0.Negate = clean_negate(src0);
1059 src1.Negate = clean_negate(src1);
1060 src2.Negate = clean_negate(src2);
1061
1062 src0.Swizzle = fill_swizzle(src0.Swizzle, wmask, RC_SWIZZLE_ONE);
1063 src1.Swizzle = fill_swizzle(src1.Swizzle, wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE);
1064 src2.Swizzle = fill_swizzle(src2.Swizzle, wmask, RC_SWIZZLE_ZERO);
1065 if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) ||
1066 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) ||
1067 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2))
1068 return 0;
1069
1070 inst2->U.I.Opcode = RC_OPCODE_MAD;
1071 inst2->U.I.SrcReg[0] = src0;
1072 inst2->U.I.SrcReg[1] = src1;
1073 inst2->U.I.SrcReg[2] = src2;
1074 }
1075 inst2->U.I.DstReg.WriteMask = wmask;
1076 /* finally delete the original instruction */
1077 rc_remove_instruction(inst1);
1078
1079 return 1;
1080 }
1081
1082 /**
1083 * This function will try to merge MOV and MAD instructions with the same
1084 * destination, making use of the constant swizzles. This only works
1085 * if there is a shared source or one of the sources is RC_FILE_NONE.
1086 *
1087 * For example:
1088 * MOV temp[0].x const[0].x
1089 * MAD temp[0].yz const[0].yz const[1].yz input[0].xw
1090 *
1091 * becomes
1092 * MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
1093 */
1094 static bool
merge_mov_mad(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1095 merge_mov_mad(struct radeon_compiler *c, struct rc_instruction *inst1, struct rc_instruction *inst2)
1096 {
1097 struct rc_instruction *mov, *mad;
1098 if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1099 mov = inst1;
1100 mad = inst2;
1101 } else {
1102 mov = inst2;
1103 mad = inst1;
1104 }
1105
1106 int shared_index = have_shared_source(mad, mov);
1107 unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
1108 struct rc_src_register src[3];
1109 src[0] = mad->U.I.SrcReg[0];
1110 src[1] = mad->U.I.SrcReg[1];
1111 src[2] = mad->U.I.SrcReg[2];
1112
1113 /* Shared source is the one for multiplication. */
1114 if (shared_index == 0 || shared_index == 1) {
1115 src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
1116 src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
1117 src[shared_index].Swizzle =
1118 merge_swizzles(src[shared_index].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1119 src[1 - shared_index].Swizzle =
1120 fill_swizzle(src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
1121 src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1122
1123 /* Shared source is the one for used for addition, or it is none. Additionally,
1124 * if the mov SrcReg is none, we merge it with the addition (third) reg as well
1125 * because than we have the highest change the swizzles will be legal.
1126 */
1127 } else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
1128 src[2].File == RC_FILE_NONE) {
1129 src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
1130 src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1131 src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
1132 src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
1133 if (src[2].File == RC_FILE_NONE) {
1134 src[2].File = mov->U.I.SrcReg[0].File;
1135 src[2].Index = mov->U.I.SrcReg[0].Index;
1136 src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
1137 src[2].Abs = mov->U.I.SrcReg[0].Abs;
1138 }
1139
1140 /* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
1141 * fill the other one with ones and the reg for addition with zeros.
1142 */
1143 } else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
1144 unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
1145 src[none_src] = mov->U.I.SrcReg[0];
1146 src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
1147 src[none_src].Swizzle =
1148 merge_swizzles(src[none_src].Swizzle, mad->U.I.SrcReg[none_src].Swizzle);
1149 src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
1150 src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle, wmask, RC_SWIZZLE_ONE);
1151 src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1152 } else {
1153 return false;
1154 }
1155
1156 if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
1157 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
1158 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
1159 return false;
1160
1161 inst2->U.I.Opcode = RC_OPCODE_MAD;
1162 inst2->U.I.SrcReg[0] = src[0];
1163 inst2->U.I.SrcReg[1] = src[1];
1164 inst2->U.I.SrcReg[2] = src[2];
1165 inst2->U.I.DstReg.WriteMask = wmask;
1166 rc_remove_instruction(inst1);
1167 return true;
1168 }
1169
1170 static bool
inst_combination(struct rc_instruction * inst1,struct rc_instruction * inst2,rc_opcode opcode1,rc_opcode opcode2)1171 inst_combination(struct rc_instruction *inst1, struct rc_instruction *inst2, rc_opcode opcode1,
1172 rc_opcode opcode2)
1173 {
1174 return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) ||
1175 (inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2));
1176 }
1177
1178 /**
1179 * Searches for instructions writing different channels of the same register that could
1180 * be merged together with the use of constant swizzles.
1181 *
1182 * The potential candidates are combinations of MOVs, ADDs, MULs and MADs.
1183 */
1184 static void
merge_channels(struct radeon_compiler * c,struct rc_instruction * inst)1185 merge_channels(struct radeon_compiler *c, struct rc_instruction *inst)
1186 {
1187 unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
1188 unsigned int orig_dst_file = inst->U.I.DstReg.File;
1189 unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
1190 const struct rc_opcode_info *orig_opcode = rc_get_opcode_info(inst->U.I.Opcode);
1191
1192 struct rc_instruction *cur = inst;
1193 while (cur != &c->Program.Instructions) {
1194 cur = cur->Next;
1195 const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);
1196
1197 /* Keep it simple for now and stop when encountering any
1198 * control flow.
1199 */
1200 if (opcode->IsFlowControl)
1201 return;
1202
1203 /* Stop when the original destination is overwritten */
1204 if (orig_dst_reg == cur->U.I.DstReg.Index && orig_dst_file == cur->U.I.DstReg.File &&
1205 (orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
1206 return;
1207
1208 /* Stop the search when the original instruction destination
1209 * is used as a source for anything.
1210 */
1211 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
1212 if (cur->U.I.SrcReg[i].File == orig_dst_file && cur->U.I.SrcReg[i].Index == orig_dst_reg)
1213 return;
1214 }
1215
1216 /* Stop the search when some of the original sources are touched. */
1217 for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) {
1218 if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File &&
1219 inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index)
1220 return;
1221 }
1222
1223 if (cur->U.I.DstReg.File == orig_dst_file && cur->U.I.DstReg.Index == orig_dst_reg &&
1224 cur->U.I.SaturateMode == inst->U.I.SaturateMode &&
1225 (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {
1226
1227 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) {
1228 if (merge_movs(c, inst, cur))
1229 return;
1230 }
1231
1232 /* Skip the merge if one of the instructions writes just w channel
1233 * and we are compiling a fragment shader. We can pair-schedule it together
1234 * later anyway and it will also give the scheduler a bit more flexibility.
1235 * Only check this after merging MOVs as when we manage to merge two MOVs
1236 * into another MOV we can still copy propagate it away. So it is a win in
1237 * that case.
1238 */
1239 if (c->has_omod &&
1240 (cur->U.I.DstReg.WriteMask == RC_MASK_W || inst->U.I.DstReg.WriteMask == RC_MASK_W))
1241 continue;
1242
1243 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) ||
1244 inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) {
1245 if (merge_mov_add_mul(c, inst, cur))
1246 return;
1247 }
1248
1249 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
1250 if (merge_mov_mad(c, inst, cur))
1251 return;
1252 }
1253 }
1254 }
1255 }
1256
1257 /**
1258 * Searches for duplicate ARLs/ARRs
1259 *
1260 * Only a very trivial case is now optimized where if a second one is detected which reads from
1261 * the same register as the first one and source is the same, just remove the second one.
1262 */
1263 static void
merge_A0_loads(struct radeon_compiler * c,struct rc_instruction * inst,bool is_ARL)1264 merge_A0_loads(struct radeon_compiler *c, struct rc_instruction *inst, bool is_ARL)
1265 {
1266 unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
1267 unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
1268 unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
1269 int cf_depth = 0;
1270
1271 struct rc_instruction *cur = inst;
1272 while (cur != &c->Program.Instructions) {
1273 cur = cur->Next;
1274 const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);
1275
1276 /* Keep it simple for now and stop when encountering any
1277 * control flow besides simple ifs.
1278 */
1279 if (opcode->IsFlowControl) {
1280 switch (cur->U.I.Opcode) {
1281 case RC_OPCODE_IF: {
1282 cf_depth++;
1283 break;
1284 }
1285 case RC_OPCODE_ELSE: {
1286 if (cf_depth < 1)
1287 return;
1288 break;
1289 }
1290 case RC_OPCODE_ENDIF: {
1291 cf_depth--;
1292 break;
1293 }
1294 default:
1295 return;
1296 }
1297 }
1298
1299 /* Stop when the original source is overwritten */
1300 if (A0_src_reg == cur->U.I.DstReg.Index && A0_src_file == cur->U.I.DstReg.File &&
1301 cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
1302 return;
1303
1304 /* Wrong A0 load type. */
1305 if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
1306 (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
1307 return;
1308
1309 if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
1310 if (A0_src_reg == cur->U.I.SrcReg[0].Index && A0_src_file == cur->U.I.SrcReg[0].File &&
1311 A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
1312 struct rc_instruction *next = cur->Next;
1313 rc_remove_instruction(cur);
1314 cur = next;
1315 } else {
1316 return;
1317 }
1318 }
1319 }
1320 }
1321
1322 /**
1323 * According to the GLSL spec, round is only 1.30 and up
1324 * so the only reason why we should ever see round is if it actually
1325 * is lowered ARR (from nine->ttn). In that case we want to reconstruct
1326 * the ARR instead of lowering the round.
1327 */
1328 static void
transform_vertex_ROUND(struct radeon_compiler * c,struct rc_instruction * inst)1329 transform_vertex_ROUND(struct radeon_compiler *c, struct rc_instruction *inst)
1330 {
1331 struct rc_reader_data readers;
1332 rc_get_readers(c, inst, &readers, NULL, NULL, NULL);
1333
1334 assert(readers.ReaderCount > 0);
1335 for (unsigned i = 0; i < readers.ReaderCount; i++) {
1336 struct rc_instruction *reader = readers.Readers[i].Inst;
1337 if (reader->U.I.Opcode != RC_OPCODE_ARL) {
1338 assert(!"Unable to convert ROUND+ARL to ARR\n");
1339 return;
1340 }
1341 }
1342
1343 /* Only ARL readers, convert all to ARR */
1344 for (unsigned i = 0; i < readers.ReaderCount; i++) {
1345 readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
1346 }
1347 /* Switch ROUND to MOV and let copy propagate sort it out later. */
1348 inst->U.I.Opcode = RC_OPCODE_MOV;
1349 }
1350
1351 /**
1352 * Apply various optimizations specific to the A0 address register loads.
1353 */
1354 static void
optimize_A0_loads(struct radeon_compiler * c)1355 optimize_A0_loads(struct radeon_compiler *c)
1356 {
1357 struct rc_instruction *inst = c->Program.Instructions.Next;
1358
1359 while (inst != &c->Program.Instructions) {
1360 struct rc_instruction *cur = inst;
1361 inst = inst->Next;
1362 if (cur->U.I.Opcode == RC_OPCODE_ARL) {
1363 merge_A0_loads(c, cur, true);
1364 } else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
1365 merge_A0_loads(c, cur, false);
1366 } else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
1367 transform_vertex_ROUND(c, cur);
1368 }
1369 }
1370 }
1371
1372 void
rc_optimize(struct radeon_compiler * c,void * user)1373 rc_optimize(struct radeon_compiler *c, void *user)
1374 {
1375 struct rc_instruction *inst = c->Program.Instructions.Next;
1376 while (inst != &c->Program.Instructions) {
1377 struct rc_instruction *cur = inst;
1378 inst = inst->Next;
1379 constant_folding(c, cur);
1380 }
1381
1382 /* Copy propagate simple movs away. */
1383 inst = c->Program.Instructions.Next;
1384 while (inst != &c->Program.Instructions) {
1385 struct rc_instruction *cur = inst;
1386 inst = inst->Next;
1387 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1388 copy_propagate(c, cur);
1389 }
1390 }
1391
1392 if (c->type == RC_VERTEX_PROGRAM) {
1393 optimize_A0_loads(c);
1394 }
1395
1396 /* Merge MOVs to same source in different channels using the constant
1397 * swizzle.
1398 */
1399 if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
1400 inst = c->Program.Instructions.Next;
1401 while (inst != &c->Program.Instructions) {
1402 struct rc_instruction *cur = inst;
1403 inst = inst->Next;
1404 if (cur->U.I.Opcode == RC_OPCODE_MOV || cur->U.I.Opcode == RC_OPCODE_ADD ||
1405 cur->U.I.Opcode == RC_OPCODE_MAD || cur->U.I.Opcode == RC_OPCODE_MUL)
1406 merge_channels(c, cur);
1407 }
1408 }
1409
1410 /* Copy propagate few extra movs from the merge_channels pass. */
1411 inst = c->Program.Instructions.Next;
1412 while (inst != &c->Program.Instructions) {
1413 struct rc_instruction *cur = inst;
1414 inst = inst->Next;
1415 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1416 copy_propagate(c, cur);
1417 }
1418 }
1419
1420 if (c->type != RC_FRAGMENT_PROGRAM) {
1421 return;
1422 }
1423
1424 /* Output modifiers. */
1425 inst = c->Program.Instructions.Next;
1426 struct rc_list *var_list = NULL;
1427 while (inst != &c->Program.Instructions) {
1428 struct rc_instruction *cur = inst;
1429 inst = inst->Next;
1430 if (cur->U.I.Opcode == RC_OPCODE_MUL) {
1431 if (!var_list)
1432 var_list = rc_get_variables(c);
1433 if (peephole_mul_omod(c, cur, var_list))
1434 var_list = NULL;
1435 }
1436 }
1437 }
1438