1 /*
2 * Copyright (C) 2009 Nicolai Haehnle.
3 * Copyright 2010 Tom Stellard <tstellar@gmail.com>
4 *
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial
17 * portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
23 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 */
28
29 #include "util/u_math.h"
30
31 #include "radeon_dataflow.h"
32
33 #include "radeon_compiler.h"
34 #include "radeon_compiler_util.h"
35 #include "radeon_list.h"
36 #include "radeon_swizzle.h"
37 #include "radeon_variable.h"
38
39 struct src_clobbered_reads_cb_data {
40 rc_register_file File;
41 unsigned int Index;
42 unsigned int Mask;
43 struct rc_reader_data * ReaderData;
44 };
45
46 typedef void (*rc_presub_replace_fn)(struct rc_instruction *,
47 struct rc_instruction *,
48 unsigned int);
49
chain_srcregs(struct rc_src_register outer,struct rc_src_register inner)50 static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
51 {
52 struct rc_src_register combine;
53 combine.File = inner.File;
54 combine.Index = inner.Index;
55 combine.RelAddr = inner.RelAddr;
56 if (outer.Abs) {
57 combine.Abs = 1;
58 combine.Negate = outer.Negate;
59 } else {
60 combine.Abs = inner.Abs;
61 combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
62 combine.Negate ^= outer.Negate;
63 }
64 combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
65 return combine;
66 }
67
copy_propagate_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)68 static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
69 struct rc_src_register * src)
70 {
71 rc_register_file file = src->File;
72 struct rc_reader_data * reader_data = data;
73
74 if(!rc_inst_can_use_presub(reader_data->C,
75 inst,
76 reader_data->Writer->U.I.PreSub.Opcode,
77 rc_swizzle_to_writemask(src->Swizzle),
78 src,
79 &reader_data->Writer->U.I.PreSub.SrcReg[0],
80 &reader_data->Writer->U.I.PreSub.SrcReg[1])) {
81 reader_data->Abort = 1;
82 return;
83 }
84
85 /* XXX This could probably be handled better. */
86 if (file == RC_FILE_ADDRESS) {
87 reader_data->Abort = 1;
88 return;
89 }
90
91 /* R300/R400 is unhappy about propagating
92 * 0: MOV temp[1], -none.1111;
93 * 1: KIL temp[1];
94 * to
95 * 0: KIL -none.1111;
96 *
97 * R500 is fine with it.
98 */
99 if (!reader_data->C->is_r500 && inst->U.I.Opcode == RC_OPCODE_KIL &&
100 reader_data->Writer->U.I.SrcReg[0].File == RC_FILE_NONE) {
101 reader_data->Abort = 1;
102 return;
103 }
104
105 /* These instructions cannot read from the constants file.
106 * see radeonTransformTEX()
107 */
108 if(reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
109 reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
110 reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_NONE &&
111 (inst->U.I.Opcode == RC_OPCODE_TEX ||
112 inst->U.I.Opcode == RC_OPCODE_TXB ||
113 inst->U.I.Opcode == RC_OPCODE_TXP ||
114 inst->U.I.Opcode == RC_OPCODE_TXD ||
115 inst->U.I.Opcode == RC_OPCODE_TXL ||
116 inst->U.I.Opcode == RC_OPCODE_KIL)){
117 reader_data->Abort = 1;
118 return;
119 }
120 }
121
src_clobbered_reads_cb(void * data,struct rc_instruction * inst,struct rc_src_register * src)122 static void src_clobbered_reads_cb(
123 void * data,
124 struct rc_instruction * inst,
125 struct rc_src_register * src)
126 {
127 struct src_clobbered_reads_cb_data * sc_data = data;
128
129 if (src->File == sc_data->File
130 && src->Index == sc_data->Index
131 && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
132
133 sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
134 }
135
136 if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
137 sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
138 }
139 }
140
is_src_clobbered_scan_write(void * data,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)141 static void is_src_clobbered_scan_write(
142 void * data,
143 struct rc_instruction * inst,
144 rc_register_file file,
145 unsigned int index,
146 unsigned int mask)
147 {
148 struct src_clobbered_reads_cb_data sc_data;
149 struct rc_reader_data * reader_data = data;
150 sc_data.File = file;
151 sc_data.Index = index;
152 sc_data.Mask = mask;
153 sc_data.ReaderData = reader_data;
154 rc_for_all_reads_src(reader_data->Writer,
155 src_clobbered_reads_cb, &sc_data);
156 }
157
copy_propagate(struct radeon_compiler * c,struct rc_instruction * inst_mov)158 static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * inst_mov)
159 {
160 struct rc_reader_data reader_data;
161 unsigned int i;
162
163 if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
164 inst_mov->U.I.WriteALUResult)
165 return;
166
167 /* Get a list of all the readers of this MOV instruction. */
168 reader_data.ExitOnAbort = 1;
169 rc_get_readers(c, inst_mov, &reader_data,
170 copy_propagate_scan_read, NULL,
171 is_src_clobbered_scan_write);
172
173 if (reader_data.Abort || reader_data.ReaderCount == 0)
174 return;
175
176 /* We can propagate SaturateMode if all the readers are MOV instructions
177 * without a presubtract operation, source negation and absolute.
178 * In that case, we just move SaturateMode to all readers. */
179 if (inst_mov->U.I.SaturateMode) {
180 for (i = 0; i < reader_data.ReaderCount; i++) {
181 struct rc_instruction * inst = reader_data.Readers[i].Inst;
182
183 if (inst->U.I.Opcode != RC_OPCODE_MOV ||
184 inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
185 inst->U.I.SrcReg[0].Abs ||
186 inst->U.I.SrcReg[0].Negate) {
187 return;
188 }
189 }
190 }
191
192 /* Propagate the MOV instruction. */
193 for (i = 0; i < reader_data.ReaderCount; i++) {
194 struct rc_instruction * inst = reader_data.Readers[i].Inst;
195 *reader_data.Readers[i].U.I.Src = chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
196
197 if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
198 inst->U.I.PreSub = inst_mov->U.I.PreSub;
199 if (!inst->U.I.SaturateMode)
200 inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
201 }
202
203 /* Finally, remove the original MOV instruction */
204 rc_remove_instruction(inst_mov);
205 }
206
207 /**
208 * Check if a source register is actually always the same
209 * swizzle constant.
210 */
is_src_uniform_constant(struct rc_src_register src,rc_swizzle * pswz,unsigned int * pnegate)211 static int is_src_uniform_constant(struct rc_src_register src,
212 rc_swizzle * pswz, unsigned int * pnegate)
213 {
214 int have_used = 0;
215
216 if (src.File != RC_FILE_NONE) {
217 *pswz = 0;
218 return 0;
219 }
220
221 for(unsigned int chan = 0; chan < 4; ++chan) {
222 unsigned int swz = GET_SWZ(src.Swizzle, chan);
223 if (swz < 4) {
224 *pswz = 0;
225 return 0;
226 }
227 if (swz == RC_SWIZZLE_UNUSED)
228 continue;
229
230 if (!have_used) {
231 *pswz = swz;
232 *pnegate = GET_BIT(src.Negate, chan);
233 have_used = 1;
234 } else {
235 if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
236 *pswz = 0;
237 return 0;
238 }
239 }
240 }
241
242 return 1;
243 }
244
245 /**
246 * Replace 0.0, 1.0 and 0.5 immediate constants by their
247 * respective swizzles. Simplify instructions like ADD dst, src, 0;
248 */
constant_folding(struct radeon_compiler * c,struct rc_instruction * inst)249 static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
250 {
251 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
252 unsigned int i;
253
254 /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
255 for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
256 struct rc_constant * constant;
257 struct rc_src_register newsrc;
258 int have_real_reference;
259 unsigned int chan;
260
261 /* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
262 for (chan = 0; chan < 4; ++chan)
263 if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
264 break;
265 if (chan == 4) {
266 inst->U.I.SrcReg[src].File = RC_FILE_NONE;
267 continue;
268 }
269
270 /* Convert immediates to swizzles. */
271 if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
272 inst->U.I.SrcReg[src].RelAddr ||
273 inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
274 continue;
275
276 constant =
277 &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
278
279 if (constant->Type != RC_CONSTANT_IMMEDIATE)
280 continue;
281
282 newsrc = inst->U.I.SrcReg[src];
283 have_real_reference = 0;
284 for (chan = 0; chan < 4; ++chan) {
285 unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
286 unsigned int newswz;
287 float imm;
288 float baseimm;
289
290 if (swz >= 4)
291 continue;
292
293 imm = constant->u.Immediate[swz];
294 baseimm = imm;
295 if (imm < 0.0)
296 baseimm = -baseimm;
297
298 if (baseimm == 0.0) {
299 newswz = RC_SWIZZLE_ZERO;
300 } else if (baseimm == 1.0) {
301 newswz = RC_SWIZZLE_ONE;
302 } else if (baseimm == 0.5 && c->has_half_swizzles) {
303 newswz = RC_SWIZZLE_HALF;
304 } else {
305 have_real_reference = 1;
306 continue;
307 }
308
309 SET_SWZ(newsrc.Swizzle, chan, newswz);
310 if (imm < 0.0 && !newsrc.Abs)
311 newsrc.Negate ^= 1 << chan;
312 }
313
314 if (!have_real_reference) {
315 newsrc.File = RC_FILE_NONE;
316 newsrc.Index = 0;
317 }
318
319 /* don't make the swizzle worse */
320 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
321 continue;
322
323 inst->U.I.SrcReg[src] = newsrc;
324 }
325
326 /* In case this instruction has been converted, make sure all of the
327 * registers that are no longer used are empty. */
328 opcode = rc_get_opcode_info(inst->U.I.Opcode);
329 for(i = opcode->NumSrcRegs; i < 3; i++) {
330 memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
331 }
332 }
333
334 /**
335 * If src and dst use the same register, this function returns a writemask that
336 * indicates which components are read by src. Otherwise zero is returned.
337 */
src_reads_dst_mask(struct rc_src_register src,struct rc_dst_register dst)338 static unsigned int src_reads_dst_mask(struct rc_src_register src,
339 struct rc_dst_register dst)
340 {
341 if (dst.File != src.File || dst.Index != src.Index) {
342 return 0;
343 }
344 return rc_swizzle_to_writemask(src.Swizzle);
345 }
346
347 /* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
348 * in any of its channels. Return 0 otherwise. */
src_has_const_swz(struct rc_src_register src)349 static int src_has_const_swz(struct rc_src_register src) {
350 int chan;
351 for(chan = 0; chan < 4; chan++) {
352 unsigned int swz = GET_SWZ(src.Swizzle, chan);
353 if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF
354 || swz == RC_SWIZZLE_ONE) {
355 return 1;
356 }
357 }
358 return 0;
359 }
360
presub_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)361 static void presub_scan_read(
362 void * data,
363 struct rc_instruction * inst,
364 struct rc_src_register * src)
365 {
366 struct rc_reader_data * reader_data = data;
367 rc_presubtract_op * presub_opcode = reader_data->CbData;
368
369 if (!rc_inst_can_use_presub(reader_data->C,
370 inst,
371 *presub_opcode,
372 reader_data->Writer->U.I.DstReg.WriteMask,
373 src,
374 &reader_data->Writer->U.I.SrcReg[0],
375 &reader_data->Writer->U.I.SrcReg[1])) {
376 reader_data->Abort = 1;
377 return;
378 }
379 }
380
presub_helper(struct radeon_compiler * c,struct rc_instruction * inst_add,rc_presubtract_op presub_opcode,rc_presub_replace_fn presub_replace)381 static int presub_helper(
382 struct radeon_compiler * c,
383 struct rc_instruction * inst_add,
384 rc_presubtract_op presub_opcode,
385 rc_presub_replace_fn presub_replace)
386 {
387 struct rc_reader_data reader_data;
388 unsigned int i;
389 rc_presubtract_op cb_op = presub_opcode;
390
391 reader_data.CbData = &cb_op;
392 reader_data.ExitOnAbort = 1;
393 rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
394 is_src_clobbered_scan_write);
395
396 if (reader_data.Abort || reader_data.ReaderCount == 0)
397 return 0;
398
399 for(i = 0; i < reader_data.ReaderCount; i++) {
400 unsigned int src_index;
401 struct rc_reader reader = reader_data.Readers[i];
402 const struct rc_opcode_info * info =
403 rc_get_opcode_info(reader.Inst->U.I.Opcode);
404
405 for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
406 if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
407 presub_replace(inst_add, reader.Inst, src_index);
408 }
409 }
410 return 1;
411 }
412
presub_replace_add(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)413 static void presub_replace_add(
414 struct rc_instruction * inst_add,
415 struct rc_instruction * inst_reader,
416 unsigned int src_index)
417 {
418 rc_presubtract_op presub_opcode;
419
420 unsigned int negates = 0;
421 if (inst_add->U.I.SrcReg[0].Negate)
422 negates++;
423 if (inst_add->U.I.SrcReg[1].Negate)
424 negates++;
425 assert(negates != 2 || inst_add->U.I.SrcReg[1].Negate == inst_add->U.I.SrcReg[0].Negate);
426
427 if (negates == 1)
428 presub_opcode = RC_PRESUB_SUB;
429 else
430 presub_opcode = RC_PRESUB_ADD;
431
432 if (inst_add->U.I.SrcReg[1].Negate && negates == 1) {
433 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
434 inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
435 } else {
436 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
437 inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
438 }
439 /* If both sources are negative we can move the negate to the presub. */
440 unsigned negate_mask = negates == 1 ? 0 : inst_add->U.I.SrcReg[0].Negate;
441 inst_reader->U.I.PreSub.SrcReg[0].Negate = negate_mask;
442 inst_reader->U.I.PreSub.SrcReg[1].Negate = negate_mask;
443 inst_reader->U.I.PreSub.Opcode = presub_opcode;
444 inst_reader->U.I.SrcReg[src_index] =
445 chain_srcregs(inst_reader->U.I.SrcReg[src_index],
446 inst_reader->U.I.PreSub.SrcReg[0]);
447 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
448 inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
449 }
450
is_presub_candidate(struct radeon_compiler * c,struct rc_instruction * inst)451 static int is_presub_candidate(
452 struct radeon_compiler * c,
453 struct rc_instruction * inst)
454 {
455 const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
456 unsigned int i;
457 unsigned int is_constant[2] = {0, 0};
458
459 assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);
460
461 if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE
462 || inst->U.I.SaturateMode
463 || inst->U.I.WriteALUResult
464 || inst->U.I.Omod) {
465 return 0;
466 }
467
468 /* If first two sources use a constant swizzle, then we can't convert it to
469 * a presubtract operation. In fact for the ADD and SUB presubtract
470 * operations neither source can contain a constant swizzle. This
471 * specific case is checked in peephole_add_presub_add() when
472 * we make sure the swizzles for both sources are equal, so we
473 * don't need to worry about it here. */
474 for (i = 0; i < 2; i++) {
475 int chan;
476 for (chan = 0; chan < 4; chan++) {
477 rc_swizzle swz =
478 get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
479 if (swz == RC_SWIZZLE_ONE
480 || swz == RC_SWIZZLE_ZERO
481 || swz == RC_SWIZZLE_HALF) {
482 is_constant[i] = 1;
483 }
484 }
485 }
486 if (is_constant[0] && is_constant[1])
487 return 0;
488
489 for(i = 0; i < info->NumSrcRegs; i++) {
490 struct rc_src_register src = inst->U.I.SrcReg[i];
491 if (src_reads_dst_mask(src, inst->U.I.DstReg))
492 return 0;
493
494 src.File = RC_FILE_PRESUB;
495 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
496 return 0;
497 }
498 return 1;
499 }
500
peephole_add_presub_add(struct radeon_compiler * c,struct rc_instruction * inst_add)501 static int peephole_add_presub_add(
502 struct radeon_compiler * c,
503 struct rc_instruction * inst_add)
504 {
505 unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
506 unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
507 unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
508
509 if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
510 return 0;
511
512 /* src0 and src1 can't have absolute values */
513 if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
514 return 0;
515
516 /* if src0 is negative, at least all bits of dstmask have to be set */
517 if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
518 return 0;
519
520 /* if src1 is negative, at least all bits of dstmask have to be set */
521 if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
522 return 0;
523
524 if (!is_presub_candidate(c, inst_add))
525 return 0;
526
527 if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
528 rc_remove_instruction(inst_add);
529 return 1;
530 }
531 return 0;
532 }
533
presub_replace_inv(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)534 static void presub_replace_inv(
535 struct rc_instruction * inst_add,
536 struct rc_instruction * inst_reader,
537 unsigned int src_index)
538 {
539 /* We must be careful not to modify inst_add, since it
540 * is possible it will remain part of the program.*/
541 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
542 inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
543 inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
544 inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
545 inst_reader->U.I.PreSub.SrcReg[0]);
546
547 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
548 inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
549 }
550
presub_replace_bias(struct rc_instruction * inst_mad,struct rc_instruction * inst_reader,unsigned int src_index)551 static void presub_replace_bias(
552 struct rc_instruction * inst_mad,
553 struct rc_instruction * inst_reader,
554 unsigned int src_index)
555 {
556 /* We must be careful not to modify inst_mad, since it
557 * is possible it will remain part of the program.*/
558 inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
559 inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
560 inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
561 inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
562 inst_reader->U.I.PreSub.SrcReg[0]);
563
564 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
565 inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
566 }
567
568 /**
569 * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
570 * Use the presubtract 1 - src0 for all readers of TEMP[0]. The first source
571 * of the add instruction must have the constant 1 swizzle. This function
572 * does not check const registers to see if their value is 1.0, so it should
573 * be called after the constant_folding optimization.
574 * @return
575 * 0 if the ADD instruction is still part of the program.
576 * 1 if the ADD instruction is no longer part of the program.
577 */
peephole_add_presub_inv(struct radeon_compiler * c,struct rc_instruction * inst_add)578 static int peephole_add_presub_inv(
579 struct radeon_compiler * c,
580 struct rc_instruction * inst_add)
581 {
582 unsigned int i, swz;
583
584 if (!is_presub_candidate(c, inst_add))
585 return 0;
586
587 /* Check if src0 is 1. */
588 /* XXX It would be nice to use is_src_uniform_constant here, but that
589 * function only works if the register's file is RC_FILE_NONE */
590 for(i = 0; i < 4; i++ ) {
591 if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
592 continue;
593
594 swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
595 if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
596 return 0;
597 }
598
599 /* Check src1. */
600 if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
601 inst_add->U.I.DstReg.WriteMask
602 || inst_add->U.I.SrcReg[1].Abs
603 || src_has_const_swz(inst_add->U.I.SrcReg[1])) {
604
605 return 0;
606 }
607
608 if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
609 rc_remove_instruction(inst_add);
610 return 1;
611 }
612 return 0;
613 }
614
615 /**
616 * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
617 * Use the presubtract 1 - 2*src0 for all readers of TEMP[0]. The first source
618 * of the add instruction must have the constant 1 swizzle. This function
619 * does not check const registers to see if their value is 1.0, so it should
620 * be called after the constant_folding optimization.
621 * @return
622 * 0 if the MAD instruction is still part of the program.
623 * 1 if the MAD instruction is no longer part of the program.
624 */
peephole_mad_presub_bias(struct radeon_compiler * c,struct rc_instruction * inst_mad)625 static int peephole_mad_presub_bias(
626 struct radeon_compiler * c,
627 struct rc_instruction * inst_mad)
628 {
629 unsigned int i, swz;
630
631 if (!is_presub_candidate(c, inst_mad))
632 return 0;
633
634 /* Check if src2 is 1. */
635 for(i = 0; i < 4; i++ ) {
636 if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
637 continue;
638
639 swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
640 if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
641 return 0;
642 }
643
644 /* Check if src1 is 2. */
645 struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
646 if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
647 return 0;
648 struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
649 if (constant->Type != RC_CONSTANT_IMMEDIATE)
650 return 0;
651 for (i = 0; i < 4; i++) {
652 if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
653 continue;
654 swz = GET_SWZ(src1_reg.Swizzle, i);
655 if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
656 return 0;
657 }
658
659 /* Check src0. */
660 if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
661 inst_mad->U.I.DstReg.WriteMask
662 || inst_mad->U.I.SrcReg[0].Abs
663 || src_has_const_swz(inst_mad->U.I.SrcReg[0])) {
664
665 return 0;
666 }
667
668 if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
669 rc_remove_instruction(inst_mad);
670 return 1;
671 }
672 return 0;
673 }
674
675 struct peephole_mul_cb_data {
676 struct rc_dst_register * Writer;
677 unsigned int Clobbered;
678 };
679
omod_filter_reader_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)680 static void omod_filter_reader_cb(
681 void * userdata,
682 struct rc_instruction * inst,
683 rc_register_file file,
684 unsigned int index,
685 unsigned int mask)
686 {
687 struct peephole_mul_cb_data * d = userdata;
688 if (rc_src_reads_dst_mask(file, mask, index,
689 d->Writer->File, d->Writer->Index, d->Writer->WriteMask)) {
690
691 d->Clobbered = 1;
692 }
693 }
694
omod_filter_writer_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)695 static void omod_filter_writer_cb(
696 void * userdata,
697 struct rc_instruction * inst,
698 rc_register_file file,
699 unsigned int index,
700 unsigned int mask)
701 {
702 struct peephole_mul_cb_data * d = userdata;
703 if (file == d->Writer->File && index == d->Writer->Index &&
704 (mask & d->Writer->WriteMask)) {
705 d->Clobbered = 1;
706 }
707 }
708
peephole_mul_omod(struct radeon_compiler * c,struct rc_instruction * inst_mul,struct rc_list * var_list)709 static int peephole_mul_omod(
710 struct radeon_compiler * c,
711 struct rc_instruction * inst_mul,
712 struct rc_list * var_list)
713 {
714 unsigned int chan = 0, swz, i;
715 int const_index = -1;
716 int temp_index = -1;
717 float const_value;
718 rc_omod_op omod_op = RC_OMOD_DISABLE;
719 struct rc_list * writer_list;
720 struct rc_variable * var;
721 struct peephole_mul_cb_data cb_data;
722 unsigned writemask_sum;
723
724 for (i = 0; i < 2; i++) {
725 unsigned int j;
726 if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT
727 && inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY) {
728 return 0;
729 }
730 if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
731 if (temp_index != -1) {
732 /* The instruction has two temp sources */
733 return 0;
734 } else {
735 temp_index = i;
736 continue;
737 }
738 }
739 /* If we get this far Src[i] must be a constant src */
740 if (inst_mul->U.I.SrcReg[i].Negate) {
741 return 0;
742 }
743 /* The constant src needs to read from the same swizzle */
744 swz = RC_SWIZZLE_UNUSED;
745 chan = 0;
746 for (j = 0; j < 4; j++) {
747 unsigned int j_swz =
748 GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
749 if (j_swz == RC_SWIZZLE_UNUSED) {
750 continue;
751 }
752 if (swz == RC_SWIZZLE_UNUSED) {
753 swz = j_swz;
754 chan = j;
755 } else if (j_swz != swz) {
756 return 0;
757 }
758 }
759
760 if (const_index != -1) {
761 /* The instruction has two constant sources */
762 return 0;
763 } else {
764 const_index = i;
765 }
766 }
767
768 if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
769 inst_mul->U.I.SrcReg[const_index].Index)) {
770 return 0;
771 }
772 const_value = rc_get_constant_value(c,
773 inst_mul->U.I.SrcReg[const_index].Index,
774 inst_mul->U.I.SrcReg[const_index].Swizzle,
775 inst_mul->U.I.SrcReg[const_index].Negate,
776 chan);
777
778 if (const_value == 2.0f) {
779 omod_op = RC_OMOD_MUL_2;
780 } else if (const_value == 4.0f) {
781 omod_op = RC_OMOD_MUL_4;
782 } else if (const_value == 8.0f) {
783 omod_op = RC_OMOD_MUL_8;
784 } else if (const_value == (1.0f / 2.0f)) {
785 omod_op = RC_OMOD_DIV_2;
786 } else if (const_value == (1.0f / 4.0f)) {
787 omod_op = RC_OMOD_DIV_4;
788 } else if (const_value == (1.0f / 8.0f)) {
789 omod_op = RC_OMOD_DIV_8;
790 } else {
791 return 0;
792 }
793
794 writer_list = rc_variable_list_get_writers_one_reader(var_list,
795 RC_INSTRUCTION_NORMAL, &inst_mul->U.I.SrcReg[temp_index]);
796
797 if (!writer_list) {
798 return 0;
799 }
800
801 cb_data.Clobbered = 0;
802 cb_data.Writer = &inst_mul->U.I.DstReg;
803 for (var = writer_list->Item; var; var = var->Friend) {
804 struct rc_instruction * inst;
805 const struct rc_opcode_info * info = rc_get_opcode_info(
806 var->Inst->U.I.Opcode);
807 if (info->HasTexture) {
808 return 0;
809 }
810 if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
811 return 0;
812 }
813 for (inst = inst_mul->Prev; inst != var->Inst;
814 inst = inst->Prev) {
815 rc_for_all_reads_mask(inst, omod_filter_reader_cb,
816 &cb_data);
817 rc_for_all_writes_mask(inst, omod_filter_writer_cb,
818 &cb_data);
819 if (cb_data.Clobbered) {
820 break;
821 }
822 }
823 }
824
825 if (cb_data.Clobbered) {
826 return 0;
827 }
828
829 writemask_sum = rc_variable_writemask_sum(writer_list->Item);
830
831 /* rc_normal_rewrite_writemask can't expand a previous writemask to store
832 * more channels replicated.
833 */
834 if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
835 return 0;
836
837 /* Rewrite the instructions */
838 for (var = writer_list->Item; var; var = var->Friend) {
839 struct rc_variable * writer = var;
840 unsigned conversion_swizzle = rc_make_conversion_swizzle(
841 writemask_sum,
842 inst_mul->U.I.DstReg.WriteMask);
843 writer->Inst->U.I.Omod = omod_op;
844 writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
845 writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
846 rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
847 writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
848 }
849
850 rc_remove_instruction(inst_mul);
851
852 return 1;
853 }
854
855 /**
856 * @return
857 * 0 if inst is still part of the program.
858 * 1 if inst is no longer part of the program.
859 */
peephole(struct radeon_compiler * c,struct rc_instruction * inst)860 static int peephole(struct radeon_compiler * c, struct rc_instruction * inst)
861 {
862 if (!c->has_presub)
863 return 0;
864
865 switch(inst->U.I.Opcode) {
866 case RC_OPCODE_ADD:
867 {
868 if (peephole_add_presub_inv(c, inst))
869 return 1;
870 if (peephole_add_presub_add(c, inst))
871 return 1;
872 break;
873 }
874 case RC_OPCODE_MAD:
875 {
876 if (peephole_mad_presub_bias(c, inst))
877 return 1;
878 break;
879 }
880 default:
881 break;
882 }
883 return 0;
884 }
885
merge_swizzles(unsigned int swz1,unsigned int swz2)886 static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2)
887 {
888 unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
889 for (unsigned int chan = 0; chan < 4; chan++) {
890 unsigned int swz = GET_SWZ(swz1, chan);
891 if (swz != RC_SWIZZLE_UNUSED) {
892 SET_SWZ(new_swz, chan, swz);
893 continue;
894 }
895 swz = GET_SWZ(swz2, chan);
896 SET_SWZ(new_swz, chan, swz);
897 }
898 return new_swz;
899 }
900
901 /* Sets negate to 0 for unused channels. */
clean_negate(struct rc_src_register src)902 static unsigned int clean_negate(struct rc_src_register src)
903 {
904 unsigned int new_negate = 0;
905 for (unsigned int chan = 0; chan < 4; chan++) {
906 unsigned int swz = GET_SWZ(src.Swizzle, chan);
907 if (swz != RC_SWIZZLE_UNUSED)
908 new_negate |= src.Negate & (1 << chan);
909 }
910 return new_negate;
911 }
912
merge_negates(struct rc_src_register src1,struct rc_src_register src2)913 static unsigned int merge_negates(struct rc_src_register src1, struct rc_src_register src2)
914 {
915 return clean_negate(src1) | clean_negate(src2);
916 }
917
fill_swizzle(unsigned int orig_swz,unsigned int wmask,unsigned int const_swz)918 static unsigned int fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz)
919 {
920 for (unsigned int chan = 0; chan < 4; chan++) {
921 unsigned int swz = GET_SWZ(orig_swz, chan);
922 if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) {
923 SET_SWZ(orig_swz, chan, const_swz);
924 }
925 }
926 return orig_swz;
927 }
928
have_shared_source(struct rc_instruction * inst1,struct rc_instruction * inst2)929 static int have_shared_source(struct rc_instruction * inst1, struct rc_instruction * inst2)
930 {
931 int shared_src = -1;
932 const struct rc_opcode_info * opcode1 = rc_get_opcode_info(inst1->U.I.Opcode);
933 const struct rc_opcode_info * opcode2 = rc_get_opcode_info(inst2->U.I.Opcode);
934 for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) {
935 for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
936 if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
937 inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
938 inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
939 inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
940 shared_src = i;
941 }
942 }
943 return shared_src;
944 }
945
946 /**
947 * Merges two MOVs writing different channels of the same destination register
948 * with the use of the constant swizzles.
949 */
merge_movs(struct radeon_compiler * c,struct rc_instruction * inst,struct rc_instruction * cur)950 static bool merge_movs(
951 struct radeon_compiler * c,
952 struct rc_instruction * inst,
953 struct rc_instruction * cur)
954 {
955 /* We can merge two MOVs into MOV if one of them is from inline constant,
956 * i.e., constant swizzles and RC_FILE_NONE).
957 *
958 * For example
959 * MOV temp[0].x none.1___
960 * MOV temp[0].y input[0]._x__
961 *
962 * becomes
963 * MOV temp[0].xy input[0].1x__
964 */
965 unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
966 if (cur->U.I.SrcReg[0].File == RC_FILE_NONE ||
967 inst->U.I.SrcReg[0].File == RC_FILE_NONE) {
968 struct rc_src_register src;
969 if (cur->U.I.SrcReg[0].File == RC_FILE_NONE)
970 src = inst->U.I.SrcReg[0];
971 else
972 src = cur->U.I.SrcReg[0];
973 src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
974 inst->U.I.SrcReg[0].Swizzle);
975 src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
976 if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
977 cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
978 cur->U.I.SrcReg[0] = src;
979 rc_remove_instruction(inst);
980 return true;
981 }
982 }
983
984 /* Handle the trivial case where the MOVs share a source.
985 *
986 * For example
987 * MOV temp[0].x const[0].x
988 * MOV temp[0].y const[0].z
989 *
990 * becomes
991 * MOV temp[0].xy const[0].xz
992 */
993 if (have_shared_source(inst, cur) == 0) {
994 struct rc_src_register src = cur->U.I.SrcReg[0];
995 src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
996 src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
997 inst->U.I.SrcReg[0].Swizzle);
998
999 if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
1000 cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
1001 cur->U.I.SrcReg[0] = src;
1002 rc_remove_instruction(inst);
1003 return true;
1004 }
1005 }
1006
1007 /* Otherwise, we can convert the MOVs into ADD.
1008 *
1009 * For example
1010 * MOV temp[0].x const[0].x
1011 * MOV temp[0].y input[0].y
1012 *
1013 * becomes
1014 * ADD temp[0].xy const[0].x0 input[0].0y
1015 */
1016 unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask;
1017 struct rc_src_register src0 = inst->U.I.SrcReg[0];
1018 struct rc_src_register src1 = cur->U.I.SrcReg[0];
1019
1020 src0.Swizzle = fill_swizzle(src0.Swizzle,
1021 wmask, RC_SWIZZLE_ZERO);
1022 src1.Swizzle = fill_swizzle(src1.Swizzle,
1023 wmask, RC_SWIZZLE_ZERO);
1024 if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) ||
1025 !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1))
1026 return false;
1027
1028 cur->U.I.DstReg.WriteMask = wmask;
1029 cur->U.I.Opcode = RC_OPCODE_ADD;
1030 cur->U.I.SrcReg[0] = src0;
1031 cur->U.I.SrcReg[1] = src1;
1032
1033 /* finally delete the original mov */
1034 rc_remove_instruction(inst);
1035 return true;
1036 }
1037
1038 /**
1039 * This function will try to merge MOV and ADD/MUL instructions with the same
1040 * destination, making use of the constant swizzles.
1041 *
1042 * For example:
1043 * MOV temp[0].x const[0].x
1044 * MUL temp[0].yz const[1].yz const[2].yz
1045 *
1046 * becomes
1047 * MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00
1048 */
merge_mov_add_mul(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1049 static int merge_mov_add_mul(
1050 struct radeon_compiler * c,
1051 struct rc_instruction * inst1,
1052 struct rc_instruction * inst2)
1053 {
1054 struct rc_instruction * inst, * mov;
1055 if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1056 mov = inst1;
1057 inst = inst2;
1058 } else {
1059 mov = inst2;
1060 inst = inst1;
1061 }
1062
1063 const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL;
1064 int shared_index = have_shared_source(inst, mov);
1065 unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask;
1066
1067 /* If there is a shared source, just merge the swizzles and be done with it. */
1068 if (shared_index != -1) {
1069 struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index];
1070 struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index];
1071
1072 shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src);
1073 shared_src.Swizzle = merge_swizzles(shared_src.Swizzle,
1074 mov->U.I.SrcReg[0].Swizzle);
1075 other_src.Negate = clean_negate(other_src);
1076 unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO;
1077 other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz);
1078
1079 if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) ||
1080 !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src))
1081 return 0;
1082
1083 inst2->U.I.Opcode = inst->U.I.Opcode;
1084 inst2->U.I.SrcReg[0] = shared_src;
1085 inst2->U.I.SrcReg[1] = other_src;
1086
1087 /* TODO: we can do a bit better in the special case when one of the sources is none.
1088 * Convert to MAD otherwise.
1089 */
1090 } else {
1091 struct rc_src_register src0, src1, src2;
1092 if (is_mul) {
1093 src2 = mov->U.I.SrcReg[0];
1094 src0 = inst->U.I.SrcReg[0];
1095 src1 = inst->U.I.SrcReg[1];
1096 } else {
1097 src0 = mov->U.I.SrcReg[0];
1098 src1 = inst->U.I.SrcReg[0];
1099 src2 = inst->U.I.SrcReg[1];
1100 }
1101 /* The following login expects that the unused channels have empty negate bits. */
1102 src0.Negate = clean_negate(src0);
1103 src1.Negate = clean_negate(src1);
1104 src2.Negate = clean_negate(src2);
1105
1106 src0.Swizzle = fill_swizzle(src0.Swizzle,
1107 wmask, RC_SWIZZLE_ONE);
1108 src1.Swizzle = fill_swizzle(src1.Swizzle,
1109 wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE);
1110 src2.Swizzle = fill_swizzle(src2.Swizzle,
1111 wmask, RC_SWIZZLE_ZERO);
1112 if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) ||
1113 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) ||
1114 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2))
1115 return 0;
1116
1117 inst2->U.I.Opcode = RC_OPCODE_MAD;
1118 inst2->U.I.SrcReg[0] = src0;
1119 inst2->U.I.SrcReg[1] = src1;
1120 inst2->U.I.SrcReg[2] = src2;
1121 }
1122 inst2->U.I.DstReg.WriteMask = wmask;
1123 /* finally delete the original instruction */
1124 rc_remove_instruction(inst1);
1125
1126 return 1;
1127 }
1128
1129 /**
1130 * This function will try to merge MOV and MAD instructions with the same
1131 * destination, making use of the constant swizzles. This only works
1132 * if there is a shared source or one of the sources is RC_FILE_NONE.
1133 *
1134 * For example:
1135 * MOV temp[0].x const[0].x
1136 * MAD temp[0].yz const[0].yz const[1].yz input[0].xw
1137 *
1138 * becomes
1139 * MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
1140 */
merge_mov_mad(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1141 static bool merge_mov_mad(
1142 struct radeon_compiler * c,
1143 struct rc_instruction * inst1,
1144 struct rc_instruction * inst2)
1145 {
1146 struct rc_instruction * mov, * mad;
1147 if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1148 mov = inst1;
1149 mad = inst2;
1150 } else {
1151 mov = inst2;
1152 mad = inst1;
1153 }
1154
1155 int shared_index = have_shared_source(mad, mov);
1156 unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
1157 struct rc_src_register src[3];
1158 src[0] = mad->U.I.SrcReg[0];
1159 src[1] = mad->U.I.SrcReg[1];
1160 src[2] = mad->U.I.SrcReg[2];
1161
1162 /* Shared source is the one for multiplication. */
1163 if (shared_index == 0 || shared_index == 1) {
1164 src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
1165 src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
1166 src[shared_index].Swizzle = merge_swizzles(src[shared_index].Swizzle,
1167 mov->U.I.SrcReg[0].Swizzle);
1168 src[1 - shared_index].Swizzle = fill_swizzle(
1169 src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
1170 src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1171
1172 /* Shared source is the one for used for addition, or it is none. Additionally,
1173 * if the mov SrcReg is none, we merge it with the addition (third) reg as well
1174 * because than we have the highest change the swizzles will be legal.
1175 */
1176 } else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
1177 src[2].File == RC_FILE_NONE) {
1178 src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
1179 src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1180 src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
1181 src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
1182 if (src[2].File == RC_FILE_NONE) {
1183 src[2].File = mov->U.I.SrcReg[0].File;
1184 src[2].Index = mov->U.I.SrcReg[0].Index;
1185 src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
1186 src[2].Abs = mov->U.I.SrcReg[0].Abs;
1187 }
1188
1189 /* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
1190 * fill the other one with ones and the reg for addition with zeros.
1191 */
1192 } else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
1193 unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
1194 src[none_src] = mov->U.I.SrcReg[0];
1195 src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
1196 src[none_src].Swizzle = merge_swizzles(src[none_src].Swizzle,
1197 mad->U.I.SrcReg[none_src].Swizzle);
1198 src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
1199 src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle,
1200 wmask, RC_SWIZZLE_ONE);
1201 src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1202 } else {
1203 return false;
1204 }
1205
1206 if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
1207 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
1208 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
1209 return false;
1210
1211 inst2->U.I.Opcode = RC_OPCODE_MAD;
1212 inst2->U.I.SrcReg[0] = src[0];
1213 inst2->U.I.SrcReg[1] = src[1];
1214 inst2->U.I.SrcReg[2] = src[2];
1215 inst2->U.I.DstReg.WriteMask = wmask;
1216 rc_remove_instruction(inst1);
1217 return true;
1218 }
1219
inst_combination(struct rc_instruction * inst1,struct rc_instruction * inst2,rc_opcode opcode1,rc_opcode opcode2)1220 static bool inst_combination(
1221 struct rc_instruction * inst1,
1222 struct rc_instruction * inst2,
1223 rc_opcode opcode1,
1224 rc_opcode opcode2)
1225 {
1226 return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) ||
1227 (inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2));
1228 }
1229
1230 /**
1231 * Searches for instructions writing different channels of the same register that could
1232 * be merged together with the use of constant swizzles.
1233 *
1234 * The potential candidates are combinations of MOVs, ADDs, MULs and MADs.
1235 */
merge_channels(struct radeon_compiler * c,struct rc_instruction * inst)1236 static void merge_channels(struct radeon_compiler * c, struct rc_instruction * inst)
1237 {
1238 unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
1239 unsigned int orig_dst_file = inst->U.I.DstReg.File;
1240 unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
1241 const struct rc_opcode_info * orig_opcode = rc_get_opcode_info(inst->U.I.Opcode);
1242
1243 struct rc_instruction * cur = inst;
1244 while (cur!= &c->Program.Instructions) {
1245 cur = cur->Next;
1246 const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);
1247
1248 /* Keep it simple for now and stop when encountering any
1249 * control flow.
1250 */
1251 if (opcode->IsFlowControl)
1252 return;
1253
1254 /* Stop when the original destination is overwritten */
1255 if (orig_dst_reg == cur->U.I.DstReg.Index &&
1256 orig_dst_file == cur->U.I.DstReg.File &&
1257 (orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
1258 return;
1259
1260 /* Stop the search when the original instruction destination
1261 * is used as a source for anything.
1262 */
1263 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
1264 if (cur->U.I.SrcReg[i].File == orig_dst_file &&
1265 cur->U.I.SrcReg[i].Index == orig_dst_reg)
1266 return;
1267 }
1268
1269 /* Stop the search when some of the original sources are touched. */
1270 for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) {
1271 if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File &&
1272 inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index)
1273 return;
1274 }
1275
1276 if (cur->U.I.DstReg.File == orig_dst_file &&
1277 cur->U.I.DstReg.Index == orig_dst_reg &&
1278 cur->U.I.SaturateMode == inst->U.I.SaturateMode &&
1279 (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {
1280
1281 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) {
1282 if (merge_movs(c, inst, cur))
1283 return;
1284 }
1285
1286 /* Skip the merge if one of the instructions writes just w channel
1287 * and we are compiling a fragment shader. We can pair-schedule it together
1288 * later anyway and it will also give the scheduler a bit more flexibility.
1289 * Only check this after merging MOVs as when we manage to merge two MOVs
1290 * into another MOV we can still copy propagate it away. So it is a win in
1291 * that case.
1292 */
1293 if (c->has_omod && (cur->U.I.DstReg.WriteMask == RC_MASK_W ||
1294 inst->U.I.DstReg.WriteMask == RC_MASK_W))
1295 continue;
1296
1297 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) ||
1298 inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) {
1299 if (merge_mov_add_mul(c, inst, cur))
1300 return;
1301 }
1302
1303 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
1304 if (merge_mov_mad(c, inst, cur))
1305 return;
1306 }
1307 }
1308 }
1309 }
1310
1311 /**
1312 * Searches for duplicate ARLs/ARRs
1313 *
1314 * Only a very trivial case is now optimized where if a second one is detected which reads from
1315 * the same register as the first one and source is the same, just remove the second one.
1316 */
merge_A0_loads(struct radeon_compiler * c,struct rc_instruction * inst,bool is_ARL)1317 static void merge_A0_loads(
1318 struct radeon_compiler * c,
1319 struct rc_instruction * inst,
1320 bool is_ARL)
1321 {
1322 unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
1323 unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
1324 unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
1325 int cf_depth = 0;
1326
1327 struct rc_instruction * cur = inst;
1328 while (cur != &c->Program.Instructions) {
1329 cur = cur->Next;
1330 const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);
1331
1332 /* Keep it simple for now and stop when encountering any
1333 * control flow besides simple ifs.
1334 */
1335 if (opcode->IsFlowControl) {
1336 switch (cur->U.I.Opcode) {
1337 case RC_OPCODE_IF:
1338 {
1339 cf_depth++;
1340 break;
1341 }
1342 case RC_OPCODE_ELSE:
1343 {
1344 if (cf_depth < 1)
1345 return;
1346 break;
1347 }
1348 case RC_OPCODE_ENDIF:
1349 {
1350 cf_depth--;
1351 break;
1352 }
1353 default:
1354 return;
1355 }
1356 }
1357
1358 /* Stop when the original source is overwritten */
1359 if (A0_src_reg == cur->U.I.DstReg.Index &&
1360 A0_src_file == cur->U.I.DstReg.File &&
1361 cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
1362 return;
1363
1364 /* Wrong A0 load type. */
1365 if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
1366 (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
1367 return;
1368
1369 if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
1370 if (A0_src_reg == cur->U.I.SrcReg[0].Index &&
1371 A0_src_file == cur->U.I.SrcReg[0].File &&
1372 A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
1373 struct rc_instruction * next = cur->Next;
1374 rc_remove_instruction(cur);
1375 cur = next;
1376 } else {
1377 return;
1378 }
1379 }
1380 }
1381 }
1382
1383 /**
1384 * According to the GLSL spec, round is only 1.30 and up
1385 * so the only reason why we should ever see round is if it actually
1386 * is lowered ARR (from nine->ttn). In that case we want to reconstruct
1387 * the ARR instead of lowering the round.
1388 */
transform_vertex_ROUND(struct radeon_compiler * c,struct rc_instruction * inst)1389 static void transform_vertex_ROUND(struct radeon_compiler* c,
1390 struct rc_instruction* inst)
1391 {
1392 struct rc_reader_data readers;
1393 rc_get_readers(c, inst, &readers, NULL, NULL, NULL);
1394
1395 assert(readers.ReaderCount > 0);
1396 for (unsigned i = 0; i < readers.ReaderCount; i++) {
1397 struct rc_instruction *reader = readers.Readers[i].Inst;
1398 if (reader->U.I.Opcode != RC_OPCODE_ARL) {
1399 assert(!"Unable to convert ROUND+ARL to ARR\n");
1400 return;
1401 }
1402 }
1403
1404 /* Only ARL readers, convert all to ARR */
1405 for (unsigned i = 0; i < readers.ReaderCount; i++) {
1406 readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
1407 }
1408 /* Switch ROUND to MOV and let copy propagate sort it out later. */
1409 inst->U.I.Opcode = RC_OPCODE_MOV;
1410 }
1411
1412 /**
1413 * Apply various optimizations specific to the A0 adress register loads.
1414 */
optimize_A0_loads(struct radeon_compiler * c)1415 static void optimize_A0_loads(struct radeon_compiler * c) {
1416 struct rc_instruction * inst = c->Program.Instructions.Next;
1417
1418 while (inst != &c->Program.Instructions) {
1419 struct rc_instruction * cur = inst;
1420 inst = inst->Next;
1421 if (cur->U.I.Opcode == RC_OPCODE_ARL) {
1422 merge_A0_loads(c, cur, true);
1423 } else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
1424 merge_A0_loads(c, cur, false);
1425 } else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
1426 transform_vertex_ROUND(c, cur);
1427 }
1428 }
1429 }
1430
rc_optimize(struct radeon_compiler * c,void * user)1431 void rc_optimize(struct radeon_compiler * c, void *user)
1432 {
1433 struct rc_instruction * inst = c->Program.Instructions.Next;
1434 while(inst != &c->Program.Instructions) {
1435 struct rc_instruction * cur = inst;
1436 inst = inst->Next;
1437 constant_folding(c, cur);
1438 }
1439
1440 /* Copy propagate simple movs away. */
1441 inst = c->Program.Instructions.Next;
1442 while(inst != &c->Program.Instructions) {
1443 struct rc_instruction * cur = inst;
1444 inst = inst->Next;
1445 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1446 copy_propagate(c, cur);
1447 }
1448 }
1449
1450 if (c->type == RC_VERTEX_PROGRAM) {
1451 optimize_A0_loads(c);
1452 }
1453
1454 /* Merge MOVs to same source in different channels using the constant
1455 * swizzle.
1456 */
1457 if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
1458 inst = c->Program.Instructions.Next;
1459 while(inst != &c->Program.Instructions) {
1460 struct rc_instruction * cur = inst;
1461 inst = inst->Next;
1462 if (cur->U.I.Opcode == RC_OPCODE_MOV ||
1463 cur->U.I.Opcode == RC_OPCODE_ADD ||
1464 cur->U.I.Opcode == RC_OPCODE_MAD ||
1465 cur->U.I.Opcode == RC_OPCODE_MUL)
1466 merge_channels(c, cur);
1467 }
1468 }
1469
1470 /* Copy propagate few extra movs from the merge_channels pass. */
1471 inst = c->Program.Instructions.Next;
1472 while(inst != &c->Program.Instructions) {
1473 struct rc_instruction * cur = inst;
1474 inst = inst->Next;
1475 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1476 copy_propagate(c, cur);
1477 }
1478 }
1479
1480 if (c->type != RC_FRAGMENT_PROGRAM) {
1481 return;
1482 }
1483
1484 /* Presubtract operations. */
1485 inst = c->Program.Instructions.Next;
1486 while(inst != &c->Program.Instructions) {
1487 struct rc_instruction * cur = inst;
1488 inst = inst->Next;
1489 peephole(c, cur);
1490 }
1491
1492 /* Output modifiers. */
1493 inst = c->Program.Instructions.Next;
1494 struct rc_list * var_list = NULL;
1495 while(inst != &c->Program.Instructions) {
1496 struct rc_instruction * cur = inst;
1497 inst = inst->Next;
1498 if (cur->U.I.Opcode == RC_OPCODE_MUL) {
1499 if (!var_list)
1500 var_list = rc_get_variables(c);
1501 if (peephole_mul_omod(c, cur, var_list))
1502 var_list = NULL;
1503 }
1504 }
1505 }
1506