• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 Nicolai Haehnle.
3  * Copyright 2010 Tom Stellard <tstellar@gmail.com>
4  *
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sublicense, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the
16  * next paragraph) shall be included in all copies or substantial
17  * portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
23  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  */
28 
29 #include "util/u_math.h"
30 
31 #include "radeon_dataflow.h"
32 
33 #include "radeon_compiler.h"
34 #include "radeon_compiler_util.h"
35 #include "radeon_list.h"
36 #include "radeon_swizzle.h"
37 #include "radeon_variable.h"
38 
39 struct src_clobbered_reads_cb_data {
40 	rc_register_file File;
41 	unsigned int Index;
42 	unsigned int Mask;
43 	struct rc_reader_data * ReaderData;
44 };
45 
46 typedef void (*rc_presub_replace_fn)(struct rc_instruction *,
47 						struct rc_instruction *,
48 						unsigned int);
49 
chain_srcregs(struct rc_src_register outer,struct rc_src_register inner)50 static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
51 {
52 	struct rc_src_register combine;
53 	combine.File = inner.File;
54 	combine.Index = inner.Index;
55 	combine.RelAddr = inner.RelAddr;
56 	if (outer.Abs) {
57 		combine.Abs = 1;
58 		combine.Negate = outer.Negate;
59 	} else {
60 		combine.Abs = inner.Abs;
61 		combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
62 		combine.Negate ^= outer.Negate;
63 	}
64 	combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
65 	return combine;
66 }
67 
copy_propagate_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)68 static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
69 						struct rc_src_register * src)
70 {
71 	rc_register_file file = src->File;
72 	struct rc_reader_data * reader_data = data;
73 
74 	if(!rc_inst_can_use_presub(reader_data->C,
75 				inst,
76 				reader_data->Writer->U.I.PreSub.Opcode,
77 				rc_swizzle_to_writemask(src->Swizzle),
78 				src,
79 				&reader_data->Writer->U.I.PreSub.SrcReg[0],
80 				&reader_data->Writer->U.I.PreSub.SrcReg[1])) {
81 		reader_data->Abort = 1;
82 		return;
83 	}
84 
85 	/* XXX This could probably be handled better. */
86 	if (file == RC_FILE_ADDRESS) {
87 		reader_data->Abort = 1;
88 		return;
89 	}
90 
91 	/* R300/R400 is unhappy about propagating
92 	 *  0: MOV temp[1], -none.1111;
93 	 *  1: KIL temp[1];
94 	 * to
95 	 *  0: KIL -none.1111;
96 	 *
97 	 * R500 is fine with it.
98 	 */
99 	if (!reader_data->C->is_r500 && inst->U.I.Opcode == RC_OPCODE_KIL &&
100 		reader_data->Writer->U.I.SrcReg[0].File == RC_FILE_NONE) {
101 		reader_data->Abort = 1;
102 		return;
103 	}
104 
105 	/* These instructions cannot read from the constants file.
106 	 * see radeonTransformTEX()
107 	 */
108 	if(reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
109 			reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
110 			reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_NONE &&
111 				(inst->U.I.Opcode == RC_OPCODE_TEX ||
112 				inst->U.I.Opcode == RC_OPCODE_TXB ||
113 				inst->U.I.Opcode == RC_OPCODE_TXP ||
114 				inst->U.I.Opcode == RC_OPCODE_TXD ||
115 				inst->U.I.Opcode == RC_OPCODE_TXL ||
116 				inst->U.I.Opcode == RC_OPCODE_KIL)){
117 		reader_data->Abort = 1;
118 		return;
119 	}
120 }
121 
src_clobbered_reads_cb(void * data,struct rc_instruction * inst,struct rc_src_register * src)122 static void src_clobbered_reads_cb(
123 	void * data,
124 	struct rc_instruction * inst,
125 	struct rc_src_register * src)
126 {
127 	struct src_clobbered_reads_cb_data * sc_data = data;
128 
129 	if (src->File == sc_data->File
130 	    && src->Index == sc_data->Index
131 	    && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
132 
133 		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
134 	}
135 
136 	if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
137 		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
138 	}
139 }
140 
is_src_clobbered_scan_write(void * data,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)141 static void is_src_clobbered_scan_write(
142 	void * data,
143 	struct rc_instruction * inst,
144 	rc_register_file file,
145 	unsigned int index,
146 	unsigned int mask)
147 {
148 	struct src_clobbered_reads_cb_data sc_data;
149 	struct rc_reader_data * reader_data = data;
150 	sc_data.File = file;
151 	sc_data.Index = index;
152 	sc_data.Mask = mask;
153 	sc_data.ReaderData = reader_data;
154 	rc_for_all_reads_src(reader_data->Writer,
155 					src_clobbered_reads_cb, &sc_data);
156 }
157 
copy_propagate(struct radeon_compiler * c,struct rc_instruction * inst_mov)158 static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * inst_mov)
159 {
160 	struct rc_reader_data reader_data;
161 	unsigned int i;
162 
163 	if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
164 	    inst_mov->U.I.WriteALUResult)
165 		return;
166 
167 	/* Get a list of all the readers of this MOV instruction. */
168 	reader_data.ExitOnAbort = 1;
169 	rc_get_readers(c, inst_mov, &reader_data,
170 		       copy_propagate_scan_read, NULL,
171 		       is_src_clobbered_scan_write);
172 
173 	if (reader_data.Abort || reader_data.ReaderCount == 0)
174 		return;
175 
176 	/* We can propagate SaturateMode if all the readers are MOV instructions
177 	 * without a presubtract operation, source negation and absolute.
178 	 * In that case, we just move SaturateMode to all readers. */
179         if (inst_mov->U.I.SaturateMode) {
180 		for (i = 0; i < reader_data.ReaderCount; i++) {
181 			struct rc_instruction * inst = reader_data.Readers[i].Inst;
182 
183 			if (inst->U.I.Opcode != RC_OPCODE_MOV ||
184 			    inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
185 			    inst->U.I.SrcReg[0].Abs ||
186 			    inst->U.I.SrcReg[0].Negate) {
187 				return;
188 			}
189 		}
190 	}
191 
192 	/* Propagate the MOV instruction. */
193 	for (i = 0; i < reader_data.ReaderCount; i++) {
194 		struct rc_instruction * inst = reader_data.Readers[i].Inst;
195 		*reader_data.Readers[i].U.I.Src = chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
196 
197 		if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
198 			inst->U.I.PreSub = inst_mov->U.I.PreSub;
199 		if (!inst->U.I.SaturateMode)
200 			inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
201 	}
202 
203 	/* Finally, remove the original MOV instruction */
204 	rc_remove_instruction(inst_mov);
205 }
206 
207 /**
208  * Check if a source register is actually always the same
209  * swizzle constant.
210  */
is_src_uniform_constant(struct rc_src_register src,rc_swizzle * pswz,unsigned int * pnegate)211 static int is_src_uniform_constant(struct rc_src_register src,
212 		rc_swizzle * pswz, unsigned int * pnegate)
213 {
214 	int have_used = 0;
215 
216 	if (src.File != RC_FILE_NONE) {
217 		*pswz = 0;
218 		return 0;
219 	}
220 
221 	for(unsigned int chan = 0; chan < 4; ++chan) {
222 		unsigned int swz = GET_SWZ(src.Swizzle, chan);
223 		if (swz < 4) {
224 			*pswz = 0;
225 			return 0;
226 		}
227 		if (swz == RC_SWIZZLE_UNUSED)
228 			continue;
229 
230 		if (!have_used) {
231 			*pswz = swz;
232 			*pnegate = GET_BIT(src.Negate, chan);
233 			have_used = 1;
234 		} else {
235 			if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
236 				*pswz = 0;
237 				return 0;
238 			}
239 		}
240 	}
241 
242 	return 1;
243 }
244 
245 /**
246  * Replace 0.0, 1.0 and 0.5 immediate constants by their
247  * respective swizzles. Simplify instructions like ADD dst, src, 0;
248  */
constant_folding(struct radeon_compiler * c,struct rc_instruction * inst)249 static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
250 {
251 	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
252 	unsigned int i;
253 
254 	/* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
255 	for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
256 		struct rc_constant * constant;
257 		struct rc_src_register newsrc;
258 		int have_real_reference;
259 		unsigned int chan;
260 
261 		/* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
262 		for (chan = 0; chan < 4; ++chan)
263 			if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
264 				break;
265 		if (chan == 4) {
266 			inst->U.I.SrcReg[src].File = RC_FILE_NONE;
267 			continue;
268 		}
269 
270 		/* Convert immediates to swizzles. */
271 		if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
272 		    inst->U.I.SrcReg[src].RelAddr ||
273 		    inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
274 			continue;
275 
276 		constant =
277 			&c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
278 
279 		if (constant->Type != RC_CONSTANT_IMMEDIATE)
280 			continue;
281 
282 		newsrc = inst->U.I.SrcReg[src];
283 		have_real_reference = 0;
284 		for (chan = 0; chan < 4; ++chan) {
285 			unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
286 			unsigned int newswz;
287 			float imm;
288 			float baseimm;
289 
290 			if (swz >= 4)
291 				continue;
292 
293 			imm = constant->u.Immediate[swz];
294 			baseimm = imm;
295 			if (imm < 0.0)
296 				baseimm = -baseimm;
297 
298 			if (baseimm == 0.0) {
299 				newswz = RC_SWIZZLE_ZERO;
300 			} else if (baseimm == 1.0) {
301 				newswz = RC_SWIZZLE_ONE;
302 			} else if (baseimm == 0.5 && c->has_half_swizzles) {
303 				newswz = RC_SWIZZLE_HALF;
304 			} else {
305 				have_real_reference = 1;
306 				continue;
307 			}
308 
309 			SET_SWZ(newsrc.Swizzle, chan, newswz);
310 			if (imm < 0.0 && !newsrc.Abs)
311 				newsrc.Negate ^= 1 << chan;
312 		}
313 
314 		if (!have_real_reference) {
315 			newsrc.File = RC_FILE_NONE;
316 			newsrc.Index = 0;
317 		}
318 
319 		/* don't make the swizzle worse */
320 		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
321 			continue;
322 
323 		inst->U.I.SrcReg[src] = newsrc;
324 	}
325 
326 	/* In case this instruction has been converted, make sure all of the
327 	 * registers that are no longer used are empty. */
328 	opcode = rc_get_opcode_info(inst->U.I.Opcode);
329 	for(i = opcode->NumSrcRegs; i < 3; i++) {
330 		memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
331 	}
332 }
333 
334 /**
335  * If src and dst use the same register, this function returns a writemask that
336  * indicates which components are read by src.  Otherwise zero is returned.
337  */
src_reads_dst_mask(struct rc_src_register src,struct rc_dst_register dst)338 static unsigned int src_reads_dst_mask(struct rc_src_register src,
339 						struct rc_dst_register dst)
340 {
341 	if (dst.File != src.File || dst.Index != src.Index) {
342 		return 0;
343 	}
344 	return rc_swizzle_to_writemask(src.Swizzle);
345 }
346 
347 /* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
348  * in any of its channels.  Return 0 otherwise. */
src_has_const_swz(struct rc_src_register src)349 static int src_has_const_swz(struct rc_src_register src) {
350 	int chan;
351 	for(chan = 0; chan < 4; chan++) {
352 		unsigned int swz = GET_SWZ(src.Swizzle, chan);
353 		if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF
354 						|| swz == RC_SWIZZLE_ONE) {
355 			return 1;
356 		}
357 	}
358 	return 0;
359 }
360 
presub_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)361 static void presub_scan_read(
362 	void * data,
363 	struct rc_instruction * inst,
364 	struct rc_src_register * src)
365 {
366 	struct rc_reader_data * reader_data = data;
367 	rc_presubtract_op * presub_opcode = reader_data->CbData;
368 
369 	if (!rc_inst_can_use_presub(reader_data->C,
370 			inst,
371 			*presub_opcode,
372 			reader_data->Writer->U.I.DstReg.WriteMask,
373 			src,
374 			&reader_data->Writer->U.I.SrcReg[0],
375 			&reader_data->Writer->U.I.SrcReg[1])) {
376 		reader_data->Abort = 1;
377 		return;
378 	}
379 }
380 
presub_helper(struct radeon_compiler * c,struct rc_instruction * inst_add,rc_presubtract_op presub_opcode,rc_presub_replace_fn presub_replace)381 static int presub_helper(
382 	struct radeon_compiler * c,
383 	struct rc_instruction * inst_add,
384 	rc_presubtract_op presub_opcode,
385 	rc_presub_replace_fn presub_replace)
386 {
387 	struct rc_reader_data reader_data;
388 	unsigned int i;
389 	rc_presubtract_op cb_op = presub_opcode;
390 
391 	reader_data.CbData = &cb_op;
392 	reader_data.ExitOnAbort = 1;
393 	rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
394 						is_src_clobbered_scan_write);
395 
396 	if (reader_data.Abort || reader_data.ReaderCount == 0)
397 		return 0;
398 
399 	for(i = 0; i < reader_data.ReaderCount; i++) {
400 		unsigned int src_index;
401 		struct rc_reader reader = reader_data.Readers[i];
402 		const struct rc_opcode_info * info =
403 				rc_get_opcode_info(reader.Inst->U.I.Opcode);
404 
405 		for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
406 			if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
407 				presub_replace(inst_add, reader.Inst, src_index);
408 		}
409 	}
410 	return 1;
411 }
412 
presub_replace_add(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)413 static void presub_replace_add(
414 	struct rc_instruction * inst_add,
415 	struct rc_instruction * inst_reader,
416 	unsigned int src_index)
417 {
418 	rc_presubtract_op presub_opcode;
419 
420 	unsigned int negates = 0;
421 	if (inst_add->U.I.SrcReg[0].Negate)
422 		negates++;
423 	if (inst_add->U.I.SrcReg[1].Negate)
424 		negates++;
425 	assert(negates != 2 || inst_add->U.I.SrcReg[1].Negate == inst_add->U.I.SrcReg[0].Negate);
426 
427 	if (negates == 1)
428 		presub_opcode = RC_PRESUB_SUB;
429 	else
430 		presub_opcode = RC_PRESUB_ADD;
431 
432 	if (inst_add->U.I.SrcReg[1].Negate && negates == 1) {
433 		inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
434 		inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
435 	} else {
436 		inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
437 		inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
438 	}
439 	/* If both sources are negative we can move the negate to the presub. */
440 	unsigned negate_mask = negates == 1 ? 0 : inst_add->U.I.SrcReg[0].Negate;
441 	inst_reader->U.I.PreSub.SrcReg[0].Negate = negate_mask;
442 	inst_reader->U.I.PreSub.SrcReg[1].Negate = negate_mask;
443 	inst_reader->U.I.PreSub.Opcode = presub_opcode;
444 	inst_reader->U.I.SrcReg[src_index] =
445 			chain_srcregs(inst_reader->U.I.SrcReg[src_index],
446 					inst_reader->U.I.PreSub.SrcReg[0]);
447 	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
448 	inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
449 }
450 
is_presub_candidate(struct radeon_compiler * c,struct rc_instruction * inst)451 static int is_presub_candidate(
452 	struct radeon_compiler * c,
453 	struct rc_instruction * inst)
454 {
455 	const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
456 	unsigned int i;
457 	unsigned int is_constant[2] = {0, 0};
458 
459 	assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);
460 
461 	if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE
462 			|| inst->U.I.SaturateMode
463 			|| inst->U.I.WriteALUResult
464 			|| inst->U.I.Omod) {
465 		return 0;
466 	}
467 
468 	/* If first two sources use a constant swizzle, then we can't convert it to
469 	 * a presubtract operation.  In fact for the ADD and SUB presubtract
470 	 * operations neither source can contain a constant swizzle.  This
471 	 * specific case is checked in peephole_add_presub_add() when
472 	 * we make sure the swizzles for both sources are equal, so we
473 	 * don't need to worry about it here. */
474 	for (i = 0; i < 2; i++) {
475 		int chan;
476 		for (chan = 0; chan < 4; chan++) {
477 			rc_swizzle swz =
478 				get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
479 			if (swz == RC_SWIZZLE_ONE
480 					|| swz == RC_SWIZZLE_ZERO
481 					|| swz == RC_SWIZZLE_HALF) {
482 				is_constant[i] = 1;
483 			}
484 		}
485 	}
486 	if (is_constant[0] && is_constant[1])
487 		return 0;
488 
489 	for(i = 0; i < info->NumSrcRegs; i++) {
490 		struct rc_src_register src = inst->U.I.SrcReg[i];
491 		if (src_reads_dst_mask(src, inst->U.I.DstReg))
492 			return 0;
493 
494 		src.File = RC_FILE_PRESUB;
495 		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
496 			return 0;
497 	}
498 	return 1;
499 }
500 
peephole_add_presub_add(struct radeon_compiler * c,struct rc_instruction * inst_add)501 static int peephole_add_presub_add(
502 	struct radeon_compiler * c,
503 	struct rc_instruction * inst_add)
504 {
505 	unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
506         unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
507         unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
508 
509 	if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
510 		return 0;
511 
512 	/* src0 and src1 can't have absolute values */
513 	if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
514 	        return 0;
515 
516         /* if src0 is negative, at least all bits of dstmask have to be set */
517         if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
518 	        return 0;
519 
520         /* if src1 is negative, at least all bits of dstmask have to be set */
521         if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
522 	        return 0;
523 
524 	if (!is_presub_candidate(c, inst_add))
525 		return 0;
526 
527 	if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
528 		rc_remove_instruction(inst_add);
529 		return 1;
530 	}
531 	return 0;
532 }
533 
presub_replace_inv(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)534 static void presub_replace_inv(
535 	struct rc_instruction * inst_add,
536 	struct rc_instruction * inst_reader,
537 	unsigned int src_index)
538 {
539 	/* We must be careful not to modify inst_add, since it
540 	 * is possible it will remain part of the program.*/
541 	inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
542 	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
543 	inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
544 	inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
545 						inst_reader->U.I.PreSub.SrcReg[0]);
546 
547 	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
548 	inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
549 }
550 
presub_replace_bias(struct rc_instruction * inst_mad,struct rc_instruction * inst_reader,unsigned int src_index)551 static void presub_replace_bias(
552 	struct rc_instruction * inst_mad,
553 	struct rc_instruction * inst_reader,
554 	unsigned int src_index)
555 {
556 	/* We must be careful not to modify inst_mad, since it
557 	 * is possible it will remain part of the program.*/
558 	inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
559 	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
560 	inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
561 	inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
562 						inst_reader->U.I.PreSub.SrcReg[0]);
563 
564 	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
565 	inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
566 }
567 
568 /**
569  * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
570  * Use the presubtract 1 - src0 for all readers of TEMP[0].  The first source
571  * of the add instruction must have the constant 1 swizzle.  This function
572  * does not check const registers to see if their value is 1.0, so it should
573  * be called after the constant_folding optimization.
574  * @return
575  * 	0 if the ADD instruction is still part of the program.
576  * 	1 if the ADD instruction is no longer part of the program.
577  */
peephole_add_presub_inv(struct radeon_compiler * c,struct rc_instruction * inst_add)578 static int peephole_add_presub_inv(
579 	struct radeon_compiler * c,
580 	struct rc_instruction * inst_add)
581 {
582 	unsigned int i, swz;
583 
584 	if (!is_presub_candidate(c, inst_add))
585 		return 0;
586 
587 	/* Check if src0 is 1. */
588 	/* XXX It would be nice to use is_src_uniform_constant here, but that
589 	 * function only works if the register's file is RC_FILE_NONE */
590 	for(i = 0; i < 4; i++ ) {
591 		if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
592 			continue;
593 
594 		swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
595 		if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
596 			return 0;
597 	}
598 
599 	/* Check src1. */
600 	if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
601 						inst_add->U.I.DstReg.WriteMask
602 		|| inst_add->U.I.SrcReg[1].Abs
603 		|| src_has_const_swz(inst_add->U.I.SrcReg[1])) {
604 
605 		return 0;
606 	}
607 
608 	if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
609 		rc_remove_instruction(inst_add);
610 		return 1;
611 	}
612 	return 0;
613 }
614 
615 /**
616  * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
617  * Use the presubtract 1 - 2*src0 for all readers of TEMP[0].  The first source
618  * of the add instruction must have the constant 1 swizzle.  This function
619  * does not check const registers to see if their value is 1.0, so it should
620  * be called after the constant_folding optimization.
621  * @return
622  * 	0 if the MAD instruction is still part of the program.
623  * 	1 if the MAD instruction is no longer part of the program.
624  */
peephole_mad_presub_bias(struct radeon_compiler * c,struct rc_instruction * inst_mad)625 static int peephole_mad_presub_bias(
626 	struct radeon_compiler * c,
627 	struct rc_instruction * inst_mad)
628 {
629 	unsigned int i, swz;
630 
631 	if (!is_presub_candidate(c, inst_mad))
632 		return 0;
633 
634 	/* Check if src2 is 1. */
635 	for(i = 0; i < 4; i++ ) {
636 		if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
637 			continue;
638 
639 		swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
640 		if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
641 			return 0;
642 	}
643 
644 	/* Check if src1 is 2. */
645 	struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
646 	if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
647 		return 0;
648         struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
649 	if (constant->Type != RC_CONSTANT_IMMEDIATE)
650 		return 0;
651         for (i = 0; i < 4; i++) {
652 		if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
653 			continue;
654 		swz = GET_SWZ(src1_reg.Swizzle, i);
655 		if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
656 			return 0;
657 	}
658 
659 	/* Check src0. */
660 	if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
661 						inst_mad->U.I.DstReg.WriteMask
662 		|| inst_mad->U.I.SrcReg[0].Abs
663 		|| src_has_const_swz(inst_mad->U.I.SrcReg[0])) {
664 
665 		return 0;
666 	}
667 
668 	if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
669 		rc_remove_instruction(inst_mad);
670 		return 1;
671 	}
672 	return 0;
673 }
674 
675 struct peephole_mul_cb_data {
676 	struct rc_dst_register * Writer;
677 	unsigned int Clobbered;
678 };
679 
omod_filter_reader_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)680 static void omod_filter_reader_cb(
681 	void * userdata,
682 	struct rc_instruction * inst,
683 	rc_register_file file,
684 	unsigned int index,
685 	unsigned int mask)
686 {
687 	struct peephole_mul_cb_data * d = userdata;
688 	if (rc_src_reads_dst_mask(file, mask, index,
689 		d->Writer->File, d->Writer->Index, d->Writer->WriteMask)) {
690 
691 		d->Clobbered = 1;
692 	}
693 }
694 
omod_filter_writer_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)695 static void omod_filter_writer_cb(
696 	void * userdata,
697 	struct rc_instruction * inst,
698 	rc_register_file file,
699 	unsigned int index,
700 	unsigned int mask)
701 {
702 	struct peephole_mul_cb_data * d = userdata;
703 	if (file == d->Writer->File && index == d->Writer->Index &&
704 					(mask & d->Writer->WriteMask)) {
705 		d->Clobbered = 1;
706 	}
707 }
708 
peephole_mul_omod(struct radeon_compiler * c,struct rc_instruction * inst_mul,struct rc_list * var_list)709 static int peephole_mul_omod(
710 	struct radeon_compiler * c,
711 	struct rc_instruction * inst_mul,
712 	struct rc_list * var_list)
713 {
714 	unsigned int chan = 0, swz, i;
715 	int const_index = -1;
716 	int temp_index = -1;
717 	float const_value;
718 	rc_omod_op omod_op = RC_OMOD_DISABLE;
719 	struct rc_list * writer_list;
720 	struct rc_variable * var;
721 	struct peephole_mul_cb_data cb_data;
722 	unsigned writemask_sum;
723 
724 	for (i = 0; i < 2; i++) {
725 		unsigned int j;
726 		if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT
727 			&& inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY) {
728 			return 0;
729 		}
730 		if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
731 			if (temp_index != -1) {
732 				/* The instruction has two temp sources */
733 				return 0;
734 			} else {
735 				temp_index = i;
736 				continue;
737 			}
738 		}
739 		/* If we get this far Src[i] must be a constant src */
740 		if (inst_mul->U.I.SrcReg[i].Negate) {
741 			return 0;
742 		}
743 		/* The constant src needs to read from the same swizzle */
744 		swz = RC_SWIZZLE_UNUSED;
745 		chan = 0;
746 		for (j = 0; j < 4; j++) {
747 			unsigned int j_swz =
748 				GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
749 			if (j_swz == RC_SWIZZLE_UNUSED) {
750 				continue;
751 			}
752 			if (swz == RC_SWIZZLE_UNUSED) {
753 				swz = j_swz;
754 				chan = j;
755 			} else if (j_swz != swz) {
756 				return 0;
757 			}
758 		}
759 
760 		if (const_index != -1) {
761 			/* The instruction has two constant sources */
762 			return 0;
763 		} else {
764 			const_index = i;
765 		}
766 	}
767 
768 	if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
769 				inst_mul->U.I.SrcReg[const_index].Index)) {
770 		return 0;
771 	}
772 	const_value = rc_get_constant_value(c,
773 			inst_mul->U.I.SrcReg[const_index].Index,
774 			inst_mul->U.I.SrcReg[const_index].Swizzle,
775 			inst_mul->U.I.SrcReg[const_index].Negate,
776 			chan);
777 
778 	if (const_value == 2.0f) {
779 		omod_op = RC_OMOD_MUL_2;
780 	} else if (const_value == 4.0f) {
781 		omod_op = RC_OMOD_MUL_4;
782 	} else if (const_value == 8.0f) {
783 		omod_op = RC_OMOD_MUL_8;
784 	} else if (const_value == (1.0f / 2.0f)) {
785 		omod_op = RC_OMOD_DIV_2;
786 	} else if (const_value == (1.0f / 4.0f)) {
787 		omod_op = RC_OMOD_DIV_4;
788 	} else if (const_value == (1.0f / 8.0f)) {
789 		omod_op = RC_OMOD_DIV_8;
790 	} else {
791 		return 0;
792 	}
793 
794 	writer_list = rc_variable_list_get_writers_one_reader(var_list,
795 		RC_INSTRUCTION_NORMAL, &inst_mul->U.I.SrcReg[temp_index]);
796 
797 	if (!writer_list) {
798 		return 0;
799 	}
800 
801 	cb_data.Clobbered = 0;
802 	cb_data.Writer = &inst_mul->U.I.DstReg;
803 	for (var = writer_list->Item; var; var = var->Friend) {
804 		struct rc_instruction * inst;
805 		const struct rc_opcode_info * info = rc_get_opcode_info(
806 				var->Inst->U.I.Opcode);
807 		if (info->HasTexture) {
808 			return 0;
809 		}
810 		if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
811 			return 0;
812 		}
813 		for (inst = inst_mul->Prev; inst != var->Inst;
814 							inst = inst->Prev) {
815 			rc_for_all_reads_mask(inst, omod_filter_reader_cb,
816 								&cb_data);
817 			rc_for_all_writes_mask(inst, omod_filter_writer_cb,
818 								&cb_data);
819 			if (cb_data.Clobbered) {
820 				break;
821 			}
822 		}
823 	}
824 
825 	if (cb_data.Clobbered) {
826 		return 0;
827 	}
828 
829 	writemask_sum = rc_variable_writemask_sum(writer_list->Item);
830 
831 	/* rc_normal_rewrite_writemask can't expand a previous writemask to store
832 	 * more channels replicated.
833 	 */
834 	if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
835 		return 0;
836 
837 	/* Rewrite the instructions */
838 	for (var = writer_list->Item; var; var = var->Friend) {
839 		struct rc_variable * writer = var;
840 		unsigned conversion_swizzle = rc_make_conversion_swizzle(
841 					writemask_sum,
842 					inst_mul->U.I.DstReg.WriteMask);
843 		writer->Inst->U.I.Omod = omod_op;
844 		writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
845 		writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
846 		rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
847 		writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
848 	}
849 
850 	rc_remove_instruction(inst_mul);
851 
852 	return 1;
853 }
854 
855 /**
856  * @return
857  * 	0 if inst is still part of the program.
858  * 	1 if inst is no longer part of the program.
859  */
peephole(struct radeon_compiler * c,struct rc_instruction * inst)860 static int peephole(struct radeon_compiler * c, struct rc_instruction * inst)
861 {
862 	if (!c->has_presub)
863 		return 0;
864 
865 	switch(inst->U.I.Opcode) {
866 	case RC_OPCODE_ADD:
867 	{
868 		if (peephole_add_presub_inv(c, inst))
869 			return 1;
870 		if (peephole_add_presub_add(c, inst))
871 			return 1;
872 		break;
873 	}
874 	case RC_OPCODE_MAD:
875 	{
876 		if (peephole_mad_presub_bias(c, inst))
877 			return 1;
878 		break;
879 	}
880 	default:
881 		break;
882 	}
883 	return 0;
884 }
885 
merge_swizzles(unsigned int swz1,unsigned int swz2)886 static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2)
887 {
888 	unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
889 	for (unsigned int chan = 0; chan < 4; chan++) {
890 		unsigned int swz = GET_SWZ(swz1, chan);
891 		if (swz != RC_SWIZZLE_UNUSED) {
892 			SET_SWZ(new_swz, chan, swz);
893 			continue;
894 		}
895 		swz = GET_SWZ(swz2, chan);
896 		SET_SWZ(new_swz, chan, swz);
897 	}
898 	return new_swz;
899 }
900 
901 /* Sets negate to 0 for unused channels. */
clean_negate(struct rc_src_register src)902 static unsigned int clean_negate(struct rc_src_register src)
903 {
904 	unsigned int new_negate = 0;
905 	for (unsigned int chan = 0; chan < 4; chan++) {
906 		unsigned int swz = GET_SWZ(src.Swizzle, chan);
907 		if (swz != RC_SWIZZLE_UNUSED)
908 			new_negate |= src.Negate & (1 << chan);
909 	}
910 	return new_negate;
911 }
912 
merge_negates(struct rc_src_register src1,struct rc_src_register src2)913 static unsigned int merge_negates(struct rc_src_register src1, struct rc_src_register src2)
914 {
915 	return clean_negate(src1) | clean_negate(src2);
916 }
917 
fill_swizzle(unsigned int orig_swz,unsigned int wmask,unsigned int const_swz)918 static unsigned int fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz)
919 {
920 	for (unsigned int chan = 0; chan < 4; chan++) {
921 		unsigned int swz = GET_SWZ(orig_swz, chan);
922 		if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) {
923 			SET_SWZ(orig_swz, chan, const_swz);
924 		}
925 	}
926 	return orig_swz;
927 }
928 
have_shared_source(struct rc_instruction * inst1,struct rc_instruction * inst2)929 static int have_shared_source(struct rc_instruction * inst1, struct rc_instruction * inst2)
930 {
931 	int shared_src = -1;
932 	const struct rc_opcode_info * opcode1 = rc_get_opcode_info(inst1->U.I.Opcode);
933 	const struct rc_opcode_info * opcode2 = rc_get_opcode_info(inst2->U.I.Opcode);
934 	for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) {
935 		for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
936 			if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
937 				inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
938 				inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
939 				inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
940 				shared_src = i;
941 		}
942 	}
943 	return shared_src;
944 }
945 
946 /**
947  * Merges two MOVs writing different channels of the same destination register
948  * with the use of the constant swizzles.
949  */
merge_movs(struct radeon_compiler * c,struct rc_instruction * inst,struct rc_instruction * cur)950 static bool merge_movs(
951 	struct radeon_compiler * c,
952 	struct rc_instruction * inst,
953 	struct rc_instruction * cur)
954 {
955 	/* We can merge two MOVs into MOV if one of them is from inline constant,
956 	 * i.e., constant swizzles and RC_FILE_NONE).
957 	 *
958 	 * For example
959 	 *   MOV temp[0].x none.1___
960 	 *   MOV temp[0].y input[0]._x__
961 	 *
962 	 * becomes
963 	 *   MOV temp[0].xy input[0].1x__
964 	 */
965 	unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
966 	if (cur->U.I.SrcReg[0].File == RC_FILE_NONE ||
967 		inst->U.I.SrcReg[0].File == RC_FILE_NONE) {
968 		struct rc_src_register src;
969 		if (cur->U.I.SrcReg[0].File == RC_FILE_NONE)
970 			src = inst->U.I.SrcReg[0];
971 		else
972 			src = cur->U.I.SrcReg[0];
973 		src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
974 						inst->U.I.SrcReg[0].Swizzle);
975 		src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
976 		if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
977 			cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
978 			cur->U.I.SrcReg[0] = src;
979 			rc_remove_instruction(inst);
980 			return true;
981 		}
982 	}
983 
984 	/* Handle the trivial case where the MOVs share a source.
985 	 *
986 	 * For example
987 	 *   MOV temp[0].x const[0].x
988 	 *   MOV temp[0].y const[0].z
989 	 *
990 	 * becomes
991 	 *   MOV temp[0].xy const[0].xz
992 	 */
993 	if (have_shared_source(inst, cur) == 0) {
994 		struct rc_src_register src = cur->U.I.SrcReg[0];
995 		src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
996 		src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
997 						inst->U.I.SrcReg[0].Swizzle);
998 
999                 if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
1000                         cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
1001                         cur->U.I.SrcReg[0] = src;
1002                         rc_remove_instruction(inst);
1003                         return true;
1004                 }
1005 	}
1006 
1007 	/* Otherwise, we can convert the MOVs into ADD.
1008 	 *
1009 	 * For example
1010 	 *   MOV temp[0].x const[0].x
1011 	 *   MOV temp[0].y input[0].y
1012 	 *
1013 	 * becomes
1014 	 *   ADD temp[0].xy const[0].x0 input[0].0y
1015 	 */
1016 	unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask;
1017 	struct rc_src_register src0 = inst->U.I.SrcReg[0];
1018 	struct rc_src_register src1 = cur->U.I.SrcReg[0];
1019 
1020 	src0.Swizzle = fill_swizzle(src0.Swizzle,
1021 				wmask, RC_SWIZZLE_ZERO);
1022 	src1.Swizzle = fill_swizzle(src1.Swizzle,
1023 				wmask, RC_SWIZZLE_ZERO);
1024 	if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) ||
1025 		!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1))
1026 		return false;
1027 
1028 	cur->U.I.DstReg.WriteMask = wmask;
1029 	cur->U.I.Opcode = RC_OPCODE_ADD;
1030 	cur->U.I.SrcReg[0] = src0;
1031 	cur->U.I.SrcReg[1] = src1;
1032 
1033 	/* finally delete the original mov */
1034 	rc_remove_instruction(inst);
1035 	return true;
1036 }
1037 
1038 /**
1039  * This function will try to merge MOV and ADD/MUL instructions with the same
1040  * destination, making use of the constant swizzles.
1041  *
1042  * For example:
1043  *   MOV temp[0].x const[0].x
1044  *   MUL temp[0].yz const[1].yz const[2].yz
1045  *
1046  * becomes
1047  *   MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00
1048  */
merge_mov_add_mul(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1049 static int merge_mov_add_mul(
1050 	struct radeon_compiler * c,
1051 	struct rc_instruction * inst1,
1052 	struct rc_instruction * inst2)
1053 {
1054 	struct rc_instruction * inst, * mov;
1055 	if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1056 		mov = inst1;
1057 		inst = inst2;
1058 	} else {
1059 		mov = inst2;
1060 		inst = inst1;
1061 	}
1062 
1063 	const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL;
1064 	int shared_index = have_shared_source(inst, mov);
1065 	unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask;
1066 
1067 	/* If there is a shared source, just merge the swizzles and be done with it. */
1068 	if (shared_index != -1) {
1069 		struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index];
1070 		struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index];
1071 
1072 		shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src);
1073 		shared_src.Swizzle = merge_swizzles(shared_src.Swizzle,
1074 					mov->U.I.SrcReg[0].Swizzle);
1075 		other_src.Negate = clean_negate(other_src);
1076 		unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO;
1077 		other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz);
1078 
1079 		if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) ||
1080 			!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src))
1081 			return 0;
1082 
1083 		inst2->U.I.Opcode = inst->U.I.Opcode;
1084 		inst2->U.I.SrcReg[0] = shared_src;
1085 		inst2->U.I.SrcReg[1] = other_src;
1086 
1087 	/* TODO: we can do a bit better in the special case when one of the sources is none.
1088 	 * Convert to MAD otherwise.
1089 	 */
1090 	} else {
1091 		struct rc_src_register src0, src1, src2;
1092 		if (is_mul) {
1093 			src2 = mov->U.I.SrcReg[0];
1094 			src0 = inst->U.I.SrcReg[0];
1095 			src1 = inst->U.I.SrcReg[1];
1096 		} else {
1097 			src0 = mov->U.I.SrcReg[0];
1098 			src1 = inst->U.I.SrcReg[0];
1099 			src2 = inst->U.I.SrcReg[1];
1100 		}
1101 		/* The following login expects that the unused channels have empty negate bits. */
1102 		src0.Negate = clean_negate(src0);
1103 		src1.Negate = clean_negate(src1);
1104 		src2.Negate = clean_negate(src2);
1105 
1106 		src0.Swizzle = fill_swizzle(src0.Swizzle,
1107 					wmask, RC_SWIZZLE_ONE);
1108 		src1.Swizzle = fill_swizzle(src1.Swizzle,
1109 					wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE);
1110 		src2.Swizzle = fill_swizzle(src2.Swizzle,
1111 					wmask, RC_SWIZZLE_ZERO);
1112 		if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) ||
1113 			!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) ||
1114 			!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2))
1115 			return 0;
1116 
1117 		inst2->U.I.Opcode = RC_OPCODE_MAD;
1118 		inst2->U.I.SrcReg[0] = src0;
1119 		inst2->U.I.SrcReg[1] = src1;
1120 		inst2->U.I.SrcReg[2] = src2;
1121 	}
1122 	inst2->U.I.DstReg.WriteMask = wmask;
1123 	/* finally delete the original instruction */
1124 	rc_remove_instruction(inst1);
1125 
1126 	return 1;
1127 }
1128 
1129 /**
1130  * This function will try to merge MOV and MAD instructions with the same
1131  * destination, making use of the constant swizzles. This only works
1132  * if there is a shared source or one of the sources is RC_FILE_NONE.
1133  *
1134  * For example:
1135  *   MOV temp[0].x const[0].x
1136  *   MAD temp[0].yz const[0].yz const[1].yz input[0].xw
1137  *
1138  * becomes
1139  *   MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
1140  */
merge_mov_mad(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1141 static bool merge_mov_mad(
1142 	struct radeon_compiler * c,
1143 	struct rc_instruction * inst1,
1144 	struct rc_instruction * inst2)
1145 {
1146 	struct rc_instruction * mov, * mad;
1147 	if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1148 		mov = inst1;
1149 		mad = inst2;
1150 	} else {
1151 		mov = inst2;
1152 		mad = inst1;
1153 	}
1154 
1155 	int shared_index = have_shared_source(mad, mov);
1156 	unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
1157 	struct rc_src_register src[3];
1158 	src[0] = mad->U.I.SrcReg[0];
1159 	src[1] = mad->U.I.SrcReg[1];
1160 	src[2] = mad->U.I.SrcReg[2];
1161 
1162 	/* Shared source is the one for multiplication. */
1163 	if (shared_index == 0 || shared_index == 1) {
1164 		src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
1165 		src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
1166 		src[shared_index].Swizzle = merge_swizzles(src[shared_index].Swizzle,
1167 				mov->U.I.SrcReg[0].Swizzle);
1168 		src[1 - shared_index].Swizzle = fill_swizzle(
1169 				src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
1170 		src[2].Swizzle =  fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1171 
1172 	/* Shared source is the one for used for addition, or it is none. Additionally,
1173 	 * if the mov SrcReg is none, we merge it with the addition (third) reg as well
1174 	 * because than we have the highest change the swizzles will be legal.
1175 	 */
1176 	} else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
1177 			src[2].File == RC_FILE_NONE) {
1178 		src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
1179 		src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1180 		src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
1181 		src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
1182 		if (src[2].File == RC_FILE_NONE) {
1183 			src[2].File = mov->U.I.SrcReg[0].File;
1184 			src[2].Index = mov->U.I.SrcReg[0].Index;
1185 			src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
1186 			src[2].Abs = mov->U.I.SrcReg[0].Abs;
1187 		}
1188 
1189 	/* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
1190 	 * fill the other one with ones and the reg for addition with zeros.
1191 	 */
1192 	} else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
1193 		unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
1194 		src[none_src] = mov->U.I.SrcReg[0];
1195 		src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
1196 		src[none_src].Swizzle = merge_swizzles(src[none_src].Swizzle,
1197 				mad->U.I.SrcReg[none_src].Swizzle);
1198 		src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
1199 		src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle,
1200 				wmask, RC_SWIZZLE_ONE);
1201 		src[2].Swizzle =  fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1202 	} else {
1203 		return false;
1204 	}
1205 
1206 	if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
1207 		!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
1208 		!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
1209 		return false;
1210 
1211 	inst2->U.I.Opcode = RC_OPCODE_MAD;
1212 	inst2->U.I.SrcReg[0] = src[0];
1213 	inst2->U.I.SrcReg[1] = src[1];
1214 	inst2->U.I.SrcReg[2] = src[2];
1215 	inst2->U.I.DstReg.WriteMask = wmask;
1216 	rc_remove_instruction(inst1);
1217 	return true;
1218 }
1219 
inst_combination(struct rc_instruction * inst1,struct rc_instruction * inst2,rc_opcode opcode1,rc_opcode opcode2)1220 static bool inst_combination(
1221 	struct rc_instruction * inst1,
1222 	struct rc_instruction * inst2,
1223 	rc_opcode opcode1,
1224 	rc_opcode opcode2)
1225 {
1226 	return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) ||
1227 		(inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2));
1228 }
1229 
1230 /**
1231  * Searches for instructions writing different channels of the same register that could
1232  * be merged together with the use of constant swizzles.
1233  *
1234  * The potential candidates are combinations of MOVs, ADDs, MULs and MADs.
1235  */
merge_channels(struct radeon_compiler * c,struct rc_instruction * inst)1236 static void merge_channels(struct radeon_compiler * c, struct rc_instruction * inst)
1237 {
1238 	unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
1239 	unsigned int orig_dst_file = inst->U.I.DstReg.File;
1240 	unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
1241 	const struct rc_opcode_info * orig_opcode = rc_get_opcode_info(inst->U.I.Opcode);
1242 
1243 	struct rc_instruction * cur = inst;
1244 	while (cur!= &c->Program.Instructions) {
1245 		cur = cur->Next;
1246 		const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);
1247 
1248 		/* Keep it simple for now and stop when encountering any
1249 		 * control flow.
1250 		 */
1251 		if (opcode->IsFlowControl)
1252 			return;
1253 
1254 		/* Stop when the original destination is overwritten */
1255 		if (orig_dst_reg == cur->U.I.DstReg.Index &&
1256 			orig_dst_file == cur->U.I.DstReg.File &&
1257 			(orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
1258 			return;
1259 
1260 		/* Stop the search when the original instruction destination
1261 		 * is used as a source for anything.
1262 		 */
1263 		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
1264 			if (cur->U.I.SrcReg[i].File == orig_dst_file &&
1265 				cur->U.I.SrcReg[i].Index == orig_dst_reg)
1266 				return;
1267 		}
1268 
1269 		/* Stop the search when some of the original sources are touched. */
1270 		for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) {
1271 			if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File &&
1272 				inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index)
1273 				return;
1274 		}
1275 
1276 		if (cur->U.I.DstReg.File == orig_dst_file &&
1277 			cur->U.I.DstReg.Index == orig_dst_reg &&
1278 			cur->U.I.SaturateMode == inst->U.I.SaturateMode &&
1279 			(cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {
1280 
1281 			if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) {
1282 				if (merge_movs(c, inst, cur))
1283 					return;
1284 			}
1285 
1286 			/* Skip the merge if one of the instructions writes just w channel
1287 			 * and we are compiling a fragment shader. We can pair-schedule it together
1288 			 * later anyway and it will also give the scheduler a bit more flexibility.
1289 			 * Only check this after merging MOVs as when we manage to merge two MOVs
1290 			 * into another MOV we can still copy propagate it away. So it is a win in
1291 			 * that case.
1292 			 */
1293 			if (c->has_omod && (cur->U.I.DstReg.WriteMask == RC_MASK_W ||
1294 				inst->U.I.DstReg.WriteMask == RC_MASK_W))
1295 				continue;
1296 
1297 			if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) ||
1298 				inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) {
1299 				if (merge_mov_add_mul(c, inst, cur))
1300 					return;
1301 			}
1302 
1303 			if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
1304 				if (merge_mov_mad(c, inst, cur))
1305 					return;
1306 			}
1307 		}
1308 	}
1309 }
1310 
1311 /**
1312  * Searches for duplicate ARLs/ARRs
1313  *
1314  * Only a very trivial case is now optimized where if a second one is detected which reads from
1315  * the same register as the first one and source is the same, just remove the second one.
1316  */
merge_A0_loads(struct radeon_compiler * c,struct rc_instruction * inst,bool is_ARL)1317 static void merge_A0_loads(
1318 	struct radeon_compiler * c,
1319 	struct rc_instruction * inst,
1320 	bool is_ARL)
1321 {
1322 	unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
1323 	unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
1324 	unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
1325 	int cf_depth = 0;
1326 
1327 	struct rc_instruction * cur = inst;
1328 	while (cur != &c->Program.Instructions) {
1329 		cur = cur->Next;
1330 		const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);
1331 
1332 		/* Keep it simple for now and stop when encountering any
1333 		 * control flow besides simple ifs.
1334 		 */
1335 		if (opcode->IsFlowControl) {
1336 			switch (cur->U.I.Opcode) {
1337 			case RC_OPCODE_IF:
1338 			{
1339 				cf_depth++;
1340 				break;
1341 			}
1342 			case RC_OPCODE_ELSE:
1343 			{
1344 				if (cf_depth < 1)
1345 					return;
1346 				break;
1347 			}
1348 			case RC_OPCODE_ENDIF:
1349 			{
1350                                 cf_depth--;
1351                                 break;
1352 			}
1353 			default:
1354 				return;
1355 			}
1356 		}
1357 
1358 		/* Stop when the original source is overwritten */
1359 		if (A0_src_reg == cur->U.I.DstReg.Index &&
1360 			A0_src_file == cur->U.I.DstReg.File &&
1361 			cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
1362 			return;
1363 
1364 		/* Wrong A0 load type. */
1365 		if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
1366 		    (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
1367 			return;
1368 
1369 		if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
1370 			if (A0_src_reg == cur->U.I.SrcReg[0].Index &&
1371 			    A0_src_file == cur->U.I.SrcReg[0].File &&
1372 			    A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
1373 				struct rc_instruction * next = cur->Next;
1374 				rc_remove_instruction(cur);
1375 				cur = next;
1376 			} else {
1377 				return;
1378 			}
1379 		}
1380 	}
1381 }
1382 
1383 /**
1384  * According to the GLSL spec, round is only 1.30 and up
1385  * so the only reason why we should ever see round is if it actually
1386  * is lowered ARR (from nine->ttn). In that case we want to reconstruct
1387  * the ARR instead of lowering the round.
1388  */
transform_vertex_ROUND(struct radeon_compiler * c,struct rc_instruction * inst)1389 static void transform_vertex_ROUND(struct radeon_compiler* c,
1390 	struct rc_instruction* inst)
1391 {
1392 	struct rc_reader_data readers;
1393 	rc_get_readers(c, inst, &readers, NULL, NULL, NULL);
1394 
1395 	assert(readers.ReaderCount > 0);
1396 	for (unsigned i = 0; i < readers.ReaderCount; i++) {
1397 		struct rc_instruction *reader = readers.Readers[i].Inst;
1398 		if (reader->U.I.Opcode != RC_OPCODE_ARL) {
1399 			assert(!"Unable to convert ROUND+ARL to ARR\n");
1400 			return;
1401 		}
1402 	}
1403 
1404 	/* Only ARL readers, convert all to ARR */
1405 	for (unsigned i = 0; i < readers.ReaderCount; i++) {
1406 		readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
1407 	}
1408 	/* Switch ROUND to MOV and let copy propagate sort it out later. */
1409 	inst->U.I.Opcode = RC_OPCODE_MOV;
1410 }
1411 
1412 /**
1413  * Apply various optimizations specific to the A0 adress register loads.
1414  */
optimize_A0_loads(struct radeon_compiler * c)1415 static void optimize_A0_loads(struct radeon_compiler * c) {
1416 	struct rc_instruction * inst = c->Program.Instructions.Next;
1417 
1418 	while (inst != &c->Program.Instructions) {
1419 		struct rc_instruction * cur = inst;
1420 		inst = inst->Next;
1421 		if (cur->U.I.Opcode == RC_OPCODE_ARL) {
1422 			merge_A0_loads(c, cur, true);
1423 		} else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
1424 			merge_A0_loads(c, cur, false);
1425 		} else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
1426 			transform_vertex_ROUND(c, cur);
1427 		}
1428 	}
1429 }
1430 
rc_optimize(struct radeon_compiler * c,void * user)1431 void rc_optimize(struct radeon_compiler * c, void *user)
1432 {
1433 	struct rc_instruction * inst = c->Program.Instructions.Next;
1434 	while(inst != &c->Program.Instructions) {
1435 		struct rc_instruction * cur = inst;
1436 		inst = inst->Next;
1437 		constant_folding(c, cur);
1438 	}
1439 
1440 	/* Copy propagate simple movs away. */
1441 	inst = c->Program.Instructions.Next;
1442 	while(inst != &c->Program.Instructions) {
1443 		struct rc_instruction * cur = inst;
1444 		inst = inst->Next;
1445 		if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1446 			copy_propagate(c, cur);
1447 		}
1448 	}
1449 
1450 	if (c->type == RC_VERTEX_PROGRAM) {
1451 		optimize_A0_loads(c);
1452 	}
1453 
1454 	/* Merge MOVs to same source in different channels using the constant
1455 	 * swizzle.
1456 	 */
1457 	if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
1458 		inst = c->Program.Instructions.Next;
1459 		while(inst != &c->Program.Instructions) {
1460 			struct rc_instruction * cur = inst;
1461 			inst = inst->Next;
1462 			if (cur->U.I.Opcode == RC_OPCODE_MOV ||
1463 				cur->U.I.Opcode == RC_OPCODE_ADD ||
1464 				cur->U.I.Opcode == RC_OPCODE_MAD ||
1465 				cur->U.I.Opcode == RC_OPCODE_MUL)
1466 				merge_channels(c, cur);
1467 		}
1468 	}
1469 
1470 	/* Copy propagate few extra movs from the merge_channels pass. */
1471 	inst = c->Program.Instructions.Next;
1472 	while(inst != &c->Program.Instructions) {
1473 		struct rc_instruction * cur = inst;
1474 		inst = inst->Next;
1475 		if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1476 			copy_propagate(c, cur);
1477 		}
1478 	}
1479 
1480 	if (c->type != RC_FRAGMENT_PROGRAM) {
1481 		return;
1482 	}
1483 
1484 	/* Presubtract operations. */
1485 	inst = c->Program.Instructions.Next;
1486 	while(inst != &c->Program.Instructions) {
1487 		struct rc_instruction * cur = inst;
1488 		inst = inst->Next;
1489 		peephole(c, cur);
1490 	}
1491 
1492 	/* Output modifiers. */
1493 	inst = c->Program.Instructions.Next;
1494 	struct rc_list * var_list = NULL;
1495 	while(inst != &c->Program.Instructions) {
1496 		struct rc_instruction * cur = inst;
1497 		inst = inst->Next;
1498 		if (cur->U.I.Opcode == RC_OPCODE_MUL) {
1499 			if (!var_list)
1500 				var_list = rc_get_variables(c);
1501 			if (peephole_mul_omod(c, cur, var_list))
1502 				var_list = NULL;
1503 		}
1504 	}
1505 }
1506