1 /*
2  * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22 
23 #include "radeon_compiler.h"
24 
25 #include <stdio.h>
26 
27 #include "r300_reg.h"
28 
29 #include "radeon_compiler_util.h"
30 #include "radeon_dataflow.h"
31 #include "radeon_program.h"
32 #include "radeon_program_alu.h"
33 #include "radeon_swizzle.h"
34 #include "radeon_emulate_branches.h"
35 #include "radeon_emulate_loops.h"
36 #include "radeon_remove_constants.h"
37 
38 /*
39  * Take an already-setup and valid source then swizzle it appropriately to
40  * obtain a constant ZERO or ONE source.
41  */
42 #define __CONST(x, y)	\
43 	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
44 			   t_swizzle(y),	\
45 			   t_swizzle(y),	\
46 			   t_swizzle(y),	\
47 			   t_swizzle(y),	\
48 			   t_src_class(vpi->SrcReg[x].File), \
49 			   RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
50 
51 
t_dst_mask(unsigned int mask)52 static unsigned long t_dst_mask(unsigned int mask)
53 {
54 	/* RC_MASK_* is equivalent to VSF_FLAG_* */
55 	return mask & RC_MASK_XYZW;
56 }
57 
t_dst_class(rc_register_file file)58 static unsigned long t_dst_class(rc_register_file file)
59 {
60 	switch (file) {
61 	default:
62 		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
63 		/* fall-through */
64 	case RC_FILE_TEMPORARY:
65 		return PVS_DST_REG_TEMPORARY;
66 	case RC_FILE_OUTPUT:
67 		return PVS_DST_REG_OUT;
68 	case RC_FILE_ADDRESS:
69 		return PVS_DST_REG_A0;
70 	}
71 }
72 
t_dst_index(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)73 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
74 				 struct rc_dst_register *dst)
75 {
76 	if (dst->File == RC_FILE_OUTPUT)
77 		return vp->outputs[dst->Index];
78 
79 	return dst->Index;
80 }
81 
t_src_class(rc_register_file file)82 static unsigned long t_src_class(rc_register_file file)
83 {
84 	switch (file) {
85 	default:
86 		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
87 		/* fall-through */
88 	case RC_FILE_NONE:
89 	case RC_FILE_TEMPORARY:
90 		return PVS_SRC_REG_TEMPORARY;
91 	case RC_FILE_INPUT:
92 		return PVS_SRC_REG_INPUT;
93 	case RC_FILE_CONSTANT:
94 		return PVS_SRC_REG_CONSTANT;
95 	}
96 }
97 
t_src_conflict(struct rc_src_register a,struct rc_src_register b)98 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
99 {
100 	unsigned long aclass = t_src_class(a.File);
101 	unsigned long bclass = t_src_class(b.File);
102 
103 	if (aclass != bclass)
104 		return 0;
105 	if (aclass == PVS_SRC_REG_TEMPORARY)
106 		return 0;
107 
108 	if (a.RelAddr || b.RelAddr)
109 		return 1;
110 	if (a.Index != b.Index)
111 		return 1;
112 
113 	return 0;
114 }
115 
t_swizzle(unsigned int swizzle)116 static inline unsigned long t_swizzle(unsigned int swizzle)
117 {
118 	/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
119 	return swizzle;
120 }
121 
t_src_index(struct r300_vertex_program_code * vp,struct rc_src_register * src)122 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
123 				 struct rc_src_register *src)
124 {
125 	if (src->File == RC_FILE_INPUT) {
126 		assert(vp->inputs[src->Index] != -1);
127 		return vp->inputs[src->Index];
128 	} else {
129 		if (src->Index < 0) {
130 			fprintf(stderr,
131 				"negative offsets for indirect addressing do not work.\n");
132 			return 0;
133 		}
134 		return src->Index;
135 	}
136 }
137 
138 /* these two functions should probably be merged... */
139 
t_src(struct r300_vertex_program_code * vp,struct rc_src_register * src)140 static unsigned long t_src(struct r300_vertex_program_code *vp,
141 			   struct rc_src_register *src)
142 {
143 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
144 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
145 	 */
146 	return PVS_SRC_OPERAND(t_src_index(vp, src),
147 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
148 			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
149 			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
150 			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
151 			       t_src_class(src->File),
152 			       src->Negate) |
153 	       (src->RelAddr << 4) | (src->Abs << 3);
154 }
155 
t_src_scalar(struct r300_vertex_program_code * vp,struct rc_src_register * src)156 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
157 				  struct rc_src_register *src)
158 {
159 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
160 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
161 	 */
162 	unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
163 
164 	return PVS_SRC_OPERAND(t_src_index(vp, src),
165 			       t_swizzle(swz),
166 			       t_swizzle(swz),
167 			       t_swizzle(swz),
168 			       t_swizzle(swz),
169 			       t_src_class(src->File),
170 			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
171 	       (src->RelAddr << 4) | (src->Abs << 3);
172 }
173 
valid_dst(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)174 static int valid_dst(struct r300_vertex_program_code *vp,
175 			   struct rc_dst_register *dst)
176 {
177 	if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
178 		return 0;
179 	} else if (dst->File == RC_FILE_ADDRESS) {
180 		assert(dst->Index == 0);
181 	}
182 
183 	return 1;
184 }
185 
ei_vector1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)186 static void ei_vector1(struct r300_vertex_program_code *vp,
187 				unsigned int hw_opcode,
188 				struct rc_sub_instruction *vpi,
189 				unsigned int * inst)
190 {
191 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
192 				     0,
193 				     0,
194 				     t_dst_index(vp, &vpi->DstReg),
195 				     t_dst_mask(vpi->DstReg.WriteMask),
196 				     t_dst_class(vpi->DstReg.File),
197                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
198 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
199 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
200 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
201 }
202 
ei_vector2(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)203 static void ei_vector2(struct r300_vertex_program_code *vp,
204 				unsigned int hw_opcode,
205 				struct rc_sub_instruction *vpi,
206 				unsigned int * inst)
207 {
208 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
209 				     0,
210 				     0,
211 				     t_dst_index(vp, &vpi->DstReg),
212 				     t_dst_mask(vpi->DstReg.WriteMask),
213 				     t_dst_class(vpi->DstReg.File),
214                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
215 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
216 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
217 	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
218 }
219 
ei_math1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)220 static void ei_math1(struct r300_vertex_program_code *vp,
221 				unsigned int hw_opcode,
222 				struct rc_sub_instruction *vpi,
223 				unsigned int * inst)
224 {
225 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
226 				     1,
227 				     0,
228 				     t_dst_index(vp, &vpi->DstReg),
229 				     t_dst_mask(vpi->DstReg.WriteMask),
230 				     t_dst_class(vpi->DstReg.File),
231                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
232 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
233 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
234 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
235 }
236 
ei_lit(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)237 static void ei_lit(struct r300_vertex_program_code *vp,
238 				      struct rc_sub_instruction *vpi,
239 				      unsigned int * inst)
240 {
241 	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
242 
243 	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
244 				     1,
245 				     0,
246 				     t_dst_index(vp, &vpi->DstReg),
247 				     t_dst_mask(vpi->DstReg.WriteMask),
248 				     t_dst_class(vpi->DstReg.File),
249                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
250 	/* NOTE: Users swizzling might not work. */
251 	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
252 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
253 				  PVS_SRC_SELECT_FORCE_0,	// Z
254 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
255 				  t_src_class(vpi->SrcReg[0].File),
256 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
257 	    (vpi->SrcReg[0].RelAddr << 4);
258 	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
259 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
260 				  PVS_SRC_SELECT_FORCE_0,	// Z
261 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
262 				  t_src_class(vpi->SrcReg[0].File),
263 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
264 	    (vpi->SrcReg[0].RelAddr << 4);
265 	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
266 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
267 				  PVS_SRC_SELECT_FORCE_0,	// Z
268 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
269 				  t_src_class(vpi->SrcReg[0].File),
270 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
271 	    (vpi->SrcReg[0].RelAddr << 4);
272 }
273 
ei_mad(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)274 static void ei_mad(struct r300_vertex_program_code *vp,
275 				      struct rc_sub_instruction *vpi,
276 				      unsigned int * inst)
277 {
278 	unsigned int i;
279 	/* Remarks about hardware limitations of MAD
280 	 * (please preserve this comment, as this information is _NOT_
281 	 * in the documentation provided by AMD).
282 	 *
283 	 * As described in the documentation, MAD with three unique temporary
284 	 * source registers requires the use of the macro version.
285 	 *
286 	 * However (and this is not mentioned in the documentation), apparently
287 	 * the macro version is _NOT_ a full superset of the normal version.
288 	 * In particular, the macro version does not always work when relative
289 	 * addressing is used in the source operands.
290 	 *
291 	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
292 	 * assembly shader path when using medium quality animations
293 	 * (i.e. animations with matrix blending instead of quaternion blending).
294 	 *
295 	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
296 	 * test for this issue - for some reason, it is possible to have vertex
297 	 * programs whose prefix is *exactly* the same as the prefix of the
298 	 * offending program in Sauerbraten up to the offending instruction
299 	 * without causing any trouble.
300 	 *
301 	 * Bottom line: Only use the macro version only when really necessary;
302 	 * according to AMD docs, this should improve performance by one clock
303 	 * as a nice side bonus.
304 	 */
305 	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
306 	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
307 	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
308 	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
309 	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
310 	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
311 		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
312 				0,
313 				1,
314 				t_dst_index(vp, &vpi->DstReg),
315 				t_dst_mask(vpi->DstReg.WriteMask),
316 				t_dst_class(vpi->DstReg.File),
317                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
318 	} else {
319 		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
320 				0,
321 				0,
322 				t_dst_index(vp, &vpi->DstReg),
323 				t_dst_mask(vpi->DstReg.WriteMask),
324 				t_dst_class(vpi->DstReg.File),
325                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
326 
327 		/* Arguments with constant swizzles still count as a unique
328 		 * temporary, so we should make sure these arguments share a
329 		 * register index with one of the other arguments. */
330 		for (i = 0; i < 3; i++) {
331 			unsigned int j;
332 			if (vpi->SrcReg[i].File != RC_FILE_NONE)
333 				continue;
334 
335 			for (j = 0; j < 3; j++) {
336 				if (i != j) {
337 					vpi->SrcReg[i].Index =
338 						vpi->SrcReg[j].Index;
339 					break;
340 				}
341 			}
342 		}
343 	}
344 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
345 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
346 	inst[3] = t_src(vp, &vpi->SrcReg[2]);
347 }
348 
ei_pow(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)349 static void ei_pow(struct r300_vertex_program_code *vp,
350 				      struct rc_sub_instruction *vpi,
351 				      unsigned int * inst)
352 {
353 	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
354 				     1,
355 				     0,
356 				     t_dst_index(vp, &vpi->DstReg),
357 				     t_dst_mask(vpi->DstReg.WriteMask),
358 				     t_dst_class(vpi->DstReg.File),
359                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
360 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
361 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
362 	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
363 }
364 
translate_vertex_program(struct radeon_compiler * c,void * user)365 static void translate_vertex_program(struct radeon_compiler *c, void *user)
366 {
367 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
368 	struct rc_instruction *rci;
369 
370 	unsigned loops[R500_PVS_MAX_LOOP_DEPTH];
371 	unsigned loop_depth = 0;
372 
373 	compiler->code->pos_end = 0;	/* Not supported yet */
374 	compiler->code->length = 0;
375 	compiler->code->num_temporaries = 0;
376 
377 	compiler->SetHwInputOutput(compiler);
378 
379 	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
380 		struct rc_sub_instruction *vpi = &rci->U.I;
381 		unsigned int *inst = compiler->code->body.d + compiler->code->length;
382 		const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
383 
384 		/* Skip instructions writing to non-existing destination */
385 		if (!valid_dst(compiler->code, &vpi->DstReg))
386 			continue;
387 
388 		if (info->HasDstReg) {
389 			/* Neither is Saturate. */
390 			if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
391 				rc_error(&compiler->Base, "Vertex program does not support the Saturate "
392 					 "modifier (yet).\n");
393 			}
394 		}
395 
396 		if (compiler->code->length >= c->max_alu_insts * 4) {
397 			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
398 			return;
399 		}
400 
401 		assert(compiler->Base.is_r500 ||
402 		       (vpi->Opcode != RC_OPCODE_SEQ &&
403 			vpi->Opcode != RC_OPCODE_SNE));
404 
405 		switch (vpi->Opcode) {
406 		case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
407 		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
408 		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
409 		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
410 		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
411 		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
412 		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
413 		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
414 		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
415 		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
416 		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
417 		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
418 		case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
419 		case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
420 		case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
421 		case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
422 		case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
423 		case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
424 		case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
425 		case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
426 		case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
427 		case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
428 		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
429 		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
430 		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
431 		case RC_OPCODE_BGNLOOP:
432 		{
433 			if ((!compiler->Base.is_r500
434 				&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)
435 				|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
436 				rc_error(&compiler->Base,
437 						"Loops are nested too deep.");
438 				return;
439 			}
440 			loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
441 			break;
442 		}
443 		case RC_OPCODE_ENDLOOP:
444 		{
445 			unsigned int act_addr;
446 			unsigned int last_addr;
447 			unsigned int ret_addr;
448 
449 			ret_addr = loops[--loop_depth];
450 			act_addr = ret_addr - 1;
451 			last_addr = (compiler->code->length / 4) - 1;
452 
453 			if (loop_depth >= R300_VS_MAX_FC_OPS) {
454 				rc_error(&compiler->Base,
455 					"Too many flow control instructions.");
456 				return;
457 			}
458 			if (compiler->Base.is_r500) {
459 				compiler->code->fc_op_addrs.r500
460 					[compiler->code->num_fc_ops].lw =
461 					R500_PVS_FC_ACT_ADRS(act_addr)
462 					| R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
463 					;
464 				compiler->code->fc_op_addrs.r500
465 					[compiler->code->num_fc_ops].uw =
466 					R500_PVS_FC_LAST_INST(last_addr)
467 					| R500_PVS_FC_RTN_INST(ret_addr)
468 					;
469 			} else {
470 				compiler->code->fc_op_addrs.r300
471 					[compiler->code->num_fc_ops] =
472 					R300_PVS_FC_ACT_ADRS(act_addr)
473 					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
474 					| R300_PVS_FC_LAST_INST(last_addr)
475 					| R300_PVS_FC_RTN_INST(ret_addr)
476 					;
477 			}
478 			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
479 				R300_PVS_FC_LOOP_INIT_VAL(0x0)
480 				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
481 				;
482 			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
483 						compiler->code->num_fc_ops);
484 			compiler->code->num_fc_ops++;
485 
486 			break;
487 		}
488 
489 		case RC_ME_PRED_SET_CLR:
490 			ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
491 			break;
492 
493 		case RC_ME_PRED_SET_INV:
494 			ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
495 			break;
496 
497 		case RC_ME_PRED_SET_POP:
498 			ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
499 			break;
500 
501 		case RC_ME_PRED_SET_RESTORE:
502 			ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
503 			break;
504 
505 		case RC_ME_PRED_SEQ:
506 			ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
507 			break;
508 
509 		case RC_ME_PRED_SNEQ:
510 			ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
511 			break;
512 
513 		case RC_VE_PRED_SNEQ_PUSH:
514 			ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
515 								vpi, inst);
516 			break;
517 
518 		default:
519 			rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
520 			return;
521 		}
522 
523 		if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
524 			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
525 						<< PVS_DST_PRED_ENABLE_SHIFT);
526 			if (vpi->DstReg.Pred == RC_PRED_SET) {
527 				inst[0] |= (PVS_DST_PRED_SENSE_MASK
528 						<< PVS_DST_PRED_SENSE_SHIFT);
529 			}
530 		}
531 
532 		/* Update the number of temporaries. */
533 		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
534 		    vpi->DstReg.Index >= compiler->code->num_temporaries)
535 			compiler->code->num_temporaries = vpi->DstReg.Index + 1;
536 
537 		for (unsigned i = 0; i < info->NumSrcRegs; i++)
538 			if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
539 			    vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
540 				compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
541 
542 		if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
543 			rc_error(&compiler->Base, "Too many temporaries.\n");
544 			return;
545 		}
546 
547 		compiler->code->length += 4;
548 
549 		if (compiler->Base.Error)
550 			return;
551 	}
552 }
553 
554 struct temporary_allocation {
555 	unsigned int Allocated:1;
556 	unsigned int HwTemp:15;
557 	struct rc_instruction * LastRead;
558 };
559 
allocate_temporary_registers(struct radeon_compiler * c,void * user)560 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
561 {
562 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
563 	struct rc_instruction *inst;
564 	struct rc_instruction *end_loop = NULL;
565 	unsigned int num_orig_temps = 0;
566 	char hwtemps[RC_REGISTER_MAX_INDEX];
567 	struct temporary_allocation * ta;
568 	unsigned int i, j;
569 
570 	memset(hwtemps, 0, sizeof(hwtemps));
571 
572 	rc_recompute_ips(c);
573 
574 	/* Pass 1: Count original temporaries. */
575 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
576 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
577 
578 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
579 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
580 				if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
581 					num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
582 			}
583 		}
584 
585 		if (opcode->HasDstReg) {
586 			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
587 				if (inst->U.I.DstReg.Index >= num_orig_temps)
588 					num_orig_temps = inst->U.I.DstReg.Index + 1;
589 			}
590 		}
591 	}
592 
593 	ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
594 			sizeof(struct temporary_allocation) * num_orig_temps);
595 	memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
596 
597 	/* Pass 2: Determine original temporary lifetimes */
598 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
599 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
600 		/* Instructions inside of loops need to use the ENDLOOP
601 		 * instruction as their LastRead. */
602 		if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
603 			int endloops = 1;
604 			struct rc_instruction * ptr;
605 			for(ptr = inst->Next;
606 				ptr != &compiler->Base.Program.Instructions;
607 							ptr = ptr->Next){
608 				if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
609 					endloops++;
610 				} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
611 					endloops--;
612 					if (endloops <= 0) {
613 						end_loop = ptr;
614 						break;
615 					}
616 				}
617 			}
618 		}
619 
620 		if (inst == end_loop) {
621 			end_loop = NULL;
622 			continue;
623 		}
624 
625 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
626 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
627 				ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
628 			}
629 		}
630 	}
631 
632 	/* Pass 3: Register allocation */
633 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
634 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
635 
636 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
637 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
638 				unsigned int orig = inst->U.I.SrcReg[i].Index;
639 				inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
640 
641 				if (ta[orig].Allocated && inst == ta[orig].LastRead)
642 					hwtemps[ta[orig].HwTemp] = 0;
643 			}
644 		}
645 
646 		if (opcode->HasDstReg) {
647 			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
648 				unsigned int orig = inst->U.I.DstReg.Index;
649 
650 				if (!ta[orig].Allocated) {
651 					for(j = 0; j < c->max_temp_regs; ++j) {
652 						if (!hwtemps[j])
653 							break;
654 					}
655 					ta[orig].Allocated = 1;
656 					ta[orig].HwTemp = j;
657 					hwtemps[ta[orig].HwTemp] = 1;
658 				}
659 
660 				inst->U.I.DstReg.Index = ta[orig].HwTemp;
661 			}
662 		}
663 	}
664 }
665 
666 /**
667  * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
668  * and the Saturate opcode modifier. Only Absolute is currently transformed.
669  */
transform_nonnative_modifiers(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)670 static int transform_nonnative_modifiers(
671 	struct radeon_compiler *c,
672 	struct rc_instruction *inst,
673 	void* unused)
674 {
675 	const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
676 	unsigned i;
677 
678 	/* Transform ABS(a) to MAX(a, -a). */
679 	for (i = 0; i < opcode->NumSrcRegs; i++) {
680 		if (inst->U.I.SrcReg[i].Abs) {
681 			struct rc_instruction *new_inst;
682 			unsigned temp;
683 
684 			inst->U.I.SrcReg[i].Abs = 0;
685 
686 			temp = rc_find_free_temporary(c);
687 
688 			new_inst = rc_insert_new_instruction(c, inst->Prev);
689 			new_inst->U.I.Opcode = RC_OPCODE_MAX;
690 			new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
691 			new_inst->U.I.DstReg.Index = temp;
692 			new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
693 			new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
694 			new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
695 
696 			memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
697 			inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
698 			inst->U.I.SrcReg[i].Index = temp;
699 			inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
700 		}
701 	}
702 	return 1;
703 }
704 
705 /**
706  * Vertex engine cannot read two inputs or two constants at the same time.
707  * Introduce intermediate MOVs to temporary registers to account for this.
708  */
transform_source_conflicts(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)709 static int transform_source_conflicts(
710 	struct radeon_compiler *c,
711 	struct rc_instruction* inst,
712 	void* unused)
713 {
714 	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
715 
716 	if (opcode->NumSrcRegs == 3) {
717 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
718 		    || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
719 			int tmpreg = rc_find_free_temporary(c);
720 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
721 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
722 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
723 			inst_mov->U.I.DstReg.Index = tmpreg;
724 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
725 
726 			reset_srcreg(&inst->U.I.SrcReg[2]);
727 			inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
728 			inst->U.I.SrcReg[2].Index = tmpreg;
729 		}
730 	}
731 
732 	if (opcode->NumSrcRegs >= 2) {
733 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
734 			int tmpreg = rc_find_free_temporary(c);
735 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
736 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
737 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
738 			inst_mov->U.I.DstReg.Index = tmpreg;
739 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
740 
741 			reset_srcreg(&inst->U.I.SrcReg[1]);
742 			inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
743 			inst->U.I.SrcReg[1].Index = tmpreg;
744 		}
745 	}
746 
747 	return 1;
748 }
749 
rc_vs_add_artificial_outputs(struct radeon_compiler * c,void * user)750 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
751 {
752 	struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
753 	int i;
754 
755 	for(i = 0; i < 32; ++i) {
756 		if ((compiler->RequiredOutputs & (1 << i)) &&
757 		    !(compiler->Base.Program.OutputsWritten & (1 << i))) {
758 			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
759 			inst->U.I.Opcode = RC_OPCODE_MOV;
760 
761 			inst->U.I.DstReg.File = RC_FILE_OUTPUT;
762 			inst->U.I.DstReg.Index = i;
763 			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
764 
765 			inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
766 			inst->U.I.SrcReg[0].Index = 0;
767 			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
768 
769 			compiler->Base.Program.OutputsWritten |= 1 << i;
770 		}
771 	}
772 }
773 
dataflow_outputs_mark_used(void * userdata,void * data,void (* callback)(void *,unsigned int,unsigned int))774 static void dataflow_outputs_mark_used(void * userdata, void * data,
775 		void (*callback)(void *, unsigned int, unsigned int))
776 {
777 	struct r300_vertex_program_compiler * c = userdata;
778 	int i;
779 
780 	for(i = 0; i < 32; ++i) {
781 		if (c->RequiredOutputs & (1 << i))
782 			callback(data, i, RC_MASK_XYZW);
783 	}
784 }
785 
swizzle_is_native(rc_opcode opcode,struct rc_src_register reg)786 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
787 {
788 	(void) opcode;
789 	(void) reg;
790 
791 	return 1;
792 }
793 
transform_negative_addressing(struct r300_vertex_program_compiler * c,struct rc_instruction * arl,struct rc_instruction * end,int min_offset)794 static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
795 					  struct rc_instruction *arl,
796 					  struct rc_instruction *end,
797 					  int min_offset)
798 {
799 	struct rc_instruction *inst, *add;
800 	unsigned const_swizzle;
801 
802 	/* Transform ARL/ARR */
803 	add = rc_insert_new_instruction(&c->Base, arl->Prev);
804 	add->U.I.Opcode = RC_OPCODE_ADD;
805 	add->U.I.DstReg.File = RC_FILE_TEMPORARY;
806 	add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
807 	add->U.I.DstReg.WriteMask = RC_MASK_X;
808 	add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
809 	add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
810 	add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
811 								     min_offset, &const_swizzle);
812 	add->U.I.SrcReg[1].Swizzle = const_swizzle;
813 
814 	arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
815 	arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
816 	arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
817 
818 	/* Rewrite offsets up to and excluding inst. */
819 	for (inst = arl->Next; inst != end; inst = inst->Next) {
820 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
821 
822 		for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
823 			if (inst->U.I.SrcReg[i].RelAddr)
824 				inst->U.I.SrcReg[i].Index -= min_offset;
825 	}
826 }
827 
rc_emulate_negative_addressing(struct radeon_compiler * compiler,void * user)828 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
829 {
830 	struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
831 	struct rc_instruction *inst, *lastARL = NULL;
832 	int min_offset = 0;
833 
834 	for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
835 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
836 
837 		if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) {
838 			if (lastARL != NULL && min_offset < 0)
839 				transform_negative_addressing(c, lastARL, inst, min_offset);
840 
841 			lastARL = inst;
842 			min_offset = 0;
843 			continue;
844 		}
845 
846 		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
847 			if (inst->U.I.SrcReg[i].RelAddr &&
848 			    inst->U.I.SrcReg[i].Index < 0) {
849 				/* ARL must precede any indirect addressing. */
850 				if (!lastARL) {
851 					rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR.");
852 					return;
853 				}
854 
855 				if (inst->U.I.SrcReg[i].Index < min_offset)
856 					min_offset = inst->U.I.SrcReg[i].Index;
857 			}
858 		}
859 	}
860 
861 	if (lastARL != NULL && min_offset < 0)
862 		transform_negative_addressing(c, lastARL, inst, min_offset);
863 }
864 
865 struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
866 	.IsNative = &swizzle_is_native,
867 	.Split = 0 /* should never be called */
868 };
869 
r3xx_compile_vertex_program(struct r300_vertex_program_compiler * c)870 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
871 {
872 	int is_r500 = c->Base.is_r500;
873 	int opt = !c->Base.disable_optimizations;
874 
875 	/* Lists of instruction transformations. */
876 	struct radeon_program_transformation alu_rewrite_r500[] = {
877 		{ &r300_transform_vertex_alu, 0 },
878 		{ &r300_transform_trig_scale_vertex, 0 },
879 		{ 0, 0 }
880 	};
881 
882 	struct radeon_program_transformation alu_rewrite_r300[] = {
883 		{ &r300_transform_vertex_alu, 0 },
884 		{ &r300_transform_trig_simple, 0 },
885 		{ 0, 0 }
886 	};
887 
888 	/* Note: These passes have to be done seperately from ALU rewrite,
889 	 * otherwise non-native ALU instructions with source conflits
890 	 * or non-native modifiers will not be treated properly.
891 	 */
892 	struct radeon_program_transformation emulate_modifiers[] = {
893 		{ &transform_nonnative_modifiers, 0 },
894 		{ 0, 0 }
895 	};
896 
897 	struct radeon_program_transformation resolve_src_conflicts[] = {
898 		{ &transform_source_conflicts, 0 },
899 		{ 0, 0 }
900 	};
901 
902 	/* List of compiler passes. */
903 	struct radeon_compiler_pass vs_list[] = {
904 		/* NAME				DUMP PREDICATE	FUNCTION			PARAM */
905 		{"add artificial outputs",	0, 1,		rc_vs_add_artificial_outputs,	NULL},
906 		{"emulate branches",		1, !is_r500,	rc_emulate_branches,		NULL},
907 		{"emulate negative addressing", 1, 1,		rc_emulate_negative_addressing,	NULL},
908 		{"native rewrite",		1, is_r500,	rc_local_transform,		alu_rewrite_r500},
909 		{"native rewrite",		1, !is_r500,	rc_local_transform,		alu_rewrite_r300},
910 		{"emulate modifiers",		1, !is_r500,	rc_local_transform,		emulate_modifiers},
911 		{"deadcode",			1, opt,		rc_dataflow_deadcode,		dataflow_outputs_mark_used},
912 		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
913 		/* This pass must be done after optimizations. */
914 		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
915 		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
916 		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
917 		{"lower control flow opcodes",	1, is_r500,	rc_vert_fc,			NULL},
918 		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
919 		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
920 		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
921 		{NULL, 0, 0, NULL, NULL}
922 	};
923 
924 	c->Base.type = RC_VERTEX_PROGRAM;
925 	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
926 
927 	rc_run_compiler(&c->Base, vs_list);
928 
929 	c->code->InputsRead = c->Base.Program.InputsRead;
930 	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
931 	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
932 }
933