• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22 
23 #include "radeon_compiler.h"
24 
25 #include <stdbool.h>
26 #include <stdio.h>
27 
28 #include "r300_reg.h"
29 
30 #include "radeon_compiler_util.h"
31 #include "radeon_dataflow.h"
32 #include "radeon_program.h"
33 #include "radeon_program_alu.h"
34 #include "radeon_swizzle.h"
35 #include "radeon_emulate_branches.h"
36 #include "radeon_remove_constants.h"
37 
38 #include "util/compiler.h"
39 
40 /*
41  * Take an already-setup and valid source then swizzle it appropriately to
42  * obtain a constant ZERO or ONE source.
43  */
44 #define __CONST(x, y)	\
45 	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
46 			   t_swizzle(y),	\
47 			   t_swizzle(y),	\
48 			   t_swizzle(y),	\
49 			   t_swizzle(y),	\
50 			   t_src_class(vpi->SrcReg[x].File), \
51 			   RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
52 
53 
t_dst_mask(unsigned int mask)54 static unsigned long t_dst_mask(unsigned int mask)
55 {
56 	/* RC_MASK_* is equivalent to VSF_FLAG_* */
57 	return mask & RC_MASK_XYZW;
58 }
59 
t_dst_class(rc_register_file file)60 static unsigned long t_dst_class(rc_register_file file)
61 {
62 	switch (file) {
63 	default:
64 		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
65 		FALLTHROUGH;
66 	case RC_FILE_TEMPORARY:
67 		return PVS_DST_REG_TEMPORARY;
68 	case RC_FILE_OUTPUT:
69 		return PVS_DST_REG_OUT;
70 	case RC_FILE_ADDRESS:
71 		return PVS_DST_REG_A0;
72 	}
73 }
74 
t_dst_index(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)75 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
76 				 struct rc_dst_register *dst)
77 {
78 	if (dst->File == RC_FILE_OUTPUT)
79 		return vp->outputs[dst->Index];
80 
81 	return dst->Index;
82 }
83 
t_src_class(rc_register_file file)84 static unsigned long t_src_class(rc_register_file file)
85 {
86 	switch (file) {
87 	default:
88 		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
89 		FALLTHROUGH;
90 	case RC_FILE_NONE:
91 	case RC_FILE_TEMPORARY:
92 		return PVS_SRC_REG_TEMPORARY;
93 	case RC_FILE_INPUT:
94 		return PVS_SRC_REG_INPUT;
95 	case RC_FILE_CONSTANT:
96 		return PVS_SRC_REG_CONSTANT;
97 	}
98 }
99 
t_src_conflict(struct rc_src_register a,struct rc_src_register b)100 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
101 {
102 	unsigned long aclass = t_src_class(a.File);
103 	unsigned long bclass = t_src_class(b.File);
104 
105 	if (aclass != bclass)
106 		return 0;
107 	if (aclass == PVS_SRC_REG_TEMPORARY)
108 		return 0;
109 
110 	if (a.RelAddr || b.RelAddr)
111 		return 1;
112 	if (a.Index != b.Index)
113 		return 1;
114 
115 	return 0;
116 }
117 
t_swizzle(unsigned int swizzle)118 static inline unsigned long t_swizzle(unsigned int swizzle)
119 {
120 	/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
121 	return swizzle;
122 }
123 
t_src_index(struct r300_vertex_program_code * vp,struct rc_src_register * src)124 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
125 				 struct rc_src_register *src)
126 {
127 	if (src->File == RC_FILE_INPUT) {
128 		assert(vp->inputs[src->Index] != -1);
129 		return vp->inputs[src->Index];
130 	} else {
131 		if (src->Index < 0) {
132 			fprintf(stderr,
133 				"negative offsets for indirect addressing do not work.\n");
134 			return 0;
135 		}
136 		return src->Index;
137 	}
138 }
139 
140 /* these two functions should probably be merged... */
141 
t_src(struct r300_vertex_program_code * vp,struct rc_src_register * src)142 static unsigned long t_src(struct r300_vertex_program_code *vp,
143 			   struct rc_src_register *src)
144 {
145 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
146 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
147 	 */
148 	return PVS_SRC_OPERAND(t_src_index(vp, src),
149 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
150 			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
151 			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
152 			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
153 			       t_src_class(src->File),
154 			       src->Negate) |
155 	       (src->RelAddr << 4) | (src->Abs << 3);
156 }
157 
t_src_scalar(struct r300_vertex_program_code * vp,struct rc_src_register * src)158 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
159 				  struct rc_src_register *src)
160 {
161 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
162 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
163 	 */
164 	unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
165 
166 	return PVS_SRC_OPERAND(t_src_index(vp, src),
167 			       t_swizzle(swz),
168 			       t_swizzle(swz),
169 			       t_swizzle(swz),
170 			       t_swizzle(swz),
171 			       t_src_class(src->File),
172 			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
173 	       (src->RelAddr << 4) | (src->Abs << 3);
174 }
175 
valid_dst(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)176 static int valid_dst(struct r300_vertex_program_code *vp,
177 			   struct rc_dst_register *dst)
178 {
179 	if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
180 		return 0;
181 	} else if (dst->File == RC_FILE_ADDRESS) {
182 		assert(dst->Index == 0);
183 	}
184 
185 	return 1;
186 }
187 
ei_vector1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)188 static void ei_vector1(struct r300_vertex_program_code *vp,
189 				unsigned int hw_opcode,
190 				struct rc_sub_instruction *vpi,
191 				unsigned int * inst)
192 {
193 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
194 				     0,
195 				     0,
196 				     t_dst_index(vp, &vpi->DstReg),
197 				     t_dst_mask(vpi->DstReg.WriteMask),
198 				     t_dst_class(vpi->DstReg.File),
199                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
200 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
201 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
202 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
203 }
204 
ei_vector2(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)205 static void ei_vector2(struct r300_vertex_program_code *vp,
206 				unsigned int hw_opcode,
207 				struct rc_sub_instruction *vpi,
208 				unsigned int * inst)
209 {
210 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
211 				     0,
212 				     0,
213 				     t_dst_index(vp, &vpi->DstReg),
214 				     t_dst_mask(vpi->DstReg.WriteMask),
215 				     t_dst_class(vpi->DstReg.File),
216                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
217 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
218 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
219 	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
220 }
221 
ei_math1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)222 static void ei_math1(struct r300_vertex_program_code *vp,
223 				unsigned int hw_opcode,
224 				struct rc_sub_instruction *vpi,
225 				unsigned int * inst)
226 {
227 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
228 				     1,
229 				     0,
230 				     t_dst_index(vp, &vpi->DstReg),
231 				     t_dst_mask(vpi->DstReg.WriteMask),
232 				     t_dst_class(vpi->DstReg.File),
233                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
234 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
235 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
236 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
237 }
238 
ei_lit(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)239 static void ei_lit(struct r300_vertex_program_code *vp,
240 				      struct rc_sub_instruction *vpi,
241 				      unsigned int * inst)
242 {
243 	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
244 
245 	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
246 				     1,
247 				     0,
248 				     t_dst_index(vp, &vpi->DstReg),
249 				     t_dst_mask(vpi->DstReg.WriteMask),
250 				     t_dst_class(vpi->DstReg.File),
251                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
252 	/* NOTE: Users swizzling might not work. */
253 	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
254 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
255 				  PVS_SRC_SELECT_FORCE_0,	// Z
256 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
257 				  t_src_class(vpi->SrcReg[0].File),
258 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
259 	    (vpi->SrcReg[0].RelAddr << 4);
260 	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
261 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
262 				  PVS_SRC_SELECT_FORCE_0,	// Z
263 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
264 				  t_src_class(vpi->SrcReg[0].File),
265 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
266 	    (vpi->SrcReg[0].RelAddr << 4);
267 	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
268 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
269 				  PVS_SRC_SELECT_FORCE_0,	// Z
270 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
271 				  t_src_class(vpi->SrcReg[0].File),
272 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
273 	    (vpi->SrcReg[0].RelAddr << 4);
274 }
275 
ei_mad(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)276 static void ei_mad(struct r300_vertex_program_code *vp,
277 				      struct rc_sub_instruction *vpi,
278 				      unsigned int * inst)
279 {
280 	unsigned int i;
281 	/* Remarks about hardware limitations of MAD
282 	 * (please preserve this comment, as this information is _NOT_
283 	 * in the documentation provided by AMD).
284 	 *
285 	 * As described in the documentation, MAD with three unique temporary
286 	 * source registers requires the use of the macro version.
287 	 *
288 	 * However (and this is not mentioned in the documentation), apparently
289 	 * the macro version is _NOT_ a full superset of the normal version.
290 	 * In particular, the macro version does not always work when relative
291 	 * addressing is used in the source operands.
292 	 *
293 	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
294 	 * assembly shader path when using medium quality animations
295 	 * (i.e. animations with matrix blending instead of quaternion blending).
296 	 *
297 	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
298 	 * test for this issue - for some reason, it is possible to have vertex
299 	 * programs whose prefix is *exactly* the same as the prefix of the
300 	 * offending program in Sauerbraten up to the offending instruction
301 	 * without causing any trouble.
302 	 *
303 	 * Bottom line: Only use the macro version only when really necessary;
304 	 * according to AMD docs, this should improve performance by one clock
305 	 * as a nice side bonus.
306 	 */
307 	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
308 	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
309 	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
310 	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
311 	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
312 	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
313 		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
314 				0,
315 				1,
316 				t_dst_index(vp, &vpi->DstReg),
317 				t_dst_mask(vpi->DstReg.WriteMask),
318 				t_dst_class(vpi->DstReg.File),
319                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
320 	} else {
321 		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
322 				0,
323 				0,
324 				t_dst_index(vp, &vpi->DstReg),
325 				t_dst_mask(vpi->DstReg.WriteMask),
326 				t_dst_class(vpi->DstReg.File),
327                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
328 
329 		/* Arguments with constant swizzles still count as a unique
330 		 * temporary, so we should make sure these arguments share a
331 		 * register index with one of the other arguments. */
332 		for (i = 0; i < 3; i++) {
333 			unsigned int j;
334 			if (vpi->SrcReg[i].File != RC_FILE_NONE)
335 				continue;
336 
337 			for (j = 0; j < 3; j++) {
338 				if (i != j) {
339 					vpi->SrcReg[i].Index =
340 						vpi->SrcReg[j].Index;
341 					break;
342 				}
343 			}
344 		}
345 	}
346 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
347 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
348 	inst[3] = t_src(vp, &vpi->SrcReg[2]);
349 }
350 
ei_pow(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)351 static void ei_pow(struct r300_vertex_program_code *vp,
352 				      struct rc_sub_instruction *vpi,
353 				      unsigned int * inst)
354 {
355 	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
356 				     1,
357 				     0,
358 				     t_dst_index(vp, &vpi->DstReg),
359 				     t_dst_mask(vpi->DstReg.WriteMask),
360 				     t_dst_class(vpi->DstReg.File),
361                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
362 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
363 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
364 	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
365 }
366 
translate_vertex_program(struct radeon_compiler * c,void * user)367 static void translate_vertex_program(struct radeon_compiler *c, void *user)
368 {
369 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
370 	struct rc_instruction *rci;
371 
372 	unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
373 	unsigned loop_depth = 0;
374 	bool last_input_read_at_loop_end = false;
375 	bool last_pos_write_at_loop_end = false;
376 
377 	compiler->code->pos_end = 0;	/* Not supported yet */
378 	compiler->code->length = 0;
379 	compiler->code->num_temporaries = 0;
380 	compiler->code->last_input_read = 0;
381 	compiler->code->last_pos_write = 0;
382 
383 	compiler->SetHwInputOutput(compiler);
384 
385 	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
386 		struct rc_sub_instruction *vpi = &rci->U.I;
387 		unsigned int *inst = compiler->code->body.d + compiler->code->length;
388 		const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
389 
390 		/* Skip instructions writing to non-existing destination */
391 		if (!valid_dst(compiler->code, &vpi->DstReg))
392 			continue;
393 
394 		if (info->HasDstReg) {
395 			/* Neither is Saturate. */
396 			if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
397 				rc_error(&compiler->Base, "Vertex program does not support the Saturate "
398 					 "modifier (yet).\n");
399 			}
400 		}
401 
402 		if (compiler->code->length >= c->max_alu_insts * 4) {
403 			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
404 			return;
405 		}
406 
407 		assert(compiler->Base.is_r500 ||
408 		       (vpi->Opcode != RC_OPCODE_SEQ &&
409 			vpi->Opcode != RC_OPCODE_SNE));
410 
411 		switch (vpi->Opcode) {
412 		case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
413 		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
414 		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
415 		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
416 		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
417 		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
418 		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
419 		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
420 		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
421 		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
422 		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
423 		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
424 		case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
425 		case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
426 		case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
427 		case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
428 		case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
429 		case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
430 		case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
431 		case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
432 		case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
433 		case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
434 		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
435 		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
436 		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
437 		case RC_OPCODE_BGNLOOP:
438 		{
439 			if ((!compiler->Base.is_r500
440 				&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)
441 				|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
442 				rc_error(&compiler->Base,
443 						"Loops are nested too deep.");
444 				return;
445 			}
446 			loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
447 			break;
448 		}
449 		case RC_OPCODE_ENDLOOP:
450 		{
451 			unsigned int act_addr;
452 			unsigned int last_addr;
453 			unsigned int ret_addr;
454 
455 			if (loop_depth == 1 && last_input_read_at_loop_end) {
456 				compiler->code->last_input_read = compiler->code->length / 4;
457 				last_input_read_at_loop_end = false;
458 			}
459 			if (loop_depth == 1 && last_pos_write_at_loop_end) {
460 				compiler->code->last_pos_write = compiler->code->length / 4;
461 				last_pos_write_at_loop_end = false;
462 			}
463 
464 			ret_addr = loops[--loop_depth];
465 			act_addr = ret_addr - 1;
466 			last_addr = (compiler->code->length / 4) - 1;
467 
468 			if (loop_depth >= R300_VS_MAX_FC_OPS) {
469 				rc_error(&compiler->Base,
470 					"Too many flow control instructions.");
471 				return;
472 			}
473 			if (compiler->Base.is_r500) {
474 				compiler->code->fc_op_addrs.r500
475 					[compiler->code->num_fc_ops].lw =
476 					R500_PVS_FC_ACT_ADRS(act_addr)
477 					| R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
478 					;
479 				compiler->code->fc_op_addrs.r500
480 					[compiler->code->num_fc_ops].uw =
481 					R500_PVS_FC_LAST_INST(last_addr)
482 					| R500_PVS_FC_RTN_INST(ret_addr)
483 					;
484 			} else {
485 				compiler->code->fc_op_addrs.r300
486 					[compiler->code->num_fc_ops] =
487 					R300_PVS_FC_ACT_ADRS(act_addr)
488 					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
489 					| R300_PVS_FC_LAST_INST(last_addr)
490 					| R300_PVS_FC_RTN_INST(ret_addr)
491 					;
492 			}
493 			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
494 				R300_PVS_FC_LOOP_INIT_VAL(0x0)
495 				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
496 				;
497 			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
498 						compiler->code->num_fc_ops);
499 			compiler->code->num_fc_ops++;
500 
501 			break;
502 		}
503 
504 		case RC_ME_PRED_SET_CLR:
505 			ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
506 			break;
507 
508 		case RC_ME_PRED_SET_INV:
509 			ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
510 			break;
511 
512 		case RC_ME_PRED_SET_POP:
513 			ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
514 			break;
515 
516 		case RC_ME_PRED_SET_RESTORE:
517 			ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
518 			break;
519 
520 		case RC_ME_PRED_SEQ:
521 			ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
522 			break;
523 
524 		case RC_ME_PRED_SNEQ:
525 			ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
526 			break;
527 
528 		case RC_VE_PRED_SNEQ_PUSH:
529 			ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
530 								vpi, inst);
531 			break;
532 
533 		default:
534 			rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
535 			return;
536 		}
537 
538 		if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
539 			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
540 						<< PVS_DST_PRED_ENABLE_SHIFT);
541 			if (vpi->DstReg.Pred == RC_PRED_SET) {
542 				inst[0] |= (PVS_DST_PRED_SENSE_MASK
543 						<< PVS_DST_PRED_SENSE_SHIFT);
544 			}
545 		}
546 
547 		/* Update the number of temporaries. */
548 		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
549 		    vpi->DstReg.Index >= compiler->code->num_temporaries)
550 			compiler->code->num_temporaries = vpi->DstReg.Index + 1;
551 
552 		/* last instruction that writes position */
553 		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_OUTPUT &&
554 		    t_dst_index(compiler->code, &vpi->DstReg) == 0) {
555 			if (loop_depth == 0)
556 				compiler->code->last_pos_write = compiler->code->length / 4;
557 			else
558 				last_pos_write_at_loop_end = true;
559 		}
560 
561 		for (unsigned i = 0; i < info->NumSrcRegs; i++) {
562 			if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
563 			    vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
564 				compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
565 			if (vpi->SrcReg[i].File == RC_FILE_INPUT) {
566 				if (loop_depth == 0)
567 					compiler->code->last_input_read = compiler->code->length / 4;
568 				else
569 					last_input_read_at_loop_end = true;
570 			}
571 
572 		}
573 
574 
575 		if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
576 			rc_error(&compiler->Base, "Too many temporaries.\n");
577 			return;
578 		}
579 
580 		compiler->code->length += 4;
581 
582 		if (compiler->Base.Error)
583 			return;
584 	}
585 }
586 
587 struct temporary_allocation {
588 	unsigned int Allocated:1;
589 	unsigned int HwTemp:15;
590 	struct rc_instruction * LastRead;
591 };
592 
get_reg(struct radeon_compiler * c,struct temporary_allocation * ta,bool * hwtemps,unsigned int orig)593 static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
594                    unsigned int orig)
595 {
596     if (!ta[orig].Allocated) {
597         int j;
598         for (j = 0; j < c->max_temp_regs; ++j)
599         {
600             if (!hwtemps[j])
601                 break;
602         }
603         ta[orig].Allocated = 1;
604         ta[orig].HwTemp = j;
605         hwtemps[ta[orig].HwTemp] = true;
606     }
607 
608     return ta[orig].HwTemp;
609 }
610 
allocate_temporary_registers(struct radeon_compiler * c,void * user)611 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
612 {
613 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
614 	struct rc_instruction *inst;
615 	struct rc_instruction *end_loop = NULL;
616 	unsigned int num_orig_temps = 0;
617 	bool hwtemps[RC_REGISTER_MAX_INDEX];
618 	struct temporary_allocation * ta;
619 	unsigned int i;
620 
621 	memset(hwtemps, 0, sizeof(hwtemps));
622 
623 	rc_recompute_ips(c);
624 
625 	/* Pass 1: Count original temporaries. */
626 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
627 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
628 
629 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
630 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
631 				if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
632 					num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
633 			}
634 		}
635 
636 		if (opcode->HasDstReg) {
637 			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
638 				if (inst->U.I.DstReg.Index >= num_orig_temps)
639 					num_orig_temps = inst->U.I.DstReg.Index + 1;
640 			}
641 		}
642 	}
643 
644 	ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
645 			sizeof(struct temporary_allocation) * num_orig_temps);
646 	memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
647 
648 	/* Pass 2: Determine original temporary lifetimes */
649 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
650 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
651 		/* Instructions inside of loops need to use the ENDLOOP
652 		 * instruction as their LastRead. */
653 		if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP)
654 			end_loop = rc_match_bgnloop(inst);
655 
656 		if (inst == end_loop) {
657 			end_loop = NULL;
658 			continue;
659 		}
660 
661 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
662 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
663 				ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
664 			}
665 		}
666 	}
667 
668 	/* Pass 3: Register allocation */
669 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
670 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
671 
672 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
673 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
674 				unsigned int orig = inst->U.I.SrcReg[i].Index;
675 				inst->U.I.SrcReg[i].Index = get_reg(c, ta, hwtemps, orig);
676 
677 				if (ta[orig].Allocated && inst == ta[orig].LastRead)
678 					hwtemps[ta[orig].HwTemp] = false;
679 			}
680 		}
681 
682 		if (opcode->HasDstReg) {
683 			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
684 				unsigned int orig = inst->U.I.DstReg.Index;
685 				inst->U.I.DstReg.Index = get_reg(c, ta, hwtemps, orig);
686 			}
687 		}
688 	}
689 }
690 
691 /**
692  * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
693  * and the Saturate opcode modifier. Only Absolute is currently transformed.
694  */
transform_nonnative_modifiers(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)695 static int transform_nonnative_modifiers(
696 	struct radeon_compiler *c,
697 	struct rc_instruction *inst,
698 	void* unused)
699 {
700 	const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
701 	unsigned i;
702 
703 	/* Transform ABS(a) to MAX(a, -a). */
704 	for (i = 0; i < opcode->NumSrcRegs; i++) {
705 		if (inst->U.I.SrcReg[i].Abs) {
706 			struct rc_instruction *new_inst;
707 			unsigned temp;
708 
709 			inst->U.I.SrcReg[i].Abs = 0;
710 
711 			temp = rc_find_free_temporary(c);
712 
713 			new_inst = rc_insert_new_instruction(c, inst->Prev);
714 			new_inst->U.I.Opcode = RC_OPCODE_MAX;
715 			new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
716 			new_inst->U.I.DstReg.Index = temp;
717 			new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
718 			new_inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
719 			new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
720 			new_inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XYZW;
721 			new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
722 
723 			inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
724 			inst->U.I.SrcReg[i].Index = temp;
725 			inst->U.I.SrcReg[i].RelAddr = 0;
726 		}
727 	}
728 	return 1;
729 }
730 
731 /**
732  * Vertex engine cannot read two inputs or two constants at the same time.
733  * Introduce intermediate MOVs to temporary registers to account for this.
734  */
transform_source_conflicts(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)735 static int transform_source_conflicts(
736 	struct radeon_compiler *c,
737 	struct rc_instruction* inst,
738 	void* unused)
739 {
740 	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
741 
742 	if (opcode->NumSrcRegs == 3) {
743 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
744 		    || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
745 			int tmpreg = rc_find_free_temporary(c);
746 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
747 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
748 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
749 			inst_mov->U.I.DstReg.Index = tmpreg;
750 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
751 			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
752 			inst_mov->U.I.SrcReg[0].Negate = 0;
753 			inst_mov->U.I.SrcReg[0].Abs = 0;
754 
755 			inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
756 			inst->U.I.SrcReg[2].Index = tmpreg;
757 			inst->U.I.SrcReg[2].RelAddr = false;
758 		}
759 	}
760 
761 	if (opcode->NumSrcRegs >= 2) {
762 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
763 			int tmpreg = rc_find_free_temporary(c);
764 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
765 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
766 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
767 			inst_mov->U.I.DstReg.Index = tmpreg;
768 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
769 			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
770 			inst_mov->U.I.SrcReg[0].Negate = 0;
771 			inst_mov->U.I.SrcReg[0].Abs = 0;
772 
773 			inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
774 			inst->U.I.SrcReg[1].Index = tmpreg;
775 			inst->U.I.SrcReg[1].RelAddr = false;
776 		}
777 	}
778 
779 	return 1;
780 }
781 
rc_vs_add_artificial_outputs(struct radeon_compiler * c,void * user)782 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
783 {
784 	struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
785 	int i;
786 
787 	for(i = 0; i < 32; ++i) {
788 		if ((compiler->RequiredOutputs & (1U << i)) &&
789 		    !(compiler->Base.Program.OutputsWritten & (1U << i))) {
790 			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
791 			inst->U.I.Opcode = RC_OPCODE_MOV;
792 
793 			inst->U.I.DstReg.File = RC_FILE_OUTPUT;
794 			inst->U.I.DstReg.Index = i;
795 			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
796 
797 			inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
798 			inst->U.I.SrcReg[0].Index = 0;
799 			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
800 
801 			compiler->Base.Program.OutputsWritten |= 1U << i;
802 		}
803 	}
804 }
805 
swizzle_is_native(rc_opcode opcode,struct rc_src_register reg)806 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
807 {
808 	(void) opcode;
809 	(void) reg;
810 
811 	return 1;
812 }
813 
transform_negative_addressing(struct r300_vertex_program_compiler * c,struct rc_instruction * arl,struct rc_instruction * end,int min_offset)814 static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
815 					  struct rc_instruction *arl,
816 					  struct rc_instruction *end,
817 					  int min_offset)
818 {
819 	struct rc_instruction *inst, *add;
820 	unsigned const_swizzle;
821 
822 	/* Transform ARL/ARR */
823 	add = rc_insert_new_instruction(&c->Base, arl->Prev);
824 	add->U.I.Opcode = RC_OPCODE_ADD;
825 	add->U.I.DstReg.File = RC_FILE_TEMPORARY;
826 	add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
827 	add->U.I.DstReg.WriteMask = RC_MASK_X;
828 	add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
829 	add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
830 	add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
831 								     min_offset, &const_swizzle);
832 	add->U.I.SrcReg[1].Swizzle = const_swizzle;
833 
834 	arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
835 	arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
836 	arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
837 
838 	/* Rewrite offsets up to and excluding inst. */
839 	for (inst = arl->Next; inst != end; inst = inst->Next) {
840 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
841 
842 		for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
843 			if (inst->U.I.SrcReg[i].RelAddr)
844 				inst->U.I.SrcReg[i].Index -= min_offset;
845 	}
846 }
847 
rc_emulate_negative_addressing(struct radeon_compiler * compiler,void * user)848 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
849 {
850 	struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
851 	struct rc_instruction *inst, *lastARL = NULL;
852 	int min_offset = 0;
853 
854 	for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
855 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
856 
857 		if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) {
858 			if (lastARL != NULL && min_offset < 0)
859 				transform_negative_addressing(c, lastARL, inst, min_offset);
860 
861 			lastARL = inst;
862 			min_offset = 0;
863 			continue;
864 		}
865 
866 		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
867 			if (inst->U.I.SrcReg[i].RelAddr &&
868 			    inst->U.I.SrcReg[i].Index < 0) {
869 				/* ARL must precede any indirect addressing. */
870 				if (!lastARL) {
871 					rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR.");
872 					return;
873 				}
874 
875 				if (inst->U.I.SrcReg[i].Index < min_offset)
876 					min_offset = inst->U.I.SrcReg[i].Index;
877 			}
878 		}
879 	}
880 
881 	if (lastARL != NULL && min_offset < 0)
882 		transform_negative_addressing(c, lastARL, inst, min_offset);
883 }
884 
885 const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
886 	.IsNative = &swizzle_is_native,
887 	.Split = NULL /* should never be called */
888 };
889 
r3xx_compile_vertex_program(struct r300_vertex_program_compiler * c)890 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
891 {
892 	int is_r500 = c->Base.is_r500;
893 	int opt = !c->Base.disable_optimizations;
894 
895 	/* Lists of instruction transformations. */
896 	struct radeon_program_transformation alu_rewrite_r500[] = {
897 		{ &r300_transform_vertex_alu, NULL },
898 		{ &r300_transform_trig_scale_vertex, NULL },
899 		{ NULL, NULL }
900 	};
901 
902 	struct radeon_program_transformation alu_rewrite_r300[] = {
903 		{ &r300_transform_vertex_alu, NULL },
904 		{ &r300_transform_trig_simple, NULL },
905 		{ NULL, NULL }
906 	};
907 
908 	/* Note: These passes have to be done seperately from ALU rewrite,
909 	 * otherwise non-native ALU instructions with source conflits
910 	 * or non-native modifiers will not be treated properly.
911 	 */
912 	struct radeon_program_transformation emulate_modifiers[] = {
913 		{ &transform_nonnative_modifiers, NULL },
914 		{ NULL, NULL }
915 	};
916 
917 	struct radeon_program_transformation resolve_src_conflicts[] = {
918 		{ &transform_source_conflicts, NULL },
919 		{ NULL, NULL }
920 	};
921 
922 	/* List of compiler passes. */
923 	struct radeon_compiler_pass vs_list[] = {
924 		/* NAME				DUMP PREDICATE	FUNCTION			PARAM */
925 		{"add artificial outputs",	0, 1,		rc_vs_add_artificial_outputs,	NULL},
926 		{"emulate branches",		1, !is_r500,	rc_emulate_branches,		NULL},
927 		{"emulate negative addressing", 1, 1,		rc_emulate_negative_addressing,	NULL},
928 		{"native rewrite",		1, is_r500,	rc_local_transform,		alu_rewrite_r500},
929 		{"native rewrite",		1, !is_r500,	rc_local_transform,		alu_rewrite_r300},
930 		{"emulate modifiers",		1, !is_r500,	rc_local_transform,		emulate_modifiers},
931 		{"deadcode",			1, opt,		rc_dataflow_deadcode,		NULL},
932 		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
933 		/* This pass must be done after optimizations. */
934 		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
935 		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
936 		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
937 		{"lower control flow opcodes",	1, is_r500,	rc_vert_fc,			NULL},
938 		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
939 		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
940 		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
941 		{NULL, 0, 0, NULL, NULL}
942 	};
943 
944 	c->Base.type = RC_VERTEX_PROGRAM;
945 	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
946 
947 	rc_run_compiler(&c->Base, vs_list);
948 
949 	c->code->InputsRead = c->Base.Program.InputsRead;
950 	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
951 	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
952 }
953