• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22 
23 #include "radeon_compiler.h"
24 
25 #include <stdbool.h>
26 #include <stdio.h>
27 
28 #include "r300_reg.h"
29 
30 #include "radeon_compiler_util.h"
31 #include "radeon_dataflow.h"
32 #include "radeon_program.h"
33 #include "radeon_program_alu.h"
34 #include "radeon_swizzle.h"
35 #include "radeon_remove_constants.h"
36 #include "radeon_regalloc.h"
37 #include "radeon_list.h"
38 
39 #include "util/compiler.h"
40 
41 /*
42  * Take an already-setup and valid source then swizzle it appropriately to
43  * obtain a constant ZERO or ONE source.
44  */
45 #define __CONST(x, y)	\
46 	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
47 			   t_swizzle(y),	\
48 			   t_swizzle(y),	\
49 			   t_swizzle(y),	\
50 			   t_swizzle(y),	\
51 			   t_src_class(vpi->SrcReg[x].File), \
52 			   RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
53 
54 
t_dst_mask(unsigned int mask)55 static unsigned long t_dst_mask(unsigned int mask)
56 {
57 	/* RC_MASK_* is equivalent to VSF_FLAG_* */
58 	return mask & RC_MASK_XYZW;
59 }
60 
t_dst_class(rc_register_file file)61 static unsigned long t_dst_class(rc_register_file file)
62 {
63 	switch (file) {
64 	default:
65 		fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
66 		FALLTHROUGH;
67 	case RC_FILE_TEMPORARY:
68 		return PVS_DST_REG_TEMPORARY;
69 	case RC_FILE_OUTPUT:
70 		return PVS_DST_REG_OUT;
71 	case RC_FILE_ADDRESS:
72 		return PVS_DST_REG_A0;
73 	}
74 }
75 
t_dst_index(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)76 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
77 				 struct rc_dst_register *dst)
78 {
79 	if (dst->File == RC_FILE_OUTPUT)
80 		return vp->outputs[dst->Index];
81 
82 	return dst->Index;
83 }
84 
t_src_class(rc_register_file file)85 static unsigned long t_src_class(rc_register_file file)
86 {
87 	switch (file) {
88 	default:
89 		fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
90 		FALLTHROUGH;
91 	case RC_FILE_NONE:
92 	case RC_FILE_TEMPORARY:
93 		return PVS_SRC_REG_TEMPORARY;
94 	case RC_FILE_INPUT:
95 		return PVS_SRC_REG_INPUT;
96 	case RC_FILE_CONSTANT:
97 		return PVS_SRC_REG_CONSTANT;
98 	}
99 }
100 
t_src_conflict(struct rc_src_register a,struct rc_src_register b)101 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
102 {
103 	unsigned long aclass = t_src_class(a.File);
104 	unsigned long bclass = t_src_class(b.File);
105 
106 	if (aclass != bclass)
107 		return 0;
108 	if (aclass == PVS_SRC_REG_TEMPORARY)
109 		return 0;
110 
111 	if (a.RelAddr || b.RelAddr)
112 		return 1;
113 	if (a.Index != b.Index)
114 		return 1;
115 
116 	return 0;
117 }
118 
t_swizzle(unsigned int swizzle)119 static inline unsigned long t_swizzle(unsigned int swizzle)
120 {
121 	/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
122 	return swizzle;
123 }
124 
t_src_index(struct r300_vertex_program_code * vp,struct rc_src_register * src)125 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
126 				 struct rc_src_register *src)
127 {
128 	if (src->File == RC_FILE_INPUT) {
129 		assert(vp->inputs[src->Index] != -1);
130 		return vp->inputs[src->Index];
131 	} else {
132 		if (src->Index < 0) {
133 			fprintf(stderr,
134 				"negative offsets for indirect addressing do not work.\n");
135 			return 0;
136 		}
137 		return src->Index;
138 	}
139 }
140 
141 /* these two functions should probably be merged... */
142 
t_src(struct r300_vertex_program_code * vp,struct rc_src_register * src)143 static unsigned long t_src(struct r300_vertex_program_code *vp,
144 			   struct rc_src_register *src)
145 {
146 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
147 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
148 	 */
149 	return PVS_SRC_OPERAND(t_src_index(vp, src),
150 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
151 			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
152 			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
153 			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
154 			       t_src_class(src->File),
155 			       src->Negate) |
156 	       (src->RelAddr << 4) | (src->Abs << 3);
157 }
158 
t_src_scalar(struct r300_vertex_program_code * vp,struct rc_src_register * src)159 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
160 				  struct rc_src_register *src)
161 {
162 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
163 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
164 	 */
165 	unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
166 
167 	return PVS_SRC_OPERAND(t_src_index(vp, src),
168 			       t_swizzle(swz),
169 			       t_swizzle(swz),
170 			       t_swizzle(swz),
171 			       t_swizzle(swz),
172 			       t_src_class(src->File),
173 			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
174 	       (src->RelAddr << 4) | (src->Abs << 3);
175 }
176 
valid_dst(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)177 static int valid_dst(struct r300_vertex_program_code *vp,
178 			   struct rc_dst_register *dst)
179 {
180 	if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
181 		return 0;
182 	} else if (dst->File == RC_FILE_ADDRESS) {
183 		assert(dst->Index == 0);
184 	}
185 
186 	return 1;
187 }
188 
ei_vector1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)189 static void ei_vector1(struct r300_vertex_program_code *vp,
190 				unsigned int hw_opcode,
191 				struct rc_sub_instruction *vpi,
192 				unsigned int * inst)
193 {
194 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
195 				     0,
196 				     0,
197 				     t_dst_index(vp, &vpi->DstReg),
198 				     t_dst_mask(vpi->DstReg.WriteMask),
199 				     t_dst_class(vpi->DstReg.File),
200                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
201 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
202 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
203 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
204 }
205 
ei_vector2(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)206 static void ei_vector2(struct r300_vertex_program_code *vp,
207 				unsigned int hw_opcode,
208 				struct rc_sub_instruction *vpi,
209 				unsigned int * inst)
210 {
211 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
212 				     0,
213 				     0,
214 				     t_dst_index(vp, &vpi->DstReg),
215 				     t_dst_mask(vpi->DstReg.WriteMask),
216 				     t_dst_class(vpi->DstReg.File),
217                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
218 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
219 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
220 	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
221 }
222 
ei_math1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)223 static void ei_math1(struct r300_vertex_program_code *vp,
224 				unsigned int hw_opcode,
225 				struct rc_sub_instruction *vpi,
226 				unsigned int * inst)
227 {
228 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
229 				     1,
230 				     0,
231 				     t_dst_index(vp, &vpi->DstReg),
232 				     t_dst_mask(vpi->DstReg.WriteMask),
233 				     t_dst_class(vpi->DstReg.File),
234                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
235 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
236 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
237 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
238 }
239 
ei_cmp(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)240 static void ei_cmp(struct r300_vertex_program_code *vp,
241 				struct rc_sub_instruction *vpi,
242 				unsigned int * inst)
243 {
244 	inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE,
245 				     0,
246 				     0,
247 				     t_dst_index(vp, &vpi->DstReg),
248 				     t_dst_mask(vpi->DstReg.WriteMask),
249 				     t_dst_class(vpi->DstReg.File),
250                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
251 
252 	/* Arguments with constant swizzles still count as a unique
253 	 * temporary, so we should make sure these arguments share a
254 	 * register index with one of the other arguments. */
255 	for (unsigned i = 0; i < 3; i++) {
256 		unsigned j = (i + 1) % 3;
257 		if (vpi->SrcReg[i].File == RC_FILE_NONE &&
258 			(vpi->SrcReg[j].File == RC_FILE_NONE ||
259 			 vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
260 			vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
261 			break;
262 		}
263 	}
264 
265 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
266 	inst[2] = t_src(vp, &vpi->SrcReg[2]);
267 	inst[3] = t_src(vp, &vpi->SrcReg[1]);
268 }
269 
ei_lit(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)270 static void ei_lit(struct r300_vertex_program_code *vp,
271 				      struct rc_sub_instruction *vpi,
272 				      unsigned int * inst)
273 {
274 	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
275 
276 	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
277 				     1,
278 				     0,
279 				     t_dst_index(vp, &vpi->DstReg),
280 				     t_dst_mask(vpi->DstReg.WriteMask),
281 				     t_dst_class(vpi->DstReg.File),
282                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
283 	/* NOTE: Users swizzling might not work. */
284 	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
285 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
286 				  PVS_SRC_SELECT_FORCE_0,	// Z
287 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
288 				  t_src_class(vpi->SrcReg[0].File),
289 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
290 	    (vpi->SrcReg[0].RelAddr << 4);
291 	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
292 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
293 				  PVS_SRC_SELECT_FORCE_0,	// Z
294 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
295 				  t_src_class(vpi->SrcReg[0].File),
296 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
297 	    (vpi->SrcReg[0].RelAddr << 4);
298 	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
299 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
300 				  PVS_SRC_SELECT_FORCE_0,	// Z
301 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
302 				  t_src_class(vpi->SrcReg[0].File),
303 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
304 	    (vpi->SrcReg[0].RelAddr << 4);
305 }
306 
ei_mad(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)307 static void ei_mad(struct r300_vertex_program_code *vp,
308 				      struct rc_sub_instruction *vpi,
309 				      unsigned int * inst)
310 {
311 	unsigned int i;
312 	/* Remarks about hardware limitations of MAD
313 	 * (please preserve this comment, as this information is _NOT_
314 	 * in the documentation provided by AMD).
315 	 *
316 	 * As described in the documentation, MAD with three unique temporary
317 	 * source registers requires the use of the macro version.
318 	 *
319 	 * However (and this is not mentioned in the documentation), apparently
320 	 * the macro version is _NOT_ a full superset of the normal version.
321 	 * In particular, the macro version does not always work when relative
322 	 * addressing is used in the source operands.
323 	 *
324 	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
325 	 * assembly shader path when using medium quality animations
326 	 * (i.e. animations with matrix blending instead of quaternion blending).
327 	 *
328 	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
329 	 * test for this issue - for some reason, it is possible to have vertex
330 	 * programs whose prefix is *exactly* the same as the prefix of the
331 	 * offending program in Sauerbraten up to the offending instruction
332 	 * without causing any trouble.
333 	 *
334 	 * Bottom line: Only use the macro version only when really necessary;
335 	 * according to AMD docs, this should improve performance by one clock
336 	 * as a nice side bonus.
337 	 */
338 	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
339 	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
340 	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
341 	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
342 	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
343 	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
344 		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
345 				0,
346 				1,
347 				t_dst_index(vp, &vpi->DstReg),
348 				t_dst_mask(vpi->DstReg.WriteMask),
349 				t_dst_class(vpi->DstReg.File),
350                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
351 	} else {
352 		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
353 				0,
354 				0,
355 				t_dst_index(vp, &vpi->DstReg),
356 				t_dst_mask(vpi->DstReg.WriteMask),
357 				t_dst_class(vpi->DstReg.File),
358                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
359 
360 		/* Arguments with constant swizzles still count as a unique
361 		 * temporary, so we should make sure these arguments share a
362 		 * register index with one of the other arguments. */
363 		for (i = 0; i < 3; i++) {
364 			unsigned int j;
365 			if (vpi->SrcReg[i].File != RC_FILE_NONE)
366 				continue;
367 
368 			for (j = 0; j < 3; j++) {
369 				if (i != j) {
370 					vpi->SrcReg[i].Index =
371 						vpi->SrcReg[j].Index;
372 					break;
373 				}
374 			}
375 		}
376 	}
377 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
378 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
379 	inst[3] = t_src(vp, &vpi->SrcReg[2]);
380 }
381 
ei_pow(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)382 static void ei_pow(struct r300_vertex_program_code *vp,
383 				      struct rc_sub_instruction *vpi,
384 				      unsigned int * inst)
385 {
386 	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
387 				     1,
388 				     0,
389 				     t_dst_index(vp, &vpi->DstReg),
390 				     t_dst_mask(vpi->DstReg.WriteMask),
391 				     t_dst_class(vpi->DstReg.File),
392                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
393 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
394 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
395 	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
396 }
397 
translate_vertex_program(struct radeon_compiler * c,void * user)398 static void translate_vertex_program(struct radeon_compiler *c, void *user)
399 {
400 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
401 	struct rc_instruction *rci;
402 
403 	unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
404 	unsigned loop_depth = 0;
405 	bool last_input_read_at_loop_end = false;
406 	bool last_pos_write_at_loop_end = false;
407 
408 	compiler->code->pos_end = 0;	/* Not supported yet */
409 	compiler->code->length = 0;
410 	compiler->code->num_temporaries = 0;
411 	compiler->code->last_input_read = 0;
412 	compiler->code->last_pos_write = 0;
413 
414 	compiler->SetHwInputOutput(compiler);
415 
416 	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
417 		struct rc_sub_instruction *vpi = &rci->U.I;
418 		unsigned int *inst = compiler->code->body.d + compiler->code->length;
419 		const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
420 
421 		/* Skip instructions writing to non-existing destination */
422 		if (!valid_dst(compiler->code, &vpi->DstReg))
423 			continue;
424 
425 		if (info->HasDstReg) {
426 			/* Neither is Saturate. */
427 			if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
428 				rc_error(&compiler->Base, "Vertex program does not support the Saturate "
429 					 "modifier (yet).\n");
430 			}
431 		}
432 
433 		if (compiler->code->length >= c->max_alu_insts * 4) {
434 			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
435 			return;
436 		}
437 
438 		assert(compiler->Base.is_r500 ||
439 		       (vpi->Opcode != RC_OPCODE_SEQ &&
440 			vpi->Opcode != RC_OPCODE_SNE));
441 
442 		switch (vpi->Opcode) {
443 		case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
444 		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
445 		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
446 		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
447 		case RC_OPCODE_CMP: ei_cmp(compiler->code, vpi, inst); break;
448 		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
449 		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
450 		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
451 		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
452 		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
453 		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
454 		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
455 		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
456 		case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
457 		case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
458 		case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
459 		case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
460 		case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
461 		case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
462 		case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
463 		case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
464 		case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
465 		case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
466 		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
467 		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
468 		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
469 		case RC_OPCODE_BGNLOOP:
470 		{
471 			if ((!compiler->Base.is_r500
472 				&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)
473 				|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
474 				rc_error(&compiler->Base,
475 						"Loops are nested too deep.");
476 				return;
477 			}
478 			loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
479 			break;
480 		}
481 		case RC_OPCODE_ENDLOOP:
482 		{
483 			unsigned int act_addr;
484 			unsigned int last_addr;
485 			unsigned int ret_addr;
486 
487 			if (loop_depth == 1 && last_input_read_at_loop_end) {
488 				compiler->code->last_input_read = compiler->code->length / 4;
489 				last_input_read_at_loop_end = false;
490 			}
491 			if (loop_depth == 1 && last_pos_write_at_loop_end) {
492 				compiler->code->last_pos_write = compiler->code->length / 4;
493 				last_pos_write_at_loop_end = false;
494 			}
495 
496 			ret_addr = loops[--loop_depth];
497 			act_addr = ret_addr - 1;
498 			last_addr = (compiler->code->length / 4) - 1;
499 
500 			if (loop_depth >= R300_VS_MAX_FC_OPS) {
501 				rc_error(&compiler->Base,
502 					"Too many flow control instructions.");
503 				return;
504 			}
505 			/* Maximum of R500_PVS_FC_LOOP_CNT_JMP_INST is 0xff, here
506 			 * we reduce it to half to avoid occasional hangs on RV516
507 			 * and downclocked RV530.
508 			 */
509 			if (compiler->Base.is_r500) {
510 				compiler->code->fc_op_addrs.r500
511 					[compiler->code->num_fc_ops].lw =
512 					R500_PVS_FC_ACT_ADRS(act_addr)
513 					| R500_PVS_FC_LOOP_CNT_JMP_INST(0x0080)
514 					;
515 				compiler->code->fc_op_addrs.r500
516 					[compiler->code->num_fc_ops].uw =
517 					R500_PVS_FC_LAST_INST(last_addr)
518 					| R500_PVS_FC_RTN_INST(ret_addr)
519 					;
520 			} else {
521 				compiler->code->fc_op_addrs.r300
522 					[compiler->code->num_fc_ops] =
523 					R300_PVS_FC_ACT_ADRS(act_addr)
524 					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
525 					| R300_PVS_FC_LAST_INST(last_addr)
526 					| R300_PVS_FC_RTN_INST(ret_addr)
527 					;
528 			}
529 			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
530 				R300_PVS_FC_LOOP_INIT_VAL(0x0)
531 				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
532 				;
533 			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
534 						compiler->code->num_fc_ops);
535 			compiler->code->num_fc_ops++;
536 
537 			break;
538 		}
539 
540 		case RC_ME_PRED_SET_CLR:
541 			ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
542 			break;
543 
544 		case RC_ME_PRED_SET_INV:
545 			ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
546 			break;
547 
548 		case RC_ME_PRED_SET_POP:
549 			ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
550 			break;
551 
552 		case RC_ME_PRED_SET_RESTORE:
553 			ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
554 			break;
555 
556 		case RC_ME_PRED_SEQ:
557 			ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
558 			break;
559 
560 		case RC_ME_PRED_SNEQ:
561 			ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
562 			break;
563 
564 		case RC_VE_PRED_SNEQ_PUSH:
565 			ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
566 								vpi, inst);
567 			break;
568 
569 		default:
570 			rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
571 			return;
572 		}
573 
574 		if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
575 			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
576 						<< PVS_DST_PRED_ENABLE_SHIFT);
577 			if (vpi->DstReg.Pred == RC_PRED_SET) {
578 				inst[0] |= (PVS_DST_PRED_SENSE_MASK
579 						<< PVS_DST_PRED_SENSE_SHIFT);
580 			}
581 		}
582 
583 		/* Update the number of temporaries. */
584 		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
585 		    vpi->DstReg.Index >= compiler->code->num_temporaries)
586 			compiler->code->num_temporaries = vpi->DstReg.Index + 1;
587 
588 		/* last instruction that writes position */
589 		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_OUTPUT &&
590 		    t_dst_index(compiler->code, &vpi->DstReg) == 0) {
591 			if (loop_depth == 0)
592 				compiler->code->last_pos_write = compiler->code->length / 4;
593 			else
594 				last_pos_write_at_loop_end = true;
595 		}
596 
597 		for (unsigned i = 0; i < info->NumSrcRegs; i++) {
598 			if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
599 			    vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
600 				compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
601 			if (vpi->SrcReg[i].File == RC_FILE_INPUT) {
602 				if (loop_depth == 0)
603 					compiler->code->last_input_read = compiler->code->length / 4;
604 				else
605 					last_input_read_at_loop_end = true;
606 			}
607 
608 		}
609 
610 
611 		if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
612 			rc_error(&compiler->Base, "Too many temporaries.\n");
613 			return;
614 		}
615 
616 		compiler->code->length += 4;
617 
618 		if (compiler->Base.Error)
619 			return;
620 	}
621 }
622 
623 struct temporary_allocation {
624 	unsigned int Allocated:1;
625 	unsigned int HwTemp:15;
626 	struct rc_instruction * LastRead;
627 };
628 
get_reg(struct radeon_compiler * c,struct temporary_allocation * ta,bool * hwtemps,unsigned int orig)629 static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
630                    unsigned int orig)
631 {
632     if (!ta[orig].Allocated) {
633         int j;
634         for (j = 0; j < c->max_temp_regs; ++j)
635         {
636             if (!hwtemps[j])
637                 break;
638         }
639         ta[orig].Allocated = 1;
640         ta[orig].HwTemp = j;
641         hwtemps[ta[orig].HwTemp] = true;
642     }
643 
644     return ta[orig].HwTemp;
645 }
646 
allocate_temporary_registers(struct radeon_compiler * c,void * user)647 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
648 {
649 	unsigned int node_count, node_index;
650 	struct ra_class ** node_classes;
651 	struct rc_list * var_ptr;
652 	struct rc_list * variables;
653 	struct ra_graph * graph;
654 	const struct rc_regalloc_state *ra_state = c->regalloc_state;
655 
656 	rc_recompute_ips(c);
657 
658 	/* Get list of program variables */
659 	variables = rc_get_variables(c);
660 	node_count = rc_list_count(variables);
661 	node_classes = memory_pool_malloc(&c->Pool,
662 			node_count * sizeof(struct ra_class *));
663 
664 	for (var_ptr = variables, node_index = 0; var_ptr;
665 					var_ptr = var_ptr->Next, node_index++) {
666 		unsigned int class_index = 0;
667 		int index;
668 		/* Compute the live intervals */
669 		rc_variable_compute_live_intervals(var_ptr->Item);
670 		unsigned int writemask = rc_variable_writemask_sum(var_ptr->Item);
671 		index = rc_find_class(c->regalloc_state->class_list, writemask, 6);
672 		if (index > -1) {
673 			class_index = c->regalloc_state->class_list[index].ID;
674 		} else {
675 			rc_error(c,
676 				"Could not find class for index=%u mask=%u\n",
677 				((struct rc_variable *)var_ptr->Item)->Dst.Index, writemask);
678 		}
679 		node_classes[node_index] = ra_state->classes[class_index];
680 	}
681 
682 	graph = ra_alloc_interference_graph(ra_state->regs, node_count);
683 
684 	for (node_index = 0; node_index < node_count; node_index++) {
685 		ra_set_node_class(graph, node_index, node_classes[node_index]);
686 	}
687 
688 	rc_build_interference_graph(graph, variables);
689 
690 	if (!ra_allocate(graph)) {
691 		rc_error(c, "Ran out of hardware temporaries\n");
692                 ralloc_free(graph);
693 		return;
694 	}
695 
696 	/* Rewrite the registers */
697 	for (var_ptr = variables, node_index = 0; var_ptr;
698 				var_ptr = var_ptr->Next, node_index++) {
699 		int reg = ra_get_node_reg(graph, node_index);
700 		unsigned int writemask = reg_get_writemask(reg);
701 		unsigned int index = reg_get_index(reg);
702 		struct rc_variable * var = var_ptr->Item;
703 
704 		rc_variable_change_dst(var, index, writemask);
705 	}
706 
707 	ralloc_free(graph);
708 }
709 
710 /**
711  * Vertex engine cannot read two inputs or two constants at the same time.
712  * Introduce intermediate MOVs to temporary registers to account for this.
713  */
transform_source_conflicts(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)714 static int transform_source_conflicts(
715 	struct radeon_compiler *c,
716 	struct rc_instruction* inst,
717 	void* unused)
718 {
719 	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
720 
721 	if (opcode->NumSrcRegs == 3) {
722 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
723 		    || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
724 			int tmpreg = rc_find_free_temporary(c);
725 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
726 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
727 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
728 			inst_mov->U.I.DstReg.Index = tmpreg;
729 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
730 			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
731 			inst_mov->U.I.SrcReg[0].Negate = 0;
732 			inst_mov->U.I.SrcReg[0].Abs = 0;
733 
734 			inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
735 			inst->U.I.SrcReg[2].Index = tmpreg;
736 			inst->U.I.SrcReg[2].RelAddr = false;
737 		}
738 	}
739 
740 	if (opcode->NumSrcRegs >= 2) {
741 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
742 			int tmpreg = rc_find_free_temporary(c);
743 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
744 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
745 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
746 			inst_mov->U.I.DstReg.Index = tmpreg;
747 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
748 			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
749 			inst_mov->U.I.SrcReg[0].Negate = 0;
750 			inst_mov->U.I.SrcReg[0].Abs = 0;
751 
752 			inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
753 			inst->U.I.SrcReg[1].Index = tmpreg;
754 			inst->U.I.SrcReg[1].RelAddr = false;
755 		}
756 	}
757 
758 	return 1;
759 }
760 
rc_vs_add_artificial_outputs(struct radeon_compiler * c,void * user)761 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
762 {
763 	struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
764 	int i;
765 
766 	for(i = 0; i < 32; ++i) {
767 		if ((compiler->RequiredOutputs & (1U << i)) &&
768 		    !(compiler->Base.Program.OutputsWritten & (1U << i))) {
769 			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
770 			inst->U.I.Opcode = RC_OPCODE_MOV;
771 
772 			inst->U.I.DstReg.File = RC_FILE_OUTPUT;
773 			inst->U.I.DstReg.Index = i;
774 			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
775 
776 			inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
777 			inst->U.I.SrcReg[0].Index = 0;
778 			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
779 
780 			compiler->Base.Program.OutputsWritten |= 1U << i;
781 		}
782 	}
783 }
784 
swizzle_is_native(rc_opcode opcode,struct rc_src_register reg)785 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
786 {
787 	(void) opcode;
788 	(void) reg;
789 
790 	return 1;
791 }
792 
793 const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
794 	.IsNative = &swizzle_is_native,
795 	.Split = NULL /* should never be called */
796 };
797 
r3xx_compile_vertex_program(struct r300_vertex_program_compiler * c)798 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
799 {
800 	int is_r500 = c->Base.is_r500;
801 	int opt = !c->Base.disable_optimizations;
802 
803 	/* Lists of instruction transformations. */
804 	struct radeon_program_transformation alu_rewrite[] = {
805 		{ &r300_transform_vertex_alu, NULL },
806 		{ NULL, NULL }
807 	};
808 
809 	struct radeon_program_transformation resolve_src_conflicts[] = {
810 		{ &transform_source_conflicts, NULL },
811 		{ NULL, NULL }
812 	};
813 
814 	/* List of compiler passes. */
815 	struct radeon_compiler_pass vs_list[] = {
816 		/* NAME				DUMP PREDICATE	FUNCTION			PARAM */
817 		{"add artificial outputs",	0, 1,		rc_vs_add_artificial_outputs,	NULL},
818 		{"native rewrite",		1, 1,		rc_local_transform,		alu_rewrite},
819 		{"unused channels",		1, opt,		rc_mark_unused_channels,	NULL},
820 		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
821 		/* This pass must be done after optimizations. */
822 		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
823 		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
824 		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
825 		{"lower control flow opcodes",	1, is_r500,	rc_vert_fc,			NULL},
826 		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
827 		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
828 		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
829 		{NULL, 0, 0, NULL, NULL}
830 	};
831 
832 	c->Base.type = RC_VERTEX_PROGRAM;
833 	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
834 
835 	rc_run_compiler(&c->Base, vs_list);
836 
837 	c->code->InputsRead = c->Base.Program.InputsRead;
838 	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
839 	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
840 }
841