1 /*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23 #include "radeon_compiler.h"
24
25 #include <stdbool.h>
26 #include <stdio.h>
27
28 #include "r300_reg.h"
29
30 #include "radeon_compiler_util.h"
31 #include "radeon_dataflow.h"
32 #include "radeon_program.h"
33 #include "radeon_program_alu.h"
34 #include "radeon_swizzle.h"
35 #include "radeon_emulate_branches.h"
36 #include "radeon_emulate_loops.h"
37 #include "radeon_remove_constants.h"
38
39 #include "util/compiler.h"
40
41 /*
42 * Take an already-setup and valid source then swizzle it appropriately to
43 * obtain a constant ZERO or ONE source.
44 */
45 #define __CONST(x, y) \
46 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
47 t_swizzle(y), \
48 t_swizzle(y), \
49 t_swizzle(y), \
50 t_swizzle(y), \
51 t_src_class(vpi->SrcReg[x].File), \
52 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
53
54
t_dst_mask(unsigned int mask)55 static unsigned long t_dst_mask(unsigned int mask)
56 {
57 /* RC_MASK_* is equivalent to VSF_FLAG_* */
58 return mask & RC_MASK_XYZW;
59 }
60
t_dst_class(rc_register_file file)61 static unsigned long t_dst_class(rc_register_file file)
62 {
63 switch (file) {
64 default:
65 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
66 FALLTHROUGH;
67 case RC_FILE_TEMPORARY:
68 return PVS_DST_REG_TEMPORARY;
69 case RC_FILE_OUTPUT:
70 return PVS_DST_REG_OUT;
71 case RC_FILE_ADDRESS:
72 return PVS_DST_REG_A0;
73 }
74 }
75
t_dst_index(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)76 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
77 struct rc_dst_register *dst)
78 {
79 if (dst->File == RC_FILE_OUTPUT)
80 return vp->outputs[dst->Index];
81
82 return dst->Index;
83 }
84
t_src_class(rc_register_file file)85 static unsigned long t_src_class(rc_register_file file)
86 {
87 switch (file) {
88 default:
89 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
90 FALLTHROUGH;
91 case RC_FILE_NONE:
92 case RC_FILE_TEMPORARY:
93 return PVS_SRC_REG_TEMPORARY;
94 case RC_FILE_INPUT:
95 return PVS_SRC_REG_INPUT;
96 case RC_FILE_CONSTANT:
97 return PVS_SRC_REG_CONSTANT;
98 }
99 }
100
t_src_conflict(struct rc_src_register a,struct rc_src_register b)101 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
102 {
103 unsigned long aclass = t_src_class(a.File);
104 unsigned long bclass = t_src_class(b.File);
105
106 if (aclass != bclass)
107 return 0;
108 if (aclass == PVS_SRC_REG_TEMPORARY)
109 return 0;
110
111 if (a.RelAddr || b.RelAddr)
112 return 1;
113 if (a.Index != b.Index)
114 return 1;
115
116 return 0;
117 }
118
t_swizzle(unsigned int swizzle)119 static inline unsigned long t_swizzle(unsigned int swizzle)
120 {
121 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
122 return swizzle;
123 }
124
t_src_index(struct r300_vertex_program_code * vp,struct rc_src_register * src)125 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
126 struct rc_src_register *src)
127 {
128 if (src->File == RC_FILE_INPUT) {
129 assert(vp->inputs[src->Index] != -1);
130 return vp->inputs[src->Index];
131 } else {
132 if (src->Index < 0) {
133 fprintf(stderr,
134 "negative offsets for indirect addressing do not work.\n");
135 return 0;
136 }
137 return src->Index;
138 }
139 }
140
141 /* these two functions should probably be merged... */
142
t_src(struct r300_vertex_program_code * vp,struct rc_src_register * src)143 static unsigned long t_src(struct r300_vertex_program_code *vp,
144 struct rc_src_register *src)
145 {
146 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
147 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
148 */
149 return PVS_SRC_OPERAND(t_src_index(vp, src),
150 t_swizzle(GET_SWZ(src->Swizzle, 0)),
151 t_swizzle(GET_SWZ(src->Swizzle, 1)),
152 t_swizzle(GET_SWZ(src->Swizzle, 2)),
153 t_swizzle(GET_SWZ(src->Swizzle, 3)),
154 t_src_class(src->File),
155 src->Negate) |
156 (src->RelAddr << 4) | (src->Abs << 3);
157 }
158
t_src_scalar(struct r300_vertex_program_code * vp,struct rc_src_register * src)159 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
160 struct rc_src_register *src)
161 {
162 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
163 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
164 */
165 unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
166
167 return PVS_SRC_OPERAND(t_src_index(vp, src),
168 t_swizzle(swz),
169 t_swizzle(swz),
170 t_swizzle(swz),
171 t_swizzle(swz),
172 t_src_class(src->File),
173 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
174 (src->RelAddr << 4) | (src->Abs << 3);
175 }
176
valid_dst(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)177 static int valid_dst(struct r300_vertex_program_code *vp,
178 struct rc_dst_register *dst)
179 {
180 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
181 return 0;
182 } else if (dst->File == RC_FILE_ADDRESS) {
183 assert(dst->Index == 0);
184 }
185
186 return 1;
187 }
188
ei_vector1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)189 static void ei_vector1(struct r300_vertex_program_code *vp,
190 unsigned int hw_opcode,
191 struct rc_sub_instruction *vpi,
192 unsigned int * inst)
193 {
194 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
195 0,
196 0,
197 t_dst_index(vp, &vpi->DstReg),
198 t_dst_mask(vpi->DstReg.WriteMask),
199 t_dst_class(vpi->DstReg.File),
200 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
201 inst[1] = t_src(vp, &vpi->SrcReg[0]);
202 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
203 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
204 }
205
ei_vector2(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)206 static void ei_vector2(struct r300_vertex_program_code *vp,
207 unsigned int hw_opcode,
208 struct rc_sub_instruction *vpi,
209 unsigned int * inst)
210 {
211 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
212 0,
213 0,
214 t_dst_index(vp, &vpi->DstReg),
215 t_dst_mask(vpi->DstReg.WriteMask),
216 t_dst_class(vpi->DstReg.File),
217 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
218 inst[1] = t_src(vp, &vpi->SrcReg[0]);
219 inst[2] = t_src(vp, &vpi->SrcReg[1]);
220 inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
221 }
222
ei_math1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)223 static void ei_math1(struct r300_vertex_program_code *vp,
224 unsigned int hw_opcode,
225 struct rc_sub_instruction *vpi,
226 unsigned int * inst)
227 {
228 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
229 1,
230 0,
231 t_dst_index(vp, &vpi->DstReg),
232 t_dst_mask(vpi->DstReg.WriteMask),
233 t_dst_class(vpi->DstReg.File),
234 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
235 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
236 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
237 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
238 }
239
ei_lit(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)240 static void ei_lit(struct r300_vertex_program_code *vp,
241 struct rc_sub_instruction *vpi,
242 unsigned int * inst)
243 {
244 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
245
246 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
247 1,
248 0,
249 t_dst_index(vp, &vpi->DstReg),
250 t_dst_mask(vpi->DstReg.WriteMask),
251 t_dst_class(vpi->DstReg.File),
252 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
253 /* NOTE: Users swizzling might not work. */
254 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
255 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
256 PVS_SRC_SELECT_FORCE_0, // Z
257 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
258 t_src_class(vpi->SrcReg[0].File),
259 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
260 (vpi->SrcReg[0].RelAddr << 4);
261 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
262 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
263 PVS_SRC_SELECT_FORCE_0, // Z
264 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
265 t_src_class(vpi->SrcReg[0].File),
266 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
267 (vpi->SrcReg[0].RelAddr << 4);
268 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
269 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
270 PVS_SRC_SELECT_FORCE_0, // Z
271 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
272 t_src_class(vpi->SrcReg[0].File),
273 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
274 (vpi->SrcReg[0].RelAddr << 4);
275 }
276
ei_mad(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)277 static void ei_mad(struct r300_vertex_program_code *vp,
278 struct rc_sub_instruction *vpi,
279 unsigned int * inst)
280 {
281 unsigned int i;
282 /* Remarks about hardware limitations of MAD
283 * (please preserve this comment, as this information is _NOT_
284 * in the documentation provided by AMD).
285 *
286 * As described in the documentation, MAD with three unique temporary
287 * source registers requires the use of the macro version.
288 *
289 * However (and this is not mentioned in the documentation), apparently
290 * the macro version is _NOT_ a full superset of the normal version.
291 * In particular, the macro version does not always work when relative
292 * addressing is used in the source operands.
293 *
294 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
295 * assembly shader path when using medium quality animations
296 * (i.e. animations with matrix blending instead of quaternion blending).
297 *
298 * Unfortunately, I (nha) have been unable to extract a Piglit regression
299 * test for this issue - for some reason, it is possible to have vertex
300 * programs whose prefix is *exactly* the same as the prefix of the
301 * offending program in Sauerbraten up to the offending instruction
302 * without causing any trouble.
303 *
304 * Bottom line: Only use the macro version only when really necessary;
305 * according to AMD docs, this should improve performance by one clock
306 * as a nice side bonus.
307 */
308 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
309 vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
310 vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
311 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
312 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
313 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
314 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
315 0,
316 1,
317 t_dst_index(vp, &vpi->DstReg),
318 t_dst_mask(vpi->DstReg.WriteMask),
319 t_dst_class(vpi->DstReg.File),
320 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
321 } else {
322 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
323 0,
324 0,
325 t_dst_index(vp, &vpi->DstReg),
326 t_dst_mask(vpi->DstReg.WriteMask),
327 t_dst_class(vpi->DstReg.File),
328 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
329
330 /* Arguments with constant swizzles still count as a unique
331 * temporary, so we should make sure these arguments share a
332 * register index with one of the other arguments. */
333 for (i = 0; i < 3; i++) {
334 unsigned int j;
335 if (vpi->SrcReg[i].File != RC_FILE_NONE)
336 continue;
337
338 for (j = 0; j < 3; j++) {
339 if (i != j) {
340 vpi->SrcReg[i].Index =
341 vpi->SrcReg[j].Index;
342 break;
343 }
344 }
345 }
346 }
347 inst[1] = t_src(vp, &vpi->SrcReg[0]);
348 inst[2] = t_src(vp, &vpi->SrcReg[1]);
349 inst[3] = t_src(vp, &vpi->SrcReg[2]);
350 }
351
ei_pow(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)352 static void ei_pow(struct r300_vertex_program_code *vp,
353 struct rc_sub_instruction *vpi,
354 unsigned int * inst)
355 {
356 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
357 1,
358 0,
359 t_dst_index(vp, &vpi->DstReg),
360 t_dst_mask(vpi->DstReg.WriteMask),
361 t_dst_class(vpi->DstReg.File),
362 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
363 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
364 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
365 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
366 }
367
translate_vertex_program(struct radeon_compiler * c,void * user)368 static void translate_vertex_program(struct radeon_compiler *c, void *user)
369 {
370 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
371 struct rc_instruction *rci;
372
373 unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
374 unsigned loop_depth = 0;
375
376 compiler->code->pos_end = 0; /* Not supported yet */
377 compiler->code->length = 0;
378 compiler->code->num_temporaries = 0;
379
380 compiler->SetHwInputOutput(compiler);
381
382 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
383 struct rc_sub_instruction *vpi = &rci->U.I;
384 unsigned int *inst = compiler->code->body.d + compiler->code->length;
385 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
386
387 /* Skip instructions writing to non-existing destination */
388 if (!valid_dst(compiler->code, &vpi->DstReg))
389 continue;
390
391 if (info->HasDstReg) {
392 /* Neither is Saturate. */
393 if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
394 rc_error(&compiler->Base, "Vertex program does not support the Saturate "
395 "modifier (yet).\n");
396 }
397 }
398
399 if (compiler->code->length >= c->max_alu_insts * 4) {
400 rc_error(&compiler->Base, "Vertex program has too many instructions\n");
401 return;
402 }
403
404 assert(compiler->Base.is_r500 ||
405 (vpi->Opcode != RC_OPCODE_SEQ &&
406 vpi->Opcode != RC_OPCODE_SNE));
407
408 switch (vpi->Opcode) {
409 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
410 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
411 case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
412 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
413 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
414 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
415 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
416 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
417 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
418 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
419 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
420 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
421 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
422 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
423 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
424 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
425 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
426 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
427 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
428 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
429 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
430 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
431 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
432 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
433 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
434 case RC_OPCODE_BGNLOOP:
435 {
436 if ((!compiler->Base.is_r500
437 && loop_depth >= R300_VS_MAX_LOOP_DEPTH)
438 || loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
439 rc_error(&compiler->Base,
440 "Loops are nested too deep.");
441 return;
442 }
443 loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
444 break;
445 }
446 case RC_OPCODE_ENDLOOP:
447 {
448 unsigned int act_addr;
449 unsigned int last_addr;
450 unsigned int ret_addr;
451
452 ret_addr = loops[--loop_depth];
453 act_addr = ret_addr - 1;
454 last_addr = (compiler->code->length / 4) - 1;
455
456 if (loop_depth >= R300_VS_MAX_FC_OPS) {
457 rc_error(&compiler->Base,
458 "Too many flow control instructions.");
459 return;
460 }
461 if (compiler->Base.is_r500) {
462 compiler->code->fc_op_addrs.r500
463 [compiler->code->num_fc_ops].lw =
464 R500_PVS_FC_ACT_ADRS(act_addr)
465 | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
466 ;
467 compiler->code->fc_op_addrs.r500
468 [compiler->code->num_fc_ops].uw =
469 R500_PVS_FC_LAST_INST(last_addr)
470 | R500_PVS_FC_RTN_INST(ret_addr)
471 ;
472 } else {
473 compiler->code->fc_op_addrs.r300
474 [compiler->code->num_fc_ops] =
475 R300_PVS_FC_ACT_ADRS(act_addr)
476 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
477 | R300_PVS_FC_LAST_INST(last_addr)
478 | R300_PVS_FC_RTN_INST(ret_addr)
479 ;
480 }
481 compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
482 R300_PVS_FC_LOOP_INIT_VAL(0x0)
483 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
484 ;
485 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
486 compiler->code->num_fc_ops);
487 compiler->code->num_fc_ops++;
488
489 break;
490 }
491
492 case RC_ME_PRED_SET_CLR:
493 ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
494 break;
495
496 case RC_ME_PRED_SET_INV:
497 ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
498 break;
499
500 case RC_ME_PRED_SET_POP:
501 ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
502 break;
503
504 case RC_ME_PRED_SET_RESTORE:
505 ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
506 break;
507
508 case RC_ME_PRED_SEQ:
509 ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
510 break;
511
512 case RC_ME_PRED_SNEQ:
513 ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
514 break;
515
516 case RC_VE_PRED_SNEQ_PUSH:
517 ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
518 vpi, inst);
519 break;
520
521 default:
522 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
523 return;
524 }
525
526 if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
527 inst[0] |= (PVS_DST_PRED_ENABLE_MASK
528 << PVS_DST_PRED_ENABLE_SHIFT);
529 if (vpi->DstReg.Pred == RC_PRED_SET) {
530 inst[0] |= (PVS_DST_PRED_SENSE_MASK
531 << PVS_DST_PRED_SENSE_SHIFT);
532 }
533 }
534
535 /* Update the number of temporaries. */
536 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
537 vpi->DstReg.Index >= compiler->code->num_temporaries)
538 compiler->code->num_temporaries = vpi->DstReg.Index + 1;
539
540 for (unsigned i = 0; i < info->NumSrcRegs; i++)
541 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
542 vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
543 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
544
545 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
546 rc_error(&compiler->Base, "Too many temporaries.\n");
547 return;
548 }
549
550 compiler->code->length += 4;
551
552 if (compiler->Base.Error)
553 return;
554 }
555 }
556
557 struct temporary_allocation {
558 unsigned int Allocated:1;
559 unsigned int HwTemp:15;
560 struct rc_instruction * LastRead;
561 };
562
get_reg(struct radeon_compiler * c,struct temporary_allocation * ta,bool * hwtemps,unsigned int orig)563 static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
564 unsigned int orig)
565 {
566 if (!ta[orig].Allocated) {
567 int j;
568 for (j = 0; j < c->max_temp_regs; ++j)
569 {
570 if (!hwtemps[j])
571 break;
572 }
573 ta[orig].Allocated = 1;
574 ta[orig].HwTemp = j;
575 hwtemps[ta[orig].HwTemp] = true;
576 }
577
578 return ta[orig].HwTemp;
579 }
580
allocate_temporary_registers(struct radeon_compiler * c,void * user)581 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
582 {
583 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
584 struct rc_instruction *inst;
585 struct rc_instruction *end_loop = NULL;
586 unsigned int num_orig_temps = 0;
587 bool hwtemps[RC_REGISTER_MAX_INDEX];
588 struct temporary_allocation * ta;
589 unsigned int i;
590
591 memset(hwtemps, 0, sizeof(hwtemps));
592
593 rc_recompute_ips(c);
594
595 /* Pass 1: Count original temporaries. */
596 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
597 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
598
599 for (i = 0; i < opcode->NumSrcRegs; ++i) {
600 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
601 if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
602 num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
603 }
604 }
605
606 if (opcode->HasDstReg) {
607 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
608 if (inst->U.I.DstReg.Index >= num_orig_temps)
609 num_orig_temps = inst->U.I.DstReg.Index + 1;
610 }
611 }
612 }
613
614 ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
615 sizeof(struct temporary_allocation) * num_orig_temps);
616 memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
617
618 /* Pass 2: Determine original temporary lifetimes */
619 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
620 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
621 /* Instructions inside of loops need to use the ENDLOOP
622 * instruction as their LastRead. */
623 if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
624 int endloops = 1;
625 struct rc_instruction * ptr;
626 for(ptr = inst->Next;
627 ptr != &compiler->Base.Program.Instructions;
628 ptr = ptr->Next){
629 if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
630 endloops++;
631 } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
632 endloops--;
633 if (endloops <= 0) {
634 end_loop = ptr;
635 break;
636 }
637 }
638 }
639 }
640
641 if (inst == end_loop) {
642 end_loop = NULL;
643 continue;
644 }
645
646 for (i = 0; i < opcode->NumSrcRegs; ++i) {
647 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
648 ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
649 }
650 }
651 }
652
653 /* Pass 3: Register allocation */
654 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
655 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
656
657 for (i = 0; i < opcode->NumSrcRegs; ++i) {
658 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
659 unsigned int orig = inst->U.I.SrcReg[i].Index;
660 inst->U.I.SrcReg[i].Index = get_reg(c, ta, hwtemps, orig);
661
662 if (ta[orig].Allocated && inst == ta[orig].LastRead)
663 hwtemps[ta[orig].HwTemp] = false;
664 }
665 }
666
667 if (opcode->HasDstReg) {
668 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
669 unsigned int orig = inst->U.I.DstReg.Index;
670 inst->U.I.DstReg.Index = get_reg(c, ta, hwtemps, orig);
671 }
672 }
673 }
674 }
675
676 /**
677 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
678 * and the Saturate opcode modifier. Only Absolute is currently transformed.
679 */
transform_nonnative_modifiers(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)680 static int transform_nonnative_modifiers(
681 struct radeon_compiler *c,
682 struct rc_instruction *inst,
683 void* unused)
684 {
685 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
686 unsigned i;
687
688 /* Transform ABS(a) to MAX(a, -a). */
689 for (i = 0; i < opcode->NumSrcRegs; i++) {
690 if (inst->U.I.SrcReg[i].Abs) {
691 struct rc_instruction *new_inst;
692 unsigned temp;
693
694 inst->U.I.SrcReg[i].Abs = 0;
695
696 temp = rc_find_free_temporary(c);
697
698 new_inst = rc_insert_new_instruction(c, inst->Prev);
699 new_inst->U.I.Opcode = RC_OPCODE_MAX;
700 new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
701 new_inst->U.I.DstReg.Index = temp;
702 new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
703 new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
704 new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
705
706 memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
707 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
708 inst->U.I.SrcReg[i].Index = temp;
709 inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
710 }
711 }
712 return 1;
713 }
714
715 /**
716 * Vertex engine cannot read two inputs or two constants at the same time.
717 * Introduce intermediate MOVs to temporary registers to account for this.
718 */
transform_source_conflicts(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)719 static int transform_source_conflicts(
720 struct radeon_compiler *c,
721 struct rc_instruction* inst,
722 void* unused)
723 {
724 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
725
726 if (opcode->NumSrcRegs == 3) {
727 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
728 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
729 int tmpreg = rc_find_free_temporary(c);
730 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
731 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
732 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
733 inst_mov->U.I.DstReg.Index = tmpreg;
734 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
735
736 reset_srcreg(&inst->U.I.SrcReg[2]);
737 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
738 inst->U.I.SrcReg[2].Index = tmpreg;
739 }
740 }
741
742 if (opcode->NumSrcRegs >= 2) {
743 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
744 int tmpreg = rc_find_free_temporary(c);
745 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
746 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
747 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
748 inst_mov->U.I.DstReg.Index = tmpreg;
749 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
750
751 reset_srcreg(&inst->U.I.SrcReg[1]);
752 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
753 inst->U.I.SrcReg[1].Index = tmpreg;
754 }
755 }
756
757 return 1;
758 }
759
rc_vs_add_artificial_outputs(struct radeon_compiler * c,void * user)760 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
761 {
762 struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
763 int i;
764
765 for(i = 0; i < 32; ++i) {
766 if ((compiler->RequiredOutputs & (1U << i)) &&
767 !(compiler->Base.Program.OutputsWritten & (1U << i))) {
768 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
769 inst->U.I.Opcode = RC_OPCODE_MOV;
770
771 inst->U.I.DstReg.File = RC_FILE_OUTPUT;
772 inst->U.I.DstReg.Index = i;
773 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
774
775 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
776 inst->U.I.SrcReg[0].Index = 0;
777 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
778
779 compiler->Base.Program.OutputsWritten |= 1U << i;
780 }
781 }
782 }
783
dataflow_outputs_mark_used(void * userdata,void * data,void (* callback)(void *,unsigned int,unsigned int))784 static void dataflow_outputs_mark_used(void * userdata, void * data,
785 void (*callback)(void *, unsigned int, unsigned int))
786 {
787 struct r300_vertex_program_compiler * c = userdata;
788 int i;
789
790 for(i = 0; i < 32; ++i) {
791 if (c->RequiredOutputs & (1U << i))
792 callback(data, i, RC_MASK_XYZW);
793 }
794 }
795
swizzle_is_native(rc_opcode opcode,struct rc_src_register reg)796 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
797 {
798 (void) opcode;
799 (void) reg;
800
801 return 1;
802 }
803
transform_negative_addressing(struct r300_vertex_program_compiler * c,struct rc_instruction * arl,struct rc_instruction * end,int min_offset)804 static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
805 struct rc_instruction *arl,
806 struct rc_instruction *end,
807 int min_offset)
808 {
809 struct rc_instruction *inst, *add;
810 unsigned const_swizzle;
811
812 /* Transform ARL/ARR */
813 add = rc_insert_new_instruction(&c->Base, arl->Prev);
814 add->U.I.Opcode = RC_OPCODE_ADD;
815 add->U.I.DstReg.File = RC_FILE_TEMPORARY;
816 add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
817 add->U.I.DstReg.WriteMask = RC_MASK_X;
818 add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
819 add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
820 add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
821 min_offset, &const_swizzle);
822 add->U.I.SrcReg[1].Swizzle = const_swizzle;
823
824 arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
825 arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
826 arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
827
828 /* Rewrite offsets up to and excluding inst. */
829 for (inst = arl->Next; inst != end; inst = inst->Next) {
830 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
831
832 for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
833 if (inst->U.I.SrcReg[i].RelAddr)
834 inst->U.I.SrcReg[i].Index -= min_offset;
835 }
836 }
837
rc_emulate_negative_addressing(struct radeon_compiler * compiler,void * user)838 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
839 {
840 struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
841 struct rc_instruction *inst, *lastARL = NULL;
842 int min_offset = 0;
843
844 for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
845 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
846
847 if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) {
848 if (lastARL != NULL && min_offset < 0)
849 transform_negative_addressing(c, lastARL, inst, min_offset);
850
851 lastARL = inst;
852 min_offset = 0;
853 continue;
854 }
855
856 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
857 if (inst->U.I.SrcReg[i].RelAddr &&
858 inst->U.I.SrcReg[i].Index < 0) {
859 /* ARL must precede any indirect addressing. */
860 if (!lastARL) {
861 rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR.");
862 return;
863 }
864
865 if (inst->U.I.SrcReg[i].Index < min_offset)
866 min_offset = inst->U.I.SrcReg[i].Index;
867 }
868 }
869 }
870
871 if (lastARL != NULL && min_offset < 0)
872 transform_negative_addressing(c, lastARL, inst, min_offset);
873 }
874
875 const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
876 .IsNative = &swizzle_is_native,
877 .Split = 0 /* should never be called */
878 };
879
r3xx_compile_vertex_program(struct r300_vertex_program_compiler * c)880 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
881 {
882 int is_r500 = c->Base.is_r500;
883 int opt = !c->Base.disable_optimizations;
884
885 /* Lists of instruction transformations. */
886 struct radeon_program_transformation alu_rewrite_r500[] = {
887 { &r300_transform_vertex_alu, 0 },
888 { &r300_transform_trig_scale_vertex, 0 },
889 { 0, 0 }
890 };
891
892 struct radeon_program_transformation alu_rewrite_r300[] = {
893 { &r300_transform_vertex_alu, 0 },
894 { &r300_transform_trig_simple, 0 },
895 { 0, 0 }
896 };
897
898 /* Note: These passes have to be done seperately from ALU rewrite,
899 * otherwise non-native ALU instructions with source conflits
900 * or non-native modifiers will not be treated properly.
901 */
902 struct radeon_program_transformation emulate_modifiers[] = {
903 { &transform_nonnative_modifiers, 0 },
904 { 0, 0 }
905 };
906
907 struct radeon_program_transformation resolve_src_conflicts[] = {
908 { &transform_source_conflicts, 0 },
909 { 0, 0 }
910 };
911
912 /* List of compiler passes. */
913 struct radeon_compiler_pass vs_list[] = {
914 /* NAME DUMP PREDICATE FUNCTION PARAM */
915 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
916 {"emulate branches", 1, !is_r500, rc_emulate_branches, NULL},
917 {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL},
918 {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500},
919 {"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300},
920 {"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers},
921 {"deadcode", 1, opt, rc_dataflow_deadcode, dataflow_outputs_mark_used},
922 {"dataflow optimize", 1, opt, rc_optimize, NULL},
923 /* This pass must be done after optimizations. */
924 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts},
925 {"register allocation", 1, opt, allocate_temporary_registers, NULL},
926 {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table},
927 {"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL},
928 {"final code validation", 0, 1, rc_validate_final_shader, NULL},
929 {"machine code generation", 0, 1, translate_vertex_program, NULL},
930 {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL},
931 {NULL, 0, 0, NULL, NULL}
932 };
933
934 c->Base.type = RC_VERTEX_PROGRAM;
935 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
936
937 rc_run_compiler(&c->Base, vs_list);
938
939 c->code->InputsRead = c->Base.Program.InputsRead;
940 c->code->OutputsWritten = c->Base.Program.OutputsWritten;
941 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
942 }
943