1 /*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23 #include "radeon_compiler.h"
24
25 #include <stdbool.h>
26 #include <stdio.h>
27
28 #include "r300_reg.h"
29
30 #include "radeon_compiler_util.h"
31 #include "radeon_dataflow.h"
32 #include "radeon_program.h"
33 #include "radeon_program_alu.h"
34 #include "radeon_swizzle.h"
35 #include "radeon_remove_constants.h"
36 #include "radeon_regalloc.h"
37 #include "radeon_list.h"
38
39 #include "util/compiler.h"
40
41 /*
42 * Take an already-setup and valid source then swizzle it appropriately to
43 * obtain a constant ZERO or ONE source.
44 */
45 #define __CONST(x, y) \
46 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
47 t_swizzle(y), \
48 t_swizzle(y), \
49 t_swizzle(y), \
50 t_swizzle(y), \
51 t_src_class(vpi->SrcReg[x].File), \
52 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
53
54
t_dst_mask(unsigned int mask)55 static unsigned long t_dst_mask(unsigned int mask)
56 {
57 /* RC_MASK_* is equivalent to VSF_FLAG_* */
58 return mask & RC_MASK_XYZW;
59 }
60
t_dst_class(rc_register_file file)61 static unsigned long t_dst_class(rc_register_file file)
62 {
63 switch (file) {
64 default:
65 fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
66 FALLTHROUGH;
67 case RC_FILE_TEMPORARY:
68 return PVS_DST_REG_TEMPORARY;
69 case RC_FILE_OUTPUT:
70 return PVS_DST_REG_OUT;
71 case RC_FILE_ADDRESS:
72 return PVS_DST_REG_A0;
73 }
74 }
75
t_dst_index(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)76 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
77 struct rc_dst_register *dst)
78 {
79 if (dst->File == RC_FILE_OUTPUT)
80 return vp->outputs[dst->Index];
81
82 return dst->Index;
83 }
84
t_src_class(rc_register_file file)85 static unsigned long t_src_class(rc_register_file file)
86 {
87 switch (file) {
88 default:
89 fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
90 FALLTHROUGH;
91 case RC_FILE_NONE:
92 case RC_FILE_TEMPORARY:
93 return PVS_SRC_REG_TEMPORARY;
94 case RC_FILE_INPUT:
95 return PVS_SRC_REG_INPUT;
96 case RC_FILE_CONSTANT:
97 return PVS_SRC_REG_CONSTANT;
98 }
99 }
100
t_src_conflict(struct rc_src_register a,struct rc_src_register b)101 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
102 {
103 unsigned long aclass = t_src_class(a.File);
104 unsigned long bclass = t_src_class(b.File);
105
106 if (aclass != bclass)
107 return 0;
108 if (aclass == PVS_SRC_REG_TEMPORARY)
109 return 0;
110
111 if (a.RelAddr || b.RelAddr)
112 return 1;
113 if (a.Index != b.Index)
114 return 1;
115
116 return 0;
117 }
118
t_swizzle(unsigned int swizzle)119 static inline unsigned long t_swizzle(unsigned int swizzle)
120 {
121 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
122 return swizzle;
123 }
124
t_src_index(struct r300_vertex_program_code * vp,struct rc_src_register * src)125 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
126 struct rc_src_register *src)
127 {
128 if (src->File == RC_FILE_INPUT) {
129 assert(vp->inputs[src->Index] != -1);
130 return vp->inputs[src->Index];
131 } else {
132 if (src->Index < 0) {
133 fprintf(stderr,
134 "negative offsets for indirect addressing do not work.\n");
135 return 0;
136 }
137 return src->Index;
138 }
139 }
140
141 /* these two functions should probably be merged... */
142
t_src(struct r300_vertex_program_code * vp,struct rc_src_register * src)143 static unsigned long t_src(struct r300_vertex_program_code *vp,
144 struct rc_src_register *src)
145 {
146 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
147 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
148 */
149 return PVS_SRC_OPERAND(t_src_index(vp, src),
150 t_swizzle(GET_SWZ(src->Swizzle, 0)),
151 t_swizzle(GET_SWZ(src->Swizzle, 1)),
152 t_swizzle(GET_SWZ(src->Swizzle, 2)),
153 t_swizzle(GET_SWZ(src->Swizzle, 3)),
154 t_src_class(src->File),
155 src->Negate) |
156 (src->RelAddr << 4) | (src->Abs << 3);
157 }
158
t_src_scalar(struct r300_vertex_program_code * vp,struct rc_src_register * src)159 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
160 struct rc_src_register *src)
161 {
162 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
163 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
164 */
165 unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
166
167 return PVS_SRC_OPERAND(t_src_index(vp, src),
168 t_swizzle(swz),
169 t_swizzle(swz),
170 t_swizzle(swz),
171 t_swizzle(swz),
172 t_src_class(src->File),
173 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
174 (src->RelAddr << 4) | (src->Abs << 3);
175 }
176
valid_dst(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)177 static int valid_dst(struct r300_vertex_program_code *vp,
178 struct rc_dst_register *dst)
179 {
180 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
181 return 0;
182 } else if (dst->File == RC_FILE_ADDRESS) {
183 assert(dst->Index == 0);
184 }
185
186 return 1;
187 }
188
ei_vector1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)189 static void ei_vector1(struct r300_vertex_program_code *vp,
190 unsigned int hw_opcode,
191 struct rc_sub_instruction *vpi,
192 unsigned int * inst)
193 {
194 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
195 0,
196 0,
197 t_dst_index(vp, &vpi->DstReg),
198 t_dst_mask(vpi->DstReg.WriteMask),
199 t_dst_class(vpi->DstReg.File),
200 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
201 inst[1] = t_src(vp, &vpi->SrcReg[0]);
202 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
203 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
204 }
205
ei_vector2(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)206 static void ei_vector2(struct r300_vertex_program_code *vp,
207 unsigned int hw_opcode,
208 struct rc_sub_instruction *vpi,
209 unsigned int * inst)
210 {
211 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
212 0,
213 0,
214 t_dst_index(vp, &vpi->DstReg),
215 t_dst_mask(vpi->DstReg.WriteMask),
216 t_dst_class(vpi->DstReg.File),
217 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
218 inst[1] = t_src(vp, &vpi->SrcReg[0]);
219 inst[2] = t_src(vp, &vpi->SrcReg[1]);
220 inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
221 }
222
ei_math1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)223 static void ei_math1(struct r300_vertex_program_code *vp,
224 unsigned int hw_opcode,
225 struct rc_sub_instruction *vpi,
226 unsigned int * inst)
227 {
228 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
229 1,
230 0,
231 t_dst_index(vp, &vpi->DstReg),
232 t_dst_mask(vpi->DstReg.WriteMask),
233 t_dst_class(vpi->DstReg.File),
234 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
235 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
236 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
237 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
238 }
239
ei_cmp(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)240 static void ei_cmp(struct r300_vertex_program_code *vp,
241 struct rc_sub_instruction *vpi,
242 unsigned int * inst)
243 {
244 inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE,
245 0,
246 0,
247 t_dst_index(vp, &vpi->DstReg),
248 t_dst_mask(vpi->DstReg.WriteMask),
249 t_dst_class(vpi->DstReg.File),
250 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
251
252 /* Arguments with constant swizzles still count as a unique
253 * temporary, so we should make sure these arguments share a
254 * register index with one of the other arguments. */
255 for (unsigned i = 0; i < 3; i++) {
256 unsigned j = (i + 1) % 3;
257 if (vpi->SrcReg[i].File == RC_FILE_NONE &&
258 (vpi->SrcReg[j].File == RC_FILE_NONE ||
259 vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
260 vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
261 break;
262 }
263 }
264
265 inst[1] = t_src(vp, &vpi->SrcReg[0]);
266 inst[2] = t_src(vp, &vpi->SrcReg[2]);
267 inst[3] = t_src(vp, &vpi->SrcReg[1]);
268 }
269
ei_lit(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)270 static void ei_lit(struct r300_vertex_program_code *vp,
271 struct rc_sub_instruction *vpi,
272 unsigned int * inst)
273 {
274 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
275
276 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
277 1,
278 0,
279 t_dst_index(vp, &vpi->DstReg),
280 t_dst_mask(vpi->DstReg.WriteMask),
281 t_dst_class(vpi->DstReg.File),
282 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
283 /* NOTE: Users swizzling might not work. */
284 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
285 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
286 PVS_SRC_SELECT_FORCE_0, // Z
287 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
288 t_src_class(vpi->SrcReg[0].File),
289 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
290 (vpi->SrcReg[0].RelAddr << 4);
291 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
292 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
293 PVS_SRC_SELECT_FORCE_0, // Z
294 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
295 t_src_class(vpi->SrcReg[0].File),
296 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
297 (vpi->SrcReg[0].RelAddr << 4);
298 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
299 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
300 PVS_SRC_SELECT_FORCE_0, // Z
301 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
302 t_src_class(vpi->SrcReg[0].File),
303 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
304 (vpi->SrcReg[0].RelAddr << 4);
305 }
306
ei_mad(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)307 static void ei_mad(struct r300_vertex_program_code *vp,
308 struct rc_sub_instruction *vpi,
309 unsigned int * inst)
310 {
311 unsigned int i;
312 /* Remarks about hardware limitations of MAD
313 * (please preserve this comment, as this information is _NOT_
314 * in the documentation provided by AMD).
315 *
316 * As described in the documentation, MAD with three unique temporary
317 * source registers requires the use of the macro version.
318 *
319 * However (and this is not mentioned in the documentation), apparently
320 * the macro version is _NOT_ a full superset of the normal version.
321 * In particular, the macro version does not always work when relative
322 * addressing is used in the source operands.
323 *
324 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
325 * assembly shader path when using medium quality animations
326 * (i.e. animations with matrix blending instead of quaternion blending).
327 *
328 * Unfortunately, I (nha) have been unable to extract a Piglit regression
329 * test for this issue - for some reason, it is possible to have vertex
330 * programs whose prefix is *exactly* the same as the prefix of the
331 * offending program in Sauerbraten up to the offending instruction
332 * without causing any trouble.
333 *
334 * Bottom line: Only use the macro version only when really necessary;
335 * according to AMD docs, this should improve performance by one clock
336 * as a nice side bonus.
337 */
338 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
339 vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
340 vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
341 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
342 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
343 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
344 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
345 0,
346 1,
347 t_dst_index(vp, &vpi->DstReg),
348 t_dst_mask(vpi->DstReg.WriteMask),
349 t_dst_class(vpi->DstReg.File),
350 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
351 } else {
352 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
353 0,
354 0,
355 t_dst_index(vp, &vpi->DstReg),
356 t_dst_mask(vpi->DstReg.WriteMask),
357 t_dst_class(vpi->DstReg.File),
358 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
359
360 /* Arguments with constant swizzles still count as a unique
361 * temporary, so we should make sure these arguments share a
362 * register index with one of the other arguments. */
363 for (i = 0; i < 3; i++) {
364 unsigned int j;
365 if (vpi->SrcReg[i].File != RC_FILE_NONE)
366 continue;
367
368 for (j = 0; j < 3; j++) {
369 if (i != j) {
370 vpi->SrcReg[i].Index =
371 vpi->SrcReg[j].Index;
372 break;
373 }
374 }
375 }
376 }
377 inst[1] = t_src(vp, &vpi->SrcReg[0]);
378 inst[2] = t_src(vp, &vpi->SrcReg[1]);
379 inst[3] = t_src(vp, &vpi->SrcReg[2]);
380 }
381
ei_pow(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)382 static void ei_pow(struct r300_vertex_program_code *vp,
383 struct rc_sub_instruction *vpi,
384 unsigned int * inst)
385 {
386 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
387 1,
388 0,
389 t_dst_index(vp, &vpi->DstReg),
390 t_dst_mask(vpi->DstReg.WriteMask),
391 t_dst_class(vpi->DstReg.File),
392 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
393 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
394 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
395 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
396 }
397
translate_vertex_program(struct radeon_compiler * c,void * user)398 static void translate_vertex_program(struct radeon_compiler *c, void *user)
399 {
400 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
401 struct rc_instruction *rci;
402
403 unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
404 unsigned loop_depth = 0;
405 bool last_input_read_at_loop_end = false;
406 bool last_pos_write_at_loop_end = false;
407
408 compiler->code->pos_end = 0; /* Not supported yet */
409 compiler->code->length = 0;
410 compiler->code->num_temporaries = 0;
411 compiler->code->last_input_read = 0;
412 compiler->code->last_pos_write = 0;
413
414 compiler->SetHwInputOutput(compiler);
415
416 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
417 struct rc_sub_instruction *vpi = &rci->U.I;
418 unsigned int *inst = compiler->code->body.d + compiler->code->length;
419 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
420
421 /* Skip instructions writing to non-existing destination */
422 if (!valid_dst(compiler->code, &vpi->DstReg))
423 continue;
424
425 if (info->HasDstReg) {
426 /* Neither is Saturate. */
427 if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
428 rc_error(&compiler->Base, "Vertex program does not support the Saturate "
429 "modifier (yet).\n");
430 }
431 }
432
433 if (compiler->code->length >= c->max_alu_insts * 4) {
434 rc_error(&compiler->Base, "Vertex program has too many instructions\n");
435 return;
436 }
437
438 assert(compiler->Base.is_r500 ||
439 (vpi->Opcode != RC_OPCODE_SEQ &&
440 vpi->Opcode != RC_OPCODE_SNE));
441
442 switch (vpi->Opcode) {
443 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
444 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
445 case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
446 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
447 case RC_OPCODE_CMP: ei_cmp(compiler->code, vpi, inst); break;
448 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
449 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
450 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
451 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
452 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
453 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
454 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
455 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
456 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
457 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
458 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
459 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
460 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
461 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
462 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
463 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
464 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
465 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
466 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
467 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
468 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
469 case RC_OPCODE_BGNLOOP:
470 {
471 if ((!compiler->Base.is_r500
472 && loop_depth >= R300_VS_MAX_LOOP_DEPTH)
473 || loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
474 rc_error(&compiler->Base,
475 "Loops are nested too deep.");
476 return;
477 }
478 loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
479 break;
480 }
481 case RC_OPCODE_ENDLOOP:
482 {
483 unsigned int act_addr;
484 unsigned int last_addr;
485 unsigned int ret_addr;
486
487 if (loop_depth == 1 && last_input_read_at_loop_end) {
488 compiler->code->last_input_read = compiler->code->length / 4;
489 last_input_read_at_loop_end = false;
490 }
491 if (loop_depth == 1 && last_pos_write_at_loop_end) {
492 compiler->code->last_pos_write = compiler->code->length / 4;
493 last_pos_write_at_loop_end = false;
494 }
495
496 ret_addr = loops[--loop_depth];
497 act_addr = ret_addr - 1;
498 last_addr = (compiler->code->length / 4) - 1;
499
500 if (loop_depth >= R300_VS_MAX_FC_OPS) {
501 rc_error(&compiler->Base,
502 "Too many flow control instructions.");
503 return;
504 }
505 /* Maximum of R500_PVS_FC_LOOP_CNT_JMP_INST is 0xff, here
506 * we reduce it to half to avoid occasional hangs on RV516
507 * and downclocked RV530.
508 */
509 if (compiler->Base.is_r500) {
510 compiler->code->fc_op_addrs.r500
511 [compiler->code->num_fc_ops].lw =
512 R500_PVS_FC_ACT_ADRS(act_addr)
513 | R500_PVS_FC_LOOP_CNT_JMP_INST(0x0080)
514 ;
515 compiler->code->fc_op_addrs.r500
516 [compiler->code->num_fc_ops].uw =
517 R500_PVS_FC_LAST_INST(last_addr)
518 | R500_PVS_FC_RTN_INST(ret_addr)
519 ;
520 } else {
521 compiler->code->fc_op_addrs.r300
522 [compiler->code->num_fc_ops] =
523 R300_PVS_FC_ACT_ADRS(act_addr)
524 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
525 | R300_PVS_FC_LAST_INST(last_addr)
526 | R300_PVS_FC_RTN_INST(ret_addr)
527 ;
528 }
529 compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
530 R300_PVS_FC_LOOP_INIT_VAL(0x0)
531 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
532 ;
533 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
534 compiler->code->num_fc_ops);
535 compiler->code->num_fc_ops++;
536
537 break;
538 }
539
540 case RC_ME_PRED_SET_CLR:
541 ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
542 break;
543
544 case RC_ME_PRED_SET_INV:
545 ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
546 break;
547
548 case RC_ME_PRED_SET_POP:
549 ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
550 break;
551
552 case RC_ME_PRED_SET_RESTORE:
553 ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
554 break;
555
556 case RC_ME_PRED_SEQ:
557 ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
558 break;
559
560 case RC_ME_PRED_SNEQ:
561 ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
562 break;
563
564 case RC_VE_PRED_SNEQ_PUSH:
565 ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
566 vpi, inst);
567 break;
568
569 default:
570 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
571 return;
572 }
573
574 if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
575 inst[0] |= (PVS_DST_PRED_ENABLE_MASK
576 << PVS_DST_PRED_ENABLE_SHIFT);
577 if (vpi->DstReg.Pred == RC_PRED_SET) {
578 inst[0] |= (PVS_DST_PRED_SENSE_MASK
579 << PVS_DST_PRED_SENSE_SHIFT);
580 }
581 }
582
583 /* Update the number of temporaries. */
584 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
585 vpi->DstReg.Index >= compiler->code->num_temporaries)
586 compiler->code->num_temporaries = vpi->DstReg.Index + 1;
587
588 /* last instruction that writes position */
589 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_OUTPUT &&
590 t_dst_index(compiler->code, &vpi->DstReg) == 0) {
591 if (loop_depth == 0)
592 compiler->code->last_pos_write = compiler->code->length / 4;
593 else
594 last_pos_write_at_loop_end = true;
595 }
596
597 for (unsigned i = 0; i < info->NumSrcRegs; i++) {
598 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
599 vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
600 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
601 if (vpi->SrcReg[i].File == RC_FILE_INPUT) {
602 if (loop_depth == 0)
603 compiler->code->last_input_read = compiler->code->length / 4;
604 else
605 last_input_read_at_loop_end = true;
606 }
607
608 }
609
610
611 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
612 rc_error(&compiler->Base, "Too many temporaries.\n");
613 return;
614 }
615
616 compiler->code->length += 4;
617
618 if (compiler->Base.Error)
619 return;
620 }
621 }
622
623 struct temporary_allocation {
624 unsigned int Allocated:1;
625 unsigned int HwTemp:15;
626 struct rc_instruction * LastRead;
627 };
628
get_reg(struct radeon_compiler * c,struct temporary_allocation * ta,bool * hwtemps,unsigned int orig)629 static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
630 unsigned int orig)
631 {
632 if (!ta[orig].Allocated) {
633 int j;
634 for (j = 0; j < c->max_temp_regs; ++j)
635 {
636 if (!hwtemps[j])
637 break;
638 }
639 ta[orig].Allocated = 1;
640 ta[orig].HwTemp = j;
641 hwtemps[ta[orig].HwTemp] = true;
642 }
643
644 return ta[orig].HwTemp;
645 }
646
allocate_temporary_registers(struct radeon_compiler * c,void * user)647 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
648 {
649 unsigned int node_count, node_index;
650 struct ra_class ** node_classes;
651 struct rc_list * var_ptr;
652 struct rc_list * variables;
653 struct ra_graph * graph;
654 const struct rc_regalloc_state *ra_state = c->regalloc_state;
655
656 rc_recompute_ips(c);
657
658 /* Get list of program variables */
659 variables = rc_get_variables(c);
660 node_count = rc_list_count(variables);
661 node_classes = memory_pool_malloc(&c->Pool,
662 node_count * sizeof(struct ra_class *));
663
664 for (var_ptr = variables, node_index = 0; var_ptr;
665 var_ptr = var_ptr->Next, node_index++) {
666 unsigned int class_index = 0;
667 int index;
668 /* Compute the live intervals */
669 rc_variable_compute_live_intervals(var_ptr->Item);
670 unsigned int writemask = rc_variable_writemask_sum(var_ptr->Item);
671 index = rc_find_class(c->regalloc_state->class_list, writemask, 6);
672 if (index > -1) {
673 class_index = c->regalloc_state->class_list[index].ID;
674 } else {
675 rc_error(c,
676 "Could not find class for index=%u mask=%u\n",
677 ((struct rc_variable *)var_ptr->Item)->Dst.Index, writemask);
678 }
679 node_classes[node_index] = ra_state->classes[class_index];
680 }
681
682 graph = ra_alloc_interference_graph(ra_state->regs, node_count);
683
684 for (node_index = 0; node_index < node_count; node_index++) {
685 ra_set_node_class(graph, node_index, node_classes[node_index]);
686 }
687
688 rc_build_interference_graph(graph, variables);
689
690 if (!ra_allocate(graph)) {
691 rc_error(c, "Ran out of hardware temporaries\n");
692 ralloc_free(graph);
693 return;
694 }
695
696 /* Rewrite the registers */
697 for (var_ptr = variables, node_index = 0; var_ptr;
698 var_ptr = var_ptr->Next, node_index++) {
699 int reg = ra_get_node_reg(graph, node_index);
700 unsigned int writemask = reg_get_writemask(reg);
701 unsigned int index = reg_get_index(reg);
702 struct rc_variable * var = var_ptr->Item;
703
704 rc_variable_change_dst(var, index, writemask);
705 }
706
707 ralloc_free(graph);
708 }
709
710 /**
711 * Vertex engine cannot read two inputs or two constants at the same time.
712 * Introduce intermediate MOVs to temporary registers to account for this.
713 */
transform_source_conflicts(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)714 static int transform_source_conflicts(
715 struct radeon_compiler *c,
716 struct rc_instruction* inst,
717 void* unused)
718 {
719 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
720
721 if (opcode->NumSrcRegs == 3) {
722 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
723 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
724 int tmpreg = rc_find_free_temporary(c);
725 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
726 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
727 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
728 inst_mov->U.I.DstReg.Index = tmpreg;
729 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
730 inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
731 inst_mov->U.I.SrcReg[0].Negate = 0;
732 inst_mov->U.I.SrcReg[0].Abs = 0;
733
734 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
735 inst->U.I.SrcReg[2].Index = tmpreg;
736 inst->U.I.SrcReg[2].RelAddr = false;
737 }
738 }
739
740 if (opcode->NumSrcRegs >= 2) {
741 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
742 int tmpreg = rc_find_free_temporary(c);
743 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
744 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
745 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
746 inst_mov->U.I.DstReg.Index = tmpreg;
747 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
748 inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
749 inst_mov->U.I.SrcReg[0].Negate = 0;
750 inst_mov->U.I.SrcReg[0].Abs = 0;
751
752 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
753 inst->U.I.SrcReg[1].Index = tmpreg;
754 inst->U.I.SrcReg[1].RelAddr = false;
755 }
756 }
757
758 return 1;
759 }
760
rc_vs_add_artificial_outputs(struct radeon_compiler * c,void * user)761 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
762 {
763 struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
764 int i;
765
766 for(i = 0; i < 32; ++i) {
767 if ((compiler->RequiredOutputs & (1U << i)) &&
768 !(compiler->Base.Program.OutputsWritten & (1U << i))) {
769 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
770 inst->U.I.Opcode = RC_OPCODE_MOV;
771
772 inst->U.I.DstReg.File = RC_FILE_OUTPUT;
773 inst->U.I.DstReg.Index = i;
774 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
775
776 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
777 inst->U.I.SrcReg[0].Index = 0;
778 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
779
780 compiler->Base.Program.OutputsWritten |= 1U << i;
781 }
782 }
783 }
784
swizzle_is_native(rc_opcode opcode,struct rc_src_register reg)785 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
786 {
787 (void) opcode;
788 (void) reg;
789
790 return 1;
791 }
792
793 const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
794 .IsNative = &swizzle_is_native,
795 .Split = NULL /* should never be called */
796 };
797
r3xx_compile_vertex_program(struct r300_vertex_program_compiler * c)798 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
799 {
800 int is_r500 = c->Base.is_r500;
801 int opt = !c->Base.disable_optimizations;
802
803 /* Lists of instruction transformations. */
804 struct radeon_program_transformation alu_rewrite[] = {
805 { &r300_transform_vertex_alu, NULL },
806 { NULL, NULL }
807 };
808
809 struct radeon_program_transformation resolve_src_conflicts[] = {
810 { &transform_source_conflicts, NULL },
811 { NULL, NULL }
812 };
813
814 /* List of compiler passes. */
815 struct radeon_compiler_pass vs_list[] = {
816 /* NAME DUMP PREDICATE FUNCTION PARAM */
817 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
818 {"native rewrite", 1, 1, rc_local_transform, alu_rewrite},
819 {"unused channels", 1, opt, rc_mark_unused_channels, NULL},
820 {"dataflow optimize", 1, opt, rc_optimize, NULL},
821 /* This pass must be done after optimizations. */
822 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts},
823 {"register allocation", 1, opt, allocate_temporary_registers, NULL},
824 {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table},
825 {"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL},
826 {"final code validation", 0, 1, rc_validate_final_shader, NULL},
827 {"machine code generation", 0, 1, translate_vertex_program, NULL},
828 {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL},
829 {NULL, 0, 0, NULL, NULL}
830 };
831
832 c->Base.type = RC_VERTEX_PROGRAM;
833 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
834
835 rc_run_compiler(&c->Base, vs_list);
836
837 c->code->InputsRead = c->Base.Program.InputsRead;
838 c->code->OutputsWritten = c->Base.Program.OutputsWritten;
839 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
840 }
841