1 /*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "radeon_compiler.h"
7
8 #include <stdbool.h>
9 #include <stdio.h>
10
11 #include "r300_reg.h"
12
13 #include "radeon_compiler_util.h"
14 #include "radeon_dataflow.h"
15 #include "radeon_list.h"
16 #include "radeon_program.h"
17 #include "radeon_program_alu.h"
18 #include "radeon_regalloc.h"
19 #include "radeon_remove_constants.h"
20 #include "radeon_swizzle.h"
21
22 #include "util/compiler.h"
23
24 /*
25 * Take an already-setup and valid source then swizzle it appropriately to
26 * obtain a constant ZERO or ONE source.
27 */
28 #define __CONST(x, y) \
29 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), t_swizzle(y), t_swizzle(y), t_swizzle(y), \
30 t_swizzle(y), t_src_class(vpi->SrcReg[x].File), RC_MASK_NONE) | \
31 (vpi->SrcReg[x].RelAddr << 4))
32
33 static unsigned long
t_dst_mask(unsigned int mask)34 t_dst_mask(unsigned int mask)
35 {
36 /* RC_MASK_* is equivalent to VSF_FLAG_* */
37 return mask & RC_MASK_XYZW;
38 }
39
40 static unsigned long
t_dst_class(rc_register_file file)41 t_dst_class(rc_register_file file)
42 {
43 switch (file) {
44 default:
45 fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
46 FALLTHROUGH;
47 case RC_FILE_TEMPORARY:
48 return PVS_DST_REG_TEMPORARY;
49 case RC_FILE_OUTPUT:
50 return PVS_DST_REG_OUT;
51 case RC_FILE_ADDRESS:
52 return PVS_DST_REG_A0;
53 }
54 }
55
56 static unsigned long
t_dst_index(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)57 t_dst_index(struct r300_vertex_program_code *vp, struct rc_dst_register *dst)
58 {
59 if (dst->File == RC_FILE_OUTPUT)
60 return vp->outputs[dst->Index];
61
62 return dst->Index;
63 }
64
65 static unsigned long
t_src_class(rc_register_file file)66 t_src_class(rc_register_file file)
67 {
68 switch (file) {
69 default:
70 fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
71 FALLTHROUGH;
72 case RC_FILE_NONE:
73 case RC_FILE_TEMPORARY:
74 return PVS_SRC_REG_TEMPORARY;
75 case RC_FILE_INPUT:
76 return PVS_SRC_REG_INPUT;
77 case RC_FILE_CONSTANT:
78 return PVS_SRC_REG_CONSTANT;
79 }
80 }
81
82 static int
t_src_conflict(struct rc_src_register a,struct rc_src_register b)83 t_src_conflict(struct rc_src_register a, struct rc_src_register b)
84 {
85 unsigned long aclass = t_src_class(a.File);
86 unsigned long bclass = t_src_class(b.File);
87
88 if (aclass != bclass)
89 return 0;
90 if (aclass == PVS_SRC_REG_TEMPORARY)
91 return 0;
92
93 if (a.RelAddr || b.RelAddr)
94 return 1;
95 if (a.Index != b.Index)
96 return 1;
97
98 return 0;
99 }
100
101 static inline unsigned long
t_swizzle(unsigned int swizzle)102 t_swizzle(unsigned int swizzle)
103 {
104 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
105 return swizzle;
106 }
107
108 static unsigned long
t_src_index(struct r300_vertex_program_code * vp,struct rc_src_register * src)109 t_src_index(struct r300_vertex_program_code *vp, struct rc_src_register *src)
110 {
111 if (src->File == RC_FILE_INPUT) {
112 assert(vp->inputs[src->Index] != -1);
113 return vp->inputs[src->Index];
114 } else {
115 if (src->Index < 0) {
116 fprintf(stderr, "negative offsets for indirect addressing do not work.\n");
117 return 0;
118 }
119 return src->Index;
120 }
121 }
122
123 /* these two functions should probably be merged... */
124
125 static unsigned long
t_src(struct r300_vertex_program_code * vp,struct rc_src_register * src)126 t_src(struct r300_vertex_program_code *vp, struct rc_src_register *src)
127 {
128 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
129 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
130 */
131 return PVS_SRC_OPERAND(t_src_index(vp, src), t_swizzle(GET_SWZ(src->Swizzle, 0)),
132 t_swizzle(GET_SWZ(src->Swizzle, 1)), t_swizzle(GET_SWZ(src->Swizzle, 2)),
133 t_swizzle(GET_SWZ(src->Swizzle, 3)), t_src_class(src->File),
134 src->Negate) |
135 (src->RelAddr << 4) | (src->Abs << 3);
136 }
137
138 static unsigned long
t_src_scalar(struct r300_vertex_program_code * vp,struct rc_src_register * src)139 t_src_scalar(struct r300_vertex_program_code *vp, struct rc_src_register *src)
140 {
141 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
142 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
143 */
144 unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
145
146 return PVS_SRC_OPERAND(t_src_index(vp, src), t_swizzle(swz), t_swizzle(swz), t_swizzle(swz),
147 t_swizzle(swz), t_src_class(src->File),
148 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
149 (src->RelAddr << 4) | (src->Abs << 3);
150 }
151
152 static int
valid_dst(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)153 valid_dst(struct r300_vertex_program_code *vp, struct rc_dst_register *dst)
154 {
155 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
156 return 0;
157 } else if (dst->File == RC_FILE_ADDRESS) {
158 assert(dst->Index == 0);
159 }
160
161 return 1;
162 }
163
164 static void
ei_vector1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)165 ei_vector1(struct r300_vertex_program_code *vp, unsigned int hw_opcode,
166 struct rc_sub_instruction *vpi, unsigned int *inst)
167 {
168 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 0, 0, t_dst_index(vp, &vpi->DstReg),
169 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
170 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
171 inst[1] = t_src(vp, &vpi->SrcReg[0]);
172 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
173 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
174 }
175
176 static void
ei_vector2(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)177 ei_vector2(struct r300_vertex_program_code *vp, unsigned int hw_opcode,
178 struct rc_sub_instruction *vpi, unsigned int *inst)
179 {
180 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 0, 0, t_dst_index(vp, &vpi->DstReg),
181 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
182 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
183 inst[1] = t_src(vp, &vpi->SrcReg[0]);
184 inst[2] = t_src(vp, &vpi->SrcReg[1]);
185 inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
186 }
187
188 static void
ei_math1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)189 ei_math1(struct r300_vertex_program_code *vp, unsigned int hw_opcode,
190 struct rc_sub_instruction *vpi, unsigned int *inst)
191 {
192 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 1, 0, t_dst_index(vp, &vpi->DstReg),
193 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
194 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
195 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
196 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
197 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
198 }
199
200 static void
ei_math1_select(struct r300_vertex_program_code * vp,unsigned math_mode,unsigned hw_opcode_ieee,unsigned hw_opcode_dx,unsigned hw_opcode_ff,struct rc_sub_instruction * vpi,unsigned int * inst)201 ei_math1_select(struct r300_vertex_program_code *vp,
202 unsigned math_mode,
203 unsigned hw_opcode_ieee,
204 unsigned hw_opcode_dx,
205 unsigned hw_opcode_ff,
206 struct rc_sub_instruction *vpi,
207 unsigned int *inst)
208 {
209 unsigned hw_opcode;
210 switch (math_mode) {
211 case RC_MATH_IEEE: hw_opcode = hw_opcode_ieee; break;
212 case RC_MATH_DX: hw_opcode = hw_opcode_dx; break;
213 case RC_MATH_FF: hw_opcode = hw_opcode_ff; break;
214 default:
215 unreachable();
216 }
217 ei_math1(vp, hw_opcode, vpi, inst);
218 }
219
220 static void
ei_cmp(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)221 ei_cmp(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
222 {
223 inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE, 0, 0, t_dst_index(vp, &vpi->DstReg),
224 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
225 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
226
227 /* Arguments with constant swizzles still count as a unique
228 * temporary, so we should make sure these arguments share a
229 * register index with one of the other arguments. */
230 for (unsigned i = 0; i < 3; i++) {
231 unsigned j = (i + 1) % 3;
232 if (vpi->SrcReg[i].File == RC_FILE_NONE &&
233 (vpi->SrcReg[j].File == RC_FILE_NONE || vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
234 vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
235 break;
236 }
237 }
238
239 inst[1] = t_src(vp, &vpi->SrcReg[0]);
240 inst[2] = t_src(vp, &vpi->SrcReg[2]);
241 inst[3] = t_src(vp, &vpi->SrcReg[1]);
242 }
243
244 static void
ei_lit(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)245 ei_lit(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
246 {
247 // LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
248
249 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX, 1, 0, t_dst_index(vp, &vpi->DstReg),
250 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
251 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
252 /* NOTE: Users swizzling might not work. */
253 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]),
254 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
255 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
256 PVS_SRC_SELECT_FORCE_0, // Z
257 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
258 t_src_class(vpi->SrcReg[0].File),
259 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
260 (vpi->SrcReg[0].RelAddr << 4);
261 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]),
262 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
263 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
264 PVS_SRC_SELECT_FORCE_0, // Z
265 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
266 t_src_class(vpi->SrcReg[0].File),
267 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
268 (vpi->SrcReg[0].RelAddr << 4);
269 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]),
270 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
271 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
272 PVS_SRC_SELECT_FORCE_0, // Z
273 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
274 t_src_class(vpi->SrcReg[0].File),
275 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
276 (vpi->SrcReg[0].RelAddr << 4);
277 }
278
279 static void
ei_mad(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)280 ei_mad(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
281 {
282 unsigned int i;
283 /* Remarks about hardware limitations of MAD
284 * (please preserve this comment, as this information is _NOT_
285 * in the documentation provided by AMD).
286 *
287 * As described in the documentation, MAD with three unique temporary
288 * source registers requires the use of the macro version.
289 *
290 * However (and this is not mentioned in the documentation), apparently
291 * the macro version is _NOT_ a full superset of the normal version.
292 * In particular, the macro version does not always work when relative
293 * addressing is used in the source operands.
294 *
295 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
296 * assembly shader path when using medium quality animations
297 * (i.e. animations with matrix blending instead of quaternion blending).
298 *
299 * Unfortunately, I (nha) have been unable to extract a Piglit regression
300 * test for this issue - for some reason, it is possible to have vertex
301 * programs whose prefix is *exactly* the same as the prefix of the
302 * offending program in Sauerbraten up to the offending instruction
303 * without causing any trouble.
304 *
305 * Bottom line: Only use the macro version only when really necessary;
306 * according to AMD docs, this should improve performance by one clock
307 * as a nice side bonus.
308 */
309 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY && vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
310 vpi->SrcReg[2].File == RC_FILE_TEMPORARY && vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
311 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
312 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
313 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD, 0, 1, t_dst_index(vp, &vpi->DstReg),
314 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
315 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
316 } else {
317 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD, 0, 0, t_dst_index(vp, &vpi->DstReg),
318 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
319 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
320
321 /* Arguments with constant swizzles still count as a unique
322 * temporary, so we should make sure these arguments share a
323 * register index with one of the other arguments. */
324 for (i = 0; i < 3; i++) {
325 unsigned int j;
326 if (vpi->SrcReg[i].File != RC_FILE_NONE)
327 continue;
328
329 for (j = 0; j < 3; j++) {
330 if (i != j) {
331 vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
332 break;
333 }
334 }
335 }
336 }
337 inst[1] = t_src(vp, &vpi->SrcReg[0]);
338 inst[2] = t_src(vp, &vpi->SrcReg[1]);
339 inst[3] = t_src(vp, &vpi->SrcReg[2]);
340 }
341
342 static void
ei_pow(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)343 ei_pow(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
344 {
345 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF, 1, 0, t_dst_index(vp, &vpi->DstReg),
346 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
347 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
348 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
349 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
350 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
351 }
352
353 static void
translate_vertex_program(struct radeon_compiler * c,void * user)354 translate_vertex_program(struct radeon_compiler *c, void *user)
355 {
356 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler *)c;
357 struct rc_instruction *rci;
358
359 unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
360 unsigned loop_depth = 0;
361 bool last_input_read_at_loop_end = false;
362 bool last_pos_write_at_loop_end = false;
363
364 compiler->code->pos_end = 0; /* Not supported yet */
365 compiler->code->length = 0;
366 compiler->code->num_temporaries = 0;
367 compiler->code->last_input_read = 0;
368 compiler->code->last_pos_write = 0;
369
370 compiler->SetHwInputOutput(compiler);
371
372 for (rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions;
373 rci = rci->Next) {
374 struct rc_sub_instruction *vpi = &rci->U.I;
375 unsigned int *inst = compiler->code->body.d + compiler->code->length;
376 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
377
378 /* Skip instructions writing to non-existing destination */
379 if (!valid_dst(compiler->code, &vpi->DstReg))
380 continue;
381
382 if (info->HasDstReg) {
383 /* Neither is Saturate. */
384 if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
385 rc_error(&compiler->Base, "Vertex program does not support the Saturate "
386 "modifier (yet).\n");
387 }
388 }
389
390 if (compiler->code->length >= c->max_alu_insts * 4) {
391 rc_error(&compiler->Base, "Vertex program has too many instructions\n");
392 return;
393 }
394
395 assert(compiler->Base.is_r500 ||
396 (vpi->Opcode != RC_OPCODE_SEQ && vpi->Opcode != RC_OPCODE_SNE));
397
398 switch (vpi->Opcode) {
399 case RC_OPCODE_ADD:
400 ei_vector2(compiler->code, VE_ADD, vpi, inst);
401 break;
402 case RC_OPCODE_ARL:
403 ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst);
404 break;
405 case RC_OPCODE_ARR:
406 ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst);
407 break;
408 case RC_OPCODE_COS:
409 ei_math1(compiler->code, ME_COS, vpi, inst);
410 break;
411 case RC_OPCODE_CMP:
412 ei_cmp(compiler->code, vpi, inst);
413 break;
414 case RC_OPCODE_DP4:
415 ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst);
416 break;
417 case RC_OPCODE_DST:
418 ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst);
419 break;
420 case RC_OPCODE_EX2:
421 ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst);
422 break;
423 case RC_OPCODE_EXP:
424 ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst);
425 break;
426 case RC_OPCODE_FRC:
427 ei_vector1(compiler->code, VE_FRACTION, vpi, inst);
428 break;
429 case RC_OPCODE_LG2:
430 ei_math1_select(compiler->code, compiler->Base.math_rules, ME_LOG_BASE2_IEEE,
431 ME_LOG_BASE2_FULL_DX, ME_LOG_BASE2_FULL_DX, vpi, inst);
432 break;
433 case RC_OPCODE_LIT:
434 ei_lit(compiler->code, vpi, inst);
435 break;
436 case RC_OPCODE_LOG:
437 ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst);
438 break;
439 case RC_OPCODE_MAD:
440 ei_mad(compiler->code, vpi, inst);
441 break;
442 case RC_OPCODE_MAX:
443 ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst);
444 break;
445 case RC_OPCODE_MIN:
446 ei_vector2(compiler->code, VE_MINIMUM, vpi, inst);
447 break;
448 case RC_OPCODE_MOV:
449 ei_vector1(compiler->code, VE_ADD, vpi, inst);
450 break;
451 case RC_OPCODE_MUL:
452 ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst);
453 break;
454 case RC_OPCODE_POW:
455 ei_pow(compiler->code, vpi, inst);
456 break;
457 case RC_OPCODE_RCP:
458 ei_math1_select(compiler->code, compiler->Base.math_rules, ME_RECIP_IEEE,
459 ME_RECIP_DX, ME_RECIP_FF, vpi, inst);
460 break;
461 case RC_OPCODE_RSQ:
462 ei_math1_select(compiler->code, compiler->Base.math_rules, ME_RECIP_SQRT_IEEE,
463 ME_RECIP_SQRT_DX, ME_RECIP_SQRT_FF, vpi, inst);
464 break;
465 case RC_OPCODE_SEQ:
466 ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst);
467 break;
468 case RC_OPCODE_SGE:
469 ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst);
470 break;
471 case RC_OPCODE_SIN:
472 ei_math1(compiler->code, ME_SIN, vpi, inst);
473 break;
474 case RC_OPCODE_SLT:
475 ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst);
476 break;
477 case RC_OPCODE_SNE:
478 ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst);
479 break;
480 case RC_OPCODE_BGNLOOP: {
481 if ((!compiler->Base.is_r500 && loop_depth >= R300_VS_MAX_LOOP_DEPTH) ||
482 loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
483 rc_error(&compiler->Base, "Loops are nested too deep.");
484 return;
485 }
486 loops[loop_depth++] = ((compiler->code->length) / 4) + 1;
487 break;
488 }
489 case RC_OPCODE_ENDLOOP: {
490 unsigned int act_addr;
491 unsigned int last_addr;
492 unsigned int ret_addr;
493
494 if (loop_depth == 1 && last_input_read_at_loop_end) {
495 compiler->code->last_input_read = compiler->code->length / 4;
496 last_input_read_at_loop_end = false;
497 }
498 if (loop_depth == 1 && last_pos_write_at_loop_end) {
499 compiler->code->last_pos_write = compiler->code->length / 4;
500 last_pos_write_at_loop_end = false;
501 }
502
503 ret_addr = loops[--loop_depth];
504 act_addr = ret_addr - 1;
505 last_addr = (compiler->code->length / 4) - 1;
506
507 if (loop_depth >= R300_VS_MAX_FC_OPS) {
508 rc_error(&compiler->Base, "Too many flow control instructions.");
509 return;
510 }
511 /* Maximum of R500_PVS_FC_LOOP_CNT_JMP_INST is 0xff, here
512 * we reduce it to half to avoid occasional hangs on RV516
513 * and downclocked RV530.
514 */
515 if (compiler->Base.is_r500) {
516 compiler->code->fc_op_addrs.r500[compiler->code->num_fc_ops].lw =
517 R500_PVS_FC_ACT_ADRS(act_addr) | R500_PVS_FC_LOOP_CNT_JMP_INST(0x0080);
518 compiler->code->fc_op_addrs.r500[compiler->code->num_fc_ops].uw =
519 R500_PVS_FC_LAST_INST(last_addr) | R500_PVS_FC_RTN_INST(ret_addr);
520 } else {
521 compiler->code->fc_op_addrs.r300[compiler->code->num_fc_ops] =
522 R300_PVS_FC_ACT_ADRS(act_addr) | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff) |
523 R300_PVS_FC_LAST_INST(last_addr) | R300_PVS_FC_RTN_INST(ret_addr);
524 }
525 compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
526 R300_PVS_FC_LOOP_INIT_VAL(0x0) | R300_PVS_FC_LOOP_STEP_VAL(0x1);
527 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(compiler->code->num_fc_ops);
528 compiler->code->num_fc_ops++;
529
530 break;
531 }
532
533 case RC_ME_PRED_SET_CLR:
534 ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
535 break;
536
537 case RC_ME_PRED_SET_INV:
538 ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
539 break;
540
541 case RC_ME_PRED_SET_POP:
542 ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
543 break;
544
545 case RC_ME_PRED_SET_RESTORE:
546 ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
547 break;
548
549 case RC_ME_PRED_SEQ:
550 ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
551 break;
552
553 case RC_ME_PRED_SNEQ:
554 ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
555 break;
556
557 case RC_VE_PRED_SNEQ_PUSH:
558 ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH, vpi, inst);
559 break;
560
561 default:
562 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
563 return;
564 }
565
566 if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
567 inst[0] |= (PVS_DST_PRED_ENABLE_MASK << PVS_DST_PRED_ENABLE_SHIFT);
568 if (vpi->DstReg.Pred == RC_PRED_SET) {
569 inst[0] |= (PVS_DST_PRED_SENSE_MASK << PVS_DST_PRED_SENSE_SHIFT);
570 }
571 }
572
573 /* Update the number of temporaries. */
574 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
575 vpi->DstReg.Index >= compiler->code->num_temporaries)
576 compiler->code->num_temporaries = vpi->DstReg.Index + 1;
577
578 /* last instruction that writes position */
579 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_OUTPUT &&
580 t_dst_index(compiler->code, &vpi->DstReg) == 0) {
581 if (loop_depth == 0)
582 compiler->code->last_pos_write = compiler->code->length / 4;
583 else
584 last_pos_write_at_loop_end = true;
585 }
586
587 for (unsigned i = 0; i < info->NumSrcRegs; i++) {
588 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
589 vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
590 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
591 if (vpi->SrcReg[i].File == RC_FILE_INPUT) {
592 if (loop_depth == 0)
593 compiler->code->last_input_read = compiler->code->length / 4;
594 else
595 last_input_read_at_loop_end = true;
596 }
597 }
598
599 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
600 rc_error(&compiler->Base, "Too many temporaries.\n");
601 return;
602 }
603
604 compiler->code->length += 4;
605
606 if (compiler->Base.Error)
607 return;
608 }
609 }
610
611 struct temporary_allocation {
612 unsigned int Allocated : 1;
613 unsigned int HwTemp : 15;
614 struct rc_instruction *LastRead;
615 };
616
617 static int
get_reg(struct radeon_compiler * c,struct temporary_allocation * ta,bool * hwtemps,unsigned int orig)618 get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
619 unsigned int orig)
620 {
621 if (!ta[orig].Allocated) {
622 int j;
623 for (j = 0; j < c->max_temp_regs; ++j) {
624 if (!hwtemps[j])
625 break;
626 }
627 ta[orig].Allocated = 1;
628 ta[orig].HwTemp = j;
629 hwtemps[ta[orig].HwTemp] = true;
630 }
631
632 return ta[orig].HwTemp;
633 }
634
635 static void
allocate_temporary_registers(struct radeon_compiler * c,void * user)636 allocate_temporary_registers(struct radeon_compiler *c, void *user)
637 {
638 unsigned int node_count, node_index;
639 struct ra_class **node_classes;
640 struct rc_list *var_ptr;
641 struct rc_list *variables;
642 struct ra_graph *graph;
643 const struct rc_regalloc_state *ra_state = c->regalloc_state;
644
645 rc_recompute_ips(c);
646
647 /* Get list of program variables */
648 variables = rc_get_variables(c);
649 node_count = rc_list_count(variables);
650 node_classes = memory_pool_malloc(&c->Pool, node_count * sizeof(struct ra_class *));
651
652 for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
653 unsigned int class_index = 0;
654 int index;
655 /* Compute the live intervals */
656 rc_variable_compute_live_intervals(var_ptr->Item);
657 unsigned int writemask = rc_variable_writemask_sum(var_ptr->Item);
658 index = rc_find_class(c->regalloc_state->class_list, writemask, 6);
659 if (index > -1) {
660 class_index = c->regalloc_state->class_list[index].ID;
661 } else {
662 rc_error(c, "Could not find class for index=%u mask=%u\n",
663 ((struct rc_variable *)var_ptr->Item)->Dst.Index, writemask);
664 }
665 node_classes[node_index] = ra_state->classes[class_index];
666 }
667
668 graph = ra_alloc_interference_graph(ra_state->regs, node_count);
669
670 for (node_index = 0; node_index < node_count; node_index++) {
671 ra_set_node_class(graph, node_index, node_classes[node_index]);
672 }
673
674 rc_build_interference_graph(graph, variables);
675
676 if (!ra_allocate(graph)) {
677 rc_error(c, "Ran out of hardware temporaries\n");
678 ralloc_free(graph);
679 return;
680 }
681
682 /* Rewrite the registers */
683 for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
684 int reg = ra_get_node_reg(graph, node_index);
685 unsigned int writemask = reg_get_writemask(reg);
686 unsigned int index = reg_get_index(reg);
687 struct rc_variable *var = var_ptr->Item;
688
689 rc_variable_change_dst(var, index, writemask);
690 }
691
692 ralloc_free(graph);
693 }
694
695 /**
696 * Vertex engine cannot read two inputs or two constants at the same time.
697 * Introduce intermediate MOVs to temporary registers to account for this.
698 */
699 static int
transform_source_conflicts(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)700 transform_source_conflicts(struct radeon_compiler *c, struct rc_instruction *inst, void *unused)
701 {
702 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
703
704 if (opcode->NumSrcRegs == 3) {
705 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]) ||
706 t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
707 int tmpreg = rc_find_free_temporary(c);
708 struct rc_instruction *inst_mov = rc_insert_new_instruction(c, inst->Prev);
709 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
710 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
711 inst_mov->U.I.DstReg.Index = tmpreg;
712 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
713 inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
714 inst_mov->U.I.SrcReg[0].Negate = 0;
715 inst_mov->U.I.SrcReg[0].Abs = 0;
716
717 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
718 inst->U.I.SrcReg[2].Index = tmpreg;
719 inst->U.I.SrcReg[2].RelAddr = false;
720 }
721 }
722
723 if (opcode->NumSrcRegs >= 2) {
724 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
725 int tmpreg = rc_find_free_temporary(c);
726 struct rc_instruction *inst_mov = rc_insert_new_instruction(c, inst->Prev);
727 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
728 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
729 inst_mov->U.I.DstReg.Index = tmpreg;
730 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
731 inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
732 inst_mov->U.I.SrcReg[0].Negate = 0;
733 inst_mov->U.I.SrcReg[0].Abs = 0;
734
735 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
736 inst->U.I.SrcReg[1].Index = tmpreg;
737 inst->U.I.SrcReg[1].RelAddr = false;
738 }
739 }
740
741 return 1;
742 }
743
744 static void
rc_vs_add_artificial_outputs(struct radeon_compiler * c,void * user)745 rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
746 {
747 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler *)c;
748 int i;
749
750 for (i = 0; i < 32; ++i) {
751 if ((compiler->RequiredOutputs & (1U << i)) &&
752 !(compiler->Base.Program.OutputsWritten & (1U << i))) {
753 struct rc_instruction *inst =
754 rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
755 inst->U.I.Opcode = RC_OPCODE_MOV;
756
757 inst->U.I.DstReg.File = RC_FILE_OUTPUT;
758 inst->U.I.DstReg.Index = i;
759 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
760
761 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
762 inst->U.I.SrcReg[0].Index = 0;
763 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
764
765 compiler->Base.Program.OutputsWritten |= 1U << i;
766 }
767 }
768 }
769
770 static int
swizzle_is_native(rc_opcode opcode,struct rc_src_register reg)771 swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
772 {
773 (void)opcode;
774 (void)reg;
775
776 return 1;
777 }
778
779 const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
780 .IsNative = &swizzle_is_native, .Split = NULL /* should never be called */
781 };
782
783 void
r3xx_compile_vertex_program(struct r300_vertex_program_compiler * c)784 r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
785 {
786 int is_r500 = c->Base.is_r500;
787 int opt = !c->Base.disable_optimizations;
788 bool debug = c->Base.Debug & RC_DBG_LOG;
789
790 /* Lists of instruction transformations. */
791 struct radeon_program_transformation alu_rewrite[] = {{&r300_transform_vertex_alu, NULL},
792 {NULL, NULL}};
793
794 struct radeon_program_transformation resolve_src_conflicts[] = {
795 {&transform_source_conflicts, NULL},
796 {NULL, NULL}};
797
798 /* List of compiler passes. */
799 struct radeon_compiler_pass vs_list[] = {
800 /* clang-format off */
801 /* NAME DUMP PREDICATE FUNCTION PARAM */
802 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
803 {"native rewrite", 1, 1, rc_local_transform, alu_rewrite},
804 {"unused channels", 1, opt, rc_mark_unused_channels, NULL},
805 {"dataflow optimize", 1, opt, rc_optimize, NULL},
806 {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table},
807 /* This pass must be done after optimizations. */
808 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts},
809 {"register allocation", 1, opt, allocate_temporary_registers, NULL},
810 {"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL},
811 {"final code validation", 0, 1, rc_validate_final_shader, NULL},
812 {"machine code generation", 0, 1, translate_vertex_program, NULL},
813 {"dump machine code", 0, debug, r300_vertex_program_dump, NULL},
814 {NULL, 0, 0, NULL, NULL}};
815 /* clang-format on */
816
817 c->Base.type = RC_VERTEX_PROGRAM;
818 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
819
820 rc_run_compiler(&c->Base, vs_list);
821
822 c->code->InputsRead = c->Base.Program.InputsRead;
823 c->code->OutputsWritten = c->Base.Program.OutputsWritten;
824 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
825 }
826