• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "radeon_compiler.h"
7 
8 #include <stdbool.h>
9 #include <stdio.h>
10 
11 #include "r300_reg.h"
12 
13 #include "radeon_compiler_util.h"
14 #include "radeon_dataflow.h"
15 #include "radeon_list.h"
16 #include "radeon_program.h"
17 #include "radeon_program_alu.h"
18 #include "radeon_regalloc.h"
19 #include "radeon_remove_constants.h"
20 #include "radeon_swizzle.h"
21 
22 #include "util/compiler.h"
23 
24 /*
25  * Take an already-setup and valid source then swizzle it appropriately to
26  * obtain a constant ZERO or ONE source.
27  */
28 #define __CONST(x, y)                                                                              \
29    (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), t_swizzle(y), t_swizzle(y), t_swizzle(y),    \
30                     t_swizzle(y), t_src_class(vpi->SrcReg[x].File), RC_MASK_NONE) |                \
31     (vpi->SrcReg[x].RelAddr << 4))
32 
33 static unsigned long
t_dst_mask(unsigned int mask)34 t_dst_mask(unsigned int mask)
35 {
36    /* RC_MASK_* is equivalent to VSF_FLAG_* */
37    return mask & RC_MASK_XYZW;
38 }
39 
40 static unsigned long
t_dst_class(rc_register_file file)41 t_dst_class(rc_register_file file)
42 {
43    switch (file) {
44    default:
45       fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
46       FALLTHROUGH;
47    case RC_FILE_TEMPORARY:
48       return PVS_DST_REG_TEMPORARY;
49    case RC_FILE_OUTPUT:
50       return PVS_DST_REG_OUT;
51    case RC_FILE_ADDRESS:
52       return PVS_DST_REG_A0;
53    }
54 }
55 
56 static unsigned long
t_dst_index(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)57 t_dst_index(struct r300_vertex_program_code *vp, struct rc_dst_register *dst)
58 {
59    if (dst->File == RC_FILE_OUTPUT)
60       return vp->outputs[dst->Index];
61 
62    return dst->Index;
63 }
64 
65 static unsigned long
t_src_class(rc_register_file file)66 t_src_class(rc_register_file file)
67 {
68    switch (file) {
69    default:
70       fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
71       FALLTHROUGH;
72    case RC_FILE_NONE:
73    case RC_FILE_TEMPORARY:
74       return PVS_SRC_REG_TEMPORARY;
75    case RC_FILE_INPUT:
76       return PVS_SRC_REG_INPUT;
77    case RC_FILE_CONSTANT:
78       return PVS_SRC_REG_CONSTANT;
79    }
80 }
81 
82 static int
t_src_conflict(struct rc_src_register a,struct rc_src_register b)83 t_src_conflict(struct rc_src_register a, struct rc_src_register b)
84 {
85    unsigned long aclass = t_src_class(a.File);
86    unsigned long bclass = t_src_class(b.File);
87 
88    if (aclass != bclass)
89       return 0;
90    if (aclass == PVS_SRC_REG_TEMPORARY)
91       return 0;
92 
93    if (a.RelAddr || b.RelAddr)
94       return 1;
95    if (a.Index != b.Index)
96       return 1;
97 
98    return 0;
99 }
100 
101 static inline unsigned long
t_swizzle(unsigned int swizzle)102 t_swizzle(unsigned int swizzle)
103 {
104    /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
105    return swizzle;
106 }
107 
108 static unsigned long
t_src_index(struct r300_vertex_program_code * vp,struct rc_src_register * src)109 t_src_index(struct r300_vertex_program_code *vp, struct rc_src_register *src)
110 {
111    if (src->File == RC_FILE_INPUT) {
112       assert(vp->inputs[src->Index] != -1);
113       return vp->inputs[src->Index];
114    } else {
115       if (src->Index < 0) {
116          fprintf(stderr, "negative offsets for indirect addressing do not work.\n");
117          return 0;
118       }
119       return src->Index;
120    }
121 }
122 
123 /* these two functions should probably be merged... */
124 
125 static unsigned long
t_src(struct r300_vertex_program_code * vp,struct rc_src_register * src)126 t_src(struct r300_vertex_program_code *vp, struct rc_src_register *src)
127 {
128    /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
129     * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
130     */
131    return PVS_SRC_OPERAND(t_src_index(vp, src), t_swizzle(GET_SWZ(src->Swizzle, 0)),
132                           t_swizzle(GET_SWZ(src->Swizzle, 1)), t_swizzle(GET_SWZ(src->Swizzle, 2)),
133                           t_swizzle(GET_SWZ(src->Swizzle, 3)), t_src_class(src->File),
134                           src->Negate) |
135           (src->RelAddr << 4) | (src->Abs << 3);
136 }
137 
138 static unsigned long
t_src_scalar(struct r300_vertex_program_code * vp,struct rc_src_register * src)139 t_src_scalar(struct r300_vertex_program_code *vp, struct rc_src_register *src)
140 {
141    /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
142     * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
143     */
144    unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
145 
146    return PVS_SRC_OPERAND(t_src_index(vp, src), t_swizzle(swz), t_swizzle(swz), t_swizzle(swz),
147                           t_swizzle(swz), t_src_class(src->File),
148                           src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
149           (src->RelAddr << 4) | (src->Abs << 3);
150 }
151 
152 static int
valid_dst(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)153 valid_dst(struct r300_vertex_program_code *vp, struct rc_dst_register *dst)
154 {
155    if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
156       return 0;
157    } else if (dst->File == RC_FILE_ADDRESS) {
158       assert(dst->Index == 0);
159    }
160 
161    return 1;
162 }
163 
164 static void
ei_vector1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)165 ei_vector1(struct r300_vertex_program_code *vp, unsigned int hw_opcode,
166            struct rc_sub_instruction *vpi, unsigned int *inst)
167 {
168    inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 0, 0, t_dst_index(vp, &vpi->DstReg),
169                                 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
170                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
171    inst[1] = t_src(vp, &vpi->SrcReg[0]);
172    inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
173    inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
174 }
175 
176 static void
ei_vector2(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)177 ei_vector2(struct r300_vertex_program_code *vp, unsigned int hw_opcode,
178            struct rc_sub_instruction *vpi, unsigned int *inst)
179 {
180    inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 0, 0, t_dst_index(vp, &vpi->DstReg),
181                                 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
182                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
183    inst[1] = t_src(vp, &vpi->SrcReg[0]);
184    inst[2] = t_src(vp, &vpi->SrcReg[1]);
185    inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
186 }
187 
188 static void
ei_math1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)189 ei_math1(struct r300_vertex_program_code *vp, unsigned int hw_opcode,
190          struct rc_sub_instruction *vpi, unsigned int *inst)
191 {
192    inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 1, 0, t_dst_index(vp, &vpi->DstReg),
193                                 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
194                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
195    inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
196    inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
197    inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
198 }
199 
200 static void
ei_math1_select(struct r300_vertex_program_code * vp,unsigned math_mode,unsigned hw_opcode_ieee,unsigned hw_opcode_dx,unsigned hw_opcode_ff,struct rc_sub_instruction * vpi,unsigned int * inst)201 ei_math1_select(struct r300_vertex_program_code *vp,
202                 unsigned math_mode,
203                 unsigned hw_opcode_ieee,
204                 unsigned hw_opcode_dx,
205                 unsigned hw_opcode_ff,
206                 struct rc_sub_instruction *vpi,
207                 unsigned int *inst)
208 {
209    unsigned hw_opcode;
210    switch (math_mode) {
211    case RC_MATH_IEEE: hw_opcode = hw_opcode_ieee; break;
212    case RC_MATH_DX: hw_opcode = hw_opcode_dx; break;
213    case RC_MATH_FF: hw_opcode = hw_opcode_ff; break;
214    default:
215       unreachable();
216    }
217    ei_math1(vp, hw_opcode, vpi, inst);
218 }
219 
220 static void
ei_cmp(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)221 ei_cmp(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
222 {
223    inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE, 0, 0, t_dst_index(vp, &vpi->DstReg),
224                                 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
225                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
226 
227    /* Arguments with constant swizzles still count as a unique
228     * temporary, so we should make sure these arguments share a
229     * register index with one of the other arguments. */
230    for (unsigned i = 0; i < 3; i++) {
231       unsigned j = (i + 1) % 3;
232       if (vpi->SrcReg[i].File == RC_FILE_NONE &&
233           (vpi->SrcReg[j].File == RC_FILE_NONE || vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
234          vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
235          break;
236       }
237    }
238 
239    inst[1] = t_src(vp, &vpi->SrcReg[0]);
240    inst[2] = t_src(vp, &vpi->SrcReg[2]);
241    inst[3] = t_src(vp, &vpi->SrcReg[1]);
242 }
243 
244 static void
ei_lit(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)245 ei_lit(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
246 {
247    // LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
248 
249    inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX, 1, 0, t_dst_index(vp, &vpi->DstReg),
250                                 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
251                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
252    /* NOTE: Users swizzling might not work. */
253    inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]),
254                              t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
255                              t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
256                              PVS_SRC_SELECT_FORCE_0,                        // Z
257                              t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
258                              t_src_class(vpi->SrcReg[0].File),
259                              vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
260              (vpi->SrcReg[0].RelAddr << 4);
261    inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]),
262                              t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
263                              t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
264                              PVS_SRC_SELECT_FORCE_0,                        // Z
265                              t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
266                              t_src_class(vpi->SrcReg[0].File),
267                              vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
268              (vpi->SrcReg[0].RelAddr << 4);
269    inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]),
270                              t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
271                              t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
272                              PVS_SRC_SELECT_FORCE_0,                        // Z
273                              t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
274                              t_src_class(vpi->SrcReg[0].File),
275                              vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
276              (vpi->SrcReg[0].RelAddr << 4);
277 }
278 
279 static void
ei_mad(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)280 ei_mad(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
281 {
282    unsigned int i;
283    /* Remarks about hardware limitations of MAD
284     * (please preserve this comment, as this information is _NOT_
285     * in the documentation provided by AMD).
286     *
287     * As described in the documentation, MAD with three unique temporary
288     * source registers requires the use of the macro version.
289     *
290     * However (and this is not mentioned in the documentation), apparently
291     * the macro version is _NOT_ a full superset of the normal version.
292     * In particular, the macro version does not always work when relative
293     * addressing is used in the source operands.
294     *
295     * This limitation caused incorrect rendering in Sauerbraten's OpenGL
296     * assembly shader path when using medium quality animations
297     * (i.e. animations with matrix blending instead of quaternion blending).
298     *
299     * Unfortunately, I (nha) have been unable to extract a Piglit regression
300     * test for this issue - for some reason, it is possible to have vertex
301     * programs whose prefix is *exactly* the same as the prefix of the
302     * offending program in Sauerbraten up to the offending instruction
303     * without causing any trouble.
304     *
305     * Bottom line: Only use the macro version only when really necessary;
306     * according to AMD docs, this should improve performance by one clock
307     * as a nice side bonus.
308     */
309    if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY && vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
310        vpi->SrcReg[2].File == RC_FILE_TEMPORARY && vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
311        vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
312        vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
313       inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD, 0, 1, t_dst_index(vp, &vpi->DstReg),
314                                    t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
315                                    vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
316    } else {
317       inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD, 0, 0, t_dst_index(vp, &vpi->DstReg),
318                                    t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
319                                    vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
320 
321       /* Arguments with constant swizzles still count as a unique
322        * temporary, so we should make sure these arguments share a
323        * register index with one of the other arguments. */
324       for (i = 0; i < 3; i++) {
325          unsigned int j;
326          if (vpi->SrcReg[i].File != RC_FILE_NONE)
327             continue;
328 
329          for (j = 0; j < 3; j++) {
330             if (i != j) {
331                vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
332                break;
333             }
334          }
335       }
336    }
337    inst[1] = t_src(vp, &vpi->SrcReg[0]);
338    inst[2] = t_src(vp, &vpi->SrcReg[1]);
339    inst[3] = t_src(vp, &vpi->SrcReg[2]);
340 }
341 
342 static void
ei_pow(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)343 ei_pow(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
344 {
345    inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF, 1, 0, t_dst_index(vp, &vpi->DstReg),
346                                 t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
347                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
348    inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
349    inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
350    inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
351 }
352 
353 static void
translate_vertex_program(struct radeon_compiler * c,void * user)354 translate_vertex_program(struct radeon_compiler *c, void *user)
355 {
356    struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler *)c;
357    struct rc_instruction *rci;
358 
359    unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
360    unsigned loop_depth = 0;
361    bool last_input_read_at_loop_end = false;
362    bool last_pos_write_at_loop_end = false;
363 
364    compiler->code->pos_end = 0; /* Not supported yet */
365    compiler->code->length = 0;
366    compiler->code->num_temporaries = 0;
367    compiler->code->last_input_read = 0;
368    compiler->code->last_pos_write = 0;
369 
370    compiler->SetHwInputOutput(compiler);
371 
372    for (rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions;
373         rci = rci->Next) {
374       struct rc_sub_instruction *vpi = &rci->U.I;
375       unsigned int *inst = compiler->code->body.d + compiler->code->length;
376       const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
377 
378       /* Skip instructions writing to non-existing destination */
379       if (!valid_dst(compiler->code, &vpi->DstReg))
380          continue;
381 
382       if (info->HasDstReg) {
383          /* Neither is Saturate. */
384          if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
385             rc_error(&compiler->Base, "Vertex program does not support the Saturate "
386                                       "modifier (yet).\n");
387          }
388       }
389 
390       if (compiler->code->length >= c->max_alu_insts * 4) {
391          rc_error(&compiler->Base, "Vertex program has too many instructions\n");
392          return;
393       }
394 
395       assert(compiler->Base.is_r500 ||
396              (vpi->Opcode != RC_OPCODE_SEQ && vpi->Opcode != RC_OPCODE_SNE));
397 
398       switch (vpi->Opcode) {
399       case RC_OPCODE_ADD:
400          ei_vector2(compiler->code, VE_ADD, vpi, inst);
401          break;
402       case RC_OPCODE_ARL:
403          ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst);
404          break;
405       case RC_OPCODE_ARR:
406          ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst);
407          break;
408       case RC_OPCODE_COS:
409          ei_math1(compiler->code, ME_COS, vpi, inst);
410          break;
411       case RC_OPCODE_CMP:
412          ei_cmp(compiler->code, vpi, inst);
413          break;
414       case RC_OPCODE_DP4:
415          ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst);
416          break;
417       case RC_OPCODE_DST:
418          ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst);
419          break;
420       case RC_OPCODE_EX2:
421          ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst);
422          break;
423       case RC_OPCODE_EXP:
424          ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst);
425          break;
426       case RC_OPCODE_FRC:
427          ei_vector1(compiler->code, VE_FRACTION, vpi, inst);
428          break;
429       case RC_OPCODE_LG2:
430          ei_math1_select(compiler->code, compiler->Base.math_rules, ME_LOG_BASE2_IEEE,
431                          ME_LOG_BASE2_FULL_DX, ME_LOG_BASE2_FULL_DX, vpi, inst);
432          break;
433       case RC_OPCODE_LIT:
434          ei_lit(compiler->code, vpi, inst);
435          break;
436       case RC_OPCODE_LOG:
437          ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst);
438          break;
439       case RC_OPCODE_MAD:
440          ei_mad(compiler->code, vpi, inst);
441          break;
442       case RC_OPCODE_MAX:
443          ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst);
444          break;
445       case RC_OPCODE_MIN:
446          ei_vector2(compiler->code, VE_MINIMUM, vpi, inst);
447          break;
448       case RC_OPCODE_MOV:
449          ei_vector1(compiler->code, VE_ADD, vpi, inst);
450          break;
451       case RC_OPCODE_MUL:
452          ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst);
453          break;
454       case RC_OPCODE_POW:
455          ei_pow(compiler->code, vpi, inst);
456          break;
457       case RC_OPCODE_RCP:
458          ei_math1_select(compiler->code, compiler->Base.math_rules, ME_RECIP_IEEE,
459                          ME_RECIP_DX, ME_RECIP_FF, vpi, inst);
460          break;
461       case RC_OPCODE_RSQ:
462          ei_math1_select(compiler->code, compiler->Base.math_rules, ME_RECIP_SQRT_IEEE,
463                          ME_RECIP_SQRT_DX, ME_RECIP_SQRT_FF, vpi, inst);
464          break;
465       case RC_OPCODE_SEQ:
466          ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst);
467          break;
468       case RC_OPCODE_SGE:
469          ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst);
470          break;
471       case RC_OPCODE_SIN:
472          ei_math1(compiler->code, ME_SIN, vpi, inst);
473          break;
474       case RC_OPCODE_SLT:
475          ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst);
476          break;
477       case RC_OPCODE_SNE:
478          ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst);
479          break;
480       case RC_OPCODE_BGNLOOP: {
481          if ((!compiler->Base.is_r500 && loop_depth >= R300_VS_MAX_LOOP_DEPTH) ||
482              loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
483             rc_error(&compiler->Base, "Loops are nested too deep.");
484             return;
485          }
486          loops[loop_depth++] = ((compiler->code->length) / 4) + 1;
487          break;
488       }
489       case RC_OPCODE_ENDLOOP: {
490          unsigned int act_addr;
491          unsigned int last_addr;
492          unsigned int ret_addr;
493 
494          if (loop_depth == 1 && last_input_read_at_loop_end) {
495             compiler->code->last_input_read = compiler->code->length / 4;
496             last_input_read_at_loop_end = false;
497          }
498          if (loop_depth == 1 && last_pos_write_at_loop_end) {
499             compiler->code->last_pos_write = compiler->code->length / 4;
500             last_pos_write_at_loop_end = false;
501          }
502 
503          ret_addr = loops[--loop_depth];
504          act_addr = ret_addr - 1;
505          last_addr = (compiler->code->length / 4) - 1;
506 
507          if (loop_depth >= R300_VS_MAX_FC_OPS) {
508             rc_error(&compiler->Base, "Too many flow control instructions.");
509             return;
510          }
511          /* Maximum of R500_PVS_FC_LOOP_CNT_JMP_INST is 0xff, here
512           * we reduce it to half to avoid occasional hangs on RV516
513           * and downclocked RV530.
514           */
515          if (compiler->Base.is_r500) {
516             compiler->code->fc_op_addrs.r500[compiler->code->num_fc_ops].lw =
517                R500_PVS_FC_ACT_ADRS(act_addr) | R500_PVS_FC_LOOP_CNT_JMP_INST(0x0080);
518             compiler->code->fc_op_addrs.r500[compiler->code->num_fc_ops].uw =
519                R500_PVS_FC_LAST_INST(last_addr) | R500_PVS_FC_RTN_INST(ret_addr);
520          } else {
521             compiler->code->fc_op_addrs.r300[compiler->code->num_fc_ops] =
522                R300_PVS_FC_ACT_ADRS(act_addr) | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff) |
523                R300_PVS_FC_LAST_INST(last_addr) | R300_PVS_FC_RTN_INST(ret_addr);
524          }
525          compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
526             R300_PVS_FC_LOOP_INIT_VAL(0x0) | R300_PVS_FC_LOOP_STEP_VAL(0x1);
527          compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(compiler->code->num_fc_ops);
528          compiler->code->num_fc_ops++;
529 
530          break;
531       }
532 
533       case RC_ME_PRED_SET_CLR:
534          ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
535          break;
536 
537       case RC_ME_PRED_SET_INV:
538          ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
539          break;
540 
541       case RC_ME_PRED_SET_POP:
542          ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
543          break;
544 
545       case RC_ME_PRED_SET_RESTORE:
546          ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
547          break;
548 
549       case RC_ME_PRED_SEQ:
550          ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
551          break;
552 
553       case RC_ME_PRED_SNEQ:
554          ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
555          break;
556 
557       case RC_VE_PRED_SNEQ_PUSH:
558          ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH, vpi, inst);
559          break;
560 
561       default:
562          rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
563          return;
564       }
565 
566       if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
567          inst[0] |= (PVS_DST_PRED_ENABLE_MASK << PVS_DST_PRED_ENABLE_SHIFT);
568          if (vpi->DstReg.Pred == RC_PRED_SET) {
569             inst[0] |= (PVS_DST_PRED_SENSE_MASK << PVS_DST_PRED_SENSE_SHIFT);
570          }
571       }
572 
573       /* Update the number of temporaries. */
574       if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
575           vpi->DstReg.Index >= compiler->code->num_temporaries)
576          compiler->code->num_temporaries = vpi->DstReg.Index + 1;
577 
578       /* last instruction that writes position */
579       if (info->HasDstReg && vpi->DstReg.File == RC_FILE_OUTPUT &&
580           t_dst_index(compiler->code, &vpi->DstReg) == 0) {
581          if (loop_depth == 0)
582             compiler->code->last_pos_write = compiler->code->length / 4;
583          else
584             last_pos_write_at_loop_end = true;
585       }
586 
587       for (unsigned i = 0; i < info->NumSrcRegs; i++) {
588          if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
589              vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
590             compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
591          if (vpi->SrcReg[i].File == RC_FILE_INPUT) {
592             if (loop_depth == 0)
593                compiler->code->last_input_read = compiler->code->length / 4;
594             else
595                last_input_read_at_loop_end = true;
596          }
597       }
598 
599       if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
600          rc_error(&compiler->Base, "Too many temporaries.\n");
601          return;
602       }
603 
604       compiler->code->length += 4;
605 
606       if (compiler->Base.Error)
607          return;
608    }
609 }
610 
611 struct temporary_allocation {
612    unsigned int Allocated : 1;
613    unsigned int HwTemp : 15;
614    struct rc_instruction *LastRead;
615 };
616 
617 static int
get_reg(struct radeon_compiler * c,struct temporary_allocation * ta,bool * hwtemps,unsigned int orig)618 get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
619         unsigned int orig)
620 {
621    if (!ta[orig].Allocated) {
622       int j;
623       for (j = 0; j < c->max_temp_regs; ++j) {
624          if (!hwtemps[j])
625             break;
626       }
627       ta[orig].Allocated = 1;
628       ta[orig].HwTemp = j;
629       hwtemps[ta[orig].HwTemp] = true;
630    }
631 
632    return ta[orig].HwTemp;
633 }
634 
635 static void
allocate_temporary_registers(struct radeon_compiler * c,void * user)636 allocate_temporary_registers(struct radeon_compiler *c, void *user)
637 {
638    unsigned int node_count, node_index;
639    struct ra_class **node_classes;
640    struct rc_list *var_ptr;
641    struct rc_list *variables;
642    struct ra_graph *graph;
643    const struct rc_regalloc_state *ra_state = c->regalloc_state;
644 
645    rc_recompute_ips(c);
646 
647    /* Get list of program variables */
648    variables = rc_get_variables(c);
649    node_count = rc_list_count(variables);
650    node_classes = memory_pool_malloc(&c->Pool, node_count * sizeof(struct ra_class *));
651 
652    for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
653       unsigned int class_index = 0;
654       int index;
655       /* Compute the live intervals */
656       rc_variable_compute_live_intervals(var_ptr->Item);
657       unsigned int writemask = rc_variable_writemask_sum(var_ptr->Item);
658       index = rc_find_class(c->regalloc_state->class_list, writemask, 6);
659       if (index > -1) {
660          class_index = c->regalloc_state->class_list[index].ID;
661       } else {
662          rc_error(c, "Could not find class for index=%u mask=%u\n",
663                   ((struct rc_variable *)var_ptr->Item)->Dst.Index, writemask);
664       }
665       node_classes[node_index] = ra_state->classes[class_index];
666    }
667 
668    graph = ra_alloc_interference_graph(ra_state->regs, node_count);
669 
670    for (node_index = 0; node_index < node_count; node_index++) {
671       ra_set_node_class(graph, node_index, node_classes[node_index]);
672    }
673 
674    rc_build_interference_graph(graph, variables);
675 
676    if (!ra_allocate(graph)) {
677       rc_error(c, "Ran out of hardware temporaries\n");
678       ralloc_free(graph);
679       return;
680    }
681 
682    /* Rewrite the registers */
683    for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
684       int reg = ra_get_node_reg(graph, node_index);
685       unsigned int writemask = reg_get_writemask(reg);
686       unsigned int index = reg_get_index(reg);
687       struct rc_variable *var = var_ptr->Item;
688 
689       rc_variable_change_dst(var, index, writemask);
690    }
691 
692    ralloc_free(graph);
693 }
694 
695 /**
696  * Vertex engine cannot read two inputs or two constants at the same time.
697  * Introduce intermediate MOVs to temporary registers to account for this.
698  */
699 static int
transform_source_conflicts(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)700 transform_source_conflicts(struct radeon_compiler *c, struct rc_instruction *inst, void *unused)
701 {
702    const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
703 
704    if (opcode->NumSrcRegs == 3) {
705       if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]) ||
706           t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
707          int tmpreg = rc_find_free_temporary(c);
708          struct rc_instruction *inst_mov = rc_insert_new_instruction(c, inst->Prev);
709          inst_mov->U.I.Opcode = RC_OPCODE_MOV;
710          inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
711          inst_mov->U.I.DstReg.Index = tmpreg;
712          inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
713          inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
714          inst_mov->U.I.SrcReg[0].Negate = 0;
715          inst_mov->U.I.SrcReg[0].Abs = 0;
716 
717          inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
718          inst->U.I.SrcReg[2].Index = tmpreg;
719          inst->U.I.SrcReg[2].RelAddr = false;
720       }
721    }
722 
723    if (opcode->NumSrcRegs >= 2) {
724       if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
725          int tmpreg = rc_find_free_temporary(c);
726          struct rc_instruction *inst_mov = rc_insert_new_instruction(c, inst->Prev);
727          inst_mov->U.I.Opcode = RC_OPCODE_MOV;
728          inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
729          inst_mov->U.I.DstReg.Index = tmpreg;
730          inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
731          inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
732          inst_mov->U.I.SrcReg[0].Negate = 0;
733          inst_mov->U.I.SrcReg[0].Abs = 0;
734 
735          inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
736          inst->U.I.SrcReg[1].Index = tmpreg;
737          inst->U.I.SrcReg[1].RelAddr = false;
738       }
739    }
740 
741    return 1;
742 }
743 
744 static void
rc_vs_add_artificial_outputs(struct radeon_compiler * c,void * user)745 rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
746 {
747    struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler *)c;
748    int i;
749 
750    for (i = 0; i < 32; ++i) {
751       if ((compiler->RequiredOutputs & (1U << i)) &&
752           !(compiler->Base.Program.OutputsWritten & (1U << i))) {
753          struct rc_instruction *inst =
754             rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
755          inst->U.I.Opcode = RC_OPCODE_MOV;
756 
757          inst->U.I.DstReg.File = RC_FILE_OUTPUT;
758          inst->U.I.DstReg.Index = i;
759          inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
760 
761          inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
762          inst->U.I.SrcReg[0].Index = 0;
763          inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
764 
765          compiler->Base.Program.OutputsWritten |= 1U << i;
766       }
767    }
768 }
769 
770 static int
swizzle_is_native(rc_opcode opcode,struct rc_src_register reg)771 swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
772 {
773    (void)opcode;
774    (void)reg;
775 
776    return 1;
777 }
778 
779 const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
780    .IsNative = &swizzle_is_native, .Split = NULL /* should never be called */
781 };
782 
783 void
r3xx_compile_vertex_program(struct r300_vertex_program_compiler * c)784 r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
785 {
786    int is_r500 = c->Base.is_r500;
787    int opt = !c->Base.disable_optimizations;
788    bool debug = c->Base.Debug & RC_DBG_LOG;
789 
790    /* Lists of instruction transformations. */
791    struct radeon_program_transformation alu_rewrite[] = {{&r300_transform_vertex_alu, NULL},
792                                                          {NULL, NULL}};
793 
794    struct radeon_program_transformation resolve_src_conflicts[] = {
795       {&transform_source_conflicts, NULL},
796       {NULL, NULL}};
797 
798    /* List of compiler passes. */
799    struct radeon_compiler_pass vs_list[] = {
800       /* clang-format off */
801       /* NAME                        DUMP PREDICATE FUNCTION                      PARAM */
802       {"add artificial outputs",     0,   1,        rc_vs_add_artificial_outputs, NULL},
803       {"native rewrite",             1,   1,        rc_local_transform,           alu_rewrite},
804       {"unused channels",            1,   opt,      rc_mark_unused_channels,      NULL},
805       {"dataflow optimize",          1,   opt,      rc_optimize,                  NULL},
806       {"dead constants",             1,   1,        rc_remove_unused_constants,   &c->code->constants_remap_table},
807       /* This pass must be done after optimizations. */
808       {"source conflict resolve",    1,   1,        rc_local_transform,           resolve_src_conflicts},
809       {"register allocation",        1,   opt,      allocate_temporary_registers, NULL},
810       {"lower control flow opcodes", 1,   is_r500,  rc_vert_fc,                   NULL},
811       {"final code validation",      0,   1,        rc_validate_final_shader,     NULL},
812       {"machine code generation",    0,   1,        translate_vertex_program,     NULL},
813       {"dump machine code",          0,   debug,    r300_vertex_program_dump,     NULL},
814       {NULL,                         0,   0,        NULL, NULL}};
815    /* clang-format on */
816 
817    c->Base.type = RC_VERTEX_PROGRAM;
818    c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
819 
820    rc_run_compiler(&c->Base, vs_list);
821 
822    c->code->InputsRead = c->Base.Program.InputsRead;
823    c->code->OutputsWritten = c->Base.Program.OutputsWritten;
824    rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
825 }
826