1 /*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "radeon_compiler.h"
7
8 #include <stdarg.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12
13 #include "pipe/p_state.h"
14 #include "util/u_debug.h"
15 #include "radeon_compiler_util.h"
16 #include "radeon_dataflow.h"
17 #include "radeon_program.h"
18 #include "radeon_program_pair.h"
19 #include "radeon_regalloc.h"
20
21 void
rc_init(struct radeon_compiler * c,const struct rc_regalloc_state * rs)22 rc_init(struct radeon_compiler *c, const struct rc_regalloc_state *rs)
23 {
24 memset(c, 0, sizeof(*c));
25
26 memory_pool_init(&c->Pool);
27 c->Program.Instructions.Prev = &c->Program.Instructions;
28 c->Program.Instructions.Next = &c->Program.Instructions;
29 c->Program.Instructions.U.I.Opcode = RC_OPCODE_ILLEGAL_OPCODE;
30 c->regalloc_state = rs;
31 c->max_temp_index = -1;
32 }
33
34 void
rc_destroy(struct radeon_compiler * c)35 rc_destroy(struct radeon_compiler *c)
36 {
37 rc_constants_destroy(&c->Program.Constants);
38 memory_pool_destroy(&c->Pool);
39 free(c->ErrorMsg);
40 }
41
42 void
rc_debug(struct radeon_compiler * c,const char * fmt,...)43 rc_debug(struct radeon_compiler *c, const char *fmt, ...)
44 {
45 va_list ap;
46
47 if (!(c->Debug & RC_DBG_LOG))
48 return;
49
50 va_start(ap, fmt);
51 vfprintf(stderr, fmt, ap);
52 va_end(ap);
53 }
54
55 void
rc_error(struct radeon_compiler * c,const char * fmt,...)56 rc_error(struct radeon_compiler *c, const char *fmt, ...)
57 {
58 va_list ap;
59
60 c->Error = 1;
61
62 if (!c->ErrorMsg) {
63 /* Only remember the first error */
64 char buf[1024];
65 int written;
66
67 va_start(ap, fmt);
68 written = vsnprintf(buf, sizeof(buf), fmt, ap);
69 va_end(ap);
70
71 if (written < sizeof(buf)) {
72 c->ErrorMsg = strdup(buf);
73 } else {
74 c->ErrorMsg = malloc(written + 1);
75
76 va_start(ap, fmt);
77 vsnprintf(c->ErrorMsg, written + 1, fmt, ap);
78 va_end(ap);
79 }
80 }
81
82 if (c->Debug & RC_DBG_LOG) {
83 fprintf(stderr, "r300compiler error: ");
84
85 va_start(ap, fmt);
86 vfprintf(stderr, fmt, ap);
87 va_end(ap);
88 }
89 }
90
91 int
rc_if_fail_helper(struct radeon_compiler * c,const char * file,int line,const char * assertion)92 rc_if_fail_helper(struct radeon_compiler *c, const char *file, int line, const char *assertion)
93 {
94 rc_error(c, "ICE at %s:%i: assertion failed: %s\n", file, line, assertion);
95 return 1;
96 }
97
98 void
rc_mark_unused_channels(struct radeon_compiler * c,void * user)99 rc_mark_unused_channels(struct radeon_compiler *c, void *user)
100 {
101 unsigned int srcmasks[3];
102
103 for (struct rc_instruction *inst = c->Program.Instructions.Next;
104 inst != &c->Program.Instructions; inst = inst->Next) {
105
106 rc_compute_sources_for_writemask(inst, inst->U.I.DstReg.WriteMask, srcmasks);
107
108 for (unsigned int src = 0; src < 3; ++src) {
109 for (unsigned int chan = 0; chan < 4; ++chan) {
110 if (!GET_BIT(srcmasks[src], chan))
111 SET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan, RC_SWIZZLE_UNUSED);
112 }
113 }
114 }
115 }
116
117 /**
118 * Recompute c->Program.InputsRead and c->Program.OutputsWritten
119 * based on which inputs and outputs are actually referenced
120 * in program instructions.
121 */
122 void
rc_calculate_inputs_outputs(struct radeon_compiler * c)123 rc_calculate_inputs_outputs(struct radeon_compiler *c)
124 {
125 struct rc_instruction *inst;
126
127 c->Program.InputsRead = 0;
128 c->Program.OutputsWritten = 0;
129
130 for (inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
131 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
132 int i;
133
134 for (i = 0; i < opcode->NumSrcRegs; ++i) {
135 if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT)
136 c->Program.InputsRead |= 1U << inst->U.I.SrcReg[i].Index;
137 }
138
139 if (opcode->HasDstReg) {
140 if (inst->U.I.DstReg.File == RC_FILE_OUTPUT)
141 c->Program.OutputsWritten |= 1U << inst->U.I.DstReg.Index;
142 }
143 }
144 }
145
146 /**
147 * Rewrite the program such that a given output is duplicated.
148 */
149 void
rc_copy_output(struct radeon_compiler * c,unsigned output,unsigned dup_output)150 rc_copy_output(struct radeon_compiler *c, unsigned output, unsigned dup_output)
151 {
152 unsigned tempreg = rc_find_free_temporary(c);
153 struct rc_instruction *inst;
154 struct rc_instruction *insert_pos = c->Program.Instructions.Prev;
155 struct rc_instruction *last_write_inst = NULL;
156 unsigned branch_depth = 0;
157 unsigned loop_depth = 0;
158 bool emit_after_control_flow = false;
159 unsigned num_writes = 0;
160
161 for (inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
162 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
163
164 if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP)
165 loop_depth++;
166 if (inst->U.I.Opcode == RC_OPCODE_IF)
167 branch_depth++;
168 if ((inst->U.I.Opcode == RC_OPCODE_ENDLOOP && loop_depth--) ||
169 (inst->U.I.Opcode == RC_OPCODE_ENDIF && branch_depth--))
170 if (emit_after_control_flow && loop_depth == 0 && branch_depth == 0) {
171 insert_pos = inst;
172 emit_after_control_flow = false;
173 }
174
175 if (opcode->HasDstReg) {
176 if (inst->U.I.DstReg.File == RC_FILE_OUTPUT && inst->U.I.DstReg.Index == output) {
177 num_writes++;
178 inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
179 inst->U.I.DstReg.Index = tempreg;
180 insert_pos = inst;
181 last_write_inst = inst;
182 if (loop_depth != 0 && branch_depth != 0)
183 emit_after_control_flow = true;
184 }
185 }
186 }
187
188 /* If there is only a single write, just duplicate the whole instruction instead.
189 * We can do this even when the single write was is a control flow.
190 */
191 if (num_writes == 1) {
192 last_write_inst->U.I.DstReg.File = RC_FILE_OUTPUT;
193 last_write_inst->U.I.DstReg.Index = output;
194
195 inst = rc_insert_new_instruction(c, last_write_inst);
196 struct rc_instruction *prev = inst->Prev;
197 struct rc_instruction *next = inst->Next;
198 memcpy(inst, last_write_inst, sizeof(struct rc_instruction));
199 inst->Prev = prev;
200 inst->Next = next;
201 inst->U.I.DstReg.Index = dup_output;
202 } else {
203 inst = rc_insert_new_instruction(c, insert_pos);
204 inst->U.I.Opcode = RC_OPCODE_MOV;
205 inst->U.I.DstReg.File = RC_FILE_OUTPUT;
206 inst->U.I.DstReg.Index = output;
207
208 inst->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
209 inst->U.I.SrcReg[0].Index = tempreg;
210 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
211
212 inst = rc_insert_new_instruction(c, inst);
213 inst->U.I.Opcode = RC_OPCODE_MOV;
214 inst->U.I.DstReg.File = RC_FILE_OUTPUT;
215 inst->U.I.DstReg.Index = dup_output;
216
217 inst->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
218 inst->U.I.SrcReg[0].Index = tempreg;
219 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
220 }
221
222 c->Program.OutputsWritten |= 1U << dup_output;
223 }
224
225 /**
226 * Introduce standard code fragment to deal with fragment.position.
227 */
228 void
rc_transform_fragment_wpos(struct radeon_compiler * c,unsigned wpos,unsigned new_input,int full_vtransform)229 rc_transform_fragment_wpos(struct radeon_compiler *c, unsigned wpos, unsigned new_input,
230 int full_vtransform)
231 {
232 unsigned tempregi = rc_find_free_temporary(c);
233 struct rc_instruction *inst_rcp;
234 struct rc_instruction *inst_mul;
235 struct rc_instruction *inst_mad;
236 struct rc_instruction *inst;
237
238 c->Program.InputsRead &= ~(1U << wpos);
239 c->Program.InputsRead |= 1U << new_input;
240
241 /* perspective divide */
242 inst_rcp = rc_insert_new_instruction(c, &c->Program.Instructions);
243 inst_rcp->U.I.Opcode = RC_OPCODE_RCP;
244
245 inst_rcp->U.I.DstReg.File = RC_FILE_TEMPORARY;
246 inst_rcp->U.I.DstReg.Index = tempregi;
247 inst_rcp->U.I.DstReg.WriteMask = RC_MASK_W;
248
249 inst_rcp->U.I.SrcReg[0].File = RC_FILE_INPUT;
250 inst_rcp->U.I.SrcReg[0].Index = new_input;
251 inst_rcp->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_WWWW;
252
253 inst_mul = rc_insert_new_instruction(c, inst_rcp);
254 inst_mul->U.I.Opcode = RC_OPCODE_MUL;
255
256 inst_mul->U.I.DstReg.File = RC_FILE_TEMPORARY;
257 inst_mul->U.I.DstReg.Index = tempregi;
258 inst_mul->U.I.DstReg.WriteMask = RC_MASK_XYZ;
259
260 inst_mul->U.I.SrcReg[0].File = RC_FILE_INPUT;
261 inst_mul->U.I.SrcReg[0].Index = new_input;
262
263 inst_mul->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
264 inst_mul->U.I.SrcReg[1].Index = tempregi;
265 inst_mul->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_WWWW;
266
267 /* viewport transformation */
268 inst_mad = rc_insert_new_instruction(c, inst_mul);
269 inst_mad->U.I.Opcode = RC_OPCODE_MAD;
270
271 inst_mad->U.I.DstReg.File = RC_FILE_TEMPORARY;
272 inst_mad->U.I.DstReg.Index = tempregi;
273 inst_mad->U.I.DstReg.WriteMask = RC_MASK_XYZ;
274
275 inst_mad->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
276 inst_mad->U.I.SrcReg[0].Index = tempregi;
277 inst_mad->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZ0;
278
279 inst_mad->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
280 inst_mad->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XYZ0;
281
282 inst_mad->U.I.SrcReg[2].File = RC_FILE_CONSTANT;
283 inst_mad->U.I.SrcReg[2].Swizzle = RC_SWIZZLE_XYZ0;
284
285 if (full_vtransform) {
286 inst_mad->U.I.SrcReg[1].Index =
287 rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_VIEWPORT_SCALE, 0);
288 inst_mad->U.I.SrcReg[2].Index =
289 rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_VIEWPORT_OFFSET, 0);
290 } else {
291 inst_mad->U.I.SrcReg[1].Index = inst_mad->U.I.SrcReg[2].Index =
292 rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_WINDOW_DIMENSION, 0);
293 }
294
295 for (inst = inst_mad->Next; inst != &c->Program.Instructions; inst = inst->Next) {
296 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
297 unsigned i;
298
299 for (i = 0; i < opcode->NumSrcRegs; i++) {
300 if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT && inst->U.I.SrcReg[i].Index == wpos) {
301 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
302 inst->U.I.SrcReg[i].Index = tempregi;
303 }
304 }
305 }
306 }
307
308 /**
309 * The FACE input in hardware contains 1 if it's a back face, 0 otherwise.
310 * Gallium and OpenGL define it the other way around.
311 *
312 * So let's just negate FACE at the beginning of the shader and rewrite the rest
313 * of the shader to read from the newly allocated temporary.
314 */
315 void
rc_transform_fragment_face(struct radeon_compiler * c,unsigned face)316 rc_transform_fragment_face(struct radeon_compiler *c, unsigned face)
317 {
318 unsigned tempregi = rc_find_free_temporary(c);
319 struct rc_instruction *inst_add;
320 struct rc_instruction *inst;
321
322 /* perspective divide */
323 inst_add = rc_insert_new_instruction(c, &c->Program.Instructions);
324 inst_add->U.I.Opcode = RC_OPCODE_ADD;
325
326 inst_add->U.I.DstReg.File = RC_FILE_TEMPORARY;
327 inst_add->U.I.DstReg.Index = tempregi;
328 inst_add->U.I.DstReg.WriteMask = RC_MASK_X;
329
330 inst_add->U.I.SrcReg[0].File = RC_FILE_NONE;
331 inst_add->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_1111;
332
333 inst_add->U.I.SrcReg[1].File = RC_FILE_INPUT;
334 inst_add->U.I.SrcReg[1].Index = face;
335 inst_add->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XXXX;
336 inst_add->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
337
338 for (inst = inst_add->Next; inst != &c->Program.Instructions; inst = inst->Next) {
339 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
340 unsigned i;
341
342 for (i = 0; i < opcode->NumSrcRegs; i++) {
343 if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT && inst->U.I.SrcReg[i].Index == face) {
344 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
345 inst->U.I.SrcReg[i].Index = tempregi;
346 }
347 }
348 }
349 }
350
351 static void
reg_count_callback(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)352 reg_count_callback(void *userdata, struct rc_instruction *inst, rc_register_file file,
353 unsigned int index, unsigned int mask)
354 {
355 struct rc_program_stats *s = userdata;
356 if (file == RC_FILE_TEMPORARY)
357 (int)index > s->num_temp_regs ? s->num_temp_regs = index : 0;
358 if (file == RC_FILE_INLINE)
359 s->num_inline_literals++;
360 if (file == RC_FILE_CONSTANT)
361 s->num_consts = MAX2(s->num_consts, index + 1);
362 }
363
364 void
rc_get_stats(struct radeon_compiler * c,struct rc_program_stats * s)365 rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s)
366 {
367 struct rc_instruction *tmp;
368 memset(s, 0, sizeof(*s));
369 unsigned ip = 0;
370 int last_begintex = -1;
371
372 for (tmp = c->Program.Instructions.Next; tmp != &c->Program.Instructions;
373 tmp = tmp->Next, ip++) {
374 const struct rc_opcode_info *info;
375 rc_for_all_reads_mask(tmp, reg_count_callback, s);
376 if (tmp->Type == RC_INSTRUCTION_NORMAL) {
377 info = rc_get_opcode_info(tmp->U.I.Opcode);
378 if (info->Opcode == RC_OPCODE_BEGIN_TEX) {
379 /* The R5xx docs mention ~30 cycles in section 8.3.1
380 * The only case when we don't want to add the cycles
381 * penalty is when the texblock contains only kil.
382 */
383 const struct rc_opcode_info *next_op = rc_get_opcode_info(tmp->Next->U.I.Opcode);
384 struct rc_instruction *second_next_instr = tmp->Next->Next;
385 const struct rc_opcode_info *second_next_op;
386 if (second_next_instr->Type == RC_INSTRUCTION_NORMAL) {
387 second_next_op = rc_get_opcode_info(second_next_instr->U.I.Opcode);
388 } else {
389 second_next_op = rc_get_opcode_info(second_next_instr->U.P.RGB.Opcode);
390 }
391 if (next_op->Opcode != RC_OPCODE_KIL ||
392 (second_next_instr->Type == RC_INSTRUCTION_NORMAL && second_next_op->HasTexture)) {
393 s->num_cycles += 30;
394 last_begintex = ip;
395 }
396 continue;
397 }
398 if (info->Opcode == RC_OPCODE_MAD && rc_inst_has_three_diff_temp_srcs(tmp))
399 s->num_cycles++;
400 } else {
401 if (tmp->U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Used)
402 s->num_presub_ops++;
403 if (tmp->U.P.Alpha.Src[RC_PAIR_PRESUB_SRC].Used)
404 s->num_presub_ops++;
405 /* Assuming alpha will never be a flow control or
406 * a tex instruction. */
407 if (tmp->U.P.Alpha.Opcode != RC_OPCODE_NOP)
408 s->num_alpha_insts++;
409 if (tmp->U.P.RGB.Opcode != RC_OPCODE_NOP)
410 s->num_rgb_insts++;
411 if (tmp->U.P.RGB.Omod != RC_OMOD_MUL_1 && tmp->U.P.RGB.Omod != RC_OMOD_DISABLE) {
412 s->num_omod_ops++;
413 }
414 if (tmp->U.P.Alpha.Omod != RC_OMOD_MUL_1 && tmp->U.P.Alpha.Omod != RC_OMOD_DISABLE) {
415 s->num_omod_ops++;
416 }
417 if (tmp->U.P.Nop)
418 s->num_cycles++;
419 /* SemWait has effect only on R500, the more instructions we can put
420 * between the tex block and the first texture semaphore, the better.
421 */
422 if (tmp->U.P.SemWait && c->is_r500 && last_begintex != -1) {
423 s->num_cycles -= MIN2(30, ip - last_begintex);
424 last_begintex = -1;
425 }
426 info = rc_get_opcode_info(tmp->U.P.RGB.Opcode);
427 }
428 if (info->IsFlowControl) {
429 s->num_fc_insts++;
430 if (info->Opcode == RC_OPCODE_BGNLOOP)
431 s->num_loops++;
432 }
433 /* VS flow control was already translated to the predicate instructions */
434 if (c->type == RC_VERTEX_PROGRAM)
435 if (strstr(info->Name, "PRED") != NULL)
436 s->num_pred_insts++;
437
438 if (info->HasTexture)
439 s->num_tex_insts++;
440 s->num_insts++;
441 s->num_cycles++;
442 }
443 /* Increment here because the reg_count_callback store the max
444 * temporary reg index in s->nun_temp_regs. */
445 s->num_temp_regs++;
446 }
447
448 static void
print_stats(struct radeon_compiler * c)449 print_stats(struct radeon_compiler *c)
450 {
451 struct rc_program_stats s;
452
453 rc_get_stats(c, &s);
454
455 /* Note that we print some dummy values for instruction categories that
456 * only the FS has, because shader-db's report.py wants all shaders to
457 * have the same set.
458 */
459 util_debug_message(
460 c->debug, SHADER_INFO,
461 "%s shader: %u inst, %u vinst, %u sinst, %u predicate, %u flowcontrol, "
462 "%u loops, %u tex, %u presub, %u omod, %u temps, %u consts, %u lits, %u cycles",
463 c->type == RC_VERTEX_PROGRAM ? "VS" : "FS", s.num_insts, s.num_rgb_insts, s.num_alpha_insts,
464 s.num_pred_insts, s.num_fc_insts, s.num_loops, s.num_tex_insts, s.num_presub_ops,
465 s.num_omod_ops, s.num_temp_regs, s.num_consts, s.num_inline_literals, s.num_cycles);
466 }
467
468 static const char *shader_name[RC_NUM_PROGRAM_TYPES] = {"Vertex Program", "Fragment Program"};
469
470 bool
rc_run_compiler_passes(struct radeon_compiler * c,struct radeon_compiler_pass * list)471 rc_run_compiler_passes(struct radeon_compiler *c, struct radeon_compiler_pass *list)
472 {
473 for (unsigned i = 0; list[i].name; i++) {
474 if (list[i].predicate) {
475 list[i].run(c, list[i].user);
476
477 if (c->Error)
478 return false;
479
480 if ((c->Debug & RC_DBG_LOG) && list[i].dump) {
481 fprintf(stderr, "%s: after '%s'\n", shader_name[c->type], list[i].name);
482 rc_print_program(&c->Program);
483 }
484 }
485 }
486 return true;
487 }
488
489 /* Executes a list of compiler passes given in the parameter 'list'. */
490 void
rc_run_compiler(struct radeon_compiler * c,struct radeon_compiler_pass * list)491 rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *list)
492 {
493 if (c->Debug & RC_DBG_LOG) {
494 fprintf(stderr, "%s: before compilation\n", shader_name[c->type]);
495 rc_print_program(&c->Program);
496 }
497
498 if (rc_run_compiler_passes(c, list)) {
499 print_stats(c);
500 }
501 }
502
503 void
rc_validate_final_shader(struct radeon_compiler * c,void * user)504 rc_validate_final_shader(struct radeon_compiler *c, void *user)
505 {
506 /* Check the number of constants. */
507 if (c->Program.Constants.Count > c->max_constants) {
508 rc_error(c, "Too many constants. Max: %i, Got: %i\n", c->max_constants,
509 c->Program.Constants.Count);
510 }
511 }
512