1 /*
2 * Copyright 2009 Nicolai Haehnle.
3 * Copyright 2011 Tom Stellard <tstellar@gmail.com>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "radeon_program_pair.h"
8
9 #include <stdio.h>
10
11 #include "util/glheader.h"
12 #include "util/ralloc.h"
13 #include "util/register_allocate.h"
14 #include "util/u_memory.h"
15
16 #include "r300_fragprog_swizzle.h"
17 #include "radeon_compiler.h"
18 #include "radeon_compiler_util.h"
19 #include "radeon_dataflow.h"
20 #include "radeon_list.h"
21 #include "radeon_regalloc.h"
22 #include "radeon_variable.h"
23
24 static void
scan_read_callback(void * data,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)25 scan_read_callback(void *data, struct rc_instruction *inst, rc_register_file file,
26 unsigned int index, unsigned int mask)
27 {
28 struct regalloc_state *s = data;
29 struct register_info *reg;
30 unsigned int i;
31
32 if (file != RC_FILE_INPUT)
33 return;
34
35 s->Input[index].Used = 1;
36 reg = &s->Input[index];
37
38 for (i = 0; i < 4; i++) {
39 if (!((mask >> i) & 0x1)) {
40 continue;
41 }
42 reg->Live[i].Used = 1;
43 reg->Live[i].Start = 0;
44 reg->Live[i].End = s->LoopEnd > inst->IP ? s->LoopEnd : inst->IP;
45 }
46 }
47
48 static void
remap_register(void * data,struct rc_instruction * inst,rc_register_file * file,unsigned int * index)49 remap_register(void *data, struct rc_instruction *inst, rc_register_file *file, unsigned int *index)
50 {
51 struct regalloc_state *s = data;
52 const struct register_info *reg;
53
54 if (*file == RC_FILE_TEMPORARY && s->Simple)
55 reg = &s->Temporary[*index];
56 else if (*file == RC_FILE_INPUT)
57 reg = &s->Input[*index];
58 else
59 return;
60
61 if (reg->Allocated) {
62 *index = reg->Index;
63 }
64 }
65
66 static void
alloc_input_simple(void * data,unsigned int input,unsigned int hwreg)67 alloc_input_simple(void *data, unsigned int input, unsigned int hwreg)
68 {
69 struct regalloc_state *s = data;
70
71 if (input >= s->NumInputs)
72 return;
73
74 s->Input[input].Allocated = 1;
75 s->Input[input].File = RC_FILE_TEMPORARY;
76 s->Input[input].Index = hwreg;
77 }
78
79 /* This functions offsets the temporary register indices by the number
80 * of input registers, because input registers are actually temporaries and
81 * should not occupy the same space.
82 *
83 * This pass is supposed to be used to maintain correct allocation of inputs
84 * if the standard register allocation is disabled. */
85 static void
do_regalloc_inputs_only(struct regalloc_state * s)86 do_regalloc_inputs_only(struct regalloc_state *s)
87 {
88 for (unsigned i = 0; i < s->NumTemporaries; i++) {
89 s->Temporary[i].Allocated = 1;
90 s->Temporary[i].File = RC_FILE_TEMPORARY;
91 s->Temporary[i].Index = i + s->NumInputs;
92 }
93 }
94
95 static unsigned int
is_derivative(rc_opcode op)96 is_derivative(rc_opcode op)
97 {
98 return (op == RC_OPCODE_DDX || op == RC_OPCODE_DDY);
99 }
100
101 struct variable_get_class_cb_data {
102 unsigned int *can_change_writemask;
103 unsigned int conversion_swizzle;
104 struct radeon_compiler *c;
105 };
106
107 static void
variable_get_class_read_cb(void * userdata,struct rc_instruction * inst,struct rc_pair_instruction_arg * arg,struct rc_pair_instruction_source * src)108 variable_get_class_read_cb(void *userdata, struct rc_instruction *inst,
109 struct rc_pair_instruction_arg *arg,
110 struct rc_pair_instruction_source *src)
111 {
112 struct variable_get_class_cb_data *d = userdata;
113 unsigned int new_swizzle = rc_adjust_channels(arg->Swizzle, d->conversion_swizzle);
114 /* We can't just call r300_swizzle_is_native basic here, because it ignores the
115 * extra requirements for presubtract. However, after pair translation we no longer
116 * have the rc_src_register required for the native swizzle, so we have to
117 * reconstruct it. */
118 struct rc_src_register reg = {};
119 reg.Swizzle = new_swizzle;
120 reg.File = src->File;
121
122 assert(inst->Type == RC_INSTRUCTION_PAIR);
123 /* The opcode is unimportant, we can't have TEX here. */
124 if (!d->c->SwizzleCaps->IsNative(RC_OPCODE_MAD, reg)) {
125 *d->can_change_writemask = 0;
126 }
127 }
128
129 static unsigned
variable_get_class(struct rc_variable * variable,const struct rc_class * classes)130 variable_get_class(struct rc_variable *variable, const struct rc_class *classes)
131 {
132 unsigned int i;
133 unsigned int can_change_writemask = 1;
134 unsigned int writemask = rc_variable_writemask_sum(variable);
135 struct rc_list *readers = rc_variable_readers_union(variable);
136 int class_index;
137
138 if (!variable->C->is_r500) {
139 struct rc_class c;
140 struct rc_variable *var_ptr;
141 /* The assumption here is that if an instruction has type
142 * RC_INSTRUCTION_NORMAL then it is a TEX instruction.
143 * r300 and r400 can't swizzle the result of a TEX lookup. */
144 for (var_ptr = variable; var_ptr; var_ptr = var_ptr->Friend) {
145 if (var_ptr->Inst->Type == RC_INSTRUCTION_NORMAL) {
146 writemask = RC_MASK_XYZW;
147 }
148 }
149
150 /* Check if it is possible to do swizzle packing for r300/r400
151 * without creating non-native swizzles. */
152 class_index = rc_find_class(classes, writemask, 3);
153 if (class_index < 0) {
154 goto error;
155 }
156 c = classes[class_index];
157 if (c.WritemaskCount == 1) {
158 goto done;
159 }
160 for (i = 0; i < c.WritemaskCount; i++) {
161 struct rc_variable *var_ptr;
162 for (var_ptr = variable; var_ptr; var_ptr = var_ptr->Friend) {
163 int j;
164 unsigned int conversion_swizzle =
165 rc_make_conversion_swizzle(writemask, c.Writemasks[i]);
166 struct variable_get_class_cb_data d;
167 d.can_change_writemask = &can_change_writemask;
168 d.conversion_swizzle = conversion_swizzle;
169 d.c = variable->C;
170 /* If we get this far var_ptr->Inst has to
171 * be a pair instruction. If variable or any
172 * of its friends are normal instructions,
173 * then the writemask will be set to RC_MASK_XYZW
174 * and the function will return before it gets
175 * here. */
176 rc_pair_for_all_reads_arg(var_ptr->Inst, variable_get_class_read_cb, &d);
177
178 for (j = 0; j < var_ptr->ReaderCount; j++) {
179 unsigned int old_swizzle;
180 unsigned int new_swizzle;
181 struct rc_reader r = var_ptr->Readers[j];
182 if (r.Inst->Type == RC_INSTRUCTION_PAIR) {
183 old_swizzle = r.U.P.Arg->Swizzle;
184 } else {
185 /* Source operands of TEX
186 * instructions can't be
187 * swizzle on r300/r400 GPUs.
188 */
189 can_change_writemask = 0;
190 break;
191 }
192 new_swizzle = rc_rewrite_swizzle(old_swizzle, conversion_swizzle);
193 if (!r300_swizzle_is_native_basic(new_swizzle)) {
194 can_change_writemask = 0;
195 break;
196 }
197 }
198 if (!can_change_writemask) {
199 break;
200 }
201 }
202 if (!can_change_writemask) {
203 break;
204 }
205 }
206 }
207
208 if (variable->Inst->Type == RC_INSTRUCTION_PAIR) {
209 /* DDX/DDY seem to always fail when their writemasks are
210 * changed.*/
211 if (is_derivative(variable->Inst->U.P.RGB.Opcode) ||
212 is_derivative(variable->Inst->U.P.Alpha.Opcode)) {
213 can_change_writemask = 0;
214 }
215 }
216 for (; readers; readers = readers->Next) {
217 struct rc_reader *r = readers->Item;
218 if (r->Inst->Type == RC_INSTRUCTION_PAIR) {
219 if (r->U.P.Arg->Source == RC_PAIR_PRESUB_SRC) {
220 can_change_writemask = 0;
221 break;
222 }
223 /* DDX/DDY also fail when their swizzles are changed. */
224 if (is_derivative(r->Inst->U.P.RGB.Opcode) || is_derivative(r->Inst->U.P.Alpha.Opcode)) {
225 can_change_writemask = 0;
226 break;
227 }
228 }
229 }
230
231 class_index = rc_find_class(classes, writemask, can_change_writemask ? 3 : 1);
232 done:
233 if (class_index > -1) {
234 return classes[class_index].ID;
235 } else {
236 error:
237 rc_error(variable->C, "Could not find class for index=%u mask=%u\n", variable->Dst.Index,
238 writemask);
239 return 0;
240 }
241 }
242
243 static void
do_advanced_regalloc(struct regalloc_state * s)244 do_advanced_regalloc(struct regalloc_state *s)
245 {
246
247 unsigned int i, input_node, node_count, node_index;
248 struct ra_class **node_classes;
249 struct rc_instruction *inst;
250 struct rc_list *var_ptr;
251 struct rc_list *variables;
252 struct ra_graph *graph;
253 const struct rc_regalloc_state *ra_state = s->C->regalloc_state;
254
255 /* Get list of program variables */
256 variables = rc_get_variables(s->C);
257 node_count = rc_list_count(variables);
258 node_classes = memory_pool_malloc(&s->C->Pool, node_count * sizeof(struct ra_class *));
259
260 for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
261 unsigned int class_index;
262 /* Compute the live intervals */
263 rc_variable_compute_live_intervals(var_ptr->Item);
264
265 class_index = variable_get_class(var_ptr->Item, ra_state->class_list);
266 node_classes[node_index] = ra_state->classes[class_index];
267 }
268
269 /* Calculate live intervals for input registers */
270 for (inst = s->C->Program.Instructions.Next; inst != &s->C->Program.Instructions;
271 inst = inst->Next) {
272 rc_opcode op = rc_get_flow_control_inst(inst);
273 if (op == RC_OPCODE_BGNLOOP) {
274 struct rc_instruction *endloop = rc_match_bgnloop(inst);
275 if (endloop->IP > s->LoopEnd) {
276 s->LoopEnd = endloop->IP;
277 }
278 }
279 rc_for_all_reads_mask(inst, scan_read_callback, s);
280 }
281
282 /* Compute the writemask for inputs. */
283 for (i = 0; i < s->NumInputs; i++) {
284 unsigned int chan, writemask = 0;
285 for (chan = 0; chan < 4; chan++) {
286 if (s->Input[i].Live[chan].Used) {
287 writemask |= (1 << chan);
288 }
289 }
290 s->Input[i].Writemask = writemask;
291 }
292
293 graph = ra_alloc_interference_graph(ra_state->regs, node_count + s->NumInputs);
294
295 for (node_index = 0; node_index < node_count; node_index++) {
296 ra_set_node_class(graph, node_index, node_classes[node_index]);
297 }
298
299 rc_build_interference_graph(graph, variables);
300
301 /* Add input registers to the interference graph */
302 for (i = 0, input_node = 0; i < s->NumInputs; i++) {
303 if (!s->Input[i].Writemask) {
304 continue;
305 }
306 for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
307 struct rc_variable *var = var_ptr->Item;
308 if (rc_overlap_live_intervals_array(s->Input[i].Live, var->Live)) {
309 ra_add_node_interference(graph, node_index, node_count + input_node);
310 }
311 }
312 /* Manually allocate a register for this input */
313 ra_set_node_reg(graph, node_count + input_node,
314 get_reg_id(s->Input[i].Index, s->Input[i].Writemask));
315 input_node++;
316 }
317
318 if (!ra_allocate(graph)) {
319 rc_error(s->C, "Ran out of hardware temporaries\n");
320 ralloc_free(graph);
321 return;
322 }
323
324 /* Rewrite the registers */
325 for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
326 int reg = ra_get_node_reg(graph, node_index);
327 unsigned int writemask = reg_get_writemask(reg);
328 unsigned int index = reg_get_index(reg);
329 struct rc_variable *var = var_ptr->Item;
330
331 if (!s->C->is_r500 && var->Inst->Type == RC_INSTRUCTION_NORMAL) {
332 writemask = rc_variable_writemask_sum(var);
333 }
334
335 if (var->Dst.File == RC_FILE_INPUT) {
336 continue;
337 }
338 rc_variable_change_dst(var, index, writemask);
339 }
340
341 ralloc_free(graph);
342 }
343
344 /**
345 * @param user This parameter should be a pointer to an integer value. If this
346 * integer value is zero, then a simple register allocator will be used that
347 * only allocates space for input registers (\sa do_regalloc_inputs_only). If
348 * user is non-zero, then the regular register allocator will be used
349 * (\sa do_regalloc).
350 */
351 void
rc_pair_regalloc(struct radeon_compiler * cc,void * user)352 rc_pair_regalloc(struct radeon_compiler *cc, void *user)
353 {
354 struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler *)cc;
355 struct regalloc_state s;
356 int *do_full_regalloc = (int *)user;
357
358 memset(&s, 0, sizeof(s));
359 s.C = cc;
360 s.NumInputs = rc_get_max_index(cc, RC_FILE_INPUT) + 1;
361 s.Input = memory_pool_malloc(&cc->Pool, s.NumInputs * sizeof(struct register_info));
362 memset(s.Input, 0, s.NumInputs * sizeof(struct register_info));
363
364 s.NumTemporaries = rc_get_max_index(cc, RC_FILE_TEMPORARY) + 1;
365 s.Temporary = memory_pool_malloc(&cc->Pool, s.NumTemporaries * sizeof(struct register_info));
366 memset(s.Temporary, 0, s.NumTemporaries * sizeof(struct register_info));
367
368 rc_recompute_ips(s.C);
369
370 c->AllocateHwInputs(c, &alloc_input_simple, &s);
371 if (*do_full_regalloc) {
372 do_advanced_regalloc(&s);
373 } else {
374 s.Simple = 1;
375 do_regalloc_inputs_only(&s);
376 }
377
378 /* Rewrite inputs and if we are doing the simple allocation, rewrite
379 * temporaries too. */
380 for (struct rc_instruction *inst = s.C->Program.Instructions.Next;
381 inst != &s.C->Program.Instructions; inst = inst->Next) {
382 rc_remap_registers(inst, &remap_register, &s);
383 }
384 }
385