1 /*
2 * Copyright (C) 2009 Nicolai Haehnle.
3 * Copyright 2011 Tom Stellard <tstellar@gmail.com>
4 *
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial
17 * portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
23 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 */
28
29 #include "radeon_program_pair.h"
30
31 #include <stdio.h>
32
33 #include "util/glheader.h"
34 #include "util/register_allocate.h"
35 #include "util/u_memory.h"
36 #include "util/ralloc.h"
37
38 #include "r300_fragprog_swizzle.h"
39 #include "radeon_compiler.h"
40 #include "radeon_compiler_util.h"
41 #include "radeon_dataflow.h"
42 #include "radeon_list.h"
43 #include "radeon_regalloc.h"
44 #include "radeon_variable.h"
45
scan_read_callback(void * data,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)46 static void scan_read_callback(void * data, struct rc_instruction * inst,
47 rc_register_file file, unsigned int index, unsigned int mask)
48 {
49 struct regalloc_state * s = data;
50 struct register_info * reg;
51 unsigned int i;
52
53 if (file != RC_FILE_INPUT)
54 return;
55
56 s->Input[index].Used = 1;
57 reg = &s->Input[index];
58
59 for (i = 0; i < 4; i++) {
60 if (!((mask >> i) & 0x1)) {
61 continue;
62 }
63 reg->Live[i].Used = 1;
64 reg->Live[i].Start = 0;
65 reg->Live[i].End =
66 s->LoopEnd > inst->IP ? s->LoopEnd : inst->IP;
67 }
68 }
69
remap_register(void * data,struct rc_instruction * inst,rc_register_file * file,unsigned int * index)70 static void remap_register(void * data, struct rc_instruction * inst,
71 rc_register_file * file, unsigned int * index)
72 {
73 struct regalloc_state * s = data;
74 const struct register_info * reg;
75
76 if (*file == RC_FILE_TEMPORARY && s->Simple)
77 reg = &s->Temporary[*index];
78 else if (*file == RC_FILE_INPUT)
79 reg = &s->Input[*index];
80 else
81 return;
82
83 if (reg->Allocated) {
84 *index = reg->Index;
85 }
86 }
87
alloc_input_simple(void * data,unsigned int input,unsigned int hwreg)88 static void alloc_input_simple(void * data, unsigned int input,
89 unsigned int hwreg)
90 {
91 struct regalloc_state * s = data;
92
93 if (input >= s->NumInputs)
94 return;
95
96 s->Input[input].Allocated = 1;
97 s->Input[input].File = RC_FILE_TEMPORARY;
98 s->Input[input].Index = hwreg;
99 }
100
101 /* This functions offsets the temporary register indices by the number
102 * of input registers, because input registers are actually temporaries and
103 * should not occupy the same space.
104 *
105 * This pass is supposed to be used to maintain correct allocation of inputs
106 * if the standard register allocation is disabled. */
do_regalloc_inputs_only(struct regalloc_state * s)107 static void do_regalloc_inputs_only(struct regalloc_state * s)
108 {
109 for (unsigned i = 0; i < s->NumTemporaries; i++) {
110 s->Temporary[i].Allocated = 1;
111 s->Temporary[i].File = RC_FILE_TEMPORARY;
112 s->Temporary[i].Index = i + s->NumInputs;
113 }
114 }
115
is_derivative(rc_opcode op)116 static unsigned int is_derivative(rc_opcode op)
117 {
118 return (op == RC_OPCODE_DDX || op == RC_OPCODE_DDY);
119 }
120
121 struct variable_get_class_cb_data {
122 unsigned int * can_change_writemask;
123 unsigned int conversion_swizzle;
124 struct radeon_compiler * c;
125 };
126
variable_get_class_read_cb(void * userdata,struct rc_instruction * inst,struct rc_pair_instruction_arg * arg,struct rc_pair_instruction_source * src)127 static void variable_get_class_read_cb(
128 void * userdata,
129 struct rc_instruction * inst,
130 struct rc_pair_instruction_arg * arg,
131 struct rc_pair_instruction_source * src)
132 {
133 struct variable_get_class_cb_data * d = userdata;
134 unsigned int new_swizzle = rc_adjust_channels(arg->Swizzle,
135 d->conversion_swizzle);
136 /* We can't just call r300_swizzle_is_native basic here, because it ignores the
137 * extra requirements for presubtract. However, after pair translation we no longer
138 * have the rc_src_register required for the native swizzle, so we have to
139 * reconstruct it. */
140 struct rc_src_register reg = {};
141 reg.Swizzle = new_swizzle;
142 reg.File = src->File;
143
144 assert(inst->Type == RC_INSTRUCTION_PAIR);
145 /* The opcode is unimportant, we can't have TEX here. */
146 if (!d->c->SwizzleCaps->IsNative(RC_OPCODE_MAD, reg)) {
147 *d->can_change_writemask = 0;
148 }
149 }
150
variable_get_class(struct rc_variable * variable,const struct rc_class * classes)151 static unsigned variable_get_class(
152 struct rc_variable * variable,
153 const struct rc_class * classes)
154 {
155 unsigned int i;
156 unsigned int can_change_writemask= 1;
157 unsigned int writemask = rc_variable_writemask_sum(variable);
158 struct rc_list * readers = rc_variable_readers_union(variable);
159 int class_index;
160
161 if (!variable->C->is_r500) {
162 struct rc_class c;
163 struct rc_variable * var_ptr;
164 /* The assumption here is that if an instruction has type
165 * RC_INSTRUCTION_NORMAL then it is a TEX instruction.
166 * r300 and r400 can't swizzle the result of a TEX lookup. */
167 for (var_ptr = variable; var_ptr; var_ptr = var_ptr->Friend) {
168 if (var_ptr->Inst->Type == RC_INSTRUCTION_NORMAL) {
169 writemask = RC_MASK_XYZW;
170 }
171 }
172
173 /* Check if it is possible to do swizzle packing for r300/r400
174 * without creating non-native swizzles. */
175 class_index = rc_find_class(classes, writemask, 3);
176 if (class_index < 0) {
177 goto error;
178 }
179 c = classes[class_index];
180 if (c.WritemaskCount == 1) {
181 goto done;
182 }
183 for (i = 0; i < c.WritemaskCount; i++) {
184 struct rc_variable * var_ptr;
185 for (var_ptr = variable; var_ptr;
186 var_ptr = var_ptr->Friend) {
187 int j;
188 unsigned int conversion_swizzle =
189 rc_make_conversion_swizzle(
190 writemask, c.Writemasks[i]);
191 struct variable_get_class_cb_data d;
192 d.can_change_writemask = &can_change_writemask;
193 d.conversion_swizzle = conversion_swizzle;
194 d.c = variable->C;
195 /* If we get this far var_ptr->Inst has to
196 * be a pair instruction. If variable or any
197 * of its friends are normal instructions,
198 * then the writemask will be set to RC_MASK_XYZW
199 * and the function will return before it gets
200 * here. */
201 rc_pair_for_all_reads_arg(var_ptr->Inst,
202 variable_get_class_read_cb, &d);
203
204 for (j = 0; j < var_ptr->ReaderCount; j++) {
205 unsigned int old_swizzle;
206 unsigned int new_swizzle;
207 struct rc_reader r = var_ptr->Readers[j];
208 if (r.Inst->Type ==
209 RC_INSTRUCTION_PAIR ) {
210 old_swizzle = r.U.P.Arg->Swizzle;
211 } else {
212 /* Source operands of TEX
213 * instructions can't be
214 * swizzle on r300/r400 GPUs.
215 */
216 can_change_writemask = 0;
217 break;
218 }
219 new_swizzle = rc_rewrite_swizzle(
220 old_swizzle, conversion_swizzle);
221 if (!r300_swizzle_is_native_basic(
222 new_swizzle)) {
223 can_change_writemask = 0;
224 break;
225 }
226 }
227 if (!can_change_writemask) {
228 break;
229 }
230 }
231 if (!can_change_writemask) {
232 break;
233 }
234 }
235 }
236
237 if (variable->Inst->Type == RC_INSTRUCTION_PAIR) {
238 /* DDX/DDY seem to always fail when their writemasks are
239 * changed.*/
240 if (is_derivative(variable->Inst->U.P.RGB.Opcode)
241 || is_derivative(variable->Inst->U.P.Alpha.Opcode)) {
242 can_change_writemask = 0;
243 }
244 }
245 for ( ; readers; readers = readers->Next) {
246 struct rc_reader * r = readers->Item;
247 if (r->Inst->Type == RC_INSTRUCTION_PAIR) {
248 if (r->U.P.Arg->Source == RC_PAIR_PRESUB_SRC) {
249 can_change_writemask = 0;
250 break;
251 }
252 /* DDX/DDY also fail when their swizzles are changed. */
253 if (is_derivative(r->Inst->U.P.RGB.Opcode)
254 || is_derivative(r->Inst->U.P.Alpha.Opcode)) {
255 can_change_writemask = 0;
256 break;
257 }
258 }
259 }
260
261 class_index = rc_find_class(classes, writemask,
262 can_change_writemask ? 3 : 1);
263 done:
264 if (class_index > -1) {
265 return classes[class_index].ID;
266 } else {
267 error:
268 rc_error(variable->C,
269 "Could not find class for index=%u mask=%u\n",
270 variable->Dst.Index, writemask);
271 return 0;
272 }
273 }
274
do_advanced_regalloc(struct regalloc_state * s)275 static void do_advanced_regalloc(struct regalloc_state * s)
276 {
277
278 unsigned int i, input_node, node_count, node_index;
279 struct ra_class ** node_classes;
280 struct rc_instruction * inst;
281 struct rc_list * var_ptr;
282 struct rc_list * variables;
283 struct ra_graph * graph;
284 const struct rc_regalloc_state *ra_state = s->C->regalloc_state;
285
286 /* Get list of program variables */
287 variables = rc_get_variables(s->C);
288 node_count = rc_list_count(variables);
289 node_classes = memory_pool_malloc(&s->C->Pool,
290 node_count * sizeof(struct ra_class *));
291
292 for (var_ptr = variables, node_index = 0; var_ptr;
293 var_ptr = var_ptr->Next, node_index++) {
294 unsigned int class_index;
295 /* Compute the live intervals */
296 rc_variable_compute_live_intervals(var_ptr->Item);
297
298 class_index = variable_get_class(var_ptr->Item, ra_state->class_list);
299 node_classes[node_index] = ra_state->classes[class_index];
300 }
301
302
303 /* Calculate live intervals for input registers */
304 for (inst = s->C->Program.Instructions.Next;
305 inst != &s->C->Program.Instructions;
306 inst = inst->Next) {
307 rc_opcode op = rc_get_flow_control_inst(inst);
308 if (op == RC_OPCODE_BGNLOOP) {
309 struct rc_instruction * endloop =
310 rc_match_bgnloop(inst);
311 if (endloop->IP > s->LoopEnd) {
312 s->LoopEnd = endloop->IP;
313 }
314 }
315 rc_for_all_reads_mask(inst, scan_read_callback, s);
316 }
317
318 /* Compute the writemask for inputs. */
319 for (i = 0; i < s->NumInputs; i++) {
320 unsigned int chan, writemask = 0;
321 for (chan = 0; chan < 4; chan++) {
322 if (s->Input[i].Live[chan].Used) {
323 writemask |= (1 << chan);
324 }
325 }
326 s->Input[i].Writemask = writemask;
327 }
328
329 graph = ra_alloc_interference_graph(ra_state->regs,
330 node_count + s->NumInputs);
331
332 for (node_index = 0; node_index < node_count; node_index++) {
333 ra_set_node_class(graph, node_index, node_classes[node_index]);
334 }
335
336 rc_build_interference_graph(graph, variables);
337
338 /* Add input registers to the interference graph */
339 for (i = 0, input_node = 0; i< s->NumInputs; i++) {
340 if (!s->Input[i].Writemask) {
341 continue;
342 }
343 for (var_ptr = variables, node_index = 0;
344 var_ptr; var_ptr = var_ptr->Next, node_index++) {
345 struct rc_variable * var = var_ptr->Item;
346 if (rc_overlap_live_intervals_array(s->Input[i].Live,
347 var->Live)) {
348 ra_add_node_interference(graph, node_index,
349 node_count + input_node);
350 }
351 }
352 /* Manually allocate a register for this input */
353 ra_set_node_reg(graph, node_count + input_node, get_reg_id(
354 s->Input[i].Index, s->Input[i].Writemask));
355 input_node++;
356 }
357
358 if (!ra_allocate(graph)) {
359 rc_error(s->C, "Ran out of hardware temporaries\n");
360 ralloc_free(graph);
361 return;
362 }
363
364 /* Rewrite the registers */
365 for (var_ptr = variables, node_index = 0; var_ptr;
366 var_ptr = var_ptr->Next, node_index++) {
367 int reg = ra_get_node_reg(graph, node_index);
368 unsigned int writemask = reg_get_writemask(reg);
369 unsigned int index = reg_get_index(reg);
370 struct rc_variable * var = var_ptr->Item;
371
372 if (!s->C->is_r500 && var->Inst->Type == RC_INSTRUCTION_NORMAL) {
373 writemask = rc_variable_writemask_sum(var);
374 }
375
376 if (var->Dst.File == RC_FILE_INPUT) {
377 continue;
378 }
379 rc_variable_change_dst(var, index, writemask);
380 }
381
382 ralloc_free(graph);
383 }
384
385 /**
386 * @param user This parameter should be a pointer to an integer value. If this
387 * integer value is zero, then a simple register allocator will be used that
388 * only allocates space for input registers (\sa do_regalloc_inputs_only). If
389 * user is non-zero, then the regular register allocator will be used
390 * (\sa do_regalloc).
391 */
rc_pair_regalloc(struct radeon_compiler * cc,void * user)392 void rc_pair_regalloc(struct radeon_compiler *cc, void *user)
393 {
394 struct r300_fragment_program_compiler *c =
395 (struct r300_fragment_program_compiler*)cc;
396 struct regalloc_state s;
397 int * do_full_regalloc = (int*)user;
398
399 memset(&s, 0, sizeof(s));
400 s.C = cc;
401 s.NumInputs = rc_get_max_index(cc, RC_FILE_INPUT) + 1;
402 s.Input = memory_pool_malloc(&cc->Pool,
403 s.NumInputs * sizeof(struct register_info));
404 memset(s.Input, 0, s.NumInputs * sizeof(struct register_info));
405
406 s.NumTemporaries = rc_get_max_index(cc, RC_FILE_TEMPORARY) + 1;
407 s.Temporary = memory_pool_malloc(&cc->Pool,
408 s.NumTemporaries * sizeof(struct register_info));
409 memset(s.Temporary, 0, s.NumTemporaries * sizeof(struct register_info));
410
411 rc_recompute_ips(s.C);
412
413 c->AllocateHwInputs(c, &alloc_input_simple, &s);
414 if (*do_full_regalloc) {
415 do_advanced_regalloc(&s);
416 } else {
417 s.Simple = 1;
418 do_regalloc_inputs_only(&s);
419 }
420
421 /* Rewrite inputs and if we are doing the simple allocation, rewrite
422 * temporaries too. */
423 for (struct rc_instruction *inst = s.C->Program.Instructions.Next;
424 inst != &s.C->Program.Instructions;
425 inst = inst->Next) {
426 rc_remap_registers(inst, &remap_register, &s);
427 }
428 }
429