1 /*
2 * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "ir3.h"
25
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32
33 #include "util/bitscan.h"
34 #include "util/half_float.h"
35 #include "util/ralloc.h"
36 #include "util/u_math.h"
37
38 #include "instr-a3xx.h"
39 #include "ir3_shader.h"
40
41 /* simple allocator to carve allocations out of an up-front allocated heap,
42 * so that we can free everything easily in one shot.
43 */
44 void *
ir3_alloc(struct ir3 * shader,int sz)45 ir3_alloc(struct ir3 *shader, int sz)
46 {
47 return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
48 }
49
50 struct ir3 *
ir3_create(struct ir3_compiler * compiler,struct ir3_shader_variant * v)51 ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
52 {
53 struct ir3 *shader = rzalloc(v, struct ir3);
54
55 shader->compiler = compiler;
56 shader->type = v->type;
57
58 list_inithead(&shader->block_list);
59 list_inithead(&shader->array_list);
60
61 return shader;
62 }
63
64 void
ir3_destroy(struct ir3 * shader)65 ir3_destroy(struct ir3 *shader)
66 {
67 ralloc_free(shader);
68 }
69
70 static bool
is_shared_consts(struct ir3_compiler * compiler,struct ir3_const_state * const_state,struct ir3_register * reg)71 is_shared_consts(struct ir3_compiler *compiler,
72 struct ir3_const_state *const_state,
73 struct ir3_register *reg)
74 {
75 if (const_state->push_consts_type == IR3_PUSH_CONSTS_SHARED &&
76 reg->flags & IR3_REG_CONST) {
77 uint32_t min_const_reg = regid(compiler->shared_consts_base_offset, 0);
78 uint32_t max_const_reg =
79 regid(compiler->shared_consts_base_offset +
80 compiler->shared_consts_size, 0);
81 return reg->num >= min_const_reg && min_const_reg < max_const_reg;
82 }
83
84 return false;
85 }
86
87 static void
collect_reg_info(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_info * info)88 collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
89 struct ir3_info *info)
90 {
91 struct ir3_shader_variant *v = info->data;
92 unsigned repeat = instr->repeat;
93
94 if (reg->flags & IR3_REG_IMMED) {
95 /* nothing to do */
96 return;
97 }
98
99 /* Shared consts don't need to be included into constlen. */
100 if (is_shared_consts(v->compiler, ir3_const_state(v), reg))
101 return;
102
103 if (!(reg->flags & IR3_REG_R)) {
104 repeat = 0;
105 }
106
107 unsigned components;
108 int16_t max;
109
110 if (reg->flags & IR3_REG_RELATIV) {
111 components = reg->size;
112 max = (reg->array.base + components - 1);
113 } else {
114 components = util_last_bit(reg->wrmask);
115 max = (reg->num + repeat + components - 1);
116 }
117
118 if (reg->flags & IR3_REG_CONST) {
119 info->max_const = MAX2(info->max_const, max >> 2);
120 } else if (max < regid(48, 0)) {
121 if (reg->flags & IR3_REG_HALF) {
122 if (v->mergedregs) {
123 /* starting w/ a6xx, half regs conflict with full regs: */
124 info->max_reg = MAX2(info->max_reg, max >> 3);
125 } else {
126 info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
127 }
128 } else {
129 info->max_reg = MAX2(info->max_reg, max >> 2);
130 }
131 }
132 }
133
134 bool
ir3_should_double_threadsize(struct ir3_shader_variant * v,unsigned regs_count)135 ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
136 {
137 const struct ir3_compiler *compiler = v->compiler;
138
139 /* If the user forced a particular wavesize respect that. */
140 if (v->shader_options.real_wavesize == IR3_SINGLE_ONLY)
141 return false;
142 if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY)
143 return true;
144
145 /* We can't support more than compiler->branchstack_size diverging threads
146 * in a wave. Thus, doubling the threadsize is only possible if we don't
147 * exceed the branchstack size limit.
148 */
149 if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
150 compiler->branchstack_size) {
151 return false;
152 }
153
154 switch (v->type) {
155 case MESA_SHADER_KERNEL:
156 case MESA_SHADER_COMPUTE: {
157 unsigned threads_per_wg =
158 v->local_size[0] * v->local_size[1] * v->local_size[2];
159
160 /* For a5xx, if the workgroup size is greater than the maximum number
161 * of threads per core with 32 threads per wave (512) then we have to
162 * use the doubled threadsize because otherwise the workgroup wouldn't
163 * fit. For smaller workgroup sizes, we follow the blob and use the
164 * smaller threadsize.
165 */
166 if (compiler->gen < 6) {
167 return v->local_size_variable ||
168 threads_per_wg >
169 compiler->threadsize_base * compiler->max_waves;
170 }
171
172 /* On a6xx, we prefer the larger threadsize unless the workgroup is
173 * small enough that it would be useless. Note that because
174 * threadsize_base is bumped to 64, we don't have to worry about the
175 * workgroup fitting, unlike the a5xx case.
176 */
177 if (!v->local_size_variable) {
178 if (threads_per_wg <= compiler->threadsize_base)
179 return false;
180 }
181 }
182 FALLTHROUGH;
183 case MESA_SHADER_FRAGMENT: {
184 /* Check that doubling the threadsize wouldn't exceed the regfile size */
185 return regs_count * 2 <= compiler->reg_size_vec4;
186 }
187
188 default:
189 /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
190 * stages - the bit doesn't exist. The blob never used it for the VS
191 * on earlier gen's anyway.
192 */
193 return false;
194 }
195 }
196
197 /* Get the maximum number of waves that could be used even if this shader
198 * didn't use any registers.
199 */
200 unsigned
ir3_get_reg_independent_max_waves(struct ir3_shader_variant * v,bool double_threadsize)201 ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
202 bool double_threadsize)
203 {
204 const struct ir3_compiler *compiler = v->compiler;
205 unsigned max_waves = compiler->max_waves;
206
207 /* Compute the limit based on branchstack */
208 if (v->branchstack > 0) {
209 unsigned branchstack_max_waves = compiler->branchstack_size /
210 v->branchstack *
211 compiler->wave_granularity;
212 max_waves = MIN2(max_waves, branchstack_max_waves);
213 }
214
215 /* If this is a compute shader, compute the limit based on shared size */
216 if ((v->type == MESA_SHADER_COMPUTE) ||
217 (v->type == MESA_SHADER_KERNEL)) {
218 unsigned threads_per_wg =
219 v->local_size[0] * v->local_size[1] * v->local_size[2];
220 unsigned waves_per_wg =
221 DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
222 (double_threadsize ? 2 : 1) *
223 compiler->wave_granularity);
224
225 /* Shared is allocated in chunks of 1k */
226 unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
227 if (shared_per_wg > 0 && !v->local_size_variable) {
228 unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
229
230 max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
231 compiler->wave_granularity);
232 }
233
234 /* If we have a compute shader that has a big workgroup, a barrier, and
235 * a branchstack which limits max_waves - this may result in a situation
236 * when we cannot run concurrently all waves of the workgroup, which
237 * would lead to a hang.
238 *
239 * TODO: Could we spill branchstack or is there other way around?
240 * Blob just explodes in such case.
241 */
242 if (v->has_barrier && (max_waves < waves_per_wg)) {
243 mesa_loge(
244 "Compute shader (%s) which has workgroup barrier cannot be used "
245 "because it's impossible to have enough concurrent waves.",
246 v->name);
247 exit(1);
248 }
249 }
250
251 return max_waves;
252 }
253
254 /* Get the maximum number of waves that could be launched limited by reg size.
255 */
256 unsigned
ir3_get_reg_dependent_max_waves(const struct ir3_compiler * compiler,unsigned reg_count,bool double_threadsize)257 ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
258 unsigned reg_count, bool double_threadsize)
259 {
260 return reg_count ? (compiler->reg_size_vec4 /
261 (reg_count * (double_threadsize ? 2 : 1)) *
262 compiler->wave_granularity)
263 : compiler->max_waves;
264 }
265
266 void
ir3_collect_info(struct ir3_shader_variant * v)267 ir3_collect_info(struct ir3_shader_variant *v)
268 {
269 struct ir3_info *info = &v->info;
270 struct ir3 *shader = v->ir;
271 const struct ir3_compiler *compiler = v->compiler;
272
273 memset(info, 0, sizeof(*info));
274 info->data = v;
275 info->max_reg = -1;
276 info->max_half_reg = -1;
277 info->max_const = -1;
278 info->multi_dword_ldp_stp = false;
279
280 uint32_t instr_count = 0;
281 foreach_block (block, &shader->block_list) {
282 foreach_instr (instr, &block->instr_list) {
283 instr_count++;
284 }
285 }
286
287 v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
288
289 /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
290 * doesn't try to decode the following data as instructions (such as the
291 * next stage's shader in turnip)
292 */
293 info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
294 info->sizedwords = info->size / 4;
295
296 bool in_preamble = false;
297 bool has_eq = false;
298
299 foreach_block (block, &shader->block_list) {
300 int sfu_delay = 0, mem_delay = 0;
301
302 foreach_instr (instr, &block->instr_list) {
303
304 foreach_src (reg, instr) {
305 collect_reg_info(instr, reg, info);
306 }
307
308 foreach_dst (reg, instr) {
309 if (is_dest_gpr(reg)) {
310 collect_reg_info(instr, reg, info);
311 }
312 }
313
314 if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
315 unsigned components = instr->srcs[2]->uim_val;
316 if (components * type_size(instr->cat6.type) > 32) {
317 info->multi_dword_ldp_stp = true;
318 }
319
320 if (instr->opc == OPC_STP)
321 info->stp_count += components;
322 else
323 info->ldp_count += components;
324 }
325
326 if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) &&
327 (instr->dsts[0]->flags & IR3_REG_EI))
328 info->last_baryf = info->instrs_count;
329
330 if ((instr->opc == OPC_NOP) && (instr->flags & IR3_INSTR_EQ)) {
331 info->last_helper = info->instrs_count;
332 has_eq = true;
333 }
334
335 if (v->type == MESA_SHADER_FRAGMENT && v->need_pixlod &&
336 instr->opc == OPC_END && !v->prefetch_end_of_quad && !has_eq)
337 info->last_helper = info->instrs_count;
338
339 if (instr->opc == OPC_SHPS)
340 in_preamble = true;
341
342 /* Don't count instructions in the preamble for instruction-count type
343 * stats, because their effect should be much smaller.
344 * TODO: we should probably have separate stats for preamble
345 * instructions, but that would blow up the amount of stats...
346 */
347 if (!in_preamble) {
348 unsigned instrs_count = 1 + instr->repeat + instr->nop;
349 unsigned nops_count = instr->nop;
350
351 if (instr->opc == OPC_NOP) {
352 nops_count = 1 + instr->repeat;
353 info->instrs_per_cat[0] += nops_count;
354 } else if (!is_meta(instr)) {
355 info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
356 info->instrs_per_cat[0] += nops_count;
357 }
358
359 if (instr->opc == OPC_MOV) {
360 if (instr->cat1.src_type == instr->cat1.dst_type) {
361 info->mov_count += 1 + instr->repeat;
362 } else {
363 info->cov_count += 1 + instr->repeat;
364 }
365 }
366
367 info->instrs_count += instrs_count;
368 info->nops_count += nops_count;
369
370 if (instr->flags & IR3_INSTR_SS) {
371 info->ss++;
372 info->sstall += sfu_delay;
373 sfu_delay = 0;
374 }
375
376 if (instr->flags & IR3_INSTR_SY) {
377 info->sy++;
378 info->systall += mem_delay;
379 mem_delay = 0;
380 }
381
382 if (is_ss_producer(instr)) {
383 sfu_delay = soft_ss_delay(instr);
384 } else {
385 int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
386 sfu_delay -= n;
387 }
388
389 if (is_sy_producer(instr)) {
390 mem_delay = soft_sy_delay(instr, shader);
391 } else {
392 int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
393 mem_delay -= n;
394 }
395 }
396
397 if (instr->opc == OPC_SHPE)
398 in_preamble = false;
399 }
400 }
401
402 /* for vertex shader, the inputs are loaded into registers before the shader
403 * is executed, so max_regs from the shader instructions might not properly
404 * reflect the # of registers actually used, especially in case passthrough
405 * varyings.
406 *
407 * Likewise, for fragment shader, we can have some regs which are passed
408 * input values but never touched by the resulting shader (ie. as result
409 * of dead code elimination or simply because we don't know how to turn
410 * the reg off.
411 */
412 for (unsigned i = 0; i < v->inputs_count; i++) {
413 /* skip frag inputs fetch via bary.f since their reg's are
414 * not written by gpu before shader starts (and in fact the
415 * regid's might not even be valid)
416 */
417 if (v->inputs[i].bary)
418 continue;
419
420 /* ignore high regs that are global to all threads in a warp
421 * (they exist by default) (a5xx+)
422 */
423 if (v->inputs[i].regid >= regid(48, 0))
424 continue;
425
426 if (v->inputs[i].compmask) {
427 unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
428 int32_t regid = v->inputs[i].regid + n;
429 if (v->inputs[i].half) {
430 if (!v->mergedregs) {
431 v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
432 } else {
433 v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
434 }
435 } else {
436 v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
437 }
438 }
439 }
440
441 for (unsigned i = 0; i < v->num_sampler_prefetch; i++) {
442 unsigned n = util_last_bit(v->sampler_prefetch[i].wrmask) - 1;
443 int32_t regid = v->sampler_prefetch[i].dst + n;
444 if (v->sampler_prefetch[i].half_precision) {
445 if (!v->mergedregs) {
446 v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
447 } else {
448 v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
449 }
450 } else {
451 v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
452 }
453 }
454
455 /* TODO: for a5xx and below, is there a separate regfile for
456 * half-registers?
457 */
458 unsigned regs_count =
459 info->max_reg + 1 +
460 (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
461
462 info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
463
464 /* TODO this is different for earlier gens, but earlier gens don't use this */
465 info->subgroup_size = v->info.double_threadsize ? 128 : 64;
466
467 unsigned reg_independent_max_waves =
468 ir3_get_reg_independent_max_waves(v, info->double_threadsize);
469 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
470 compiler, regs_count, info->double_threadsize);
471 info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
472 assert(info->max_waves <= v->compiler->max_waves);
473 }
474
475 static struct ir3_register *
reg_create(struct ir3 * shader,int num,int flags)476 reg_create(struct ir3 *shader, int num, int flags)
477 {
478 struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
479 reg->wrmask = 1;
480 reg->flags = flags;
481 reg->num = num;
482 return reg;
483 }
484
485 static void
insert_instr(struct ir3_block * block,struct ir3_instruction * instr)486 insert_instr(struct ir3_block *block, struct ir3_instruction *instr)
487 {
488 struct ir3 *shader = block->shader;
489
490 instr->serialno = ++shader->instr_count;
491
492 list_addtail(&instr->node, &block->instr_list);
493
494 if (is_input(instr))
495 array_insert(shader, shader->baryfs, instr);
496 }
497
498 struct ir3_block *
ir3_block_create(struct ir3 * shader)499 ir3_block_create(struct ir3 *shader)
500 {
501 struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
502 #ifdef DEBUG
503 block->serialno = ++shader->block_count;
504 #endif
505 block->shader = shader;
506 list_inithead(&block->node);
507 list_inithead(&block->instr_list);
508 return block;
509 }
510
511 void
ir3_block_add_predecessor(struct ir3_block * block,struct ir3_block * pred)512 ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
513 {
514 array_insert(block, block->predecessors, pred);
515 }
516
517 void
ir3_block_link_physical(struct ir3_block * pred,struct ir3_block * succ)518 ir3_block_link_physical(struct ir3_block *pred,
519 struct ir3_block *succ)
520 {
521 array_insert(pred, pred->physical_successors, succ);
522 array_insert(succ, succ->physical_predecessors, pred);
523 }
524
525 void
ir3_block_remove_predecessor(struct ir3_block * block,struct ir3_block * pred)526 ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
527 {
528 for (unsigned i = 0; i < block->predecessors_count; i++) {
529 if (block->predecessors[i] == pred) {
530 if (i < block->predecessors_count - 1) {
531 block->predecessors[i] =
532 block->predecessors[block->predecessors_count - 1];
533 }
534
535 block->predecessors_count--;
536 return;
537 }
538 }
539 }
540
541 unsigned
ir3_block_get_pred_index(struct ir3_block * block,struct ir3_block * pred)542 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
543 {
544 for (unsigned i = 0; i < block->predecessors_count; i++) {
545 if (block->predecessors[i] == pred) {
546 return i;
547 }
548 }
549
550 unreachable("ir3_block_get_pred_index() invalid predecessor");
551 }
552
553 static struct ir3_instruction *
instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)554 instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
555 {
556 /* Add extra sources for array destinations and the address reg */
557 if (1 <= opc_cat(opc))
558 nsrc += 2;
559 struct ir3_instruction *instr;
560 unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
561 (nsrc * sizeof(instr->srcs[0]));
562 char *ptr = ir3_alloc(block->shader, sz);
563
564 instr = (struct ir3_instruction *)ptr;
565 ptr += sizeof(*instr);
566 instr->dsts = (struct ir3_register **)ptr;
567 instr->srcs = instr->dsts + ndst;
568
569 #ifdef DEBUG
570 instr->dsts_max = ndst;
571 instr->srcs_max = nsrc;
572 #endif
573
574 return instr;
575 }
576
577 struct ir3_instruction *
ir3_instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)578 ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
579 {
580 struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
581 instr->block = block;
582 instr->opc = opc;
583 insert_instr(block, instr);
584 return instr;
585 }
586
587 struct ir3_instruction *
ir3_instr_clone(struct ir3_instruction * instr)588 ir3_instr_clone(struct ir3_instruction *instr)
589 {
590 struct ir3_instruction *new_instr = instr_create(
591 instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
592 struct ir3_register **dsts, **srcs;
593
594 dsts = new_instr->dsts;
595 srcs = new_instr->srcs;
596 *new_instr = *instr;
597 new_instr->dsts = dsts;
598 new_instr->srcs = srcs;
599
600 insert_instr(instr->block, new_instr);
601
602 /* clone registers: */
603 new_instr->dsts_count = 0;
604 new_instr->srcs_count = 0;
605 foreach_dst (reg, instr) {
606 struct ir3_register *new_reg =
607 ir3_dst_create(new_instr, reg->num, reg->flags);
608 *new_reg = *reg;
609 if (new_reg->instr)
610 new_reg->instr = new_instr;
611 }
612 foreach_src (reg, instr) {
613 struct ir3_register *new_reg =
614 ir3_src_create(new_instr, reg->num, reg->flags);
615 *new_reg = *reg;
616 }
617
618 if (instr->address) {
619 assert(instr->srcs_count > 0);
620 new_instr->address = new_instr->srcs[instr->srcs_count - 1];
621 }
622
623 return new_instr;
624 }
625
626 /* Add a false dependency to instruction, to ensure it is scheduled first: */
627 void
ir3_instr_add_dep(struct ir3_instruction * instr,struct ir3_instruction * dep)628 ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
629 {
630 for (unsigned i = 0; i < instr->deps_count; i++) {
631 if (instr->deps[i] == dep)
632 return;
633 }
634
635 array_insert(instr, instr->deps, dep);
636 }
637
638 struct ir3_register *
ir3_src_create(struct ir3_instruction * instr,int num,int flags)639 ir3_src_create(struct ir3_instruction *instr, int num, int flags)
640 {
641 struct ir3 *shader = instr->block->shader;
642 #ifdef DEBUG
643 assert(instr->srcs_count < instr->srcs_max);
644 #endif
645 struct ir3_register *reg = reg_create(shader, num, flags);
646 instr->srcs[instr->srcs_count++] = reg;
647 return reg;
648 }
649
650 struct ir3_register *
ir3_dst_create(struct ir3_instruction * instr,int num,int flags)651 ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
652 {
653 struct ir3 *shader = instr->block->shader;
654 #ifdef DEBUG
655 assert(instr->dsts_count < instr->dsts_max);
656 #endif
657 struct ir3_register *reg = reg_create(shader, num, flags);
658 instr->dsts[instr->dsts_count++] = reg;
659 return reg;
660 }
661
662 struct ir3_register *
ir3_reg_clone(struct ir3 * shader,struct ir3_register * reg)663 ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
664 {
665 struct ir3_register *new_reg = reg_create(shader, 0, 0);
666 *new_reg = *reg;
667 return new_reg;
668 }
669
670 void
ir3_reg_set_last_array(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_register * last_write)671 ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
672 struct ir3_register *last_write)
673 {
674 assert(reg->flags & IR3_REG_ARRAY);
675 struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
676 *new_reg = *reg;
677 new_reg->def = last_write;
678 ir3_reg_tie(reg, new_reg);
679 }
680
681 void
ir3_instr_set_address(struct ir3_instruction * instr,struct ir3_instruction * addr)682 ir3_instr_set_address(struct ir3_instruction *instr,
683 struct ir3_instruction *addr)
684 {
685 if (!instr->address) {
686 struct ir3 *ir = instr->block->shader;
687
688 assert(instr->block == addr->block);
689
690 instr->address =
691 ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
692 instr->address->def = addr->dsts[0];
693 assert(reg_num(addr->dsts[0]) == REG_A0);
694 unsigned comp = reg_comp(addr->dsts[0]);
695 if (comp == 0) {
696 array_insert(ir, ir->a0_users, instr);
697 } else {
698 assert(comp == 1);
699 array_insert(ir, ir->a1_users, instr);
700 }
701 } else {
702 assert(instr->address->def->instr == addr);
703 }
704 }
705
706 void
ir3_block_clear_mark(struct ir3_block * block)707 ir3_block_clear_mark(struct ir3_block *block)
708 {
709 foreach_instr (instr, &block->instr_list)
710 instr->flags &= ~IR3_INSTR_MARK;
711 }
712
713 void
ir3_clear_mark(struct ir3 * ir)714 ir3_clear_mark(struct ir3 *ir)
715 {
716 foreach_block (block, &ir->block_list) {
717 ir3_block_clear_mark(block);
718 }
719 }
720
721 unsigned
ir3_count_instructions(struct ir3 * ir)722 ir3_count_instructions(struct ir3 *ir)
723 {
724 unsigned cnt = 1;
725 foreach_block (block, &ir->block_list) {
726 block->start_ip = cnt;
727 foreach_instr (instr, &block->instr_list) {
728 instr->ip = cnt++;
729 }
730 block->end_ip = cnt;
731 }
732 return cnt;
733 }
734
735 /* When counting instructions for RA, we insert extra fake instructions at the
736 * beginning of each block, where values become live, and at the end where
737 * values die. This prevents problems where values live-in at the beginning or
738 * live-out at the end of a block from being treated as if they were
739 * live-in/live-out at the first/last instruction, which would be incorrect.
740 * In ir3_legalize these ip's are assumed to be actual ip's of the final
741 * program, so it would be incorrect to use this everywhere.
742 */
743
744 unsigned
ir3_count_instructions_ra(struct ir3 * ir)745 ir3_count_instructions_ra(struct ir3 *ir)
746 {
747 unsigned cnt = 1;
748 foreach_block (block, &ir->block_list) {
749 block->start_ip = cnt++;
750 foreach_instr (instr, &block->instr_list) {
751 instr->ip = cnt++;
752 }
753 block->end_ip = cnt++;
754 }
755 return cnt;
756 }
757
758 struct ir3_array *
ir3_lookup_array(struct ir3 * ir,unsigned id)759 ir3_lookup_array(struct ir3 *ir, unsigned id)
760 {
761 foreach_array (arr, &ir->array_list)
762 if (arr->id == id)
763 return arr;
764 return NULL;
765 }
766
767 void
ir3_find_ssa_uses(struct ir3 * ir,void * mem_ctx,bool falsedeps)768 ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
769 {
770 /* We could do this in a single pass if we can assume instructions
771 * are always sorted. Which currently might not always be true.
772 * (In particular after ir3_group pass, but maybe other places.)
773 */
774 foreach_block (block, &ir->block_list)
775 foreach_instr (instr, &block->instr_list)
776 instr->uses = NULL;
777
778 foreach_block (block, &ir->block_list) {
779 foreach_instr (instr, &block->instr_list) {
780 foreach_ssa_src_n (src, n, instr) {
781 if (__is_false_dep(instr, n) && !falsedeps)
782 continue;
783 if (!src->uses)
784 src->uses = _mesa_pointer_set_create(mem_ctx);
785 _mesa_set_add(src->uses, instr);
786 }
787 }
788 }
789 }
790
791 /**
792 * Set the destination type of an instruction, for example if a
793 * conversion is folded in, handling the special cases where the
794 * instruction's dest type or opcode needs to be fixed up.
795 */
796 void
ir3_set_dst_type(struct ir3_instruction * instr,bool half)797 ir3_set_dst_type(struct ir3_instruction *instr, bool half)
798 {
799 if (half) {
800 instr->dsts[0]->flags |= IR3_REG_HALF;
801 } else {
802 instr->dsts[0]->flags &= ~IR3_REG_HALF;
803 }
804
805 switch (opc_cat(instr->opc)) {
806 case 1: /* move instructions */
807 if (half) {
808 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
809 } else {
810 instr->cat1.dst_type = full_type(instr->cat1.dst_type);
811 }
812 break;
813 case 4:
814 if (half) {
815 instr->opc = cat4_half_opc(instr->opc);
816 } else {
817 instr->opc = cat4_full_opc(instr->opc);
818 }
819 break;
820 case 5:
821 if (half) {
822 instr->cat5.type = half_type(instr->cat5.type);
823 } else {
824 instr->cat5.type = full_type(instr->cat5.type);
825 }
826 break;
827 }
828 }
829
830 /**
831 * One-time fixup for instruction src-types. Other than cov's that
832 * are folded, an instruction's src type does not change.
833 */
834 void
ir3_fixup_src_type(struct ir3_instruction * instr)835 ir3_fixup_src_type(struct ir3_instruction *instr)
836 {
837 if (instr->srcs_count == 0)
838 return;
839
840 switch (opc_cat(instr->opc)) {
841 case 1: /* move instructions */
842 if (instr->srcs[0]->flags & IR3_REG_HALF) {
843 instr->cat1.src_type = half_type(instr->cat1.src_type);
844 } else {
845 instr->cat1.src_type = full_type(instr->cat1.src_type);
846 }
847 break;
848 case 3:
849 if (instr->srcs[0]->flags & IR3_REG_HALF) {
850 instr->opc = cat3_half_opc(instr->opc);
851 } else {
852 instr->opc = cat3_full_opc(instr->opc);
853 }
854 break;
855 }
856 }
857
858 /**
859 * Map a floating point immed to FLUT (float lookup table) value,
860 * returns negative for immediates that cannot be mapped.
861 */
862 int
ir3_flut(struct ir3_register * src_reg)863 ir3_flut(struct ir3_register *src_reg)
864 {
865 static const struct {
866 uint32_t f32;
867 uint16_t f16;
868 } flut[] = {
869 { .f32 = 0x00000000, .f16 = 0x0000 }, /* 0.0 */
870 { .f32 = 0x3f000000, .f16 = 0x3800 }, /* 0.5 */
871 { .f32 = 0x3f800000, .f16 = 0x3c00 }, /* 1.0 */
872 { .f32 = 0x40000000, .f16 = 0x4000 }, /* 2.0 */
873 { .f32 = 0x402df854, .f16 = 0x4170 }, /* e */
874 { .f32 = 0x40490fdb, .f16 = 0x4248 }, /* pi */
875 { .f32 = 0x3ea2f983, .f16 = 0x3518 }, /* 1/pi */
876 { .f32 = 0x3f317218, .f16 = 0x398c }, /* 1/log2(e) */
877 { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 }, /* log2(e) */
878 { .f32 = 0x3e9a209b, .f16 = 0x34d1 }, /* 1/log2(10) */
879 { .f32 = 0x40549a78, .f16 = 0x42a5 }, /* log2(10) */
880 { .f32 = 0x40800000, .f16 = 0x4400 }, /* 4.0 */
881 };
882
883 if (src_reg->flags & IR3_REG_HALF) {
884 /* Note that half-float immeds are already lowered to 16b in nir: */
885 uint32_t imm = src_reg->uim_val;
886 for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
887 if (flut[i].f16 == imm) {
888 return i;
889 }
890 }
891 } else {
892 uint32_t imm = src_reg->uim_val;
893 for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
894 if (flut[i].f32 == imm) {
895 return i;
896 }
897 }
898 }
899
900 return -1;
901 }
902
903 static unsigned
cp_flags(unsigned flags)904 cp_flags(unsigned flags)
905 {
906 /* only considering these flags (at least for now): */
907 flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
908 IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
909 IR3_REG_SHARED);
910 return flags;
911 }
912
913 bool
ir3_valid_flags(struct ir3_instruction * instr,unsigned n,unsigned flags)914 ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
915 {
916 struct ir3_compiler *compiler = instr->block->shader->compiler;
917 unsigned valid_flags;
918
919 if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3)
920 return false;
921
922 flags = cp_flags(flags);
923
924 /* If destination is indirect, then source cannot be.. at least
925 * I don't think so..
926 */
927 if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
928 (flags & IR3_REG_RELATIV))
929 return false;
930
931 if (flags & IR3_REG_RELATIV) {
932 /* TODO need to test on earlier gens.. pretty sure the earlier
933 * problem was just that we didn't check that the src was from
934 * same block (since we can't propagate address register values
935 * across blocks currently)
936 */
937 if (compiler->gen < 6)
938 return false;
939
940 /* NOTE in the special try_swap_mad_two_srcs() case we can be
941 * called on a src that has already had an indirect load folded
942 * in, in which case ssa() returns NULL
943 */
944 if (instr->srcs[n]->flags & IR3_REG_SSA) {
945 struct ir3_instruction *src = ssa(instr->srcs[n]);
946 if (src->address->def->instr->block != instr->block)
947 return false;
948 }
949 }
950
951 if (is_meta(instr)) {
952 /* collect and phi nodes support const/immed sources, which will be
953 * turned into move instructions, but not anything else.
954 */
955 if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
956 return false;
957
958 if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
959 return false;
960
961 return true;
962 }
963
964 switch (opc_cat(instr->opc)) {
965 case 0: /* end, chmask */
966 return flags == 0;
967 case 1:
968 switch (instr->opc) {
969 case OPC_MOVMSK:
970 case OPC_SWZ:
971 case OPC_SCT:
972 case OPC_GAT:
973 valid_flags = IR3_REG_SHARED;
974 break;
975 case OPC_SCAN_MACRO:
976 return flags == 0;
977 break;
978 default:
979 valid_flags =
980 IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
981 }
982 if (flags & ~valid_flags)
983 return false;
984 break;
985 case 2:
986 valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
987 IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
988
989 if (flags & ~valid_flags)
990 return false;
991
992 /* Allow an immediate src1 for flat.b, since it's ignored */
993 if (instr->opc == OPC_FLAT_B &&
994 n == 1 && flags == IR3_REG_IMMED)
995 return true;
996
997 if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
998 unsigned m = n ^ 1;
999 /* cannot deal w/ const or shared in both srcs:
1000 * (note that some cat2 actually only have a single src)
1001 */
1002 if (m < instr->srcs_count) {
1003 struct ir3_register *reg = instr->srcs[m];
1004 if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
1005 (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
1006 return false;
1007 if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
1008 return false;
1009 }
1010 }
1011 break;
1012 case 3:
1013 valid_flags =
1014 ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
1015
1016 switch (instr->opc) {
1017 case OPC_SHRM:
1018 case OPC_SHLM:
1019 case OPC_SHRG:
1020 case OPC_SHLG:
1021 case OPC_ANDG: {
1022 valid_flags |= IR3_REG_IMMED;
1023 /* Can be RELATIV+CONST but not CONST: */
1024 if (flags & IR3_REG_RELATIV)
1025 valid_flags |= IR3_REG_CONST;
1026 break;
1027 }
1028 case OPC_WMM:
1029 case OPC_WMM_ACCU: {
1030 valid_flags = IR3_REG_SHARED;
1031 if (n == 2)
1032 valid_flags = IR3_REG_CONST;
1033 break;
1034 }
1035 case OPC_DP2ACC:
1036 case OPC_DP4ACC:
1037 break;
1038 default:
1039 valid_flags |= IR3_REG_CONST;
1040 }
1041
1042 if (flags & ~valid_flags)
1043 return false;
1044
1045 if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
1046 /* cannot deal w/ const/shared/relativ in 2nd src: */
1047 if (n == 1)
1048 return false;
1049 }
1050
1051 break;
1052 case 4:
1053 /* seems like blob compiler avoids const as src.. */
1054 /* TODO double check if this is still the case on a4xx */
1055 if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
1056 return false;
1057 if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
1058 return false;
1059 break;
1060 case 5:
1061 /* no flags allowed */
1062 if (flags)
1063 return false;
1064 break;
1065 case 6:
1066 valid_flags = IR3_REG_IMMED;
1067 if (flags & ~valid_flags)
1068 return false;
1069
1070 if (flags & IR3_REG_IMMED) {
1071 /* doesn't seem like we can have immediate src for store
1072 * instructions:
1073 *
1074 * TODO this restriction could also apply to load instructions,
1075 * but for load instructions this arg is the address (and not
1076 * really sure any good way to test a hard-coded immed addr src)
1077 */
1078 if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
1079 return false;
1080
1081 if ((instr->opc == OPC_LDL) && (n == 0))
1082 return false;
1083
1084 if ((instr->opc == OPC_STL) && (n != 2))
1085 return false;
1086
1087 if ((instr->opc == OPC_LDP) && (n == 0))
1088 return false;
1089
1090 if ((instr->opc == OPC_STP) && (n != 2))
1091 return false;
1092
1093 if (instr->opc == OPC_STLW && n == 0)
1094 return false;
1095
1096 if (instr->opc == OPC_LDLW && n == 0)
1097 return false;
1098
1099 /* disallow immediates in anything but the SSBO slot argument for
1100 * cat6 instructions:
1101 */
1102 if (is_global_a3xx_atomic(instr->opc) && (n != 0))
1103 return false;
1104
1105 if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) ||
1106 is_bindless_atomic(instr->opc))
1107 return false;
1108
1109 if (instr->opc == OPC_STG && (n == 2))
1110 return false;
1111
1112 if (instr->opc == OPC_STG_A && (n == 4))
1113 return false;
1114
1115 if (instr->opc == OPC_LDG && (n == 0))
1116 return false;
1117
1118 if (instr->opc == OPC_LDG_A && (n < 2))
1119 return false;
1120
1121 /* as with atomics, these cat6 instrs can only have an immediate
1122 * for SSBO/IBO slot argument
1123 */
1124 switch (instr->opc) {
1125 case OPC_LDIB:
1126 case OPC_STIB:
1127 case OPC_RESINFO:
1128 if (n != 0)
1129 return false;
1130 break;
1131 default:
1132 break;
1133 }
1134 }
1135
1136 break;
1137 }
1138
1139 return true;
1140 }
1141
1142 bool
ir3_valid_immediate(struct ir3_instruction * instr,int32_t immed)1143 ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
1144 {
1145 if (instr->opc == OPC_MOV || is_meta(instr))
1146 return true;
1147
1148 if (is_mem(instr)) {
1149 switch (instr->opc) {
1150 /* Some load/store instructions have a 13-bit offset and size which must
1151 * always be an immediate and the rest of the sources cannot be
1152 * immediates, so the frontend is responsible for checking the size:
1153 */
1154 case OPC_LDL:
1155 case OPC_STL:
1156 case OPC_LDP:
1157 case OPC_STP:
1158 case OPC_LDG:
1159 case OPC_STG:
1160 case OPC_SPILL_MACRO:
1161 case OPC_RELOAD_MACRO:
1162 case OPC_LDG_A:
1163 case OPC_STG_A:
1164 case OPC_LDLW:
1165 case OPC_STLW:
1166 case OPC_LDLV:
1167 return true;
1168 default:
1169 /* most cat6 src immediates can only encode 8 bits: */
1170 return !(immed & ~0xff);
1171 }
1172 }
1173
1174 /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1175 return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1176 }
1177