1 /*
2 * Copyright © 2012 Rob Clark <robdclark@gmail.com>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "ir3.h"
7
8 #include <assert.h>
9 #include <errno.h>
10 #include <stdbool.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "util/bitscan.h"
16 #include "util/half_float.h"
17 #include "util/ralloc.h"
18 #include "util/u_math.h"
19
20 #include "instr-a3xx.h"
21 #include "ir3_shader.h"
22
23 /* simple allocator to carve allocations out of an up-front allocated heap,
24 * so that we can free everything easily in one shot.
25 */
26 void *
ir3_alloc(struct ir3 * shader,int sz)27 ir3_alloc(struct ir3 *shader, int sz)
28 {
29 return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
30 }
31
32 struct ir3 *
ir3_create(struct ir3_compiler * compiler,struct ir3_shader_variant * v)33 ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
34 {
35 struct ir3 *shader = rzalloc(v, struct ir3);
36
37 shader->compiler = compiler;
38 shader->type = v->type;
39
40 list_inithead(&shader->block_list);
41 list_inithead(&shader->array_list);
42
43 return shader;
44 }
45
46 void
ir3_destroy(struct ir3 * shader)47 ir3_destroy(struct ir3 *shader)
48 {
49 ralloc_free(shader);
50 }
51
52 static bool
is_shared_consts(struct ir3_compiler * compiler,const struct ir3_const_state * const_state,struct ir3_register * reg)53 is_shared_consts(struct ir3_compiler *compiler,
54 const struct ir3_const_state *const_state,
55 struct ir3_register *reg)
56 {
57 if (const_state->push_consts_type == IR3_PUSH_CONSTS_SHARED &&
58 reg->flags & IR3_REG_CONST) {
59 uint32_t min_const_reg = regid(compiler->shared_consts_base_offset, 0);
60 uint32_t max_const_reg =
61 regid(compiler->shared_consts_base_offset +
62 compiler->shared_consts_size, 0);
63 return reg->num >= min_const_reg && min_const_reg < max_const_reg;
64 }
65
66 return false;
67 }
68
69 static void
collect_reg_info(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_info * info)70 collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
71 struct ir3_info *info)
72 {
73 struct ir3_shader_variant *v = info->data;
74
75 if (reg->flags & IR3_REG_IMMED) {
76 /* nothing to do */
77 return;
78 }
79
80 /* Shared consts don't need to be included into constlen. */
81 if (is_shared_consts(v->compiler, ir3_const_state(v), reg))
82 return;
83
84 unsigned components;
85 int16_t max;
86
87 if (reg->flags & IR3_REG_RELATIV) {
88 components = reg->size;
89 max = (reg->array.base + components - 1);
90 } else {
91 components = util_last_bit(reg->wrmask);
92 max = (reg->num + components - 1);
93 }
94
95 if (reg->flags & IR3_REG_CONST) {
96 info->max_const = MAX2(info->max_const, max >> 2);
97 } else if (max < regid(48, 0)) {
98 if (reg->flags & IR3_REG_HALF) {
99 if (v->mergedregs) {
100 /* starting w/ a6xx, half regs conflict with full regs: */
101 info->max_reg = MAX2(info->max_reg, max >> 3);
102 } else {
103 info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
104 }
105 } else {
106 info->max_reg = MAX2(info->max_reg, max >> 2);
107 }
108 }
109 }
110
111 bool
ir3_should_double_threadsize(struct ir3_shader_variant * v,unsigned regs_count)112 ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
113 {
114 const struct ir3_compiler *compiler = v->compiler;
115
116 /* If the user forced a particular wavesize respect that. */
117 if (v->shader_options.real_wavesize == IR3_SINGLE_ONLY)
118 return false;
119 if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY)
120 return true;
121
122 /* We can't support more than compiler->branchstack_size diverging threads
123 * in a wave. Thus, doubling the threadsize is only possible if we don't
124 * exceed the branchstack size limit.
125 */
126 if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
127 compiler->branchstack_size) {
128 return false;
129 }
130
131 switch (v->type) {
132 case MESA_SHADER_KERNEL:
133 case MESA_SHADER_COMPUTE: {
134 unsigned threads_per_wg =
135 v->local_size[0] * v->local_size[1] * v->local_size[2];
136
137 /* For a5xx, if the workgroup size is greater than the maximum number
138 * of threads per core with 32 threads per wave (512) then we have to
139 * use the doubled threadsize because otherwise the workgroup wouldn't
140 * fit. For smaller workgroup sizes, we follow the blob and use the
141 * smaller threadsize.
142 */
143 if (compiler->gen < 6) {
144 return v->local_size_variable ||
145 threads_per_wg >
146 compiler->threadsize_base * compiler->max_waves;
147 }
148
149 /* On a6xx, we prefer the larger threadsize unless the workgroup is
150 * small enough that it would be useless. Note that because
151 * threadsize_base is bumped to 64, we don't have to worry about the
152 * workgroup fitting, unlike the a5xx case.
153 */
154 if (!v->local_size_variable) {
155 if (threads_per_wg <= compiler->threadsize_base)
156 return false;
157 }
158 }
159 FALLTHROUGH;
160 case MESA_SHADER_FRAGMENT: {
161 /* Check that doubling the threadsize wouldn't exceed the regfile size */
162 return regs_count * 2 <= compiler->reg_size_vec4;
163 }
164
165 default:
166 /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
167 * stages - the bit doesn't exist. The blob never used it for the VS
168 * on earlier gen's anyway.
169 */
170 return false;
171 }
172 }
173
174 /* Get the maximum number of waves that could be used even if this shader
175 * didn't use any registers.
176 */
177 unsigned
ir3_get_reg_independent_max_waves(struct ir3_shader_variant * v,bool double_threadsize)178 ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
179 bool double_threadsize)
180 {
181 const struct ir3_compiler *compiler = v->compiler;
182 unsigned max_waves = compiler->max_waves;
183
184 /* Compute the limit based on branchstack */
185 if (v->branchstack > 0) {
186 unsigned branchstack_max_waves = compiler->branchstack_size /
187 v->branchstack *
188 compiler->wave_granularity;
189 max_waves = MIN2(max_waves, branchstack_max_waves);
190 }
191
192 /* If this is a compute shader, compute the limit based on shared size */
193 if ((v->type == MESA_SHADER_COMPUTE) ||
194 (v->type == MESA_SHADER_KERNEL)) {
195 unsigned threads_per_wg =
196 v->local_size[0] * v->local_size[1] * v->local_size[2];
197 unsigned waves_per_wg =
198 DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
199 (double_threadsize ? 2 : 1) *
200 compiler->wave_granularity);
201
202 /* Shared is allocated in chunks of 1k */
203 unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
204 if (shared_per_wg > 0 && !v->local_size_variable) {
205 unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
206
207 max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
208 compiler->wave_granularity);
209 }
210
211 /* If we have a compute shader that has a big workgroup, a barrier, and
212 * a branchstack which limits max_waves - this may result in a situation
213 * when we cannot run concurrently all waves of the workgroup, which
214 * would lead to a hang.
215 *
216 * TODO: Could we spill branchstack or is there other way around?
217 * Blob just explodes in such case.
218 */
219 if (v->has_barrier && (max_waves < waves_per_wg)) {
220 mesa_loge(
221 "Compute shader (%s) which has workgroup barrier cannot be used "
222 "because it's impossible to have enough concurrent waves.",
223 v->name);
224 exit(1);
225 }
226 }
227
228 return max_waves;
229 }
230
231 /* Get the maximum number of waves that could be launched limited by reg size.
232 */
233 unsigned
ir3_get_reg_dependent_max_waves(const struct ir3_compiler * compiler,unsigned reg_count,bool double_threadsize)234 ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
235 unsigned reg_count, bool double_threadsize)
236 {
237 return reg_count ? (compiler->reg_size_vec4 /
238 (reg_count * (double_threadsize ? 2 : 1)) *
239 compiler->wave_granularity)
240 : compiler->max_waves;
241 }
242
243 void
ir3_collect_info(struct ir3_shader_variant * v)244 ir3_collect_info(struct ir3_shader_variant *v)
245 {
246 struct ir3_info *info = &v->info;
247 struct ir3 *shader = v->ir;
248 const struct ir3_compiler *compiler = v->compiler;
249
250 memset(info, 0, sizeof(*info));
251 info->data = v;
252 info->max_reg = -1;
253 info->max_half_reg = -1;
254 info->max_const = -1;
255 info->multi_dword_ldp_stp = false;
256
257 uint32_t instr_count = 0;
258 foreach_block (block, &shader->block_list) {
259 foreach_instr (instr, &block->instr_list) {
260 instr_count++;
261 }
262 }
263
264 v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
265
266 /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
267 * doesn't try to decode the following data as instructions (such as the
268 * next stage's shader in turnip)
269 */
270 info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
271 info->sizedwords = info->size / 4;
272
273 info->early_preamble = v->early_preamble;
274
275 bool in_preamble = false;
276 bool has_eq = false;
277
278 /* Track which registers are currently aliases because they shouldn't be
279 * included in the GPR footprint.
280 */
281 regmask_t aliases;
282
283 /* Full and half aliases do not overlap so treat them as !mergedregs. */
284 regmask_init(&aliases, false);
285
286 foreach_block (block, &shader->block_list) {
287 int sfu_delay = 0, mem_delay = 0;
288
289 foreach_instr (instr, &block->instr_list) {
290
291 foreach_src (reg, instr) {
292 if (!is_reg_gpr(reg) || !regmask_get(&aliases, reg)) {
293 collect_reg_info(instr, reg, info);
294 }
295 }
296
297 foreach_dst (reg, instr) {
298 if (instr->opc == OPC_ALIAS &&
299 instr->cat7.alias_scope == ALIAS_TEX) {
300 regmask_set(&aliases, instr->dsts[0]);
301 } else if (is_dest_gpr(reg)) {
302 collect_reg_info(instr, reg, info);
303 }
304 }
305
306 if (is_tex(instr)) {
307 /* All aliases are cleared after they are used. */
308 regmask_init(&aliases, false);
309 }
310
311 if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
312 unsigned components = instr->srcs[2]->uim_val;
313
314 /* This covers any multi-component access that could straddle
315 * across multiple double-words.
316 */
317 if (components > 1)
318 info->multi_dword_ldp_stp = true;
319
320 if (instr->opc == OPC_STP)
321 info->stp_count += components;
322 else
323 info->ldp_count += components;
324 }
325
326 if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) &&
327 (instr->dsts[0]->flags & IR3_REG_EI))
328 info->last_baryf = info->instrs_count;
329
330 if ((instr->opc == OPC_NOP) && (instr->flags & IR3_INSTR_EQ)) {
331 info->last_helper = info->instrs_count;
332 has_eq = true;
333 }
334
335 if (v->type == MESA_SHADER_FRAGMENT && v->need_pixlod &&
336 instr->opc == OPC_END && !v->prefetch_end_of_quad && !has_eq)
337 info->last_helper = info->instrs_count;
338
339 if (instr->opc == OPC_SHPS)
340 in_preamble = true;
341
342 /* Don't count instructions in the preamble for instruction-count type
343 * stats, because their effect should be much smaller.
344 * TODO: we should probably have separate stats for preamble
345 * instructions, but that would blow up the amount of stats...
346 */
347 if (!in_preamble) {
348 unsigned instrs_count = 1 + instr->repeat + instr->nop;
349 unsigned nops_count = instr->nop;
350
351 if (instr->opc == OPC_NOP) {
352 nops_count = 1 + instr->repeat;
353 info->instrs_per_cat[0] += nops_count;
354 } else if (!is_meta(instr)) {
355 info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
356 info->instrs_per_cat[0] += nops_count;
357 }
358
359 if (instr->opc == OPC_MOV) {
360 if (instr->cat1.src_type == instr->cat1.dst_type) {
361 info->mov_count += 1 + instr->repeat;
362 } else {
363 info->cov_count += 1 + instr->repeat;
364 }
365 }
366
367 info->instrs_count += instrs_count;
368 info->nops_count += nops_count;
369
370 if (instr->flags & IR3_INSTR_SS) {
371 info->ss++;
372 info->sstall += sfu_delay;
373 sfu_delay = 0;
374 }
375
376 if (instr->flags & IR3_INSTR_SY) {
377 info->sy++;
378 info->systall += mem_delay;
379 mem_delay = 0;
380 }
381
382 if (is_ss_producer(instr)) {
383 sfu_delay = soft_ss_delay(instr);
384 } else {
385 int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
386 sfu_delay -= n;
387 }
388
389 if (is_sy_producer(instr)) {
390 mem_delay = soft_sy_delay(instr, shader);
391 } else {
392 int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
393 mem_delay -= n;
394 }
395 } else {
396 unsigned instrs_count = 1 + instr->repeat + instr->nop;
397 info->preamble_instrs_count += instrs_count;
398 }
399
400 if (instr->opc == OPC_SHPE)
401 in_preamble = false;
402 }
403 }
404
405 /* for vertex shader, the inputs are loaded into registers before the shader
406 * is executed, so max_regs from the shader instructions might not properly
407 * reflect the # of registers actually used, especially in case passthrough
408 * varyings.
409 *
410 * Likewise, for fragment shader, we can have some regs which are passed
411 * input values but never touched by the resulting shader (ie. as result
412 * of dead code elimination or simply because we don't know how to turn
413 * the reg off.
414 */
415 for (unsigned i = 0; i < v->inputs_count; i++) {
416 /* skip frag inputs fetch via bary.f since their reg's are
417 * not written by gpu before shader starts (and in fact the
418 * regid's might not even be valid)
419 */
420 if (v->inputs[i].bary)
421 continue;
422
423 /* ignore high regs that are global to all threads in a warp
424 * (they exist by default) (a5xx+)
425 */
426 if (v->inputs[i].regid >= regid(48, 0))
427 continue;
428
429 if (v->inputs[i].compmask) {
430 unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
431 int32_t regid = v->inputs[i].regid + n;
432 if (v->inputs[i].half) {
433 if (!v->mergedregs) {
434 v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
435 } else {
436 v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
437 }
438 } else {
439 v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
440 }
441 }
442 }
443
444 for (unsigned i = 0; i < v->num_sampler_prefetch; i++) {
445 unsigned n = util_last_bit(v->sampler_prefetch[i].wrmask) - 1;
446 int32_t regid = v->sampler_prefetch[i].dst + n;
447 if (v->sampler_prefetch[i].half_precision) {
448 if (!v->mergedregs) {
449 v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
450 } else {
451 v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
452 }
453 } else {
454 v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
455 }
456 }
457
458 /* TODO: for a5xx and below, is there a separate regfile for
459 * half-registers?
460 */
461 unsigned regs_count =
462 info->max_reg + 1 +
463 (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
464
465 info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
466
467 /* TODO this is different for earlier gens, but earlier gens don't use this */
468 info->subgroup_size = v->info.double_threadsize ? 128 : 64;
469
470 unsigned reg_independent_max_waves =
471 ir3_get_reg_independent_max_waves(v, info->double_threadsize);
472 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
473 compiler, regs_count, info->double_threadsize);
474 info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
475 assert(info->max_waves <= v->compiler->max_waves);
476 }
477
478 static struct ir3_register *
reg_create(struct ir3 * shader,int num,int flags)479 reg_create(struct ir3 *shader, int num, int flags)
480 {
481 struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
482 reg->wrmask = 1;
483 reg->flags = flags;
484 reg->num = num;
485 return reg;
486 }
487
488 static void
insert_instr(struct ir3_cursor cursor,struct ir3_instruction * instr)489 insert_instr(struct ir3_cursor cursor, struct ir3_instruction *instr)
490 {
491 struct ir3 *shader = instr->block->shader;
492
493 instr->serialno = ++shader->instr_count;
494
495 switch (cursor.option) {
496 case IR3_CURSOR_BEFORE_BLOCK:
497 list_add(&instr->node, &cursor.block->instr_list);
498 break;
499 case IR3_CURSOR_AFTER_BLOCK:
500 list_addtail(&instr->node, &cursor.block->instr_list);
501 break;
502 case IR3_CURSOR_BEFORE_INSTR:
503 list_addtail(&instr->node, &cursor.instr->node);
504 break;
505 case IR3_CURSOR_AFTER_INSTR:
506 list_add(&instr->node, &cursor.instr->node);
507 break;
508 }
509
510 if (is_input(instr))
511 array_insert(shader, shader->baryfs, instr);
512 }
513
514 struct ir3_block *
ir3_block_create(struct ir3 * shader)515 ir3_block_create(struct ir3 *shader)
516 {
517 struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
518 #if MESA_DEBUG
519 block->serialno = ++shader->block_count;
520 #endif
521 block->shader = shader;
522 list_inithead(&block->node);
523 list_inithead(&block->instr_list);
524 return block;
525 }
526
527 struct ir3_instruction *
ir3_find_end(struct ir3 * ir)528 ir3_find_end(struct ir3 *ir)
529 {
530 foreach_block_rev (block, &ir->block_list) {
531 foreach_instr_rev (instr, &block->instr_list) {
532 if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
533 return instr;
534 }
535 }
536 unreachable("couldn't find end instruction");
537 }
538
539 static struct ir3_instruction *
block_get_last_instruction(struct ir3_block * block)540 block_get_last_instruction(struct ir3_block *block)
541 {
542 if (list_is_empty(&block->instr_list))
543 return NULL;
544 return list_last_entry(&block->instr_list, struct ir3_instruction, node);
545 }
546
547 struct ir3_instruction *
ir3_block_get_terminator(struct ir3_block * block)548 ir3_block_get_terminator(struct ir3_block *block)
549 {
550 struct ir3_instruction *last = block_get_last_instruction(block);
551
552 if (last && is_terminator(last))
553 return last;
554
555 return NULL;
556 }
557
558 struct ir3_instruction *
ir3_block_take_terminator(struct ir3_block * block)559 ir3_block_take_terminator(struct ir3_block *block)
560 {
561 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
562
563 if (terminator)
564 list_delinit(&terminator->node);
565
566 return terminator;
567 }
568
569 struct ir3_instruction *
ir3_block_get_last_non_terminator(struct ir3_block * block)570 ir3_block_get_last_non_terminator(struct ir3_block *block)
571 {
572 struct ir3_instruction *last = block_get_last_instruction(block);
573
574 if (!last)
575 return NULL;
576
577 if (!is_terminator(last))
578 return last;
579
580 if (last->node.prev != &block->instr_list)
581 return list_entry(last->node.prev, struct ir3_instruction, node);
582
583 return NULL;
584 }
585
586 struct ir3_instruction *
ir3_block_get_last_phi(struct ir3_block * block)587 ir3_block_get_last_phi(struct ir3_block *block)
588 {
589 struct ir3_instruction *last_phi = NULL;
590
591 foreach_instr (instr, &block->instr_list) {
592 if (instr->opc != OPC_META_PHI)
593 break;
594
595 last_phi = instr;
596 }
597
598 return last_phi;
599 }
600
601 struct ir3_instruction *
ir3_find_shpe(struct ir3 * ir)602 ir3_find_shpe(struct ir3 *ir)
603 {
604 if (!ir3_has_preamble(ir)) {
605 return NULL;
606 }
607
608 foreach_block (block, &ir->block_list) {
609 struct ir3_instruction *last = ir3_block_get_last_non_terminator(block);
610
611 if (last && last->opc == OPC_SHPE) {
612 return last;
613 }
614 }
615
616 unreachable("preamble without shpe");
617 }
618
619 struct ir3_instruction *
ir3_create_empty_preamble(struct ir3 * ir)620 ir3_create_empty_preamble(struct ir3 *ir)
621 {
622 assert(!ir3_has_preamble(ir));
623
624 struct ir3_block *main_start_block = ir3_start_block(ir);
625
626 /* Create a preamble CFG similar to what the frontend would generate. Note
627 * that the empty else_block is important for ir3_after_preamble to work.
628 *
629 * shps_block:
630 * if (shps) {
631 * getone_block:
632 * if (getone) {
633 * body_block:
634 * shpe
635 * }
636 * } else {
637 * else_block:
638 * }
639 * main_start_block:
640 */
641 struct ir3_block *shps_block = ir3_block_create(ir);
642 struct ir3_block *getone_block = ir3_block_create(ir);
643 struct ir3_block *body_block = ir3_block_create(ir);
644 struct ir3_block *else_block = ir3_block_create(ir);
645 list_add(&else_block->node, &ir->block_list);
646 list_add(&body_block->node, &ir->block_list);
647 list_add(&getone_block->node, &ir->block_list);
648 list_add(&shps_block->node, &ir->block_list);
649
650 struct ir3_builder b = ir3_builder_at(ir3_after_block(shps_block));
651 ir3_SHPS(&b);
652 shps_block->successors[0] = getone_block;
653 ir3_block_add_predecessor(getone_block, shps_block);
654 ir3_block_link_physical(shps_block, getone_block);
655 shps_block->successors[1] = else_block;
656 ir3_block_add_predecessor(else_block, shps_block);
657 ir3_block_link_physical(shps_block, else_block);
658
659 b.cursor = ir3_after_block(getone_block);
660 ir3_GETONE(&b);
661 getone_block->divergent_condition = true;
662 getone_block->successors[0] = body_block;
663 ir3_block_add_predecessor(body_block, getone_block);
664 ir3_block_link_physical(getone_block, body_block);
665 getone_block->successors[1] = main_start_block;
666 ir3_block_add_predecessor(main_start_block, getone_block);
667 ir3_block_link_physical(getone_block, main_start_block);
668
669 b.cursor = ir3_after_block(body_block);
670 struct ir3_instruction *shpe = ir3_SHPE(&b);
671 shpe->barrier_class = shpe->barrier_conflict = IR3_BARRIER_CONST_W;
672 array_insert(body_block, body_block->keeps, shpe);
673 ir3_JUMP(&b);
674 body_block->successors[0] = main_start_block;
675 ir3_block_add_predecessor(main_start_block, body_block);
676 ir3_block_link_physical(body_block, main_start_block);
677
678 b.cursor = ir3_after_block(else_block);
679 ir3_JUMP(&b);
680 else_block->successors[0] = main_start_block;
681 ir3_block_add_predecessor(main_start_block, else_block);
682 ir3_block_link_physical(else_block, main_start_block);
683
684 main_start_block->reconvergence_point = true;
685
686 return shpe;
687 }
688
689 void
ir3_block_add_predecessor(struct ir3_block * block,struct ir3_block * pred)690 ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
691 {
692 array_insert(block, block->predecessors, pred);
693 }
694
695 void
ir3_block_link_physical(struct ir3_block * pred,struct ir3_block * succ)696 ir3_block_link_physical(struct ir3_block *pred,
697 struct ir3_block *succ)
698 {
699 array_insert(pred, pred->physical_successors, succ);
700 array_insert(succ, succ->physical_predecessors, pred);
701 }
702
703 void
ir3_block_remove_predecessor(struct ir3_block * block,struct ir3_block * pred)704 ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
705 {
706 for (unsigned i = 0; i < block->predecessors_count; i++) {
707 if (block->predecessors[i] == pred) {
708 if (i < block->predecessors_count - 1) {
709 block->predecessors[i] =
710 block->predecessors[block->predecessors_count - 1];
711 }
712
713 block->predecessors_count--;
714 return;
715 }
716 }
717 }
718
719 unsigned
ir3_block_get_pred_index(struct ir3_block * block,struct ir3_block * pred)720 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
721 {
722 for (unsigned i = 0; i < block->predecessors_count; i++) {
723 if (block->predecessors[i] == pred) {
724 return i;
725 }
726 }
727
728 unreachable("ir3_block_get_pred_index() invalid predecessor");
729 }
730
731 static struct ir3_instruction *
instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)732 instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
733 {
734 /* Add extra sources for array destinations and the address reg */
735 if (1 <= opc_cat(opc))
736 nsrc += 2;
737 struct ir3_instruction *instr;
738 unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
739 (nsrc * sizeof(instr->srcs[0]));
740 char *ptr = ir3_alloc(block->shader, sz);
741
742 instr = (struct ir3_instruction *)ptr;
743 ptr += sizeof(*instr);
744 instr->dsts = (struct ir3_register **)ptr;
745 instr->srcs = instr->dsts + ndst;
746
747 #if MESA_DEBUG
748 instr->dsts_max = ndst;
749 instr->srcs_max = nsrc;
750 #endif
751
752 list_inithead(&instr->rpt_node);
753 return instr;
754 }
755
756 static void
add_to_address_users(struct ir3_instruction * instr)757 add_to_address_users(struct ir3_instruction *instr)
758 {
759 assert(instr->address != NULL);
760
761 struct ir3 *ir = instr->block->shader;
762 struct ir3_register *addr_reg = instr->address->def;
763 assert(reg_num(addr_reg) == REG_A0);
764 unsigned comp = reg_comp(addr_reg);
765 if (comp == 0) {
766 array_insert(ir, ir->a0_users, instr);
767 } else {
768 assert(comp == 1);
769 array_insert(ir, ir->a1_users, instr);
770 }
771 }
772
773 static struct ir3_block *
get_block(struct ir3_cursor cursor)774 get_block(struct ir3_cursor cursor)
775 {
776 switch (cursor.option) {
777 case IR3_CURSOR_BEFORE_BLOCK:
778 case IR3_CURSOR_AFTER_BLOCK:
779 return cursor.block;
780 case IR3_CURSOR_BEFORE_INSTR:
781 case IR3_CURSOR_AFTER_INSTR:
782 return cursor.instr->block;
783 }
784
785 unreachable("illegal cursor option");
786 }
787
788 struct ir3_instruction *
ir3_instr_create_at(struct ir3_cursor cursor,opc_t opc,int ndst,int nsrc)789 ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc, int ndst, int nsrc)
790 {
791 struct ir3_block *block = get_block(cursor);
792 struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
793 instr->block = block;
794 instr->opc = opc;
795 insert_instr(cursor, instr);
796 return instr;
797 }
798
799 struct ir3_instruction *
ir3_build_instr(struct ir3_builder * builder,opc_t opc,int ndst,int nsrc)800 ir3_build_instr(struct ir3_builder *builder, opc_t opc, int ndst, int nsrc)
801 {
802 struct ir3_instruction *instr =
803 ir3_instr_create_at(builder->cursor, opc, ndst, nsrc);
804
805 /* During instruction selection, instructions are sometimes emitted to blocks
806 * other than the current one. For example, to predecessor blocks for phi
807 * sources or to the first block for inputs. For those cases, a new builder
808 * is created to emit at the end of the target block. However, if the target
809 * block happens to be the same as the current block, the main builder would
810 * not be updated to point past the new instructions. Therefore, don't update
811 * the cursor when it points to the end of a block to ensure that new
812 * instructions will always be added at the end.
813 */
814 if (builder->cursor.option != IR3_CURSOR_AFTER_BLOCK) {
815 builder->cursor = ir3_after_instr(instr);
816 }
817
818 return instr;
819 }
820
821 struct ir3_instruction *
ir3_instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)822 ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
823 {
824 return ir3_instr_create_at(ir3_before_terminator(block), opc, ndst, nsrc);
825 }
826
827 struct ir3_instruction *
ir3_instr_create_at_end(struct ir3_block * block,opc_t opc,int ndst,int nsrc)828 ir3_instr_create_at_end(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
829 {
830 return ir3_instr_create_at(ir3_after_block(block), opc, ndst, nsrc);
831 }
832
833 struct ir3_instruction *
ir3_instr_clone(struct ir3_instruction * instr)834 ir3_instr_clone(struct ir3_instruction *instr)
835 {
836 struct ir3_instruction *new_instr = instr_create(
837 instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
838 struct ir3_register **dsts, **srcs;
839
840 dsts = new_instr->dsts;
841 srcs = new_instr->srcs;
842 *new_instr = *instr;
843 new_instr->dsts = dsts;
844 new_instr->srcs = srcs;
845 list_inithead(&new_instr->rpt_node);
846
847 insert_instr(ir3_before_terminator(instr->block), new_instr);
848
849 /* clone registers: */
850 new_instr->dsts_count = 0;
851 new_instr->srcs_count = 0;
852 foreach_dst (reg, instr) {
853 struct ir3_register *new_reg =
854 ir3_dst_create(new_instr, reg->num, reg->flags);
855 *new_reg = *reg;
856 if (new_reg->instr)
857 new_reg->instr = new_instr;
858 }
859 foreach_src (reg, instr) {
860 struct ir3_register *new_reg =
861 ir3_src_create(new_instr, reg->num, reg->flags);
862 *new_reg = *reg;
863 }
864
865 if (instr->address) {
866 assert(instr->srcs_count > 0);
867 new_instr->address = new_instr->srcs[instr->srcs_count - 1];
868 add_to_address_users(new_instr);
869 }
870
871 return new_instr;
872 }
873
874 /* Add a false dependency to instruction, to ensure it is scheduled first: */
875 void
ir3_instr_add_dep(struct ir3_instruction * instr,struct ir3_instruction * dep)876 ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
877 {
878 for (unsigned i = 0; i < instr->deps_count; i++) {
879 if (instr->deps[i] == dep)
880 return;
881 }
882
883 array_insert(instr, instr->deps, dep);
884 }
885
886 void
ir3_instr_remove(struct ir3_instruction * instr)887 ir3_instr_remove(struct ir3_instruction *instr)
888 {
889 list_delinit(&instr->node);
890 list_delinit(&instr->rpt_node);
891 }
892
893 void
ir3_instr_create_rpt(struct ir3_instruction ** instrs,unsigned n)894 ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n)
895 {
896 assert(n > 0 && !ir3_instr_is_rpt(instrs[0]));
897
898 for (unsigned i = 1; i < n; ++i) {
899 assert(!ir3_instr_is_rpt(instrs[i]));
900 assert(instrs[i]->serialno > instrs[i - 1]->serialno);
901
902 list_addtail(&instrs[i]->rpt_node, &instrs[0]->rpt_node);
903 }
904 }
905
906 bool
ir3_instr_is_rpt(const struct ir3_instruction * instr)907 ir3_instr_is_rpt(const struct ir3_instruction *instr)
908 {
909 return !list_is_empty(&instr->rpt_node);
910 }
911
912 bool
ir3_instr_is_first_rpt(const struct ir3_instruction * instr)913 ir3_instr_is_first_rpt(const struct ir3_instruction *instr)
914 {
915 if (!ir3_instr_is_rpt(instr))
916 return false;
917
918 struct ir3_instruction *prev_rpt =
919 list_entry(instr->rpt_node.prev, struct ir3_instruction, rpt_node);
920 return prev_rpt->serialno > instr->serialno;
921 }
922
923 struct ir3_instruction *
ir3_instr_prev_rpt(const struct ir3_instruction * instr)924 ir3_instr_prev_rpt(const struct ir3_instruction *instr)
925 {
926 assert(ir3_instr_is_rpt(instr));
927
928 if (ir3_instr_is_first_rpt(instr))
929 return NULL;
930 return list_entry(instr->rpt_node.prev, struct ir3_instruction, rpt_node);
931 }
932
933 struct ir3_instruction *
ir3_instr_first_rpt(struct ir3_instruction * instr)934 ir3_instr_first_rpt(struct ir3_instruction *instr)
935 {
936 assert(ir3_instr_is_rpt(instr));
937
938 while (!ir3_instr_is_first_rpt(instr)) {
939 instr = ir3_instr_prev_rpt(instr);
940 assert(instr);
941 }
942
943 return instr;
944 }
945
946 unsigned
ir3_instr_rpt_length(const struct ir3_instruction * instr)947 ir3_instr_rpt_length(const struct ir3_instruction *instr)
948 {
949 assert(ir3_instr_is_first_rpt(instr));
950
951 return list_length(&instr->rpt_node) + 1;
952 }
953
954 struct ir3_register *
ir3_src_create(struct ir3_instruction * instr,int num,int flags)955 ir3_src_create(struct ir3_instruction *instr, int num, int flags)
956 {
957 struct ir3 *shader = instr->block->shader;
958 #if MESA_DEBUG
959 assert(instr->srcs_count < instr->srcs_max);
960 #endif
961 struct ir3_register *reg = reg_create(shader, num, flags);
962 instr->srcs[instr->srcs_count++] = reg;
963 return reg;
964 }
965
966 struct ir3_register *
ir3_dst_create(struct ir3_instruction * instr,int num,int flags)967 ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
968 {
969 struct ir3 *shader = instr->block->shader;
970 #if MESA_DEBUG
971 assert(instr->dsts_count < instr->dsts_max);
972 #endif
973 struct ir3_register *reg = reg_create(shader, num, flags);
974 instr->dsts[instr->dsts_count++] = reg;
975 return reg;
976 }
977
978 struct ir3_register *
ir3_reg_clone(struct ir3 * shader,struct ir3_register * reg)979 ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
980 {
981 struct ir3_register *new_reg = reg_create(shader, 0, 0);
982 *new_reg = *reg;
983 return new_reg;
984 }
985
986 void
ir3_reg_set_last_array(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_register * last_write)987 ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
988 struct ir3_register *last_write)
989 {
990 assert(reg->flags & IR3_REG_ARRAY);
991 struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
992 *new_reg = *reg;
993 new_reg->def = last_write;
994 ir3_reg_tie(reg, new_reg);
995 }
996
997 void
ir3_instr_set_address(struct ir3_instruction * instr,struct ir3_instruction * addr)998 ir3_instr_set_address(struct ir3_instruction *instr,
999 struct ir3_instruction *addr)
1000 {
1001 if (!instr->address) {
1002 assert(instr->block == addr->block);
1003
1004 instr->address =
1005 ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
1006 instr->address->def = addr->dsts[0];
1007 add_to_address_users(instr);
1008 } else {
1009 assert(instr->address->def->instr == addr);
1010 }
1011 }
1012
1013 /* Does this instruction use the scalar ALU?
1014 */
1015 bool
is_scalar_alu(struct ir3_instruction * instr,const struct ir3_compiler * compiler)1016 is_scalar_alu(struct ir3_instruction *instr,
1017 const struct ir3_compiler *compiler)
1018 {
1019 /* MOVMSK seems to always need (ss) even with other scalar ALU instructions
1020 */
1021 return instr->opc != OPC_MOVMSK &&
1022 instr->opc != OPC_SCAN_CLUSTERS_MACRO &&
1023 instr->opc != OPC_SCAN_MACRO &&
1024 is_alu(instr) && (instr->dsts[0]->flags & IR3_REG_SHARED) &&
1025 /* scalar->scalar mov instructions (but NOT cov) were supported before the
1026 * scalar ALU was supported, but they still required (ss) whereas on GPUs
1027 * that have a scalar ALU they are executed on it and do not require (ss).
1028 * We have to be careful to return false for these if scalar ALU isn't
1029 * supported, so that we treat them like vector->scalar mov instructions
1030 * (such as requiring (ss)).
1031 */
1032 compiler->has_scalar_alu &&
1033 /* moves from normal to shared seem to use a separate ALU as before and
1034 * require a (ss) on dependent instructions.
1035 */
1036 ((instr->opc != OPC_MOV && !is_subgroup_cond_mov_macro(instr)) ||
1037 (instr->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)));
1038 }
1039
1040 void
ir3_block_clear_mark(struct ir3_block * block)1041 ir3_block_clear_mark(struct ir3_block *block)
1042 {
1043 foreach_instr (instr, &block->instr_list)
1044 instr->flags &= ~IR3_INSTR_MARK;
1045 }
1046
1047 void
ir3_clear_mark(struct ir3 * ir)1048 ir3_clear_mark(struct ir3 *ir)
1049 {
1050 foreach_block (block, &ir->block_list) {
1051 ir3_block_clear_mark(block);
1052 }
1053 }
1054
1055 unsigned
ir3_count_instructions(struct ir3 * ir)1056 ir3_count_instructions(struct ir3 *ir)
1057 {
1058 unsigned cnt = 1;
1059 foreach_block (block, &ir->block_list) {
1060 block->start_ip = cnt;
1061 foreach_instr (instr, &block->instr_list) {
1062 instr->ip = cnt++;
1063 }
1064 block->end_ip = cnt;
1065 }
1066 return cnt;
1067 }
1068
1069 unsigned
ir3_count_instructions_sched(struct ir3 * ir)1070 ir3_count_instructions_sched(struct ir3 *ir)
1071 {
1072 unsigned cnt = 1;
1073 foreach_block (block, &ir->block_list) {
1074 block->start_ip = cnt;
1075 foreach_instr (instr, &block->instr_list) {
1076 if (!is_terminator(instr))
1077 instr->ip = cnt++;
1078 }
1079 block->end_ip = cnt;
1080 }
1081 return cnt;
1082 }
1083
1084 /* When counting instructions for RA, we insert extra fake instructions at the
1085 * beginning of each block, where values become live, and at the end where
1086 * values die. This prevents problems where values live-in at the beginning or
1087 * live-out at the end of a block from being treated as if they were
1088 * live-in/live-out at the first/last instruction, which would be incorrect.
1089 * In ir3_legalize these ip's are assumed to be actual ip's of the final
1090 * program, so it would be incorrect to use this everywhere.
1091 */
1092
1093 unsigned
ir3_count_instructions_ra(struct ir3 * ir)1094 ir3_count_instructions_ra(struct ir3 *ir)
1095 {
1096 unsigned cnt = 1;
1097 foreach_block (block, &ir->block_list) {
1098 block->start_ip = cnt++;
1099 foreach_instr (instr, &block->instr_list) {
1100 instr->ip = cnt++;
1101 }
1102 block->end_ip = cnt++;
1103 }
1104 return cnt;
1105 }
1106
1107 struct ir3_array *
ir3_lookup_array(struct ir3 * ir,unsigned id)1108 ir3_lookup_array(struct ir3 *ir, unsigned id)
1109 {
1110 foreach_array (arr, &ir->array_list)
1111 if (arr->id == id)
1112 return arr;
1113 return NULL;
1114 }
1115
ir3_find_ssa_uses_for(struct ir3 * ir,void * mem_ctx,use_filter_cb filter)1116 void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter)
1117 {
1118 /* We could do this in a single pass if we can assume instructions
1119 * are always sorted. Which currently might not always be true.
1120 * (In particular after ir3_group pass, but maybe other places.)
1121 */
1122 foreach_block (block, &ir->block_list)
1123 foreach_instr (instr, &block->instr_list)
1124 instr->uses = NULL;
1125
1126 foreach_block (block, &ir->block_list) {
1127 foreach_instr (instr, &block->instr_list) {
1128 foreach_ssa_src_n (src, n, instr) {
1129 if (!filter(instr, n))
1130 continue;
1131 if (!src->uses)
1132 src->uses = _mesa_pointer_set_create(mem_ctx);
1133 _mesa_set_add(src->uses, instr);
1134 }
1135 }
1136 }
1137 }
1138
1139 static bool
no_false_deps(struct ir3_instruction * instr,unsigned src_n)1140 no_false_deps(struct ir3_instruction *instr, unsigned src_n)
1141 {
1142 return !__is_false_dep(instr, src_n);
1143 }
1144
1145 static bool
any_src(struct ir3_instruction * instr,unsigned src_n)1146 any_src(struct ir3_instruction *instr, unsigned src_n)
1147 {
1148 return true;
1149 }
1150
1151 void
ir3_find_ssa_uses(struct ir3 * ir,void * mem_ctx,bool falsedeps)1152 ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
1153 {
1154 if (falsedeps)
1155 return ir3_find_ssa_uses_for(ir, mem_ctx, any_src);
1156 return ir3_find_ssa_uses_for(ir, mem_ctx, no_false_deps);
1157 }
1158
1159 /**
1160 * Set the destination type of an instruction, for example if a
1161 * conversion is folded in, handling the special cases where the
1162 * instruction's dest type or opcode needs to be fixed up.
1163 */
1164 void
ir3_set_dst_type(struct ir3_instruction * instr,bool half)1165 ir3_set_dst_type(struct ir3_instruction *instr, bool half)
1166 {
1167 if (half) {
1168 instr->dsts[0]->flags |= IR3_REG_HALF;
1169 } else {
1170 instr->dsts[0]->flags &= ~IR3_REG_HALF;
1171 }
1172
1173 switch (opc_cat(instr->opc)) {
1174 case 1: /* move instructions */
1175 if (half) {
1176 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
1177 } else {
1178 instr->cat1.dst_type = full_type(instr->cat1.dst_type);
1179 }
1180 break;
1181 case 4:
1182 if (half) {
1183 instr->opc = cat4_half_opc(instr->opc);
1184 } else {
1185 instr->opc = cat4_full_opc(instr->opc);
1186 }
1187 break;
1188 case 5:
1189 if (half) {
1190 instr->cat5.type = half_type(instr->cat5.type);
1191 } else {
1192 instr->cat5.type = full_type(instr->cat5.type);
1193 }
1194 break;
1195 }
1196 }
1197
1198 /**
1199 * One-time fixup for instruction src-types. Other than cov's that
1200 * are folded, an instruction's src type does not change.
1201 */
1202 void
ir3_fixup_src_type(struct ir3_instruction * instr)1203 ir3_fixup_src_type(struct ir3_instruction *instr)
1204 {
1205 if (instr->srcs_count == 0)
1206 return;
1207
1208 switch (opc_cat(instr->opc)) {
1209 case 1: /* move instructions */
1210 if (instr->srcs[0]->flags & IR3_REG_HALF) {
1211 instr->cat1.src_type = half_type(instr->cat1.src_type);
1212 } else {
1213 instr->cat1.src_type = full_type(instr->cat1.src_type);
1214 }
1215 break;
1216 case 3:
1217 if (instr->srcs[0]->flags & IR3_REG_HALF) {
1218 instr->opc = cat3_half_opc(instr->opc);
1219 } else {
1220 instr->opc = cat3_full_opc(instr->opc);
1221 }
1222 break;
1223 }
1224 }
1225
1226 /**
1227 * Map a floating point immed to FLUT (float lookup table) value,
1228 * returns negative for immediates that cannot be mapped.
1229 */
1230 int
ir3_flut(struct ir3_register * src_reg)1231 ir3_flut(struct ir3_register *src_reg)
1232 {
1233 static const struct {
1234 uint32_t f32;
1235 uint16_t f16;
1236 } flut[] = {
1237 { .f32 = 0x00000000, .f16 = 0x0000 }, /* 0.0 */
1238 { .f32 = 0x3f000000, .f16 = 0x3800 }, /* 0.5 */
1239 { .f32 = 0x3f800000, .f16 = 0x3c00 }, /* 1.0 */
1240 { .f32 = 0x40000000, .f16 = 0x4000 }, /* 2.0 */
1241 { .f32 = 0x402df854, .f16 = 0x4170 }, /* e */
1242 { .f32 = 0x40490fdb, .f16 = 0x4248 }, /* pi */
1243 { .f32 = 0x3ea2f983, .f16 = 0x3518 }, /* 1/pi */
1244 { .f32 = 0x3f317218, .f16 = 0x398c }, /* 1/log2(e) */
1245 { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 }, /* log2(e) */
1246 { .f32 = 0x3e9a209b, .f16 = 0x34d1 }, /* 1/log2(10) */
1247 { .f32 = 0x40549a78, .f16 = 0x42a5 }, /* log2(10) */
1248 { .f32 = 0x40800000, .f16 = 0x4400 }, /* 4.0 */
1249 };
1250
1251 if (src_reg->flags & IR3_REG_HALF) {
1252 /* Note that half-float immeds are already lowered to 16b in nir: */
1253 uint32_t imm = src_reg->uim_val;
1254 for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
1255 if (flut[i].f16 == imm) {
1256 return i;
1257 }
1258 }
1259 } else {
1260 uint32_t imm = src_reg->uim_val;
1261 for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
1262 if (flut[i].f32 == imm) {
1263 return i;
1264 }
1265 }
1266 }
1267
1268 return -1;
1269 }
1270
1271 static unsigned
cp_flags(unsigned flags)1272 cp_flags(unsigned flags)
1273 {
1274 /* only considering these flags (at least for now): */
1275 flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
1276 IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
1277 IR3_REG_SHARED);
1278 return flags;
1279 }
1280
1281 bool
ir3_valid_flags(struct ir3_instruction * instr,unsigned n,unsigned flags)1282 ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
1283 {
1284 struct ir3_compiler *compiler = instr->block->shader->compiler;
1285 unsigned valid_flags;
1286
1287 flags = cp_flags(flags);
1288
1289 /* If destination is indirect, then source cannot be.. at least
1290 * I don't think so..
1291 */
1292 if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
1293 (flags & IR3_REG_RELATIV))
1294 return false;
1295
1296 if (flags & IR3_REG_RELATIV) {
1297 /* TODO need to test on earlier gens.. pretty sure the earlier
1298 * problem was just that we didn't check that the src was from
1299 * same block (since we can't propagate address register values
1300 * across blocks currently)
1301 */
1302 if (compiler->gen < 6)
1303 return false;
1304
1305 /* NOTE in the special try_swap_mad_two_srcs() case we can be
1306 * called on a src that has already had an indirect load folded
1307 * in, in which case ssa() returns NULL
1308 */
1309 if (instr->srcs[n]->flags & IR3_REG_SSA) {
1310 struct ir3_instruction *src = ssa(instr->srcs[n]);
1311 if (src->address->def->instr->block != instr->block)
1312 return false;
1313 }
1314 }
1315
1316 if (is_meta(instr)) {
1317 /* collect and phi nodes support const/immed sources, which will be
1318 * turned into move instructions, but not anything else.
1319 */
1320 if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
1321 return false;
1322
1323 /* Except for immed/const sources, source and dest shared-ness must match.
1324 */
1325 if (!(flags & (IR3_REG_IMMED | IR3_REG_CONST)) &&
1326 (flags & IR3_REG_SHARED) != (instr->dsts[0]->flags & IR3_REG_SHARED))
1327 return false;
1328
1329 return true;
1330 }
1331
1332 switch (opc_cat(instr->opc)) {
1333 case 0: /* end, chmask */
1334 return flags == 0;
1335 case 1:
1336 switch (instr->opc) {
1337 case OPC_MOVMSK:
1338 case OPC_SWZ:
1339 case OPC_SCT:
1340 case OPC_GAT:
1341 valid_flags = IR3_REG_SHARED;
1342 break;
1343 case OPC_SCAN_MACRO:
1344 if (n == 0)
1345 return flags == 0;
1346 else
1347 return flags == IR3_REG_SHARED;
1348 break;
1349 case OPC_SCAN_CLUSTERS_MACRO:
1350 if (n == 0)
1351 return flags == IR3_REG_SHARED;
1352 else
1353 return flags == 0;
1354 break;
1355 default: {
1356 valid_flags =
1357 IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
1358
1359 /* floating-point conversions when moving from non-shared to shared
1360 * seem not to work. We only use floating-point types in ir3 for
1361 * conversions, so don't bother specially handling the case where the
1362 * types are equal. Same goes for 8-bit sign extension.
1363 */
1364 if ((instr->dsts[0]->flags & IR3_REG_SHARED) &&
1365 !(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)) &&
1366 ((full_type(instr->cat1.src_type) == TYPE_F32 ||
1367 full_type(instr->cat1.dst_type) == TYPE_F32) ||
1368 (instr->cat1.src_type == TYPE_U8 &&
1369 full_type(instr->cat1.dst_type) == TYPE_S32)))
1370 return false;
1371
1372 /* Conversions seem not to work in shared->shared copies before scalar
1373 * ALU is supported.
1374 */
1375 if (!compiler->has_scalar_alu &&
1376 (flags & IR3_REG_SHARED) &&
1377 (instr->dsts[0]->flags & IR3_REG_SHARED) &&
1378 instr->cat1.src_type != instr->cat1.dst_type)
1379 return false;
1380 }
1381 }
1382 if (flags & ~valid_flags)
1383 return false;
1384 break;
1385 case 2:
1386 valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
1387 IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
1388
1389 if (flags & ~valid_flags)
1390 return false;
1391
1392 /* Allow an immediate src1 for flat.b, since it's ignored */
1393 if (instr->opc == OPC_FLAT_B &&
1394 n == 1 && flags == IR3_REG_IMMED)
1395 return true;
1396
1397 /* cat2/cat3 scalar ALU instructions must not have regular sources. */
1398 if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1399 if (!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)))
1400 return false;
1401 }
1402
1403 if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
1404 unsigned m = n ^ 1;
1405 /* cannot deal w/ const or shared in both srcs:
1406 * (note that some cat2 actually only have a single src)
1407 */
1408 if (m < instr->srcs_count) {
1409 struct ir3_register *reg = instr->srcs[m];
1410 if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1411 if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
1412 return false;
1413 } else {
1414 if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
1415 (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
1416 return false;
1417 }
1418 if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
1419 return false;
1420 }
1421 }
1422 break;
1423 case 3:
1424 valid_flags =
1425 ir3_cat3_absneg(instr->opc, n) | IR3_REG_RELATIV | IR3_REG_SHARED;
1426
1427 switch (instr->opc) {
1428 case OPC_SHRM:
1429 case OPC_SHLM:
1430 case OPC_SHRG:
1431 case OPC_SHLG:
1432 case OPC_ANDG: {
1433 if (n != 1) {
1434 valid_flags |= IR3_REG_IMMED;
1435 }
1436
1437 /* Can be RELATIV+CONST but not CONST: */
1438 if (flags & IR3_REG_RELATIV)
1439 valid_flags |= IR3_REG_CONST;
1440
1441 if (!(instr->dsts[0]->flags & IR3_REG_SHARED) && n < 2) {
1442 /* Of the first two sources, only one can be shared. */
1443 unsigned m = n ^ 1;
1444
1445 if ((flags & IR3_REG_SHARED) &&
1446 (instr->srcs[m]->flags & IR3_REG_SHARED)) {
1447 return false;
1448 }
1449 }
1450 break;
1451 }
1452 case OPC_WMM:
1453 case OPC_WMM_ACCU: {
1454 valid_flags = IR3_REG_SHARED;
1455 if (n == 2)
1456 valid_flags = IR3_REG_CONST;
1457 break;
1458 }
1459 case OPC_DP2ACC:
1460 case OPC_DP4ACC:
1461 break;
1462 default:
1463 valid_flags |= IR3_REG_CONST;
1464 }
1465
1466 if (flags & ~valid_flags)
1467 return false;
1468
1469 if (flags & (IR3_REG_CONST | IR3_REG_RELATIV) ||
1470 (!(instr->dsts[0]->flags & IR3_REG_SHARED) &&
1471 (flags & IR3_REG_SHARED))) {
1472 /* cannot deal w/ const/shared/relativ in 2nd src: */
1473 if (n == 1)
1474 return false;
1475 }
1476
1477 if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1478 if (!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)))
1479 return false;
1480 }
1481
1482 break;
1483 case 4:
1484 if ((instr->dsts[0]->flags & IR3_REG_SHARED) != (flags & IR3_REG_SHARED))
1485 return false;
1486 /* seems like blob compiler avoids const as src.. */
1487 /* TODO double check if this is still the case on a4xx */
1488 if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
1489 return false;
1490 if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
1491 return false;
1492 break;
1493 case 5:
1494 if (instr->opc == OPC_ISAM && (instr->flags & IR3_INSTR_V)) {
1495 if (((instr->flags & IR3_INSTR_S2EN) && n == 2) ||
1496 (!(instr->flags & IR3_INSTR_S2EN) && n == 1)) {
1497 return flags == IR3_REG_IMMED;
1498 }
1499 }
1500 /* no flags allowed */
1501 if (flags)
1502 return false;
1503 break;
1504 case 6:
1505 valid_flags = IR3_REG_IMMED;
1506
1507 if (instr->opc == OPC_STC && n == 1)
1508 valid_flags |= IR3_REG_SHARED;
1509 if (instr->opc == OPC_SHFL) {
1510 if (n == 0)
1511 valid_flags &= ~IR3_REG_IMMED;
1512 else if (n == 1)
1513 valid_flags |= IR3_REG_SHARED;
1514 }
1515
1516 if (flags & ~valid_flags)
1517 return false;
1518
1519 if (flags & IR3_REG_IMMED) {
1520 /* doesn't seem like we can have immediate src for store
1521 * instructions:
1522 *
1523 * TODO this restriction could also apply to load instructions,
1524 * but for load instructions this arg is the address (and not
1525 * really sure any good way to test a hard-coded immed addr src)
1526 */
1527 if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
1528 return false;
1529
1530 if ((instr->opc == OPC_LDL) && (n == 0))
1531 return false;
1532
1533 if ((instr->opc == OPC_STL) && (n != 2))
1534 return false;
1535
1536 if ((instr->opc == OPC_LDP) && (n == 0))
1537 return false;
1538
1539 if ((instr->opc == OPC_STP) && (n != 2))
1540 return false;
1541
1542 if (instr->opc == OPC_STLW && n == 0)
1543 return false;
1544
1545 if (instr->opc == OPC_LDLW && n == 0)
1546 return false;
1547
1548 /* disallow immediates in anything but the SSBO slot argument for
1549 * cat6 instructions:
1550 */
1551 if (is_global_a3xx_atomic(instr->opc) && (n != 0))
1552 return false;
1553
1554 if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) ||
1555 is_bindless_atomic(instr->opc))
1556 return false;
1557
1558 if (instr->opc == OPC_STG && (n == 2))
1559 return false;
1560
1561 if (instr->opc == OPC_STG_A && (n == 4))
1562 return false;
1563
1564 if (instr->opc == OPC_LDG && (n == 0))
1565 return false;
1566
1567 if (instr->opc == OPC_LDG_A && (n < 2))
1568 return false;
1569
1570 if (instr->opc == OPC_STC && n != 0)
1571 return false;
1572
1573 /* as with atomics, these cat6 instrs can only have an immediate
1574 * for SSBO/IBO slot argument
1575 */
1576 switch (instr->opc) {
1577 case OPC_LDIB:
1578 case OPC_STIB:
1579 if (n != 0 && n != 2)
1580 return false;
1581 break;
1582 case OPC_RESINFO:
1583 if (n != 0)
1584 return false;
1585 break;
1586 default:
1587 break;
1588 }
1589 }
1590
1591 break;
1592 }
1593
1594 return true;
1595 }
1596
1597 bool
ir3_valid_immediate(struct ir3_instruction * instr,int32_t immed)1598 ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
1599 {
1600 if (instr->opc == OPC_MOV || is_meta(instr) || instr->opc == OPC_ALIAS)
1601 return true;
1602
1603 if (is_mem(instr)) {
1604 switch (instr->opc) {
1605 /* Some load/store instructions have a 13-bit offset and size which must
1606 * always be an immediate and the rest of the sources cannot be
1607 * immediates, so the frontend is responsible for checking the size:
1608 */
1609 case OPC_LDL:
1610 case OPC_STL:
1611 case OPC_LDP:
1612 case OPC_STP:
1613 case OPC_LDG:
1614 case OPC_STG:
1615 case OPC_SPILL_MACRO:
1616 case OPC_RELOAD_MACRO:
1617 case OPC_LDG_A:
1618 case OPC_STG_A:
1619 case OPC_LDLW:
1620 case OPC_STLW:
1621 case OPC_LDLV:
1622 return true;
1623 default:
1624 /* most cat6 src immediates can only encode 8 bits: */
1625 return !(immed & ~0xff);
1626 }
1627 }
1628
1629 /* The alternative cat3 encoding used for sh[lr][gm]/andg uses 12 bit
1630 * immediates that won't be sign-extended.
1631 */
1632 if (is_cat3_alt(instr->opc)) {
1633 return !(immed & ~0xfff);
1634 }
1635
1636 /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1637 return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1638 }
1639
1640 struct ir3_instruction *
ir3_get_cond_for_nonzero_compare(struct ir3_instruction * instr)1641 ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr)
1642 {
1643 /* If instr is a negation (likely as a result of an nir_b2n), we can ignore
1644 * that and use its source, since the nonzero-ness stays the same.
1645 */
1646 if (instr->opc == OPC_ABSNEG_S && instr->flags == 0 &&
1647 (instr->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
1648 IR3_REG_SNEG) {
1649 return instr->srcs[0]->def->instr;
1650 }
1651
1652 return instr;
1653 }
1654
1655 bool
ir3_supports_rpt(struct ir3_compiler * compiler,unsigned opc)1656 ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc)
1657 {
1658 switch (opc_cat(opc)) {
1659 case 0:
1660 return opc == OPC_NOP;
1661 case 1:
1662 return opc == OPC_MOV || opc == OPC_SWZ || opc == OPC_MOVMSK;
1663 case 2:
1664 if (opc == OPC_BARY_F && !compiler->has_rpt_bary_f)
1665 return false;
1666 return true;
1667 case 3:
1668 return opc != OPC_DP2ACC && opc != OPC_DP4ACC;
1669 case 4:
1670 return opc != OPC_RCP;
1671 default:
1672 return false;
1673 }
1674 }
1675