• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2012 Rob Clark <robdclark@gmail.com>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "ir3.h"
7 
8 #include <assert.h>
9 #include <errno.h>
10 #include <stdbool.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "util/bitscan.h"
16 #include "util/half_float.h"
17 #include "util/ralloc.h"
18 #include "util/u_math.h"
19 
20 #include "instr-a3xx.h"
21 #include "ir3_shader.h"
22 
23 /* simple allocator to carve allocations out of an up-front allocated heap,
24  * so that we can free everything easily in one shot.
25  */
26 void *
ir3_alloc(struct ir3 * shader,int sz)27 ir3_alloc(struct ir3 *shader, int sz)
28 {
29    return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
30 }
31 
32 struct ir3 *
ir3_create(struct ir3_compiler * compiler,struct ir3_shader_variant * v)33 ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
34 {
35    struct ir3 *shader = rzalloc(v, struct ir3);
36 
37    shader->compiler = compiler;
38    shader->type = v->type;
39 
40    list_inithead(&shader->block_list);
41    list_inithead(&shader->array_list);
42 
43    return shader;
44 }
45 
46 void
ir3_destroy(struct ir3 * shader)47 ir3_destroy(struct ir3 *shader)
48 {
49    ralloc_free(shader);
50 }
51 
52 static bool
is_shared_consts(struct ir3_compiler * compiler,const struct ir3_const_state * const_state,struct ir3_register * reg)53 is_shared_consts(struct ir3_compiler *compiler,
54                  const struct ir3_const_state *const_state,
55                  struct ir3_register *reg)
56 {
57    if (const_state->push_consts_type == IR3_PUSH_CONSTS_SHARED &&
58        reg->flags & IR3_REG_CONST) {
59       uint32_t min_const_reg = regid(compiler->shared_consts_base_offset, 0);
60       uint32_t max_const_reg =
61          regid(compiler->shared_consts_base_offset +
62                compiler->shared_consts_size, 0);
63       return reg->num >= min_const_reg && min_const_reg < max_const_reg;
64    }
65 
66    return false;
67 }
68 
69 static void
collect_reg_info(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_info * info)70 collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
71                  struct ir3_info *info)
72 {
73    struct ir3_shader_variant *v = info->data;
74 
75    if (reg->flags & IR3_REG_IMMED) {
76       /* nothing to do */
77       return;
78    }
79 
80    /* Shared consts don't need to be included into constlen. */
81    if (is_shared_consts(v->compiler, ir3_const_state(v), reg))
82       return;
83 
84    unsigned components;
85    int16_t max;
86 
87    if (reg->flags & IR3_REG_RELATIV) {
88       components = reg->size;
89       max = (reg->array.base + components - 1);
90    } else {
91       components = util_last_bit(reg->wrmask);
92       max = (reg->num + components - 1);
93    }
94 
95    if (reg->flags & IR3_REG_CONST) {
96       info->max_const = MAX2(info->max_const, max >> 2);
97    } else if (max < regid(48, 0)) {
98       if (reg->flags & IR3_REG_HALF) {
99          if (v->mergedregs) {
100             /* starting w/ a6xx, half regs conflict with full regs: */
101             info->max_reg = MAX2(info->max_reg, max >> 3);
102          } else {
103             info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
104          }
105       } else {
106          info->max_reg = MAX2(info->max_reg, max >> 2);
107       }
108    }
109 }
110 
111 bool
ir3_should_double_threadsize(struct ir3_shader_variant * v,unsigned regs_count)112 ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
113 {
114    const struct ir3_compiler *compiler = v->compiler;
115 
116    /* If the user forced a particular wavesize respect that. */
117    if (v->shader_options.real_wavesize == IR3_SINGLE_ONLY)
118       return false;
119    if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY)
120       return true;
121 
122    /* We can't support more than compiler->branchstack_size diverging threads
123     * in a wave. Thus, doubling the threadsize is only possible if we don't
124     * exceed the branchstack size limit.
125     */
126    if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
127        compiler->branchstack_size) {
128       return false;
129    }
130 
131    switch (v->type) {
132    case MESA_SHADER_KERNEL:
133    case MESA_SHADER_COMPUTE: {
134       unsigned threads_per_wg =
135          v->local_size[0] * v->local_size[1] * v->local_size[2];
136 
137       /* For a5xx, if the workgroup size is greater than the maximum number
138        * of threads per core with 32 threads per wave (512) then we have to
139        * use the doubled threadsize because otherwise the workgroup wouldn't
140        * fit. For smaller workgroup sizes, we follow the blob and use the
141        * smaller threadsize.
142        */
143       if (compiler->gen < 6) {
144          return v->local_size_variable ||
145                 threads_per_wg >
146                    compiler->threadsize_base * compiler->max_waves;
147       }
148 
149       /* On a6xx, we prefer the larger threadsize unless the workgroup is
150        * small enough that it would be useless. Note that because
151        * threadsize_base is bumped to 64, we don't have to worry about the
152        * workgroup fitting, unlike the a5xx case.
153        */
154       if (!v->local_size_variable) {
155          if (threads_per_wg <= compiler->threadsize_base)
156             return false;
157       }
158    }
159       FALLTHROUGH;
160    case MESA_SHADER_FRAGMENT: {
161       /* Check that doubling the threadsize wouldn't exceed the regfile size */
162       return regs_count * 2 <= compiler->reg_size_vec4;
163    }
164 
165    default:
166       /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
167        * stages - the bit doesn't exist. The blob never used it for the VS
168        * on earlier gen's anyway.
169        */
170       return false;
171    }
172 }
173 
174 /* Get the maximum number of waves that could be used even if this shader
175  * didn't use any registers.
176  */
177 unsigned
ir3_get_reg_independent_max_waves(struct ir3_shader_variant * v,bool double_threadsize)178 ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
179                                   bool double_threadsize)
180 {
181    const struct ir3_compiler *compiler = v->compiler;
182    unsigned max_waves = compiler->max_waves;
183 
184    /* Compute the limit based on branchstack */
185    if (v->branchstack > 0) {
186       unsigned branchstack_max_waves = compiler->branchstack_size /
187                                        v->branchstack *
188                                        compiler->wave_granularity;
189       max_waves = MIN2(max_waves, branchstack_max_waves);
190    }
191 
192    /* If this is a compute shader, compute the limit based on shared size */
193    if ((v->type == MESA_SHADER_COMPUTE) ||
194        (v->type == MESA_SHADER_KERNEL)) {
195       unsigned threads_per_wg =
196          v->local_size[0] * v->local_size[1] * v->local_size[2];
197       unsigned waves_per_wg =
198          DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
199                                          (double_threadsize ? 2 : 1) *
200                                          compiler->wave_granularity);
201 
202       /* Shared is allocated in chunks of 1k */
203       unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
204       if (shared_per_wg > 0 && !v->local_size_variable) {
205          unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
206 
207          max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
208                                         compiler->wave_granularity);
209       }
210 
211       /* If we have a compute shader that has a big workgroup, a barrier, and
212        * a branchstack which limits max_waves - this may result in a situation
213        * when we cannot run concurrently all waves of the workgroup, which
214        * would lead to a hang.
215        *
216        * TODO: Could we spill branchstack or is there other way around?
217        * Blob just explodes in such case.
218        */
219       if (v->has_barrier && (max_waves < waves_per_wg)) {
220          mesa_loge(
221             "Compute shader (%s) which has workgroup barrier cannot be used "
222             "because it's impossible to have enough concurrent waves.",
223             v->name);
224          exit(1);
225       }
226    }
227 
228    return max_waves;
229 }
230 
231 /* Get the maximum number of waves that could be launched limited by reg size.
232  */
233 unsigned
ir3_get_reg_dependent_max_waves(const struct ir3_compiler * compiler,unsigned reg_count,bool double_threadsize)234 ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
235                                 unsigned reg_count, bool double_threadsize)
236 {
237    return reg_count ? (compiler->reg_size_vec4 /
238                        (reg_count * (double_threadsize ? 2 : 1)) *
239                        compiler->wave_granularity)
240                     : compiler->max_waves;
241 }
242 
243 void
ir3_collect_info(struct ir3_shader_variant * v)244 ir3_collect_info(struct ir3_shader_variant *v)
245 {
246    struct ir3_info *info = &v->info;
247    struct ir3 *shader = v->ir;
248    const struct ir3_compiler *compiler = v->compiler;
249 
250    memset(info, 0, sizeof(*info));
251    info->data = v;
252    info->max_reg = -1;
253    info->max_half_reg = -1;
254    info->max_const = -1;
255    info->multi_dword_ldp_stp = false;
256 
257    uint32_t instr_count = 0;
258    foreach_block (block, &shader->block_list) {
259       foreach_instr (instr, &block->instr_list) {
260          instr_count++;
261       }
262    }
263 
264    v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
265 
266    /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
267     * doesn't try to decode the following data as instructions (such as the
268     * next stage's shader in turnip)
269     */
270    info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
271    info->sizedwords = info->size / 4;
272 
273    info->early_preamble = v->early_preamble;
274 
275    bool in_preamble = false;
276    bool has_eq = false;
277 
278    /* Track which registers are currently aliases because they shouldn't be
279     * included in the GPR footprint.
280     */
281    regmask_t aliases;
282 
283    /* Full and half aliases do not overlap so treat them as !mergedregs. */
284    regmask_init(&aliases, false);
285 
286    foreach_block (block, &shader->block_list) {
287       int sfu_delay = 0, mem_delay = 0;
288 
289       foreach_instr (instr, &block->instr_list) {
290 
291          foreach_src (reg, instr) {
292             if (!is_reg_gpr(reg) || !regmask_get(&aliases, reg)) {
293                collect_reg_info(instr, reg, info);
294             }
295          }
296 
297          foreach_dst (reg, instr) {
298             if (instr->opc == OPC_ALIAS &&
299                 instr->cat7.alias_scope == ALIAS_TEX) {
300                regmask_set(&aliases, instr->dsts[0]);
301             } else if (is_dest_gpr(reg)) {
302                collect_reg_info(instr, reg, info);
303             }
304          }
305 
306          if (is_tex(instr)) {
307             /* All aliases are cleared after they are used. */
308             regmask_init(&aliases, false);
309          }
310 
311          if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
312             unsigned components = instr->srcs[2]->uim_val;
313 
314             /* This covers any multi-component access that could straddle
315              * across multiple double-words.
316              */
317             if (components > 1)
318                info->multi_dword_ldp_stp = true;
319 
320             if (instr->opc == OPC_STP)
321                info->stp_count += components;
322             else
323                info->ldp_count += components;
324          }
325 
326          if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) &&
327              (instr->dsts[0]->flags & IR3_REG_EI))
328             info->last_baryf = info->instrs_count;
329 
330          if ((instr->opc == OPC_NOP) && (instr->flags & IR3_INSTR_EQ)) {
331             info->last_helper = info->instrs_count;
332             has_eq = true;
333          }
334 
335          if (v->type == MESA_SHADER_FRAGMENT && v->need_pixlod &&
336              instr->opc == OPC_END && !v->prefetch_end_of_quad && !has_eq)
337             info->last_helper = info->instrs_count;
338 
339          if (instr->opc == OPC_SHPS)
340             in_preamble = true;
341 
342          /* Don't count instructions in the preamble for instruction-count type
343           * stats, because their effect should be much smaller.
344           * TODO: we should probably have separate stats for preamble
345           * instructions, but that would blow up the amount of stats...
346           */
347          if (!in_preamble) {
348             unsigned instrs_count = 1 + instr->repeat + instr->nop;
349             unsigned nops_count = instr->nop;
350 
351             if (instr->opc == OPC_NOP) {
352                nops_count = 1 + instr->repeat;
353                info->instrs_per_cat[0] += nops_count;
354             } else if (!is_meta(instr)) {
355                info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
356                info->instrs_per_cat[0] += nops_count;
357             }
358 
359             if (instr->opc == OPC_MOV) {
360                if (instr->cat1.src_type == instr->cat1.dst_type) {
361                   info->mov_count += 1 + instr->repeat;
362                } else {
363                   info->cov_count += 1 + instr->repeat;
364                }
365             }
366 
367             info->instrs_count += instrs_count;
368             info->nops_count += nops_count;
369 
370             if (instr->flags & IR3_INSTR_SS) {
371                info->ss++;
372                info->sstall += sfu_delay;
373                sfu_delay = 0;
374             }
375 
376             if (instr->flags & IR3_INSTR_SY) {
377                info->sy++;
378                info->systall += mem_delay;
379                mem_delay = 0;
380             }
381 
382             if (is_ss_producer(instr)) {
383                sfu_delay = soft_ss_delay(instr);
384             } else {
385                int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
386                sfu_delay -= n;
387             }
388 
389             if (is_sy_producer(instr)) {
390                mem_delay = soft_sy_delay(instr, shader);
391             } else {
392                int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
393                mem_delay -= n;
394             }
395          } else {
396             unsigned instrs_count = 1 + instr->repeat + instr->nop;
397             info->preamble_instrs_count += instrs_count;
398          }
399 
400          if (instr->opc == OPC_SHPE)
401             in_preamble = false;
402       }
403    }
404 
405    /* for vertex shader, the inputs are loaded into registers before the shader
406     * is executed, so max_regs from the shader instructions might not properly
407     * reflect the # of registers actually used, especially in case passthrough
408     * varyings.
409     *
410     * Likewise, for fragment shader, we can have some regs which are passed
411     * input values but never touched by the resulting shader (ie. as result
412     * of dead code elimination or simply because we don't know how to turn
413     * the reg off.
414     */
415    for (unsigned i = 0; i < v->inputs_count; i++) {
416       /* skip frag inputs fetch via bary.f since their reg's are
417        * not written by gpu before shader starts (and in fact the
418        * regid's might not even be valid)
419        */
420       if (v->inputs[i].bary)
421          continue;
422 
423       /* ignore high regs that are global to all threads in a warp
424        * (they exist by default) (a5xx+)
425        */
426       if (v->inputs[i].regid >= regid(48, 0))
427          continue;
428 
429       if (v->inputs[i].compmask) {
430          unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
431          int32_t regid = v->inputs[i].regid + n;
432          if (v->inputs[i].half) {
433             if (!v->mergedregs) {
434                v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
435             } else {
436                v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
437             }
438          } else {
439             v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
440          }
441       }
442    }
443 
444    for (unsigned i = 0; i < v->num_sampler_prefetch; i++) {
445       unsigned n = util_last_bit(v->sampler_prefetch[i].wrmask) - 1;
446       int32_t regid = v->sampler_prefetch[i].dst + n;
447       if (v->sampler_prefetch[i].half_precision) {
448          if (!v->mergedregs) {
449             v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
450          } else {
451             v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
452          }
453       } else {
454          v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
455       }
456    }
457 
458    /* TODO: for a5xx and below, is there a separate regfile for
459     * half-registers?
460     */
461    unsigned regs_count =
462       info->max_reg + 1 +
463       (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
464 
465    info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
466 
467    /* TODO this is different for earlier gens, but earlier gens don't use this */
468    info->subgroup_size = v->info.double_threadsize ? 128 : 64;
469 
470    unsigned reg_independent_max_waves =
471       ir3_get_reg_independent_max_waves(v, info->double_threadsize);
472    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
473       compiler, regs_count, info->double_threadsize);
474    info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
475    assert(info->max_waves <= v->compiler->max_waves);
476 }
477 
478 static struct ir3_register *
reg_create(struct ir3 * shader,int num,int flags)479 reg_create(struct ir3 *shader, int num, int flags)
480 {
481    struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
482    reg->wrmask = 1;
483    reg->flags = flags;
484    reg->num = num;
485    return reg;
486 }
487 
488 static void
insert_instr(struct ir3_cursor cursor,struct ir3_instruction * instr)489 insert_instr(struct ir3_cursor cursor, struct ir3_instruction *instr)
490 {
491    struct ir3 *shader = instr->block->shader;
492 
493    instr->serialno = ++shader->instr_count;
494 
495    switch (cursor.option) {
496    case IR3_CURSOR_BEFORE_BLOCK:
497       list_add(&instr->node, &cursor.block->instr_list);
498       break;
499    case IR3_CURSOR_AFTER_BLOCK:
500       list_addtail(&instr->node, &cursor.block->instr_list);
501       break;
502    case IR3_CURSOR_BEFORE_INSTR:
503       list_addtail(&instr->node, &cursor.instr->node);
504       break;
505    case IR3_CURSOR_AFTER_INSTR:
506       list_add(&instr->node, &cursor.instr->node);
507       break;
508    }
509 
510    if (is_input(instr))
511       array_insert(shader, shader->baryfs, instr);
512 }
513 
514 struct ir3_block *
ir3_block_create(struct ir3 * shader)515 ir3_block_create(struct ir3 *shader)
516 {
517    struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
518 #if MESA_DEBUG
519    block->serialno = ++shader->block_count;
520 #endif
521    block->shader = shader;
522    list_inithead(&block->node);
523    list_inithead(&block->instr_list);
524    return block;
525 }
526 
527 struct ir3_instruction *
ir3_find_end(struct ir3 * ir)528 ir3_find_end(struct ir3 *ir)
529 {
530    foreach_block_rev (block, &ir->block_list) {
531       foreach_instr_rev (instr, &block->instr_list) {
532          if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
533             return instr;
534       }
535    }
536    unreachable("couldn't find end instruction");
537 }
538 
539 static struct ir3_instruction *
block_get_last_instruction(struct ir3_block * block)540 block_get_last_instruction(struct ir3_block *block)
541 {
542    if (list_is_empty(&block->instr_list))
543       return NULL;
544    return list_last_entry(&block->instr_list, struct ir3_instruction, node);
545 }
546 
547 struct ir3_instruction *
ir3_block_get_terminator(struct ir3_block * block)548 ir3_block_get_terminator(struct ir3_block *block)
549 {
550    struct ir3_instruction *last = block_get_last_instruction(block);
551 
552    if (last && is_terminator(last))
553       return last;
554 
555    return NULL;
556 }
557 
558 struct ir3_instruction *
ir3_block_take_terminator(struct ir3_block * block)559 ir3_block_take_terminator(struct ir3_block *block)
560 {
561    struct ir3_instruction *terminator = ir3_block_get_terminator(block);
562 
563    if (terminator)
564       list_delinit(&terminator->node);
565 
566    return terminator;
567 }
568 
569 struct ir3_instruction *
ir3_block_get_last_non_terminator(struct ir3_block * block)570 ir3_block_get_last_non_terminator(struct ir3_block *block)
571 {
572    struct ir3_instruction *last = block_get_last_instruction(block);
573 
574    if (!last)
575       return NULL;
576 
577    if (!is_terminator(last))
578       return last;
579 
580    if (last->node.prev != &block->instr_list)
581       return list_entry(last->node.prev, struct ir3_instruction, node);
582 
583    return NULL;
584 }
585 
586 struct ir3_instruction *
ir3_block_get_last_phi(struct ir3_block * block)587 ir3_block_get_last_phi(struct ir3_block *block)
588 {
589    struct ir3_instruction *last_phi = NULL;
590 
591    foreach_instr (instr, &block->instr_list) {
592       if (instr->opc != OPC_META_PHI)
593          break;
594 
595       last_phi = instr;
596    }
597 
598    return last_phi;
599 }
600 
601 struct ir3_instruction *
ir3_find_shpe(struct ir3 * ir)602 ir3_find_shpe(struct ir3 *ir)
603 {
604    if (!ir3_has_preamble(ir)) {
605       return NULL;
606    }
607 
608    foreach_block (block, &ir->block_list) {
609       struct ir3_instruction *last = ir3_block_get_last_non_terminator(block);
610 
611       if (last && last->opc == OPC_SHPE) {
612          return last;
613       }
614    }
615 
616    unreachable("preamble without shpe");
617 }
618 
619 struct ir3_instruction *
ir3_create_empty_preamble(struct ir3 * ir)620 ir3_create_empty_preamble(struct ir3 *ir)
621 {
622    assert(!ir3_has_preamble(ir));
623 
624    struct ir3_block *main_start_block = ir3_start_block(ir);
625 
626    /* Create a preamble CFG similar to what the frontend would generate. Note
627     * that the empty else_block is important for ir3_after_preamble to work.
628     *
629     * shps_block:
630     * if (shps) {
631     *    getone_block:
632     *    if (getone) {
633     *       body_block:
634     *       shpe
635     *    }
636     * } else {
637     *    else_block:
638     * }
639     * main_start_block:
640     */
641    struct ir3_block *shps_block = ir3_block_create(ir);
642    struct ir3_block *getone_block = ir3_block_create(ir);
643    struct ir3_block *body_block = ir3_block_create(ir);
644    struct ir3_block *else_block = ir3_block_create(ir);
645    list_add(&else_block->node, &ir->block_list);
646    list_add(&body_block->node, &ir->block_list);
647    list_add(&getone_block->node, &ir->block_list);
648    list_add(&shps_block->node, &ir->block_list);
649 
650    struct ir3_builder b = ir3_builder_at(ir3_after_block(shps_block));
651    ir3_SHPS(&b);
652    shps_block->successors[0] = getone_block;
653    ir3_block_add_predecessor(getone_block, shps_block);
654    ir3_block_link_physical(shps_block, getone_block);
655    shps_block->successors[1] = else_block;
656    ir3_block_add_predecessor(else_block, shps_block);
657    ir3_block_link_physical(shps_block, else_block);
658 
659    b.cursor = ir3_after_block(getone_block);
660    ir3_GETONE(&b);
661    getone_block->divergent_condition = true;
662    getone_block->successors[0] = body_block;
663    ir3_block_add_predecessor(body_block, getone_block);
664    ir3_block_link_physical(getone_block, body_block);
665    getone_block->successors[1] = main_start_block;
666    ir3_block_add_predecessor(main_start_block, getone_block);
667    ir3_block_link_physical(getone_block, main_start_block);
668 
669    b.cursor = ir3_after_block(body_block);
670    struct ir3_instruction *shpe = ir3_SHPE(&b);
671    shpe->barrier_class = shpe->barrier_conflict = IR3_BARRIER_CONST_W;
672    array_insert(body_block, body_block->keeps, shpe);
673    ir3_JUMP(&b);
674    body_block->successors[0] = main_start_block;
675    ir3_block_add_predecessor(main_start_block, body_block);
676    ir3_block_link_physical(body_block, main_start_block);
677 
678    b.cursor = ir3_after_block(else_block);
679    ir3_JUMP(&b);
680    else_block->successors[0] = main_start_block;
681    ir3_block_add_predecessor(main_start_block, else_block);
682    ir3_block_link_physical(else_block, main_start_block);
683 
684    main_start_block->reconvergence_point = true;
685 
686    return shpe;
687 }
688 
689 void
ir3_block_add_predecessor(struct ir3_block * block,struct ir3_block * pred)690 ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
691 {
692    array_insert(block, block->predecessors, pred);
693 }
694 
695 void
ir3_block_link_physical(struct ir3_block * pred,struct ir3_block * succ)696 ir3_block_link_physical(struct ir3_block *pred,
697                         struct ir3_block *succ)
698 {
699    array_insert(pred, pred->physical_successors, succ);
700    array_insert(succ, succ->physical_predecessors, pred);
701 }
702 
703 void
ir3_block_remove_predecessor(struct ir3_block * block,struct ir3_block * pred)704 ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
705 {
706    for (unsigned i = 0; i < block->predecessors_count; i++) {
707       if (block->predecessors[i] == pred) {
708          if (i < block->predecessors_count - 1) {
709             block->predecessors[i] =
710                block->predecessors[block->predecessors_count - 1];
711          }
712 
713          block->predecessors_count--;
714          return;
715       }
716    }
717 }
718 
719 unsigned
ir3_block_get_pred_index(struct ir3_block * block,struct ir3_block * pred)720 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
721 {
722    for (unsigned i = 0; i < block->predecessors_count; i++) {
723       if (block->predecessors[i] == pred) {
724          return i;
725       }
726    }
727 
728    unreachable("ir3_block_get_pred_index() invalid predecessor");
729 }
730 
731 static struct ir3_instruction *
instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)732 instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
733 {
734    /* Add extra sources for array destinations and the address reg */
735    if (1 <= opc_cat(opc))
736       nsrc += 2;
737    struct ir3_instruction *instr;
738    unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
739                  (nsrc * sizeof(instr->srcs[0]));
740    char *ptr = ir3_alloc(block->shader, sz);
741 
742    instr = (struct ir3_instruction *)ptr;
743    ptr += sizeof(*instr);
744    instr->dsts = (struct ir3_register **)ptr;
745    instr->srcs = instr->dsts + ndst;
746 
747 #if MESA_DEBUG
748    instr->dsts_max = ndst;
749    instr->srcs_max = nsrc;
750 #endif
751 
752    list_inithead(&instr->rpt_node);
753    return instr;
754 }
755 
756 static void
add_to_address_users(struct ir3_instruction * instr)757 add_to_address_users(struct ir3_instruction *instr)
758 {
759    assert(instr->address != NULL);
760 
761    struct ir3 *ir = instr->block->shader;
762    struct ir3_register *addr_reg = instr->address->def;
763    assert(reg_num(addr_reg) == REG_A0);
764    unsigned comp = reg_comp(addr_reg);
765    if (comp == 0) {
766       array_insert(ir, ir->a0_users, instr);
767    } else {
768       assert(comp == 1);
769       array_insert(ir, ir->a1_users, instr);
770    }
771 }
772 
773 static struct ir3_block *
get_block(struct ir3_cursor cursor)774 get_block(struct ir3_cursor cursor)
775 {
776    switch (cursor.option) {
777    case IR3_CURSOR_BEFORE_BLOCK:
778    case IR3_CURSOR_AFTER_BLOCK:
779       return cursor.block;
780    case IR3_CURSOR_BEFORE_INSTR:
781    case IR3_CURSOR_AFTER_INSTR:
782       return cursor.instr->block;
783    }
784 
785    unreachable("illegal cursor option");
786 }
787 
788 struct ir3_instruction *
ir3_instr_create_at(struct ir3_cursor cursor,opc_t opc,int ndst,int nsrc)789 ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc, int ndst, int nsrc)
790 {
791    struct ir3_block *block = get_block(cursor);
792    struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
793    instr->block = block;
794    instr->opc = opc;
795    insert_instr(cursor, instr);
796    return instr;
797 }
798 
799 struct ir3_instruction *
ir3_build_instr(struct ir3_builder * builder,opc_t opc,int ndst,int nsrc)800 ir3_build_instr(struct ir3_builder *builder, opc_t opc, int ndst, int nsrc)
801 {
802    struct ir3_instruction *instr =
803       ir3_instr_create_at(builder->cursor, opc, ndst, nsrc);
804 
805    /* During instruction selection, instructions are sometimes emitted to blocks
806     * other than the current one. For example, to predecessor blocks for phi
807     * sources or to the first block for inputs. For those cases, a new builder
808     * is created to emit at the end of the target block. However, if the target
809     * block happens to be the same as the current block, the main builder would
810     * not be updated to point past the new instructions. Therefore, don't update
811     * the cursor when it points to the end of a block to ensure that new
812     * instructions will always be added at the end.
813     */
814    if (builder->cursor.option != IR3_CURSOR_AFTER_BLOCK) {
815       builder->cursor = ir3_after_instr(instr);
816    }
817 
818    return instr;
819 }
820 
821 struct ir3_instruction *
ir3_instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)822 ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
823 {
824    return ir3_instr_create_at(ir3_before_terminator(block), opc, ndst, nsrc);
825 }
826 
827 struct ir3_instruction *
ir3_instr_create_at_end(struct ir3_block * block,opc_t opc,int ndst,int nsrc)828 ir3_instr_create_at_end(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
829 {
830    return ir3_instr_create_at(ir3_after_block(block), opc, ndst, nsrc);
831 }
832 
833 struct ir3_instruction *
ir3_instr_clone(struct ir3_instruction * instr)834 ir3_instr_clone(struct ir3_instruction *instr)
835 {
836    struct ir3_instruction *new_instr = instr_create(
837       instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
838    struct ir3_register **dsts, **srcs;
839 
840    dsts = new_instr->dsts;
841    srcs = new_instr->srcs;
842    *new_instr = *instr;
843    new_instr->dsts = dsts;
844    new_instr->srcs = srcs;
845    list_inithead(&new_instr->rpt_node);
846 
847    insert_instr(ir3_before_terminator(instr->block), new_instr);
848 
849    /* clone registers: */
850    new_instr->dsts_count = 0;
851    new_instr->srcs_count = 0;
852    foreach_dst (reg, instr) {
853       struct ir3_register *new_reg =
854          ir3_dst_create(new_instr, reg->num, reg->flags);
855       *new_reg = *reg;
856       if (new_reg->instr)
857          new_reg->instr = new_instr;
858    }
859    foreach_src (reg, instr) {
860       struct ir3_register *new_reg =
861          ir3_src_create(new_instr, reg->num, reg->flags);
862       *new_reg = *reg;
863    }
864 
865    if (instr->address) {
866       assert(instr->srcs_count > 0);
867       new_instr->address = new_instr->srcs[instr->srcs_count - 1];
868       add_to_address_users(new_instr);
869    }
870 
871    return new_instr;
872 }
873 
874 /* Add a false dependency to instruction, to ensure it is scheduled first: */
875 void
ir3_instr_add_dep(struct ir3_instruction * instr,struct ir3_instruction * dep)876 ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
877 {
878    for (unsigned i = 0; i < instr->deps_count; i++) {
879       if (instr->deps[i] == dep)
880          return;
881    }
882 
883    array_insert(instr, instr->deps, dep);
884 }
885 
886 void
ir3_instr_remove(struct ir3_instruction * instr)887 ir3_instr_remove(struct ir3_instruction *instr)
888 {
889    list_delinit(&instr->node);
890    list_delinit(&instr->rpt_node);
891 }
892 
893 void
ir3_instr_create_rpt(struct ir3_instruction ** instrs,unsigned n)894 ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n)
895 {
896    assert(n > 0 && !ir3_instr_is_rpt(instrs[0]));
897 
898    for (unsigned i = 1; i < n; ++i) {
899       assert(!ir3_instr_is_rpt(instrs[i]));
900       assert(instrs[i]->serialno > instrs[i - 1]->serialno);
901 
902       list_addtail(&instrs[i]->rpt_node, &instrs[0]->rpt_node);
903    }
904 }
905 
906 bool
ir3_instr_is_rpt(const struct ir3_instruction * instr)907 ir3_instr_is_rpt(const struct ir3_instruction *instr)
908 {
909    return !list_is_empty(&instr->rpt_node);
910 }
911 
912 bool
ir3_instr_is_first_rpt(const struct ir3_instruction * instr)913 ir3_instr_is_first_rpt(const struct ir3_instruction *instr)
914 {
915    if (!ir3_instr_is_rpt(instr))
916       return false;
917 
918    struct ir3_instruction *prev_rpt =
919       list_entry(instr->rpt_node.prev, struct ir3_instruction, rpt_node);
920    return prev_rpt->serialno > instr->serialno;
921 }
922 
923 struct ir3_instruction *
ir3_instr_prev_rpt(const struct ir3_instruction * instr)924 ir3_instr_prev_rpt(const struct ir3_instruction *instr)
925 {
926    assert(ir3_instr_is_rpt(instr));
927 
928    if (ir3_instr_is_first_rpt(instr))
929       return NULL;
930    return list_entry(instr->rpt_node.prev, struct ir3_instruction, rpt_node);
931 }
932 
933 struct ir3_instruction *
ir3_instr_first_rpt(struct ir3_instruction * instr)934 ir3_instr_first_rpt(struct ir3_instruction *instr)
935 {
936    assert(ir3_instr_is_rpt(instr));
937 
938    while (!ir3_instr_is_first_rpt(instr)) {
939       instr = ir3_instr_prev_rpt(instr);
940       assert(instr);
941    }
942 
943    return instr;
944 }
945 
946 unsigned
ir3_instr_rpt_length(const struct ir3_instruction * instr)947 ir3_instr_rpt_length(const struct ir3_instruction *instr)
948 {
949    assert(ir3_instr_is_first_rpt(instr));
950 
951    return list_length(&instr->rpt_node) + 1;
952 }
953 
954 struct ir3_register *
ir3_src_create(struct ir3_instruction * instr,int num,int flags)955 ir3_src_create(struct ir3_instruction *instr, int num, int flags)
956 {
957    struct ir3 *shader = instr->block->shader;
958 #if MESA_DEBUG
959    assert(instr->srcs_count < instr->srcs_max);
960 #endif
961    struct ir3_register *reg = reg_create(shader, num, flags);
962    instr->srcs[instr->srcs_count++] = reg;
963    return reg;
964 }
965 
966 struct ir3_register *
ir3_dst_create(struct ir3_instruction * instr,int num,int flags)967 ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
968 {
969    struct ir3 *shader = instr->block->shader;
970 #if MESA_DEBUG
971    assert(instr->dsts_count < instr->dsts_max);
972 #endif
973    struct ir3_register *reg = reg_create(shader, num, flags);
974    instr->dsts[instr->dsts_count++] = reg;
975    return reg;
976 }
977 
978 struct ir3_register *
ir3_reg_clone(struct ir3 * shader,struct ir3_register * reg)979 ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
980 {
981    struct ir3_register *new_reg = reg_create(shader, 0, 0);
982    *new_reg = *reg;
983    return new_reg;
984 }
985 
986 void
ir3_reg_set_last_array(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_register * last_write)987 ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
988                        struct ir3_register *last_write)
989 {
990    assert(reg->flags & IR3_REG_ARRAY);
991    struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
992    *new_reg = *reg;
993    new_reg->def = last_write;
994    ir3_reg_tie(reg, new_reg);
995 }
996 
997 void
ir3_instr_set_address(struct ir3_instruction * instr,struct ir3_instruction * addr)998 ir3_instr_set_address(struct ir3_instruction *instr,
999                       struct ir3_instruction *addr)
1000 {
1001    if (!instr->address) {
1002       assert(instr->block == addr->block);
1003 
1004       instr->address =
1005          ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
1006       instr->address->def = addr->dsts[0];
1007       add_to_address_users(instr);
1008    } else {
1009       assert(instr->address->def->instr == addr);
1010    }
1011 }
1012 
1013 /* Does this instruction use the scalar ALU?
1014  */
1015 bool
is_scalar_alu(struct ir3_instruction * instr,const struct ir3_compiler * compiler)1016 is_scalar_alu(struct ir3_instruction *instr,
1017               const struct ir3_compiler *compiler)
1018 {
1019    /* MOVMSK seems to always need (ss) even with other scalar ALU instructions
1020     */
1021    return instr->opc != OPC_MOVMSK &&
1022       instr->opc != OPC_SCAN_CLUSTERS_MACRO &&
1023       instr->opc != OPC_SCAN_MACRO &&
1024       is_alu(instr) && (instr->dsts[0]->flags & IR3_REG_SHARED) &&
1025       /* scalar->scalar mov instructions (but NOT cov) were supported before the
1026        * scalar ALU was supported, but they still required (ss) whereas on GPUs
1027        * that have a scalar ALU they are executed on it and do not require (ss).
1028        * We have to be careful to return false for these if scalar ALU isn't
1029        * supported, so that we treat them like vector->scalar mov instructions
1030        * (such as requiring (ss)).
1031        */
1032       compiler->has_scalar_alu &&
1033       /* moves from normal to shared seem to use a separate ALU as before and
1034        * require a (ss) on dependent instructions.
1035        */
1036       ((instr->opc != OPC_MOV && !is_subgroup_cond_mov_macro(instr)) ||
1037        (instr->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)));
1038 }
1039 
1040 void
ir3_block_clear_mark(struct ir3_block * block)1041 ir3_block_clear_mark(struct ir3_block *block)
1042 {
1043    foreach_instr (instr, &block->instr_list)
1044       instr->flags &= ~IR3_INSTR_MARK;
1045 }
1046 
1047 void
ir3_clear_mark(struct ir3 * ir)1048 ir3_clear_mark(struct ir3 *ir)
1049 {
1050    foreach_block (block, &ir->block_list) {
1051       ir3_block_clear_mark(block);
1052    }
1053 }
1054 
1055 unsigned
ir3_count_instructions(struct ir3 * ir)1056 ir3_count_instructions(struct ir3 *ir)
1057 {
1058    unsigned cnt = 1;
1059    foreach_block (block, &ir->block_list) {
1060       block->start_ip = cnt;
1061       foreach_instr (instr, &block->instr_list) {
1062          instr->ip = cnt++;
1063       }
1064       block->end_ip = cnt;
1065    }
1066    return cnt;
1067 }
1068 
1069 unsigned
ir3_count_instructions_sched(struct ir3 * ir)1070 ir3_count_instructions_sched(struct ir3 *ir)
1071 {
1072    unsigned cnt = 1;
1073    foreach_block (block, &ir->block_list) {
1074       block->start_ip = cnt;
1075       foreach_instr (instr, &block->instr_list) {
1076          if (!is_terminator(instr))
1077             instr->ip = cnt++;
1078       }
1079       block->end_ip = cnt;
1080    }
1081    return cnt;
1082 }
1083 
1084 /* When counting instructions for RA, we insert extra fake instructions at the
1085  * beginning of each block, where values become live, and at the end where
1086  * values die. This prevents problems where values live-in at the beginning or
1087  * live-out at the end of a block from being treated as if they were
1088  * live-in/live-out at the first/last instruction, which would be incorrect.
1089  * In ir3_legalize these ip's are assumed to be actual ip's of the final
1090  * program, so it would be incorrect to use this everywhere.
1091  */
1092 
1093 unsigned
ir3_count_instructions_ra(struct ir3 * ir)1094 ir3_count_instructions_ra(struct ir3 *ir)
1095 {
1096    unsigned cnt = 1;
1097    foreach_block (block, &ir->block_list) {
1098       block->start_ip = cnt++;
1099       foreach_instr (instr, &block->instr_list) {
1100          instr->ip = cnt++;
1101       }
1102       block->end_ip = cnt++;
1103    }
1104    return cnt;
1105 }
1106 
1107 struct ir3_array *
ir3_lookup_array(struct ir3 * ir,unsigned id)1108 ir3_lookup_array(struct ir3 *ir, unsigned id)
1109 {
1110    foreach_array (arr, &ir->array_list)
1111       if (arr->id == id)
1112          return arr;
1113    return NULL;
1114 }
1115 
ir3_find_ssa_uses_for(struct ir3 * ir,void * mem_ctx,use_filter_cb filter)1116 void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter)
1117 {
1118    /* We could do this in a single pass if we can assume instructions
1119     * are always sorted.  Which currently might not always be true.
1120     * (In particular after ir3_group pass, but maybe other places.)
1121     */
1122    foreach_block (block, &ir->block_list)
1123       foreach_instr (instr, &block->instr_list)
1124          instr->uses = NULL;
1125 
1126    foreach_block (block, &ir->block_list) {
1127       foreach_instr (instr, &block->instr_list) {
1128          foreach_ssa_src_n (src, n, instr) {
1129             if (!filter(instr, n))
1130                continue;
1131             if (!src->uses)
1132                src->uses = _mesa_pointer_set_create(mem_ctx);
1133             _mesa_set_add(src->uses, instr);
1134          }
1135       }
1136    }
1137 }
1138 
1139 static bool
no_false_deps(struct ir3_instruction * instr,unsigned src_n)1140 no_false_deps(struct ir3_instruction *instr, unsigned src_n)
1141 {
1142    return !__is_false_dep(instr, src_n);
1143 }
1144 
1145 static bool
any_src(struct ir3_instruction * instr,unsigned src_n)1146 any_src(struct ir3_instruction *instr, unsigned src_n)
1147 {
1148    return true;
1149 }
1150 
1151 void
ir3_find_ssa_uses(struct ir3 * ir,void * mem_ctx,bool falsedeps)1152 ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
1153 {
1154    if (falsedeps)
1155       return ir3_find_ssa_uses_for(ir, mem_ctx, any_src);
1156    return ir3_find_ssa_uses_for(ir, mem_ctx, no_false_deps);
1157 }
1158 
1159 /**
1160  * Set the destination type of an instruction, for example if a
1161  * conversion is folded in, handling the special cases where the
1162  * instruction's dest type or opcode needs to be fixed up.
1163  */
1164 void
ir3_set_dst_type(struct ir3_instruction * instr,bool half)1165 ir3_set_dst_type(struct ir3_instruction *instr, bool half)
1166 {
1167    if (half) {
1168       instr->dsts[0]->flags |= IR3_REG_HALF;
1169    } else {
1170       instr->dsts[0]->flags &= ~IR3_REG_HALF;
1171    }
1172 
1173    switch (opc_cat(instr->opc)) {
1174    case 1: /* move instructions */
1175       if (half) {
1176          instr->cat1.dst_type = half_type(instr->cat1.dst_type);
1177       } else {
1178          instr->cat1.dst_type = full_type(instr->cat1.dst_type);
1179       }
1180       break;
1181    case 4:
1182       if (half) {
1183          instr->opc = cat4_half_opc(instr->opc);
1184       } else {
1185          instr->opc = cat4_full_opc(instr->opc);
1186       }
1187       break;
1188    case 5:
1189       if (half) {
1190          instr->cat5.type = half_type(instr->cat5.type);
1191       } else {
1192          instr->cat5.type = full_type(instr->cat5.type);
1193       }
1194       break;
1195    }
1196 }
1197 
1198 /**
1199  * One-time fixup for instruction src-types.  Other than cov's that
1200  * are folded, an instruction's src type does not change.
1201  */
1202 void
ir3_fixup_src_type(struct ir3_instruction * instr)1203 ir3_fixup_src_type(struct ir3_instruction *instr)
1204 {
1205    if (instr->srcs_count == 0)
1206       return;
1207 
1208    switch (opc_cat(instr->opc)) {
1209    case 1: /* move instructions */
1210       if (instr->srcs[0]->flags & IR3_REG_HALF) {
1211          instr->cat1.src_type = half_type(instr->cat1.src_type);
1212       } else {
1213          instr->cat1.src_type = full_type(instr->cat1.src_type);
1214       }
1215       break;
1216    case 3:
1217       if (instr->srcs[0]->flags & IR3_REG_HALF) {
1218          instr->opc = cat3_half_opc(instr->opc);
1219       } else {
1220          instr->opc = cat3_full_opc(instr->opc);
1221       }
1222       break;
1223    }
1224 }
1225 
1226 /**
1227  * Map a floating point immed to FLUT (float lookup table) value,
1228  * returns negative for immediates that cannot be mapped.
1229  */
1230 int
ir3_flut(struct ir3_register * src_reg)1231 ir3_flut(struct ir3_register *src_reg)
1232 {
1233    static const struct {
1234       uint32_t f32;
1235       uint16_t f16;
1236    } flut[] = {
1237          { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
1238          { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
1239          { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
1240          { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
1241          { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
1242          { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
1243          { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
1244          { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
1245          { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
1246          { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
1247          { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
1248          { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
1249    };
1250 
1251    if (src_reg->flags & IR3_REG_HALF) {
1252       /* Note that half-float immeds are already lowered to 16b in nir: */
1253       uint32_t imm = src_reg->uim_val;
1254       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
1255          if (flut[i].f16 == imm) {
1256             return i;
1257          }
1258       }
1259    } else {
1260       uint32_t imm = src_reg->uim_val;
1261       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
1262          if (flut[i].f32 == imm) {
1263             return i;
1264          }
1265       }
1266    }
1267 
1268    return -1;
1269 }
1270 
1271 static unsigned
cp_flags(unsigned flags)1272 cp_flags(unsigned flags)
1273 {
1274    /* only considering these flags (at least for now): */
1275    flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
1276              IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
1277              IR3_REG_SHARED);
1278    return flags;
1279 }
1280 
1281 bool
ir3_valid_flags(struct ir3_instruction * instr,unsigned n,unsigned flags)1282 ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
1283 {
1284    struct ir3_compiler *compiler = instr->block->shader->compiler;
1285    unsigned valid_flags;
1286 
1287    flags = cp_flags(flags);
1288 
1289    /* If destination is indirect, then source cannot be.. at least
1290     * I don't think so..
1291     */
1292    if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
1293        (flags & IR3_REG_RELATIV))
1294       return false;
1295 
1296    if (flags & IR3_REG_RELATIV) {
1297       /* TODO need to test on earlier gens.. pretty sure the earlier
1298        * problem was just that we didn't check that the src was from
1299        * same block (since we can't propagate address register values
1300        * across blocks currently)
1301        */
1302       if (compiler->gen < 6)
1303          return false;
1304 
1305       /* NOTE in the special try_swap_mad_two_srcs() case we can be
1306        * called on a src that has already had an indirect load folded
1307        * in, in which case ssa() returns NULL
1308        */
1309       if (instr->srcs[n]->flags & IR3_REG_SSA) {
1310          struct ir3_instruction *src = ssa(instr->srcs[n]);
1311          if (src->address->def->instr->block != instr->block)
1312             return false;
1313       }
1314    }
1315 
1316    if (is_meta(instr)) {
1317       /* collect and phi nodes support const/immed sources, which will be
1318        * turned into move instructions, but not anything else.
1319        */
1320       if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
1321          return false;
1322 
1323       /* Except for immed/const sources, source and dest shared-ness must match.
1324        */
1325       if (!(flags & (IR3_REG_IMMED | IR3_REG_CONST)) &&
1326           (flags & IR3_REG_SHARED) != (instr->dsts[0]->flags & IR3_REG_SHARED))
1327          return false;
1328 
1329       return true;
1330    }
1331 
1332    switch (opc_cat(instr->opc)) {
1333    case 0: /* end, chmask */
1334       return flags == 0;
1335    case 1:
1336       switch (instr->opc) {
1337       case OPC_MOVMSK:
1338       case OPC_SWZ:
1339       case OPC_SCT:
1340       case OPC_GAT:
1341          valid_flags = IR3_REG_SHARED;
1342          break;
1343       case OPC_SCAN_MACRO:
1344          if (n == 0)
1345             return flags == 0;
1346          else
1347             return flags == IR3_REG_SHARED;
1348          break;
1349       case OPC_SCAN_CLUSTERS_MACRO:
1350          if (n == 0)
1351             return flags == IR3_REG_SHARED;
1352          else
1353             return flags == 0;
1354          break;
1355       default: {
1356          valid_flags =
1357             IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
1358 
1359          /* floating-point conversions when moving from non-shared to shared
1360           * seem not to work. We only use floating-point types in ir3 for
1361           * conversions, so don't bother specially handling the case where the
1362           * types are equal. Same goes for 8-bit sign extension.
1363           */
1364          if ((instr->dsts[0]->flags & IR3_REG_SHARED) &&
1365              !(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)) &&
1366              ((full_type(instr->cat1.src_type) == TYPE_F32 ||
1367                full_type(instr->cat1.dst_type) == TYPE_F32) ||
1368               (instr->cat1.src_type == TYPE_U8 &&
1369                full_type(instr->cat1.dst_type) == TYPE_S32)))
1370             return false;
1371 
1372          /* Conversions seem not to work in shared->shared copies before scalar
1373           * ALU is supported.
1374           */
1375          if (!compiler->has_scalar_alu &&
1376              (flags & IR3_REG_SHARED) &&
1377              (instr->dsts[0]->flags & IR3_REG_SHARED) &&
1378              instr->cat1.src_type != instr->cat1.dst_type)
1379             return false;
1380       }
1381       }
1382       if (flags & ~valid_flags)
1383          return false;
1384       break;
1385    case 2:
1386       valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
1387                     IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
1388 
1389       if (flags & ~valid_flags)
1390          return false;
1391 
1392       /* Allow an immediate src1 for flat.b, since it's ignored */
1393       if (instr->opc == OPC_FLAT_B &&
1394           n == 1 && flags == IR3_REG_IMMED)
1395          return true;
1396 
1397       /* cat2/cat3 scalar ALU instructions must not have regular sources. */
1398       if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1399          if (!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)))
1400             return false;
1401       }
1402 
1403       if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
1404          unsigned m = n ^ 1;
1405          /* cannot deal w/ const or shared in both srcs:
1406           * (note that some cat2 actually only have a single src)
1407           */
1408          if (m < instr->srcs_count) {
1409             struct ir3_register *reg = instr->srcs[m];
1410             if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1411                if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
1412                   return false;
1413             } else {
1414                if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
1415                    (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
1416                   return false;
1417             }
1418             if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
1419                return false;
1420          }
1421       }
1422       break;
1423    case 3:
1424       valid_flags =
1425          ir3_cat3_absneg(instr->opc, n) | IR3_REG_RELATIV | IR3_REG_SHARED;
1426 
1427       switch (instr->opc) {
1428       case OPC_SHRM:
1429       case OPC_SHLM:
1430       case OPC_SHRG:
1431       case OPC_SHLG:
1432       case OPC_ANDG: {
1433          if (n != 1) {
1434             valid_flags |= IR3_REG_IMMED;
1435          }
1436 
1437          /* Can be RELATIV+CONST but not CONST: */
1438          if (flags & IR3_REG_RELATIV)
1439             valid_flags |= IR3_REG_CONST;
1440 
1441          if (!(instr->dsts[0]->flags & IR3_REG_SHARED) && n < 2) {
1442             /* Of the first two sources, only one can be shared. */
1443             unsigned m = n ^ 1;
1444 
1445             if ((flags & IR3_REG_SHARED) &&
1446                 (instr->srcs[m]->flags & IR3_REG_SHARED)) {
1447                return false;
1448             }
1449          }
1450          break;
1451       }
1452       case OPC_WMM:
1453       case OPC_WMM_ACCU: {
1454          valid_flags = IR3_REG_SHARED;
1455          if (n == 2)
1456             valid_flags = IR3_REG_CONST;
1457          break;
1458       }
1459       case OPC_DP2ACC:
1460       case OPC_DP4ACC:
1461          break;
1462       default:
1463          valid_flags |= IR3_REG_CONST;
1464       }
1465 
1466       if (flags & ~valid_flags)
1467          return false;
1468 
1469       if (flags & (IR3_REG_CONST | IR3_REG_RELATIV) ||
1470           (!(instr->dsts[0]->flags & IR3_REG_SHARED) &&
1471            (flags & IR3_REG_SHARED))) {
1472          /* cannot deal w/ const/shared/relativ in 2nd src: */
1473          if (n == 1)
1474             return false;
1475       }
1476 
1477       if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1478          if (!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)))
1479             return false;
1480       }
1481 
1482       break;
1483    case 4:
1484       if ((instr->dsts[0]->flags & IR3_REG_SHARED) != (flags & IR3_REG_SHARED))
1485          return false;
1486       /* seems like blob compiler avoids const as src.. */
1487       /* TODO double check if this is still the case on a4xx */
1488       if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
1489          return false;
1490       if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
1491          return false;
1492       break;
1493    case 5:
1494       if (instr->opc == OPC_ISAM && (instr->flags & IR3_INSTR_V)) {
1495          if (((instr->flags & IR3_INSTR_S2EN) && n == 2) ||
1496              (!(instr->flags & IR3_INSTR_S2EN) && n == 1)) {
1497             return flags == IR3_REG_IMMED;
1498          }
1499       }
1500       /* no flags allowed */
1501       if (flags)
1502          return false;
1503       break;
1504    case 6:
1505       valid_flags = IR3_REG_IMMED;
1506 
1507       if (instr->opc == OPC_STC && n == 1)
1508          valid_flags |= IR3_REG_SHARED;
1509       if (instr->opc == OPC_SHFL) {
1510          if (n == 0)
1511             valid_flags &= ~IR3_REG_IMMED;
1512          else if (n == 1)
1513             valid_flags |= IR3_REG_SHARED;
1514       }
1515 
1516       if (flags & ~valid_flags)
1517          return false;
1518 
1519       if (flags & IR3_REG_IMMED) {
1520          /* doesn't seem like we can have immediate src for store
1521           * instructions:
1522           *
1523           * TODO this restriction could also apply to load instructions,
1524           * but for load instructions this arg is the address (and not
1525           * really sure any good way to test a hard-coded immed addr src)
1526           */
1527          if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
1528             return false;
1529 
1530          if ((instr->opc == OPC_LDL) && (n == 0))
1531             return false;
1532 
1533          if ((instr->opc == OPC_STL) && (n != 2))
1534             return false;
1535 
1536          if ((instr->opc == OPC_LDP) && (n == 0))
1537             return false;
1538 
1539          if ((instr->opc == OPC_STP) && (n != 2))
1540             return false;
1541 
1542          if (instr->opc == OPC_STLW && n == 0)
1543             return false;
1544 
1545          if (instr->opc == OPC_LDLW && n == 0)
1546             return false;
1547 
1548          /* disallow immediates in anything but the SSBO slot argument for
1549           * cat6 instructions:
1550           */
1551          if (is_global_a3xx_atomic(instr->opc) && (n != 0))
1552             return false;
1553 
1554          if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) ||
1555              is_bindless_atomic(instr->opc))
1556             return false;
1557 
1558          if (instr->opc == OPC_STG && (n == 2))
1559             return false;
1560 
1561          if (instr->opc == OPC_STG_A && (n == 4))
1562             return false;
1563 
1564          if (instr->opc == OPC_LDG && (n == 0))
1565             return false;
1566 
1567          if (instr->opc == OPC_LDG_A && (n < 2))
1568             return false;
1569 
1570          if (instr->opc == OPC_STC && n != 0)
1571             return false;
1572 
1573          /* as with atomics, these cat6 instrs can only have an immediate
1574           * for SSBO/IBO slot argument
1575           */
1576          switch (instr->opc) {
1577          case OPC_LDIB:
1578          case OPC_STIB:
1579             if (n != 0 && n != 2)
1580                return false;
1581             break;
1582          case OPC_RESINFO:
1583             if (n != 0)
1584                return false;
1585             break;
1586          default:
1587             break;
1588          }
1589       }
1590 
1591       break;
1592    }
1593 
1594    return true;
1595 }
1596 
1597 bool
ir3_valid_immediate(struct ir3_instruction * instr,int32_t immed)1598 ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
1599 {
1600    if (instr->opc == OPC_MOV || is_meta(instr) || instr->opc == OPC_ALIAS)
1601       return true;
1602 
1603    if (is_mem(instr)) {
1604       switch (instr->opc) {
1605       /* Some load/store instructions have a 13-bit offset and size which must
1606        * always be an immediate and the rest of the sources cannot be
1607        * immediates, so the frontend is responsible for checking the size:
1608        */
1609       case OPC_LDL:
1610       case OPC_STL:
1611       case OPC_LDP:
1612       case OPC_STP:
1613       case OPC_LDG:
1614       case OPC_STG:
1615       case OPC_SPILL_MACRO:
1616       case OPC_RELOAD_MACRO:
1617       case OPC_LDG_A:
1618       case OPC_STG_A:
1619       case OPC_LDLW:
1620       case OPC_STLW:
1621       case OPC_LDLV:
1622          return true;
1623       default:
1624          /* most cat6 src immediates can only encode 8 bits: */
1625          return !(immed & ~0xff);
1626       }
1627    }
1628 
1629    /* The alternative cat3 encoding used for sh[lr][gm]/andg uses 12 bit
1630     * immediates that won't be sign-extended.
1631     */
1632    if (is_cat3_alt(instr->opc)) {
1633       return !(immed & ~0xfff);
1634    }
1635 
1636    /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1637    return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1638 }
1639 
1640 struct ir3_instruction *
ir3_get_cond_for_nonzero_compare(struct ir3_instruction * instr)1641 ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr)
1642 {
1643    /* If instr is a negation (likely as a result of an nir_b2n), we can ignore
1644     * that and use its source, since the nonzero-ness stays the same.
1645     */
1646    if (instr->opc == OPC_ABSNEG_S && instr->flags == 0 &&
1647        (instr->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
1648           IR3_REG_SNEG) {
1649       return instr->srcs[0]->def->instr;
1650    }
1651 
1652    return instr;
1653 }
1654 
1655 bool
ir3_supports_rpt(struct ir3_compiler * compiler,unsigned opc)1656 ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc)
1657 {
1658    switch (opc_cat(opc)) {
1659    case 0:
1660       return opc == OPC_NOP;
1661    case 1:
1662       return opc == OPC_MOV || opc == OPC_SWZ || opc == OPC_MOVMSK;
1663    case 2:
1664       if (opc == OPC_BARY_F && !compiler->has_rpt_bary_f)
1665          return false;
1666       return true;
1667    case 3:
1668       return opc != OPC_DP2ACC && opc != OPC_DP4ACC;
1669    case 4:
1670       return opc != OPC_RCP;
1671    default:
1672       return false;
1673    }
1674 }
1675