• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2013 Rob Clark <robdclark@gmail.com>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #ifndef IR3_H_
7 #define IR3_H_
8 
9 #include <stdbool.h>
10 #include <stdint.h>
11 
12 #include "compiler/shader_enums.h"
13 
14 #include "util/bitscan.h"
15 #include "util/list.h"
16 #include "util/set.h"
17 #include "util/u_debug.h"
18 
19 #include "freedreno_common.h"
20 
21 #include "instr-a3xx.h"
22 
23 /* low level intermediate representation of an adreno shader program */
24 
25 struct ir3_compiler;
26 struct ir3;
27 struct ir3_instruction;
28 struct ir3_block;
29 
30 struct ir3_info {
31    void *data; /* used internally in ir3 assembler */
32    /* Size in bytes of the shader binary, including NIR constants and
33     * padding
34     */
35    uint32_t size;
36    /* byte offset from start of the shader to the NIR constant data. */
37    uint32_t constant_data_offset;
38    /* Size in dwords of the instructions. */
39    uint16_t sizedwords;
40    uint16_t instrs_count; /* expanded to account for rpt's */
41    uint16_t preamble_instrs_count;
42    uint16_t nops_count;   /* # of nop instructions, including nopN */
43    uint16_t mov_count;
44    uint16_t cov_count;
45    uint16_t stp_count;
46    uint16_t ldp_count;
47    /* NOTE: max_reg, etc, does not include registers not touched
48     * by the shader (ie. vertex fetched via VFD_DECODE but not
49     * touched by shader)
50     */
51    int8_t max_reg; /* highest GPR # used by shader */
52    int8_t max_half_reg;
53    int16_t max_const;
54    /* This is the maximum # of waves that can executed at once in one core,
55     * assuming that they are all executing this shader.
56     */
57    int8_t max_waves;
58    uint8_t subgroup_size;
59    bool double_threadsize;
60    bool multi_dword_ldp_stp;
61    bool early_preamble;
62 
63    /* number of sync bits: */
64    uint16_t ss, sy;
65 
66    /* estimate of number of cycles stalled on (ss) */
67    uint16_t sstall;
68    /* estimate of number of cycles stalled on (sy) */
69    uint16_t systall;
70 
71    uint16_t last_baryf; /* instruction # of last varying fetch */
72 
73    uint16_t last_helper; /* last instruction to use helper invocations */
74 
75    /* Number of instructions of a given category: */
76    uint16_t instrs_per_cat[8];
77 };
78 
79 struct ir3_merge_set {
80    uint16_t preferred_reg;
81    uint16_t size;
82    uint16_t alignment;
83 
84    unsigned interval_start;
85    unsigned spill_slot;
86 
87    unsigned regs_count;
88    struct ir3_register **regs;
89 };
90 
91 typedef enum ir3_register_flags {
92    IR3_REG_CONST = BIT(0),
93    IR3_REG_IMMED = BIT(1),
94    IR3_REG_HALF = BIT(2),
95    /* Shared registers have the same value for all threads when read.
96     * They can only be written when one thread is active (that is, inside
97     * a "getone" block).
98     */
99    IR3_REG_SHARED = BIT(3),
100    IR3_REG_RELATIV = BIT(4),
101    IR3_REG_R = BIT(5),
102    /* Most instructions, it seems, can do float abs/neg but not
103     * integer.  The CP pass needs to know what is intended (int or
104     * float) in order to do the right thing.  For this reason the
105     * abs/neg flags are split out into float and int variants.  In
106     * addition, .b (bitwise) operations, the negate is actually a
107     * bitwise not, so split that out into a new flag to make it
108     * more clear.
109     */
110    IR3_REG_FNEG = BIT(6),
111    IR3_REG_FABS = BIT(7),
112    IR3_REG_SNEG = BIT(8),
113    IR3_REG_SABS = BIT(9),
114    IR3_REG_BNOT = BIT(10),
115    /* (ei) flag, end-input?  Set on last bary, presumably to signal
116     * that the shader needs no more input:
117     *
118     * Note: Has different meaning on other instructions like add.s/u
119     */
120    IR3_REG_EI = BIT(11),
121    /* meta-flags, for intermediate stages of IR, ie.
122     * before register assignment is done:
123     */
124    IR3_REG_SSA = BIT(12), /* 'def' is ptr to assigning destination */
125    IR3_REG_ARRAY = BIT(13),
126 
127    /* Set on a use whenever the SSA value becomes dead after the current
128     * instruction.
129     */
130    IR3_REG_KILL = BIT(14),
131 
132    /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
133     * same SSA value in a single instruction, this is only set on the first
134     * use.
135     */
136    IR3_REG_FIRST_KILL = BIT(15),
137 
138    /* Set when a destination doesn't have any uses and is dead immediately
139     * after the instruction. This can happen even after optimizations for
140     * corner cases such as destinations of atomic instructions.
141     */
142    IR3_REG_UNUSED = BIT(16),
143 
144    /* "Early-clobber" on a destination means that the destination is
145     * (potentially) written before any sources are read and therefore
146     * interferes with the sources of the instruction.
147     */
148    IR3_REG_EARLY_CLOBBER = BIT(17),
149 
150    /* If this is the last usage of a specific value in the register, the
151     * register cannot be read without being written to first after this.
152     * Note: This effectively has the same semantics as IR3_REG_KILL.
153     */
154    IR3_REG_LAST_USE = BIT(18),
155 
156    /* Predicate register (p0.c). Cannot be combined with half or shared. */
157    IR3_REG_PREDICATE = BIT(19),
158 } ir3_register_flags;
159 
160 struct ir3_register {
161    BITMASK_ENUM(ir3_register_flags) flags;
162 
163    unsigned name;
164 
165    /* used for cat5 instructions, but also for internal/IR level
166     * tracking of what registers are read/written by an instruction.
167     * wrmask may be a bad name since it is used to represent both
168     * src and dst that touch multiple adjacent registers.
169     */
170    unsigned wrmask : 16; /* up to vec16 */
171 
172    /* for relative addressing, 32bits for array size is too small,
173     * but otoh we don't need to deal with disjoint sets, so instead
174     * use a simple size field (number of scalar components).
175     *
176     * Note the size field isn't important for relative const (since
177     * we don't have to do register allocation for constants).
178     */
179    unsigned size : 16;
180 
181    /* normal registers:
182     * the component is in the low two bits of the reg #, so
183     * rN.x becomes: (N << 2) | x
184     */
185    uint16_t num;
186    union {
187       /* immediate: */
188       int32_t iim_val;
189       uint32_t uim_val;
190       float fim_val;
191       /* relative: */
192       struct {
193          uint16_t id;
194          int16_t offset;
195          uint16_t base;
196       } array;
197    };
198 
199    /* For IR3_REG_SSA, dst registers contain pointer back to the instruction
200     * containing this register.
201     */
202    struct ir3_instruction *instr;
203 
204    /* For IR3_REG_SSA, src registers contain ptr back to assigning
205     * instruction.
206     *
207     * For IR3_REG_ARRAY, the pointer is back to the last dependent
208     * array access (although the net effect is the same, it points
209     * back to a previous instruction that we depend on).
210     */
211    struct ir3_register *def;
212 
213    /* Pointer to another register in the instruction that must share the same
214     * physical register. Each destination can be tied with one source, and
215     * they must have "tied" pointing to each other.
216     */
217    struct ir3_register *tied;
218 
219    unsigned spill_slot, next_use;
220 
221    unsigned merge_set_offset;
222    struct ir3_merge_set *merge_set;
223    unsigned interval_start, interval_end;
224 };
225 
226 /*
227  * Stupid/simple growable array implementation:
228  */
229 #define DECLARE_ARRAY(type, name)                                              \
230    unsigned name##_count, name##_sz;                                           \
231    type *name;
232 
233 #define array_insert(ctx, arr, ...)                                            \
234    do {                                                                        \
235       if (arr##_count == arr##_sz) {                                           \
236          arr##_sz = MAX2(2 * arr##_sz, 16);                                    \
237          arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0]));             \
238       }                                                                        \
239       arr[arr##_count++] = __VA_ARGS__;                                        \
240    } while (0)
241 
242 typedef enum {
243    REDUCE_OP_ADD_U,
244    REDUCE_OP_ADD_F,
245    REDUCE_OP_MUL_U,
246    REDUCE_OP_MUL_F,
247    REDUCE_OP_MIN_U,
248    REDUCE_OP_MIN_S,
249    REDUCE_OP_MIN_F,
250    REDUCE_OP_MAX_U,
251    REDUCE_OP_MAX_S,
252    REDUCE_OP_MAX_F,
253    REDUCE_OP_AND_B,
254    REDUCE_OP_OR_B,
255    REDUCE_OP_XOR_B,
256 } reduce_op_t;
257 
258 typedef enum {
259    ALIAS_TEX = 0,
260    ALIAS_RT = 3,
261    ALIAS_MEM = 4,
262 } ir3_alias_scope;
263 
264 typedef enum {
265    SHFL_XOR = 1,
266    SHFL_UP = 2,
267    SHFL_DOWN = 3,
268    SHFL_RUP = 6,
269    SHFL_RDOWN = 7,
270 } ir3_shfl_mode;
271 
272 typedef enum ir3_instruction_flags {
273    /* (sy) flag is set on first instruction, and after sample
274     * instructions (probably just on RAW hazard).
275     */
276    IR3_INSTR_SY = BIT(0),
277    /* (ss) flag is set on first instruction, and first instruction
278     * to depend on the result of "long" instructions (RAW hazard):
279     *
280     *   rcp, rsq, log2, exp2, sin, cos, sqrt
281     *
282     * It seems to synchronize until all in-flight instructions are
283     * completed, for example:
284     *
285     *   rsq hr1.w, hr1.w
286     *   add.f hr2.z, (neg)hr2.z, hc0.y
287     *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
288     *   rsq hr2.x, hr2.x
289     *   (rpt1)nop
290     *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
291     *   nop
292     *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
293     *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
294     *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
295     *
296     * The last mul.f does not have (ss) set, presumably because the
297     * (ss) on the previous instruction does the job.
298     *
299     * The blob driver also seems to set it on WAR hazards, although
300     * not really clear if this is needed or just blob compiler being
301     * sloppy.  So far I haven't found a case where removing the (ss)
302     * causes problems for WAR hazard, but I could just be getting
303     * lucky:
304     *
305     *   rcp r1.y, r3.y
306     *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
307     *
308     */
309    IR3_INSTR_SS = BIT(1),
310    /* (jp) flag is set on jump targets:
311     */
312    IR3_INSTR_JP = BIT(2),
313    /* (eq) flag kills helper invocations when they are no longer needed */
314    IR3_INSTR_EQ = BIT(3),
315    IR3_INSTR_UL = BIT(4),
316    IR3_INSTR_3D = BIT(5),
317    IR3_INSTR_A = BIT(6),
318    IR3_INSTR_O = BIT(7),
319    IR3_INSTR_P = BIT(8),
320    IR3_INSTR_S = BIT(9),
321    IR3_INSTR_S2EN = BIT(10),
322    IR3_INSTR_SAT = BIT(11),
323    /* (cat5/cat6) Bindless */
324    IR3_INSTR_B = BIT(12),
325    /* (cat5/cat6) nonuniform */
326    IR3_INSTR_NONUNIF = BIT(13),
327    /* (cat5-only) Get some parts of the encoding from a1.x */
328    IR3_INSTR_A1EN = BIT(14),
329    /* uniform destination for ldc, which must be set if and only if it has a
330     * shared reg destination
331     */
332    IR3_INSTR_U = BIT(15),
333    /* meta-flags, for intermediate stages of IR, ie.
334     * before register assignment is done:
335     */
336    IR3_INSTR_MARK = BIT(16),
337 
338    /* Used by shared register allocation when creating spill/reload instructions
339     * to inform validation that this is created by RA. This also may be set on
340     * an instruction where a spill has been folded into it.
341     */
342    IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
343 
344    IR3_INSTR_UNUSED = BIT(17),
345 
346    /* Used to indicate that a mov comes from a lowered READ_FIRST/READ_COND
347     * and may broadcast a helper invocation's value from a vector register to a
348     * shared register that may be read by other invocations. This factors into
349     * (eq) calculations.
350     */
351    IR3_INSTR_NEEDS_HELPERS = BIT(18),
352 
353    /* isam.v */
354    IR3_INSTR_V = BIT(19),
355 
356    /* isam.1d. Note that .1d is an active-low bit. */
357    IR3_INSTR_INV_1D = BIT(20),
358 
359    /* isam.v/ldib.b/stib.b can optionally use an immediate offset with one of
360     * their sources.
361     */
362    IR3_INSTR_IMM_OFFSET = BIT(21),
363 } ir3_instruction_flags;
364 
365 struct ir3_instruction {
366    struct ir3_block *block;
367    opc_t opc;
368    BITMASK_ENUM(ir3_instruction_flags) flags;
369    uint8_t repeat;
370    uint8_t nop;
371 #if MESA_DEBUG
372    unsigned srcs_max, dsts_max;
373 #endif
374    unsigned srcs_count, dsts_count;
375    struct ir3_register **dsts;
376    struct ir3_register **srcs;
377    union {
378       struct {
379          char inv1, inv2;
380          int immed;
381          struct ir3_block *target;
382          const char *target_label;
383          unsigned idx; /* for brac.N */
384       } cat0;
385       struct {
386          type_t src_type, dst_type;
387          round_t round;
388          reduce_op_t reduce_op;
389       } cat1;
390       struct {
391          enum {
392             IR3_COND_LT = 0,
393             IR3_COND_LE = 1,
394             IR3_COND_GT = 2,
395             IR3_COND_GE = 3,
396             IR3_COND_EQ = 4,
397             IR3_COND_NE = 5,
398          } condition;
399       } cat2;
400       struct {
401          enum {
402             IR3_SRC_UNSIGNED = 0,
403             IR3_SRC_MIXED = 1,
404          } signedness;
405          enum {
406             IR3_SRC_PACKED_LOW = 0,
407             IR3_SRC_PACKED_HIGH = 1,
408          } packed;
409          bool swapped;
410       } cat3;
411       struct {
412          unsigned samp, tex;
413          unsigned tex_base : 3;
414          unsigned cluster_size : 4;
415          type_t type;
416       } cat5;
417       struct {
418          type_t type;
419          /* TODO remove dst_offset and handle as a ir3_register
420           * which might be IMMED, similar to how src_offset is
421           * handled.
422           */
423          int dst_offset;
424          int iim_val;       /* for ldgb/stgb, # of components */
425          unsigned d    : 3; /* for ldc, component offset */
426          bool typed    : 1;
427          unsigned base : 3;
428          ir3_shfl_mode shfl_mode : 3;
429       } cat6;
430       struct {
431          unsigned w : 1; /* write */
432          unsigned r : 1; /* read */
433          unsigned l : 1; /* local */
434          unsigned g : 1; /* global */
435 
436          ir3_alias_scope alias_scope;
437       } cat7;
438       /* for meta-instructions, just used to hold extra data
439        * before instruction scheduling, etc
440        */
441       struct {
442          int off; /* component/offset */
443       } split;
444       struct {
445          /* Per-source index back to the entry in the
446           * ir3_shader_variant::outputs table.
447           */
448          unsigned *outidxs;
449       } end;
450       struct {
451          /* used to temporarily hold reference to nir_phi_instr
452           * until we resolve the phi srcs
453           */
454          void *nphi;
455          unsigned comp;
456       } phi;
457       struct {
458          unsigned samp, tex;
459          unsigned input_offset;
460          unsigned samp_base : 3;
461          unsigned tex_base  : 3;
462       } prefetch;
463       struct {
464          /* maps back to entry in ir3_shader_variant::inputs table: */
465          int inidx;
466          /* for sysvals, identifies the sysval type.  Mostly so we can
467           * identify the special cases where a sysval should not be DCE'd
468           * (currently, just pre-fs texture fetch)
469           */
470          gl_system_value sysval;
471       } input;
472       struct {
473          unsigned src_base, src_size;
474          unsigned dst_base;
475       } push_consts;
476       struct {
477          uint64_t value;
478       } raw;
479    };
480 
481    /* For assigning jump offsets, we need instruction's position: */
482    uint32_t ip;
483 
484    /* used for per-pass extra instruction data.
485     *
486     * TODO we should remove the per-pass data like this and 'use_count'
487     * and do something similar to what RA does w/ ir3_ra_instr_data..
488     * ie. use the ir3_count_instructions pass, and then use instr->ip
489     * to index into a table of pass-private data.
490     */
491    void *data;
492 
493    /**
494     * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
495     */
496    struct set *uses;
497 
498    int use_count; /* currently just updated/used by cp */
499 
500    /* an instruction can reference at most one address register amongst
501     * it's src/dst registers.  Beyond that, you need to insert mov's.
502     *
503     * NOTE: do not write this directly, use ir3_instr_set_address()
504     */
505    struct ir3_register *address;
506 
507    /* Tracking for additional dependent instructions.  Used to handle
508     * barriers, WAR hazards for arrays/SSBOs/etc.
509     */
510    DECLARE_ARRAY(struct ir3_instruction *, deps);
511 
512    /*
513     * From PoV of instruction scheduling, not execution (ie. ignores global/
514     * local distinction):
515     *                            shared  image  atomic  SSBO  everything
516     *   barrier()/            -   R/W     R/W    R/W     R/W       X
517     *     groupMemoryBarrier()
518     *     memoryBarrier()
519     *     (but only images declared coherent?)
520     *   memoryBarrierAtomic() -                  R/W
521     *   memoryBarrierBuffer() -                          R/W
522     *   memoryBarrierImage()  -           R/W
523     *   memoryBarrierShared() -   R/W
524     *
525     * TODO I think for SSBO/image/shared, in cases where we can determine
526     * which variable is accessed, we don't need to care about accesses to
527     * different variables (unless declared coherent??)
528     */
529    enum {
530       IR3_BARRIER_EVERYTHING = 1 << 0,
531       IR3_BARRIER_SHARED_R = 1 << 1,
532       IR3_BARRIER_SHARED_W = 1 << 2,
533       IR3_BARRIER_IMAGE_R = 1 << 3,
534       IR3_BARRIER_IMAGE_W = 1 << 4,
535       IR3_BARRIER_BUFFER_R = 1 << 5,
536       IR3_BARRIER_BUFFER_W = 1 << 6,
537       IR3_BARRIER_ARRAY_R = 1 << 7,
538       IR3_BARRIER_ARRAY_W = 1 << 8,
539       IR3_BARRIER_PRIVATE_R = 1 << 9,
540       IR3_BARRIER_PRIVATE_W = 1 << 10,
541       IR3_BARRIER_CONST_W = 1 << 11,
542       IR3_BARRIER_ACTIVE_FIBERS_R = 1 << 12,
543       IR3_BARRIER_ACTIVE_FIBERS_W = 1 << 13,
544    } barrier_class,
545       barrier_conflict;
546 
547    /* Entry in ir3_block's instruction list: */
548    struct list_head node;
549 
550    /* List of this instruction's repeat group. Vectorized NIR instructions are
551     * emitted as multiple scalar instructions that are linked together using
552     * this field. After RA, the ir3_combine_rpt pass iterates these groups and,
553     * if the register assignment allows it, merges them into a (rptN)
554     * instruction.
555     *
556     * NOTE: this is not a typical list as there is no empty list head. The list
557     * head is stored in the first instruction of the repeat group so also refers
558     * to a list entry. In order to distinguish the list's first entry, we use
559     * serialno: instructions in a repeat group are always emitted consecutively
560     * so the first will have the lowest serialno.
561     *
562     * As this is not a typical list, we have to be careful with using the
563     * existing list helper. For example, using list_length on the first
564     * instruction will yield one less than the number of instructions in its
565     * group.
566     */
567    struct list_head rpt_node;
568 
569    uint32_t serialno;
570 
571    // TODO only computerator/assembler:
572    int line;
573 };
574 
575 /* Represents repeat groups in return values and arguments of the rpt builder
576  * API functions.
577  */
578 struct ir3_instruction_rpt {
579    struct ir3_instruction *rpts[4];
580 };
581 
582 struct ir3 {
583    struct ir3_compiler *compiler;
584    gl_shader_stage type;
585 
586    DECLARE_ARRAY(struct ir3_instruction *, inputs);
587 
588    /* Track bary.f (and ldlv) instructions.. this is needed in
589     * scheduling to ensure that all varying fetches happen before
590     * any potential kill instructions.  The hw gets grumpy if all
591     * threads in a group are killed before the last bary.f gets
592     * a chance to signal end of input (ei).
593     */
594    DECLARE_ARRAY(struct ir3_instruction *, baryfs);
595 
596    /* Track all indirect instructions (read and write).  To avoid
597     * deadlock scenario where an address register gets scheduled,
598     * but other dependent src instructions cannot be scheduled due
599     * to dependency on a *different* address register value, the
600     * scheduler needs to ensure that all dependencies other than
601     * the instruction other than the address register are scheduled
602     * before the one that writes the address register.  Having a
603     * convenient list of instructions that reference some address
604     * register simplifies this.
605     */
606    DECLARE_ARRAY(struct ir3_instruction *, a0_users);
607 
608    /* same for a1.x: */
609    DECLARE_ARRAY(struct ir3_instruction *, a1_users);
610 
611    /* Track texture sample instructions which need texture state
612     * patched in (for astc-srgb workaround):
613     */
614    DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
615 
616    /* Track tg4 instructions which need texture state patched in (for tg4
617     * swizzling workaround):
618     */
619    DECLARE_ARRAY(struct ir3_instruction *, tg4);
620 
621    /* List of blocks: */
622    struct list_head block_list;
623 
624    /* List of ir3_array's: */
625    struct list_head array_list;
626 
627 #if MESA_DEBUG
628    unsigned block_count;
629 #endif
630    unsigned instr_count;
631 };
632 
633 struct ir3_array {
634    struct list_head node;
635    unsigned length;
636    unsigned id;
637 
638    struct nir_def *r;
639 
640    /* To avoid array write's from getting DCE'd, keep track of the
641     * most recent write.  Any array access depends on the most
642     * recent write.  This way, nothing depends on writes after the
643     * last read.  But all the writes that happen before that have
644     * something depending on them
645     */
646    struct ir3_register *last_write;
647 
648    /* extra stuff used in RA pass: */
649    unsigned base; /* base vreg name */
650    unsigned reg;  /* base physical reg */
651    uint16_t start_ip, end_ip;
652 
653    /* Indicates if half-precision */
654    bool half;
655 
656    bool unused;
657 };
658 
659 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
660 
661 struct ir3_block {
662    struct list_head node;
663    struct ir3 *shader;
664 
665    const struct nir_block *nblock;
666 
667    struct list_head instr_list; /* list of ir3_instruction */
668 
669    /* each block has either one or two successors.. in case of two
670     * successors, 'condition' decides which one to follow.  A block preceding
671     * an if/else has two successors.
672     *
673     * In some cases the path that the machine actually takes through the
674     * program may not match the per-thread view of the CFG. In particular
675     * this is the case for if/else, where the machine jumps from the end of
676     * the if to the beginning of the else and switches active lanes. While
677     * most things only care about the per-thread view, we need to use the
678     * "physical" view when allocating shared registers. "successors" contains
679     * the per-thread successors, and "physical_successors" contains the
680     * physical successors which includes the fallthrough edge from the if to
681     * the else.
682     */
683    struct ir3_block *successors[2];
684 
685    bool divergent_condition;
686 
687    DECLARE_ARRAY(struct ir3_block *, predecessors);
688    DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
689    DECLARE_ARRAY(struct ir3_block *, physical_successors);
690 
691    uint16_t start_ip, end_ip;
692 
693    bool reconvergence_point;
694 
695    bool in_early_preamble;
696 
697    /* Track instructions which do not write a register but other-
698     * wise must not be discarded (such as kill, stg, etc)
699     */
700    DECLARE_ARRAY(struct ir3_instruction *, keeps);
701 
702    /* used for per-pass extra block data.  Mainly used right
703     * now in RA step to track livein/liveout.
704     */
705    void *data;
706 
707    uint32_t index;
708 
709    struct ir3_block *imm_dom;
710    DECLARE_ARRAY(struct ir3_block *, dom_children);
711 
712    uint32_t dom_pre_index;
713    uint32_t dom_post_index;
714 
715    uint32_t loop_depth;
716 
717 #if MESA_DEBUG
718    uint32_t serialno;
719 #endif
720 };
721 
722 enum ir3_cursor_option {
723    IR3_CURSOR_BEFORE_BLOCK,
724    IR3_CURSOR_AFTER_BLOCK,
725    IR3_CURSOR_BEFORE_INSTR,
726    IR3_CURSOR_AFTER_INSTR,
727 };
728 
729 struct ir3_cursor {
730    enum ir3_cursor_option option;
731    union {
732       struct ir3_block *block;
733       struct ir3_instruction *instr;
734    };
735 };
736 
737 struct ir3_builder {
738    struct ir3_cursor cursor;
739 };
740 
741 static inline uint32_t
block_id(struct ir3_block * block)742 block_id(struct ir3_block *block)
743 {
744 #if MESA_DEBUG
745    return block->serialno;
746 #else
747    return (uint32_t)(unsigned long)block;
748 #endif
749 }
750 
751 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)752 ir3_start_block(struct ir3 *ir)
753 {
754    return list_first_entry(&ir->block_list, struct ir3_block, node);
755 }
756 
757 static inline struct ir3_block *
ir3_end_block(struct ir3 * ir)758 ir3_end_block(struct ir3 *ir)
759 {
760    return list_last_entry(&ir->block_list, struct ir3_block, node);
761 }
762 
763 struct ir3_instruction *ir3_block_get_terminator(struct ir3_block *block);
764 
765 struct ir3_instruction *ir3_block_take_terminator(struct ir3_block *block);
766 
767 struct ir3_instruction *
768 ir3_block_get_last_non_terminator(struct ir3_block *block);
769 
770 struct ir3_instruction *ir3_block_get_last_phi(struct ir3_block *block);
771 
772 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)773 ir3_after_preamble(struct ir3 *ir)
774 {
775    struct ir3_block *block = ir3_start_block(ir);
776    /* The preamble will have a usually-empty else branch, and we want to skip
777     * that to get to the block after the preamble.
778     */
779    struct ir3_instruction *terminator = ir3_block_get_terminator(block);
780    if (terminator && (terminator->opc == OPC_SHPS))
781       return block->successors[1]->successors[0];
782    else
783       return block;
784 }
785 
786 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
787 void ir3_block_link_physical(struct ir3_block *pred, struct ir3_block *succ);
788 void ir3_block_remove_predecessor(struct ir3_block *block,
789                                   struct ir3_block *pred);
790 unsigned ir3_block_get_pred_index(struct ir3_block *block,
791                                   struct ir3_block *pred);
792 
793 void ir3_calc_dominance(struct ir3 *ir);
794 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
795 
796 struct ir3_shader_variant;
797 
798 struct ir3 *ir3_create(struct ir3_compiler *compiler,
799                        struct ir3_shader_variant *v);
800 void ir3_destroy(struct ir3 *shader);
801 
802 void ir3_collect_info(struct ir3_shader_variant *v);
803 void *ir3_alloc(struct ir3 *shader, int sz);
804 
805 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
806                                          unsigned reg_count,
807                                          bool double_threadsize);
808 
809 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
810                                            bool double_threadsize);
811 
812 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
813                                   unsigned regs_count);
814 
815 struct ir3_block *ir3_block_create(struct ir3 *shader);
816 
817 struct ir3_instruction *ir3_build_instr(struct ir3_builder *builder, opc_t opc,
818                                         int ndst, int nsrc);
819 struct ir3_instruction *ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc,
820                                             int ndst, int nsrc);
821 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
822                                          int ndst, int nsrc);
823 struct ir3_instruction *ir3_instr_create_at_end(struct ir3_block *block,
824                                                 opc_t opc, int ndst, int nsrc);
825 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
826 void ir3_instr_add_dep(struct ir3_instruction *instr,
827                        struct ir3_instruction *dep);
828 const char *ir3_instr_name(struct ir3_instruction *instr);
829 void ir3_instr_remove(struct ir3_instruction *instr);
830 
831 void ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n);
832 bool ir3_instr_is_rpt(const struct ir3_instruction *instr);
833 bool ir3_instr_is_first_rpt(const struct ir3_instruction *instr);
834 struct ir3_instruction *ir3_instr_prev_rpt(const struct ir3_instruction *instr);
835 struct ir3_instruction *ir3_instr_first_rpt(struct ir3_instruction *instr);
836 unsigned ir3_instr_rpt_length(const struct ir3_instruction *instr);
837 
838 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
839                                     int flags);
840 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
841                                     int flags);
842 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
843                                    struct ir3_register *reg);
844 
845 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)846 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
847 {
848    assert(!dst->tied && !src->tied);
849    dst->tied = src;
850    src->tied = dst;
851 }
852 
853 void ir3_reg_set_last_array(struct ir3_instruction *instr,
854                             struct ir3_register *reg,
855                             struct ir3_register *last_write);
856 
857 void ir3_instr_set_address(struct ir3_instruction *instr,
858                            struct ir3_instruction *addr);
859 
860 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)861 ir3_instr_check_mark(struct ir3_instruction *instr)
862 {
863    if (instr->flags & IR3_INSTR_MARK)
864       return true; /* already visited */
865    instr->flags |= IR3_INSTR_MARK;
866    return false;
867 }
868 
869 void ir3_block_clear_mark(struct ir3_block *block);
870 void ir3_clear_mark(struct ir3 *shader);
871 
872 unsigned ir3_count_instructions(struct ir3 *ir);
873 unsigned ir3_count_instructions_sched(struct ir3 *ir);
874 unsigned ir3_count_instructions_ra(struct ir3 *ir);
875 
876 /**
877  * Move 'instr' to just before 'after'
878  */
879 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)880 ir3_instr_move_before(struct ir3_instruction *instr,
881                       struct ir3_instruction *after)
882 {
883    list_delinit(&instr->node);
884    list_addtail(&instr->node, &after->node);
885 }
886 
887 /**
888  * Move 'instr' to just after 'before':
889  */
890 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)891 ir3_instr_move_after(struct ir3_instruction *instr,
892                      struct ir3_instruction *before)
893 {
894    list_delinit(&instr->node);
895    list_add(&instr->node, &before->node);
896 }
897 
898 /**
899  * Move 'instr' to the beginning of the block:
900  */
901 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)902 ir3_instr_move_before_block(struct ir3_instruction *instr,
903                             struct ir3_block *block)
904 {
905    list_delinit(&instr->node);
906    list_add(&instr->node, &block->instr_list);
907 }
908 
909 typedef bool (*use_filter_cb)(struct ir3_instruction *use, unsigned src_n);
910 
911 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
912 void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter);
913 
914 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
915 void ir3_fixup_src_type(struct ir3_instruction *instr);
916 
917 int ir3_flut(struct ir3_register *src_reg);
918 
919 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
920 
921 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
922 
923 /**
924  * Given an instruction whose result we want to test for nonzero, return a
925  * potentially different instruction for which the result would be the same.
926  * This might be one of its sources if instr doesn't change the nonzero-ness.
927  */
928 struct ir3_instruction *
929 ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr);
930 
931 bool ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc);
932 
933 #include "util/set.h"
934 #define foreach_ssa_use(__use, __instr)                                        \
935    for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
936         __use = NULL)                                                          \
937       set_foreach ((__instr)->uses, __entry)                                   \
938          if ((__use = (void *)__entry->key))
939 
940 static inline uint32_t
reg_num(const struct ir3_register * reg)941 reg_num(const struct ir3_register *reg)
942 {
943    return reg->num >> 2;
944 }
945 
946 static inline uint32_t
reg_comp(const struct ir3_register * reg)947 reg_comp(const struct ir3_register *reg)
948 {
949    return reg->num & 0x3;
950 }
951 
952 static inline bool
is_flow(struct ir3_instruction * instr)953 is_flow(struct ir3_instruction *instr)
954 {
955    return (opc_cat(instr->opc) == 0);
956 }
957 
958 static inline bool
is_terminator(struct ir3_instruction * instr)959 is_terminator(struct ir3_instruction *instr)
960 {
961    switch (instr->opc) {
962    case OPC_BR:
963    case OPC_JUMP:
964    case OPC_BANY:
965    case OPC_BALL:
966    case OPC_BRAA:
967    case OPC_BRAO:
968    case OPC_SHPS:
969    case OPC_GETONE:
970    case OPC_GETLAST:
971    case OPC_PREDT:
972    case OPC_PREDF:
973       return true;
974    default:
975       return false;
976    }
977 }
978 
979 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)980 is_kill_or_demote(struct ir3_instruction *instr)
981 {
982    return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
983 }
984 
985 static inline bool
is_nop(struct ir3_instruction * instr)986 is_nop(struct ir3_instruction *instr)
987 {
988    return instr->opc == OPC_NOP;
989 }
990 
991 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)992 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
993 {
994    unsigned dst_type = (dst->flags & IR3_REG_HALF);
995    unsigned src_type = (src->flags & IR3_REG_HALF);
996 
997    /* Treat shared->normal copies and normal->shared copies as same-type. */
998    return dst_type == src_type;
999 }
1000 
1001 /* Is it a non-transformative (ie. not type changing) mov?  This can
1002  * also include absneg.s/absneg.f, which for the most part can be
1003  * treated as a mov (single src argument).
1004  */
1005 static inline bool
is_same_type_mov(struct ir3_instruction * instr)1006 is_same_type_mov(struct ir3_instruction *instr)
1007 {
1008    struct ir3_register *dst;
1009 
1010    switch (instr->opc) {
1011    case OPC_MOV:
1012       if (instr->cat1.src_type != instr->cat1.dst_type)
1013          return false;
1014       /* If the type of dest reg and src reg are different,
1015        * it shouldn't be considered as same type mov
1016        */
1017       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1018          return false;
1019       break;
1020    case OPC_ABSNEG_F:
1021    case OPC_ABSNEG_S:
1022       if (instr->flags & IR3_INSTR_SAT)
1023          return false;
1024       /* If the type of dest reg and src reg are different,
1025        * it shouldn't be considered as same type mov
1026        */
1027       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1028          return false;
1029       break;
1030    default:
1031       return false;
1032    }
1033 
1034    dst = instr->dsts[0];
1035 
1036    /* mov's that write to a0 or p0.x are special: */
1037    if (dst->flags & IR3_REG_PREDICATE)
1038       return false;
1039    if (reg_num(dst) == REG_A0)
1040       return false;
1041 
1042    if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
1043       return false;
1044 
1045    return true;
1046 }
1047 
1048 /* A move from const, which changes size but not type, can also be
1049  * folded into dest instruction in some cases.
1050  */
1051 static inline bool
is_const_mov(struct ir3_instruction * instr)1052 is_const_mov(struct ir3_instruction *instr)
1053 {
1054    if (instr->opc != OPC_MOV)
1055       return false;
1056 
1057    if (!(instr->srcs[0]->flags & IR3_REG_CONST))
1058       return false;
1059 
1060    type_t src_type = instr->cat1.src_type;
1061    type_t dst_type = instr->cat1.dst_type;
1062 
1063    /* Allow a narrowing move, but not a widening one.  A narrowing
1064     * move from full c1.x can be folded into a hc1.x use in an ALU
1065     * instruction because it is doing the same thing as constant-
1066     * demotion.  If CONSTANT_DEMOTION_ENABLE wasn't set, we'd need
1067     * return false in all cases.
1068     */
1069    if ((type_size(dst_type) > type_size(src_type)) ||
1070        (type_size(dst_type) == 8))
1071       return false;
1072 
1073    return (type_float(src_type) && type_float(dst_type)) ||
1074           (type_uint(src_type) && type_uint(dst_type)) ||
1075           (type_sint(src_type) && type_sint(dst_type));
1076 }
1077 
1078 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)1079 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
1080 {
1081    switch (instr->opc) {
1082    case OPC_BALLOT_MACRO:
1083    case OPC_ANY_MACRO:
1084    case OPC_ALL_MACRO:
1085    case OPC_ELECT_MACRO:
1086    case OPC_READ_COND_MACRO:
1087    case OPC_READ_GETLAST_MACRO:
1088    case OPC_READ_FIRST_MACRO:
1089    case OPC_SCAN_MACRO:
1090    case OPC_SCAN_CLUSTERS_MACRO:
1091       return true;
1092    default:
1093       return false;
1094    }
1095 }
1096 
1097 static inline bool
is_alu(struct ir3_instruction * instr)1098 is_alu(struct ir3_instruction *instr)
1099 {
1100    return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
1101 }
1102 
1103 static inline bool
is_sfu(struct ir3_instruction * instr)1104 is_sfu(struct ir3_instruction *instr)
1105 {
1106    return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
1107 }
1108 
1109 static inline bool
is_tex(struct ir3_instruction * instr)1110 is_tex(struct ir3_instruction *instr)
1111 {
1112    return (opc_cat(instr->opc) == 5) && instr->opc != OPC_TCINV;
1113 }
1114 
1115 static inline bool
is_tex_shuffle(struct ir3_instruction * instr)1116 is_tex_shuffle(struct ir3_instruction *instr)
1117 {
1118    switch (instr->opc) {
1119    case OPC_BRCST_ACTIVE:
1120    case OPC_QUAD_SHUFFLE_BRCST:
1121    case OPC_QUAD_SHUFFLE_HORIZ:
1122    case OPC_QUAD_SHUFFLE_VERT:
1123    case OPC_QUAD_SHUFFLE_DIAG:
1124       return true;
1125    default:
1126       return false;
1127    }
1128 }
1129 
1130 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)1131 is_tex_or_prefetch(struct ir3_instruction *instr)
1132 {
1133    return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
1134 }
1135 
1136 static inline bool
is_mem(struct ir3_instruction * instr)1137 is_mem(struct ir3_instruction *instr)
1138 {
1139    return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
1140 }
1141 
1142 static inline bool
is_barrier(struct ir3_instruction * instr)1143 is_barrier(struct ir3_instruction *instr)
1144 {
1145    return (opc_cat(instr->opc) == 7);
1146 }
1147 
1148 static inline bool
is_half(struct ir3_instruction * instr)1149 is_half(struct ir3_instruction *instr)
1150 {
1151    return !!(instr->dsts[0]->flags & IR3_REG_HALF);
1152 }
1153 
1154 static inline bool
is_shared(struct ir3_instruction * instr)1155 is_shared(struct ir3_instruction *instr)
1156 {
1157    return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
1158 }
1159 
1160 static inline bool
is_store(struct ir3_instruction * instr)1161 is_store(struct ir3_instruction *instr)
1162 {
1163    /* these instructions, the "destination" register is
1164     * actually a source, the address to store to.
1165     */
1166    switch (instr->opc) {
1167    case OPC_STG:
1168    case OPC_STG_A:
1169    case OPC_STGB:
1170    case OPC_STIB:
1171    case OPC_STP:
1172    case OPC_STL:
1173    case OPC_STLW:
1174    case OPC_L2G:
1175    case OPC_G2L:
1176       return true;
1177    default:
1178       return false;
1179    }
1180 }
1181 
1182 static inline bool
is_load(struct ir3_instruction * instr)1183 is_load(struct ir3_instruction *instr)
1184 {
1185    switch (instr->opc) {
1186    case OPC_LDG:
1187    case OPC_LDG_A:
1188    case OPC_LDGB:
1189    case OPC_LDIB:
1190    case OPC_LDL:
1191    case OPC_LDP:
1192    case OPC_L2G:
1193    case OPC_LDLW:
1194    case OPC_LDLV:
1195       /* probably some others too.. */
1196       return true;
1197    case OPC_LDC:
1198       return instr->dsts_count > 0;
1199    default:
1200       return false;
1201    }
1202 }
1203 
1204 static inline bool
is_input(struct ir3_instruction * instr)1205 is_input(struct ir3_instruction *instr)
1206 {
1207    /* in some cases, ldlv is used to fetch varying without
1208     * interpolation.. fortunately inloc is the first src
1209     * register in either case
1210     */
1211    switch (instr->opc) {
1212    case OPC_LDLV:
1213    case OPC_BARY_F:
1214    case OPC_FLAT_B:
1215       return true;
1216    default:
1217       return false;
1218    }
1219 }
1220 
1221 /* Whether non-helper invocations can read the value of helper invocations. We
1222  * cannot insert (eq) before these instructions.
1223  */
1224 static inline bool
uses_helpers(struct ir3_instruction * instr)1225 uses_helpers(struct ir3_instruction *instr)
1226 {
1227    switch (instr->opc) {
1228    /* These require helper invocations to be present */
1229    case OPC_SAMB:
1230    case OPC_GETLOD:
1231    case OPC_DSX:
1232    case OPC_DSY:
1233    case OPC_DSXPP_1:
1234    case OPC_DSYPP_1:
1235    case OPC_DSXPP_MACRO:
1236    case OPC_DSYPP_MACRO:
1237    case OPC_QUAD_SHUFFLE_BRCST:
1238    case OPC_QUAD_SHUFFLE_HORIZ:
1239    case OPC_QUAD_SHUFFLE_VERT:
1240    case OPC_QUAD_SHUFFLE_DIAG:
1241    case OPC_META_TEX_PREFETCH:
1242       return true;
1243 
1244    /* sam requires helper invocations except for dummy prefetch instructions */
1245    case OPC_SAM:
1246       return instr->dsts_count != 0;
1247 
1248    /* Subgroup operations don't require helper invocations to be present, but
1249     * will use helper invocations if they are present.
1250     */
1251    case OPC_BALLOT_MACRO:
1252    case OPC_ANY_MACRO:
1253    case OPC_ALL_MACRO:
1254    case OPC_READ_FIRST_MACRO:
1255    case OPC_READ_COND_MACRO:
1256    case OPC_MOVMSK:
1257    case OPC_BRCST_ACTIVE:
1258       return true;
1259 
1260    /* Catch lowered READ_FIRST/READ_COND. For elect, don't include the getone
1261     * in the preamble because it doesn't actually matter which fiber is
1262     * selected.
1263     */
1264    case OPC_MOV:
1265    case OPC_ELECT_MACRO:
1266       return instr->flags & IR3_INSTR_NEEDS_HELPERS;
1267 
1268    default:
1269       return false;
1270    }
1271 }
1272 
1273 static inline bool
is_bool(struct ir3_instruction * instr)1274 is_bool(struct ir3_instruction *instr)
1275 {
1276    switch (instr->opc) {
1277    case OPC_CMPS_F:
1278    case OPC_CMPS_S:
1279    case OPC_CMPS_U:
1280       return true;
1281    default:
1282       return false;
1283    }
1284 }
1285 
1286 static inline opc_t
cat3_half_opc(opc_t opc)1287 cat3_half_opc(opc_t opc)
1288 {
1289    switch (opc) {
1290    case OPC_MAD_F32:
1291       return OPC_MAD_F16;
1292    case OPC_SEL_B32:
1293       return OPC_SEL_B16;
1294    case OPC_SEL_S32:
1295       return OPC_SEL_S16;
1296    case OPC_SEL_F32:
1297       return OPC_SEL_F16;
1298    case OPC_SAD_S32:
1299       return OPC_SAD_S16;
1300    default:
1301       return opc;
1302    }
1303 }
1304 
1305 static inline opc_t
cat3_full_opc(opc_t opc)1306 cat3_full_opc(opc_t opc)
1307 {
1308    switch (opc) {
1309    case OPC_MAD_F16:
1310       return OPC_MAD_F32;
1311    case OPC_SEL_B16:
1312       return OPC_SEL_B32;
1313    case OPC_SEL_S16:
1314       return OPC_SEL_S32;
1315    case OPC_SEL_F16:
1316       return OPC_SEL_F32;
1317    case OPC_SAD_S16:
1318       return OPC_SAD_S32;
1319    default:
1320       return opc;
1321    }
1322 }
1323 
1324 static inline opc_t
cat4_half_opc(opc_t opc)1325 cat4_half_opc(opc_t opc)
1326 {
1327    switch (opc) {
1328    case OPC_RSQ:
1329       return OPC_HRSQ;
1330    case OPC_LOG2:
1331       return OPC_HLOG2;
1332    case OPC_EXP2:
1333       return OPC_HEXP2;
1334    default:
1335       return opc;
1336    }
1337 }
1338 
1339 static inline opc_t
cat4_full_opc(opc_t opc)1340 cat4_full_opc(opc_t opc)
1341 {
1342    switch (opc) {
1343    case OPC_HRSQ:
1344       return OPC_RSQ;
1345    case OPC_HLOG2:
1346       return OPC_LOG2;
1347    case OPC_HEXP2:
1348       return OPC_EXP2;
1349    default:
1350       return opc;
1351    }
1352 }
1353 
1354 static inline bool
is_meta(struct ir3_instruction * instr)1355 is_meta(struct ir3_instruction *instr)
1356 {
1357    return (opc_cat(instr->opc) == OPC_META);
1358 }
1359 
1360 static inline unsigned
reg_elems(const struct ir3_register * reg)1361 reg_elems(const struct ir3_register *reg)
1362 {
1363    if (reg->flags & IR3_REG_ARRAY)
1364       return reg->size;
1365    else
1366       return util_last_bit(reg->wrmask);
1367 }
1368 
1369 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1370 reg_elem_size(const struct ir3_register *reg)
1371 {
1372    return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1373 }
1374 
1375 static inline unsigned
reg_size(const struct ir3_register * reg)1376 reg_size(const struct ir3_register *reg)
1377 {
1378    return reg_elems(reg) * reg_elem_size(reg);
1379 }
1380 
1381 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
1382  * and have to handle relative accesses specially.
1383  */
1384 
1385 static inline unsigned
post_ra_reg_elems(struct ir3_register * reg)1386 post_ra_reg_elems(struct ir3_register *reg)
1387 {
1388    if (reg->flags & IR3_REG_RELATIV)
1389       return reg->size;
1390    return reg_elems(reg);
1391 }
1392 
1393 static inline unsigned
post_ra_reg_num(struct ir3_register * reg)1394 post_ra_reg_num(struct ir3_register *reg)
1395 {
1396    if (reg->flags & IR3_REG_RELATIV)
1397       return reg->array.base;
1398    return reg->num;
1399 }
1400 
1401 static inline unsigned
dest_regs(struct ir3_instruction * instr)1402 dest_regs(struct ir3_instruction *instr)
1403 {
1404    if (instr->dsts_count == 0)
1405       return 0;
1406 
1407    assert(instr->dsts_count == 1);
1408    return util_last_bit(instr->dsts[0]->wrmask);
1409 }
1410 
1411 static inline bool
is_reg_gpr(const struct ir3_register * reg)1412 is_reg_gpr(const struct ir3_register *reg)
1413 {
1414    if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_PREDICATE))
1415       return false;
1416    if (reg_num(reg) == REG_A0)
1417       return false;
1418    if (!(reg->flags & (IR3_REG_SSA | IR3_REG_RELATIV)) &&
1419        reg->num == INVALID_REG)
1420       return false;
1421    return true;
1422 }
1423 
1424 static inline bool
is_reg_a0(const struct ir3_register * reg)1425 is_reg_a0(const struct ir3_register *reg)
1426 {
1427    if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1428       return false;
1429    return reg->num == regid(REG_A0, 0);
1430 }
1431 
1432 /* is dst a normal temp register: */
1433 static inline bool
is_dest_gpr(const struct ir3_register * dst)1434 is_dest_gpr(const struct ir3_register *dst)
1435 {
1436    if (dst->wrmask == 0)
1437       return false;
1438    return is_reg_gpr(dst);
1439 }
1440 
1441 static inline bool
writes_gpr(struct ir3_instruction * instr)1442 writes_gpr(struct ir3_instruction *instr)
1443 {
1444    if (dest_regs(instr) == 0)
1445       return false;
1446    return is_dest_gpr(instr->dsts[0]);
1447 }
1448 
1449 static inline bool
writes_addr0(struct ir3_instruction * instr)1450 writes_addr0(struct ir3_instruction *instr)
1451 {
1452    /* Note: only the first dest can write to a0.x */
1453    if (instr->dsts_count > 0) {
1454       struct ir3_register *dst = instr->dsts[0];
1455       return dst->num == regid(REG_A0, 0);
1456    }
1457    return false;
1458 }
1459 
1460 static inline bool
writes_addr1(struct ir3_instruction * instr)1461 writes_addr1(struct ir3_instruction *instr)
1462 {
1463    /* Note: only the first dest can write to a1.x */
1464    if (instr->dsts_count > 0) {
1465       struct ir3_register *dst = instr->dsts[0];
1466       return dst->num == regid(REG_A0, 1);
1467    }
1468    return false;
1469 }
1470 
1471 static inline bool
writes_pred(struct ir3_instruction * instr)1472 writes_pred(struct ir3_instruction *instr)
1473 {
1474    /* Note: only the first dest can write to p0 */
1475    if (instr->dsts_count > 0) {
1476       struct ir3_register *dst = instr->dsts[0];
1477       return !!(dst->flags & IR3_REG_PREDICATE);
1478    }
1479    return false;
1480 }
1481 
1482 /* r0.x - r47.w are "normal" registers. r48.x - r55.w are shared registers.
1483  * Everything above those are non-GPR registers like a0.x and p0.x that aren't
1484  * assigned by RA.
1485  */
1486 #define GPR_REG_SIZE (4 * 48)
1487 #define SHARED_REG_START GPR_REG_SIZE
1488 #define SHARED_REG_SIZE (4 * 8)
1489 #define NONGPR_REG_START (SHARED_REG_START + SHARED_REG_SIZE)
1490 #define NONGPR_REG_SIZE (4 * 8)
1491 
1492 enum ir3_reg_file {
1493    IR3_FILE_FULL,
1494    IR3_FILE_HALF,
1495    IR3_FILE_SHARED,
1496    IR3_FILE_NONGPR,
1497 };
1498 
1499 /* Return a file + offset that can be used for determining if two registers
1500  * alias. The register is only really used for its flags, the num is taken from
1501  * the parameter. Registers overlap if they are in the same file and have an
1502  * overlapping offset. The offset is multiplied by 2 for full registers to
1503  * handle aliasing half and full registers, that is it's in units of half-regs.
1504  */
1505 static inline unsigned
ir3_reg_file_offset(const struct ir3_register * reg,unsigned num,bool mergedregs,enum ir3_reg_file * file)1506 ir3_reg_file_offset(const struct ir3_register *reg, unsigned num,
1507                     bool mergedregs, enum ir3_reg_file *file)
1508 {
1509    assert(!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
1510    unsigned size = reg_elem_size(reg);
1511    if (!is_reg_gpr(reg)) {
1512       *file = IR3_FILE_NONGPR;
1513       return (num - NONGPR_REG_START) * size;
1514    } else if (reg->flags & IR3_REG_SHARED) {
1515       *file = IR3_FILE_SHARED;
1516       return (num - SHARED_REG_START) * size;
1517    } else if (mergedregs || !(reg->flags & IR3_REG_HALF)) {
1518       *file = IR3_FILE_FULL;
1519       return num * size;
1520    } else {
1521       *file = IR3_FILE_HALF;
1522       return num;
1523    }
1524 }
1525 
1526 /* returns defining instruction for reg */
1527 /* TODO better name */
1528 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1529 ssa(struct ir3_register *reg)
1530 {
1531    if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1532       return reg->def->instr;
1533    return NULL;
1534 }
1535 
1536 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1537 conflicts(struct ir3_register *a, struct ir3_register *b)
1538 {
1539    return (a && b) && (a->def != b->def);
1540 }
1541 
1542 static inline bool
reg_is_addr1(struct ir3_register * r)1543 reg_is_addr1(struct ir3_register *r)
1544 {
1545    if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1546       return false;
1547    return r->num == regid(REG_A0, 1);
1548 }
1549 
1550 static inline type_t
half_type(type_t type)1551 half_type(type_t type)
1552 {
1553    switch (type) {
1554    case TYPE_F32:
1555       return TYPE_F16;
1556    case TYPE_U32:
1557    case TYPE_U8_32:
1558       return TYPE_U16;
1559    case TYPE_S32:
1560       return TYPE_S16;
1561    case TYPE_F16:
1562    case TYPE_U16:
1563    case TYPE_S16:
1564       return type;
1565    case TYPE_U8:
1566       return type;
1567    default:
1568       assert(0);
1569       return (type_t)~0;
1570    }
1571 }
1572 
1573 static inline type_t
full_type(type_t type)1574 full_type(type_t type)
1575 {
1576    switch (type) {
1577    case TYPE_F16:
1578       return TYPE_F32;
1579    case TYPE_U8:
1580    case TYPE_U8_32:
1581    case TYPE_U16:
1582       return TYPE_U32;
1583    case TYPE_S16:
1584       return TYPE_S32;
1585    case TYPE_F32:
1586    case TYPE_U32:
1587    case TYPE_S32:
1588       return type;
1589    default:
1590       assert(0);
1591       return (type_t)~0;
1592    }
1593 }
1594 
1595 /* some cat2 instructions (ie. those which are not float) can embed an
1596  * immediate:
1597  */
1598 static inline bool
ir3_cat2_int(opc_t opc)1599 ir3_cat2_int(opc_t opc)
1600 {
1601    switch (opc) {
1602    case OPC_ADD_U:
1603    case OPC_ADD_S:
1604    case OPC_SUB_U:
1605    case OPC_SUB_S:
1606    case OPC_CMPS_U:
1607    case OPC_CMPS_S:
1608    case OPC_MIN_U:
1609    case OPC_MIN_S:
1610    case OPC_MAX_U:
1611    case OPC_MAX_S:
1612    case OPC_CMPV_U:
1613    case OPC_CMPV_S:
1614    case OPC_MUL_U24:
1615    case OPC_MUL_S24:
1616    case OPC_MULL_U:
1617    case OPC_CLZ_S:
1618    case OPC_ABSNEG_S:
1619    case OPC_AND_B:
1620    case OPC_OR_B:
1621    case OPC_NOT_B:
1622    case OPC_XOR_B:
1623    case OPC_BFREV_B:
1624    case OPC_CLZ_B:
1625    case OPC_SHL_B:
1626    case OPC_SHR_B:
1627    case OPC_ASHR_B:
1628    case OPC_MGEN_B:
1629    case OPC_GETBIT_B:
1630    case OPC_CBITS_B:
1631    case OPC_BARY_F:
1632    case OPC_FLAT_B:
1633       return true;
1634 
1635    default:
1636       return false;
1637    }
1638 }
1639 
1640 /* map cat2 instruction to valid abs/neg flags: */
1641 static inline unsigned
ir3_cat2_absneg(opc_t opc)1642 ir3_cat2_absneg(opc_t opc)
1643 {
1644    switch (opc) {
1645    case OPC_ADD_F:
1646    case OPC_MIN_F:
1647    case OPC_MAX_F:
1648    case OPC_MUL_F:
1649    case OPC_SIGN_F:
1650    case OPC_CMPS_F:
1651    case OPC_ABSNEG_F:
1652    case OPC_CMPV_F:
1653    case OPC_FLOOR_F:
1654    case OPC_CEIL_F:
1655    case OPC_RNDNE_F:
1656    case OPC_RNDAZ_F:
1657    case OPC_TRUNC_F:
1658    case OPC_BARY_F:
1659       return IR3_REG_FABS | IR3_REG_FNEG;
1660 
1661    case OPC_ADD_U:
1662    case OPC_ADD_S:
1663    case OPC_SUB_U:
1664    case OPC_SUB_S:
1665    case OPC_CMPS_U:
1666    case OPC_CMPS_S:
1667    case OPC_MIN_U:
1668    case OPC_MIN_S:
1669    case OPC_MAX_U:
1670    case OPC_MAX_S:
1671    case OPC_CMPV_U:
1672    case OPC_CMPV_S:
1673    case OPC_MUL_U24:
1674    case OPC_MUL_S24:
1675    case OPC_MULL_U:
1676    case OPC_CLZ_S:
1677       return 0;
1678 
1679    case OPC_ABSNEG_S:
1680       return IR3_REG_SABS | IR3_REG_SNEG;
1681 
1682    case OPC_AND_B:
1683    case OPC_OR_B:
1684    case OPC_NOT_B:
1685    case OPC_XOR_B:
1686    case OPC_BFREV_B:
1687    case OPC_CLZ_B:
1688    case OPC_SHL_B:
1689    case OPC_SHR_B:
1690    case OPC_ASHR_B:
1691    case OPC_MGEN_B:
1692    case OPC_GETBIT_B:
1693    case OPC_CBITS_B:
1694       return IR3_REG_BNOT;
1695 
1696    default:
1697       return 0;
1698    }
1699 }
1700 
1701 /* map cat3 instructions to valid abs/neg flags: */
1702 static inline unsigned
ir3_cat3_absneg(opc_t opc,unsigned src_n)1703 ir3_cat3_absneg(opc_t opc, unsigned src_n)
1704 {
1705    switch (opc) {
1706    case OPC_MAD_F16:
1707    case OPC_MAD_F32:
1708    case OPC_SEL_F16:
1709    case OPC_SEL_F32:
1710       return IR3_REG_FNEG;
1711 
1712    case OPC_SAD_S16:
1713    case OPC_SAD_S32:
1714       return src_n == 1 ? IR3_REG_SNEG : 0;
1715 
1716    case OPC_MAD_U16:
1717    case OPC_MADSH_U16:
1718    case OPC_MAD_S16:
1719    case OPC_MADSH_M16:
1720    case OPC_MAD_U24:
1721    case OPC_MAD_S24:
1722    case OPC_SEL_S16:
1723    case OPC_SEL_S32:
1724       /* neg *may* work on 3rd src.. */
1725 
1726    case OPC_SEL_B16:
1727    case OPC_SEL_B32:
1728 
1729    case OPC_SHRM:
1730    case OPC_SHLM:
1731    case OPC_SHRG:
1732    case OPC_SHLG:
1733    case OPC_ANDG:
1734    case OPC_WMM:
1735    case OPC_WMM_ACCU:
1736 
1737    default:
1738       return 0;
1739    }
1740 }
1741 
1742 /* Return the type (float, int, or uint) the op uses when converting from the
1743  * internal result of the op (which is assumed to be the same size as the
1744  * sources) to the destination when they are not the same size. If F32 it does
1745  * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1746  * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1747  * doesn't do anything sensible or is unknown.
1748  */
1749 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1750 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1751 {
1752    *can_fold = true;
1753    switch (instr->opc) {
1754    case OPC_ADD_F:
1755    case OPC_MUL_F:
1756    case OPC_BARY_F:
1757    case OPC_MAD_F32:
1758    case OPC_MAD_F16:
1759    case OPC_WMM:
1760    case OPC_WMM_ACCU:
1761       return TYPE_F32;
1762 
1763    case OPC_ADD_U:
1764    case OPC_SUB_U:
1765    case OPC_MIN_U:
1766    case OPC_MAX_U:
1767    case OPC_AND_B:
1768    case OPC_OR_B:
1769    case OPC_NOT_B:
1770    case OPC_XOR_B:
1771    case OPC_MUL_U24:
1772    case OPC_MULL_U:
1773    case OPC_SHL_B:
1774    case OPC_SHR_B:
1775    case OPC_ASHR_B:
1776    case OPC_MAD_U24:
1777    case OPC_SHRM:
1778    case OPC_SHLM:
1779    case OPC_SHRG:
1780    case OPC_SHLG:
1781    case OPC_ANDG:
1782    /* Comparison ops zero-extend/truncate their results, so consider them as
1783     * unsigned here.
1784     */
1785    case OPC_CMPS_F:
1786    case OPC_CMPV_F:
1787    case OPC_CMPS_U:
1788    case OPC_CMPS_S:
1789       return TYPE_U32;
1790 
1791    case OPC_ADD_S:
1792    case OPC_SUB_S:
1793    case OPC_MIN_S:
1794    case OPC_MAX_S:
1795    case OPC_ABSNEG_S:
1796    case OPC_MUL_S24:
1797    case OPC_MAD_S24:
1798       return TYPE_S32;
1799 
1800    /* We assume that any move->move folding that could be done was done by
1801     * NIR.
1802     */
1803    case OPC_MOV:
1804    default:
1805       *can_fold = false;
1806       return TYPE_U32;
1807    }
1808 }
1809 
1810 /* Return the src and dst types for the conversion which is already folded
1811  * into the op. We can assume that instr has folded in a conversion from
1812  * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1813  * to call if ir3_output_conv_type() returns can_fold = true.
1814  */
1815 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1816 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1817 {
1818    switch (instr->opc) {
1819    case OPC_CMPS_F:
1820    case OPC_CMPV_F:
1821    case OPC_CMPS_U:
1822    case OPC_CMPS_S:
1823       /* Comparisons only return 0/1 and the size of the comparison sources
1824        * is irrelevant, never consider them as having an output conversion
1825        * by returning a type with the dest size here:
1826        */
1827       return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1828                                                     : full_type(base_type);
1829 
1830    case OPC_BARY_F:
1831       /* bary.f doesn't have an explicit source, but we can assume here that
1832        * the varying data it reads is in fp32.
1833        *
1834        * This may be fp16 on older gen's depending on some register
1835        * settings, but it's probably not worth plumbing that through for a
1836        * small improvement that NIR would hopefully handle for us anyway.
1837        */
1838       return TYPE_F32;
1839 
1840    case OPC_FLAT_B:
1841       /* Treat the input data as u32 if not interpolating. */
1842       return TYPE_U32;
1843 
1844    default:
1845       return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1846                                                     : full_type(base_type);
1847    }
1848 }
1849 
1850 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1851 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1852 {
1853    return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1854                                                  : full_type(base_type);
1855 }
1856 
1857 /* Some instructions have signed/unsigned variants which are identical except
1858  * for whether the folded conversion sign-extends or zero-extends, and we can
1859  * fold in a mismatching move by rewriting the opcode. Return the opcode to
1860  * switch signedness, and whether one exists.
1861  */
1862 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1863 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1864 {
1865    switch (opc) {
1866 #define PAIR(u, s)                                                             \
1867    case OPC_##u:                                                               \
1868       return OPC_##s;                                                          \
1869    case OPC_##s:                                                               \
1870       return OPC_##u;
1871       PAIR(ADD_U, ADD_S)
1872       PAIR(SUB_U, SUB_S)
1873       /* Note: these are only identical when the sources are half, but that's
1874        * the only case we call this function for anyway.
1875        */
1876       PAIR(MUL_U24, MUL_S24)
1877 
1878    default:
1879       *can_swap = false;
1880       return opc;
1881    }
1882 }
1883 
1884 #define MASK(n) ((1 << (n)) - 1)
1885 
1886 /* iterator for an instructions's sources (reg), also returns src #: */
1887 #define foreach_src_n(__srcreg, __n, __instr)                                  \
1888    if ((__instr)->srcs_count)                                                  \
1889       for (struct ir3_register *__srcreg = (struct ir3_register *)~0; __srcreg;\
1890            __srcreg = NULL)                                                    \
1891          for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt;    \
1892               __n++)                                                           \
1893             if ((__srcreg = (__instr)->srcs[__n]))
1894 
1895 /* iterator for an instructions's sources (reg): */
1896 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1897 
1898 #define foreach_src_if(__srcreg, __instr, __filter)                            \
1899    foreach_src (__srcreg, __instr)                                             \
1900       if (__filter(__srcreg))
1901 
1902 /* iterator for an instructions's destinations (reg), also returns dst #: */
1903 #define foreach_dst_n(__dstreg, __n, __instr)                                  \
1904    if ((__instr)->dsts_count)                                                  \
1905       for (struct ir3_register *__dstreg = (struct ir3_register *)~0; __dstreg;\
1906            __dstreg = NULL)                                                    \
1907          for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt;    \
1908               __n++)                                                           \
1909             if ((__dstreg = (__instr)->dsts[__n]))
1910 
1911 /* iterator for an instructions's destinations (reg): */
1912 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1913 
1914 #define foreach_dst_if(__dstreg, __instr, __filter)                            \
1915    foreach_dst (__dstreg, __instr)                                             \
1916       if (__filter(__dstreg))
1917 
1918 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1919 __ssa_src_cnt(struct ir3_instruction *instr)
1920 {
1921    return instr->srcs_count + instr->deps_count;
1922 }
1923 
1924 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1925 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1926 {
1927    if (n >= instr->srcs_count)
1928       return true;
1929    return false;
1930 }
1931 
1932 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1933 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1934 {
1935    if (__is_false_dep(instr, n))
1936       return &instr->deps[n - instr->srcs_count];
1937    if (ssa(instr->srcs[n]))
1938       return &instr->srcs[n]->def->instr;
1939    return NULL;
1940 }
1941 
1942 #define foreach_ssa_srcp_n(__srcp, __n, __instr)                               \
1943    for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL)   \
1944       for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt;      \
1945            __n++)                                                              \
1946          if ((__srcp = __ssa_srcp_n(__instr, __n)))
1947 
1948 #define foreach_ssa_srcp(__srcp, __instr)                                      \
1949    foreach_ssa_srcp_n (__srcp, __i, __instr)
1950 
1951 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1952 #define foreach_ssa_src_n(__srcinst, __n, __instr)                             \
1953    for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst;             \
1954         __srcinst = NULL)                                                      \
1955       foreach_ssa_srcp_n (__srcp, __n, __instr)                                \
1956          if ((__srcinst = *__srcp))
1957 
1958 /* iterator for an instruction's SSA sources (instr): */
1959 #define foreach_ssa_src(__srcinst, __instr)                                    \
1960    foreach_ssa_src_n (__srcinst, __i, __instr)
1961 
1962 /* iterators for shader inputs: */
1963 #define foreach_input_n(__ininstr, __cnt, __ir)                                \
1964    for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr;             \
1965         __ininstr = NULL)                                                      \
1966       for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++)          \
1967          if ((__ininstr = (__ir)->inputs[__cnt]))
1968 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1969 
1970 /* iterators for instructions: */
1971 #define foreach_instr(__instr, __list)                                         \
1972    list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1973 #define foreach_instr_from(__instr, __start, __list)                           \
1974    list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \
1975                             __list, node)
1976 #define foreach_instr_rev(__instr, __list)                                     \
1977    list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1978 #define foreach_instr_safe(__instr, __list)                                    \
1979    list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1980 #define foreach_instr_from_safe(__instr, __start, __list)                      \
1981    list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start,     \
1982                                  __list, node)
1983 
1984 /* Iterate over all instructions in a repeat group. */
1985 #define foreach_instr_rpt(__rpt, __instr)                                      \
1986    if (assert(ir3_instr_is_first_rpt(__instr)), true)                          \
1987       for (struct ir3_instruction *__rpt = __instr, *__first = __instr;        \
1988            __first || __rpt != __instr;                                        \
1989            __first = NULL, __rpt =                                             \
1990                               list_entry(__rpt->rpt_node.next,                 \
1991                                          struct ir3_instruction, rpt_node))
1992 
1993 /* Iterate over all instructions except the first one in a repeat group. */
1994 #define foreach_instr_rpt_excl(__rpt, __instr)                                 \
1995    if (assert(ir3_instr_is_first_rpt(__instr)), true)                          \
1996       list_for_each_entry (struct ir3_instruction, __rpt, &__instr->rpt_node,  \
1997                            rpt_node)
1998 
1999 #define foreach_instr_rpt_excl_safe(__rpt, __instr)                            \
2000    if (assert(ir3_instr_is_first_rpt(__instr)), true)                          \
2001       list_for_each_entry_safe (struct ir3_instruction, __rpt,                 \
2002                                 &__instr->rpt_node, rpt_node)
2003 
2004 /* iterators for blocks: */
2005 #define foreach_block(__block, __list)                                         \
2006    list_for_each_entry (struct ir3_block, __block, __list, node)
2007 #define foreach_block_safe(__block, __list)                                    \
2008    list_for_each_entry_safe (struct ir3_block, __block, __list, node)
2009 #define foreach_block_rev(__block, __list)                                     \
2010    list_for_each_entry_rev (struct ir3_block, __block, __list, node)
2011 
2012 /* iterators for arrays: */
2013 #define foreach_array(__array, __list)                                         \
2014    list_for_each_entry (struct ir3_array, __array, __list, node)
2015 #define foreach_array_safe(__array, __list)                                    \
2016    list_for_each_entry_safe (struct ir3_array, __array, __list, node)
2017 
2018 #define IR3_PASS(ir, pass, ...)                                                \
2019    ({                                                                          \
2020       bool progress = pass(ir, ##__VA_ARGS__);                                 \
2021       if (progress) {                                                          \
2022          ir3_debug_print(ir, "AFTER: " #pass);                                 \
2023          ir3_validate(ir);                                                     \
2024       }                                                                        \
2025       progress;                                                                \
2026    })
2027 
2028 /* validate: */
2029 void ir3_validate(struct ir3 *ir);
2030 
2031 /* dump: */
2032 void ir3_print(struct ir3 *ir);
2033 void ir3_print_instr(struct ir3_instruction *instr);
2034 
2035 struct log_stream;
2036 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
2037 
2038 /* delay calculation: */
2039 int ir3_delayslots(struct ir3_compiler *compiler,
2040                    struct ir3_instruction *assigner,
2041                    struct ir3_instruction *consumer, unsigned n, bool soft);
2042 unsigned ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
2043                                     struct ir3_instruction *assigner,
2044                                     struct ir3_instruction *consumer,
2045                                     unsigned assigner_n, unsigned consumer_n);
2046 
2047 /* estimated (ss)/(sy) delay calculation */
2048 
2049 static inline bool
is_local_mem_load(struct ir3_instruction * instr)2050 is_local_mem_load(struct ir3_instruction *instr)
2051 {
2052    return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
2053       instr->opc == OPC_LDLW;
2054 }
2055 
2056 bool is_scalar_alu(struct ir3_instruction *instr,
2057                    const struct ir3_compiler *compiler);
2058 
2059 /* Does this instruction sometimes need (ss) to wait for its result? */
2060 static inline bool
is_ss_producer(struct ir3_instruction * instr)2061 is_ss_producer(struct ir3_instruction *instr)
2062 {
2063    foreach_dst (dst, instr) {
2064       if (dst->flags & IR3_REG_SHARED)
2065          return true;
2066    }
2067 
2068    if (instr->block->in_early_preamble && writes_addr1(instr))
2069       return true;
2070 
2071    return is_sfu(instr) || is_local_mem_load(instr) || instr->opc == OPC_SHFL;
2072 }
2073 
2074 static inline bool
needs_ss(const struct ir3_compiler * compiler,struct ir3_instruction * producer,struct ir3_instruction * consumer)2075 needs_ss(const struct ir3_compiler *compiler, struct ir3_instruction *producer,
2076          struct ir3_instruction *consumer)
2077 {
2078    if (is_scalar_alu(producer, compiler) &&
2079        is_scalar_alu(consumer, compiler) &&
2080        (producer->dsts[0]->flags & IR3_REG_HALF) ==
2081        (consumer->srcs[0]->flags & IR3_REG_HALF))
2082       return false;
2083 
2084    return is_ss_producer(producer);
2085 }
2086 
2087 /* The soft delay for approximating the cost of (ss). */
2088 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)2089 soft_ss_delay(struct ir3_instruction *instr)
2090 {
2091    /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
2092     * using nop's instead of (ss) is:
2093     *
2094     *     8 - single warp
2095     *     9 - two warps
2096     *    10 - four warps
2097     *
2098     * and so on. Not quite sure where it tapers out (ie. how many warps share an
2099     * SFU unit). But 10 seems like a reasonable # to choose:
2100     */
2101    if (is_sfu(instr) || is_local_mem_load(instr))
2102       return 10;
2103 
2104    /* The blob adds 6 nops between shared producers and consumers, and before we
2105     * used (ss) this was sufficient in most cases.
2106     */
2107    return 6;
2108 }
2109 
2110 static inline bool
is_sy_producer(struct ir3_instruction * instr)2111 is_sy_producer(struct ir3_instruction *instr)
2112 {
2113    return is_tex_or_prefetch(instr) ||
2114       (is_load(instr) && !is_local_mem_load(instr)) ||
2115       is_atomic(instr->opc);
2116 }
2117 
2118 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)2119 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
2120 {
2121    /* TODO: this is just an optimistic guess, we can do better post-RA.
2122     */
2123    bool double_wavesize =
2124       shader->type == MESA_SHADER_FRAGMENT ||
2125       shader->type == MESA_SHADER_COMPUTE;
2126 
2127    unsigned components = reg_elems(instr->dsts[0]);
2128 
2129    /* These numbers come from counting the number of delay slots to get
2130     * cat5/cat6 results back using nops instead of (sy). Note that these numbers
2131     * are with the result preloaded to cache by loading it before in the same
2132     * shader - uncached results are much larger.
2133     *
2134     * Note: most ALU instructions can't complete at the full doubled rate, so
2135     * they take 2 cycles. The only exception is fp16 instructions with no
2136     * built-in conversions. Therefore divide the latency by 2.
2137     *
2138     * TODO: Handle this properly in the scheduler and remove this.
2139     */
2140    if (instr->opc == OPC_LDC) {
2141       if (double_wavesize)
2142          return (21 + 8 * components) / 2;
2143       else
2144          return 18 + 4 * components;
2145    } else if (is_tex_or_prefetch(instr)) {
2146       if (double_wavesize) {
2147          switch (components) {
2148          case 1: return 58 / 2;
2149          case 2: return 60 / 2;
2150          case 3: return 77 / 2;
2151          case 4: return 79 / 2;
2152          default: unreachable("bad number of components");
2153          }
2154       } else {
2155          switch (components) {
2156          case 1: return 51;
2157          case 2: return 53;
2158          case 3: return 62;
2159          case 4: return 64;
2160          default: unreachable("bad number of components");
2161          }
2162       }
2163    } else {
2164       /* TODO: measure other cat6 opcodes like ldg */
2165       if (double_wavesize)
2166          return (172 + components) / 2;
2167       else
2168          return 109 + components;
2169    }
2170 }
2171 
2172 /* Some instructions don't immediately consume their sources so may introduce a
2173  * WAR hazard.
2174  */
2175 static inline bool
is_war_hazard_producer(struct ir3_instruction * instr)2176 is_war_hazard_producer(struct ir3_instruction *instr)
2177 {
2178    return is_tex(instr) || is_mem(instr) || is_ss_producer(instr) ||
2179           instr->opc == OPC_STC;
2180 }
2181 
2182 bool ir3_cleanup_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2183 bool ir3_merge_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2184 bool ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v);
2185 
2186 /* unreachable block elimination: */
2187 bool ir3_remove_unreachable(struct ir3 *ir);
2188 
2189 /* calculate reconvergence information: */
2190 void ir3_calc_reconvergence(struct ir3_shader_variant *so);
2191 
2192 /* lower invalid shared phis after calculating reconvergence information: */
2193 bool ir3_lower_shared_phis(struct ir3 *ir);
2194 
2195 /* dead code elimination: */
2196 struct ir3_shader_variant;
2197 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
2198 
2199 /* fp16 conversion folding */
2200 bool ir3_cf(struct ir3 *ir);
2201 
2202 /* shared mov folding */
2203 bool ir3_shared_fold(struct ir3 *ir);
2204 
2205 /* copy-propagate: */
2206 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
2207 
2208 /* common subexpression elimination: */
2209 bool ir3_cse(struct ir3 *ir);
2210 
2211 /* Make arrays SSA */
2212 bool ir3_array_to_ssa(struct ir3 *ir);
2213 
2214 /* scheduling: */
2215 bool ir3_sched_add_deps(struct ir3 *ir);
2216 int ir3_sched(struct ir3 *ir);
2217 
2218 struct ir3_context;
2219 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
2220 
2221 /* register assignment: */
2222 int ir3_ra(struct ir3_shader_variant *v);
2223 void ir3_ra_predicates(struct ir3_shader_variant *v);
2224 
2225 /* lower subgroup ops: */
2226 bool ir3_lower_subgroups(struct ir3 *ir);
2227 
2228 /* legalize: */
2229 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
2230 bool ir3_legalize_relative(struct ir3 *ir);
2231 
2232 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)2233 ir3_has_latency_to_hide(struct ir3 *ir)
2234 {
2235    /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
2236     * know the nature of the fragment shader.  Just assume it will have
2237     * latency to hide:
2238     */
2239    if (ir->type != MESA_SHADER_FRAGMENT)
2240       return true;
2241 
2242    foreach_block (block, &ir->block_list) {
2243       foreach_instr (instr, &block->instr_list) {
2244          if (is_tex_or_prefetch(instr))
2245             return true;
2246 
2247          if (is_load(instr)) {
2248             switch (instr->opc) {
2249             case OPC_LDLV:
2250             case OPC_LDL:
2251             case OPC_LDLW:
2252                break;
2253             default:
2254                return true;
2255             }
2256          }
2257       }
2258    }
2259 
2260    return false;
2261 }
2262 
2263 /**
2264  * Move 'instr' to after the last phi node at the beginning of the block:
2265  */
2266 static inline void
ir3_instr_move_after_phis(struct ir3_instruction * instr,struct ir3_block * block)2267 ir3_instr_move_after_phis(struct ir3_instruction *instr,
2268                           struct ir3_block *block)
2269 {
2270    struct ir3_instruction *last_phi = ir3_block_get_last_phi(block);
2271    if (last_phi)
2272       ir3_instr_move_after(instr, last_phi);
2273    else
2274       ir3_instr_move_before_block(instr, block);
2275 }
2276 
2277 static inline struct ir3_cursor
ir3_before_block(struct ir3_block * block)2278 ir3_before_block(struct ir3_block *block)
2279 {
2280    assert(block);
2281    struct ir3_cursor cursor;
2282    cursor.option = IR3_CURSOR_BEFORE_BLOCK;
2283    cursor.block = block;
2284    return cursor;
2285 }
2286 
2287 static inline struct ir3_cursor
ir3_after_block(struct ir3_block * block)2288 ir3_after_block(struct ir3_block *block)
2289 {
2290    assert(block);
2291    struct ir3_cursor cursor;
2292    cursor.option = IR3_CURSOR_AFTER_BLOCK;
2293    cursor.block = block;
2294    return cursor;
2295 }
2296 
2297 static inline struct ir3_cursor
ir3_before_instr(struct ir3_instruction * instr)2298 ir3_before_instr(struct ir3_instruction *instr)
2299 {
2300    assert(instr);
2301    struct ir3_cursor cursor;
2302    cursor.option = IR3_CURSOR_BEFORE_INSTR;
2303    cursor.instr = instr;
2304    return cursor;
2305 }
2306 
2307 static inline struct ir3_cursor
ir3_after_instr(struct ir3_instruction * instr)2308 ir3_after_instr(struct ir3_instruction *instr)
2309 {
2310    assert(instr);
2311    struct ir3_cursor cursor;
2312    cursor.option = IR3_CURSOR_AFTER_INSTR;
2313    cursor.instr = instr;
2314    return cursor;
2315 }
2316 
2317 static inline struct ir3_cursor
ir3_before_terminator(struct ir3_block * block)2318 ir3_before_terminator(struct ir3_block *block)
2319 {
2320    assert(block);
2321    struct ir3_instruction *terminator = ir3_block_get_terminator(block);
2322 
2323    if (terminator)
2324       return ir3_before_instr(terminator);
2325    return ir3_after_block(block);
2326 }
2327 
2328 static inline struct ir3_cursor
ir3_after_phis(struct ir3_block * block)2329 ir3_after_phis(struct ir3_block *block)
2330 {
2331    assert(block);
2332 
2333    foreach_instr (instr, &block->instr_list) {
2334       if (instr->opc != OPC_META_PHI)
2335          return ir3_before_instr(instr);
2336    }
2337 
2338    return ir3_after_block(block);
2339 }
2340 
2341 static inline struct ir3_cursor
ir3_after_instr_and_phis(struct ir3_instruction * instr)2342 ir3_after_instr_and_phis(struct ir3_instruction *instr)
2343 {
2344    if (instr->opc == OPC_META_PHI) {
2345       return ir3_after_phis(instr->block);
2346    } else {
2347       return ir3_after_instr(instr);
2348    }
2349 }
2350 
2351 static inline struct ir3_builder
ir3_builder_at(struct ir3_cursor cursor)2352 ir3_builder_at(struct ir3_cursor cursor)
2353 {
2354    struct ir3_builder builder;
2355    builder.cursor = cursor;
2356    return builder;
2357 }
2358 
2359 
2360 /* ************************************************************************* */
2361 /* instruction helpers */
2362 
2363 /* creates SSA src of correct type (ie. half vs full precision) */
2364 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)2365 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
2366           unsigned flags)
2367 {
2368    struct ir3_register *reg;
2369    flags |= src->dsts[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED);
2370    reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
2371    reg->def = src->dsts[0];
2372    reg->wrmask = src->dsts[0]->wrmask;
2373    return reg;
2374 }
2375 
2376 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)2377 __ssa_dst(struct ir3_instruction *instr)
2378 {
2379    struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
2380    reg->instr = instr;
2381    return reg;
2382 }
2383 
2384 static BITMASK_ENUM(ir3_register_flags)
type_flags(type_t type)2385 type_flags(type_t type)
2386 {
2387    if (type_size(type) < 32)
2388       return IR3_REG_HALF;
2389    return (ir3_register_flags)0;
2390 }
2391 
2392 static inline struct ir3_instruction *
create_immed_typed_shared(struct ir3_builder * build,uint32_t val,type_t type,bool shared)2393 create_immed_typed_shared(struct ir3_builder *build, uint32_t val, type_t type,
2394                           bool shared)
2395 {
2396    struct ir3_instruction *mov;
2397    ir3_register_flags flags = type_flags(type);
2398 
2399    mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2400    mov->cat1.src_type = type;
2401    mov->cat1.dst_type = type;
2402    __ssa_dst(mov)->flags |= flags | (shared ? IR3_REG_SHARED : 0);
2403    ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
2404 
2405    return mov;
2406 }
2407 
2408 static inline struct ir3_instruction *
create_immed_typed(struct ir3_builder * build,uint32_t val,type_t type)2409 create_immed_typed(struct ir3_builder *build, uint32_t val, type_t type)
2410 {
2411    return create_immed_typed_shared(build, val, type, false);
2412 }
2413 
2414 static inline struct ir3_instruction *
create_immed_shared(struct ir3_builder * build,uint32_t val,bool shared)2415 create_immed_shared(struct ir3_builder *build, uint32_t val, bool shared)
2416 {
2417    return create_immed_typed_shared(build, val, TYPE_U32, shared);
2418 }
2419 
2420 static inline struct ir3_instruction *
create_immed(struct ir3_builder * build,uint32_t val)2421 create_immed(struct ir3_builder *build, uint32_t val)
2422 {
2423    return create_immed_shared(build, val, false);
2424 }
2425 
2426 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_builder * build,unsigned n,type_t type)2427 create_uniform_typed(struct ir3_builder *build, unsigned n, type_t type)
2428 {
2429    struct ir3_instruction *mov;
2430    ir3_register_flags flags = type_flags(type);
2431 
2432    mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2433    mov->cat1.src_type = type;
2434    mov->cat1.dst_type = type;
2435    __ssa_dst(mov)->flags |= flags;
2436    ir3_src_create(mov, n, IR3_REG_CONST | flags);
2437 
2438    return mov;
2439 }
2440 
2441 static inline struct ir3_instruction *
create_uniform(struct ir3_builder * build,unsigned n)2442 create_uniform(struct ir3_builder *build, unsigned n)
2443 {
2444    return create_uniform_typed(build, n, TYPE_F32);
2445 }
2446 
2447 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_builder * build,int n,type_t type,struct ir3_instruction * address)2448 create_uniform_indirect(struct ir3_builder *build, int n, type_t type,
2449                         struct ir3_instruction *address)
2450 {
2451    struct ir3_instruction *mov;
2452 
2453    mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2454    mov->cat1.src_type = type;
2455    mov->cat1.dst_type = type;
2456    __ssa_dst(mov);
2457    ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
2458 
2459    ir3_instr_set_address(mov, address);
2460 
2461    return mov;
2462 }
2463 
2464 static inline struct ir3_instruction *
ir3_MOV(struct ir3_builder * build,struct ir3_instruction * src,type_t type)2465 ir3_MOV(struct ir3_builder *build, struct ir3_instruction *src, type_t type)
2466 {
2467    struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOV, 1, 1);
2468    ir3_register_flags flags = type_flags(type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2469 
2470    __ssa_dst(instr)->flags |= flags;
2471    if (src->dsts[0]->flags & IR3_REG_ARRAY) {
2472       struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
2473       src_reg->array = src->dsts[0]->array;
2474    } else {
2475       __ssa_src(instr, src, 0);
2476    }
2477    assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
2478    instr->cat1.src_type = type;
2479    instr->cat1.dst_type = type;
2480    return instr;
2481 }
2482 
2483 static inline struct ir3_instruction_rpt
ir3_MOV_rpt(struct ir3_builder * build,unsigned nrpt,struct ir3_instruction_rpt src,type_t type)2484 ir3_MOV_rpt(struct ir3_builder *build, unsigned nrpt,
2485             struct ir3_instruction_rpt src, type_t type)
2486 {
2487    struct ir3_instruction_rpt dst;
2488    assert(nrpt <= ARRAY_SIZE(dst.rpts));
2489 
2490    for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2491       dst.rpts[rpt] = ir3_MOV(build, src.rpts[rpt], type);
2492 
2493    ir3_instr_create_rpt(dst.rpts, nrpt);
2494    return dst;
2495 }
2496 
2497 static inline struct ir3_instruction *
ir3_COV(struct ir3_builder * build,struct ir3_instruction * src,type_t src_type,type_t dst_type)2498 ir3_COV(struct ir3_builder *build, struct ir3_instruction *src, type_t src_type,
2499         type_t dst_type)
2500 {
2501    struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOV, 1, 1);
2502    ir3_register_flags dst_flags = type_flags(dst_type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2503    ASSERTED ir3_register_flags src_flags = type_flags(src_type);
2504 
2505    assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2506 
2507    __ssa_dst(instr)->flags |= dst_flags;
2508    __ssa_src(instr, src, 0);
2509    instr->cat1.src_type = src_type;
2510    instr->cat1.dst_type = dst_type;
2511    assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2512    return instr;
2513 }
2514 
2515 static inline struct ir3_instruction_rpt
ir3_COV_rpt(struct ir3_builder * build,unsigned nrpt,struct ir3_instruction_rpt src,type_t src_type,type_t dst_type)2516 ir3_COV_rpt(struct ir3_builder *build, unsigned nrpt,
2517             struct ir3_instruction_rpt src, type_t src_type, type_t dst_type)
2518 {
2519    struct ir3_instruction_rpt dst;
2520 
2521    for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2522       dst.rpts[rpt] = ir3_COV(build, src.rpts[rpt], src_type, dst_type);
2523 
2524    ir3_instr_create_rpt(dst.rpts, nrpt);
2525    return dst;
2526 }
2527 
2528 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_builder * build,unsigned components)2529 ir3_MOVMSK(struct ir3_builder *build, unsigned components)
2530 {
2531    struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOVMSK, 1, 0);
2532 
2533    struct ir3_register *dst = __ssa_dst(instr);
2534    dst->flags |= IR3_REG_SHARED;
2535    dst->wrmask = (1 << components) - 1;
2536    instr->repeat = components - 1;
2537    return instr;
2538 }
2539 
2540 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_builder * build,struct ir3_instruction * src,unsigned components)2541 ir3_BALLOT_MACRO(struct ir3_builder *build, struct ir3_instruction *src,
2542                  unsigned components)
2543 {
2544    struct ir3_instruction *instr =
2545       ir3_build_instr(build, OPC_BALLOT_MACRO, 1, 1);
2546 
2547    struct ir3_register *dst = __ssa_dst(instr);
2548    dst->flags |= IR3_REG_SHARED;
2549    dst->wrmask = (1 << components) - 1;
2550 
2551    __ssa_src(instr, src, 0);
2552 
2553    return instr;
2554 }
2555 
2556 /* clang-format off */
2557 #define __INSTR0(flag, name, opc)                                              \
2558 static inline struct ir3_instruction *ir3_##name(struct ir3_builder *build)    \
2559 {                                                                              \
2560    struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 0);          \
2561    instr->flags |= flag;                                                       \
2562    return instr;                                                               \
2563 }
2564 /* clang-format on */
2565 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2566 #define INSTR0(name)     __INSTR0((ir3_instruction_flags)0, name, OPC_##name)
2567 
2568 /* clang-format off */
2569 #define __INSTR1(flag, dst_count, name, opc, scalar_alu)                       \
2570 static inline struct ir3_instruction *ir3_##name(                              \
2571    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags)      \
2572 {                                                                              \
2573    struct ir3_instruction *instr =                                             \
2574       ir3_build_instr(build, opc, dst_count, 1);                               \
2575    unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & IR3_REG_SHARED) : 0;  \
2576    for (unsigned i = 0; i < dst_count; i++)                                    \
2577       __ssa_dst(instr)->flags |= dst_flag;                                     \
2578    __ssa_src(instr, a, aflags);                                                \
2579    instr->flags |= flag;                                                       \
2580    return instr;                                                               \
2581 }                                                                              \
2582 static inline struct ir3_instruction_rpt ir3_##name##_rpt(                     \
2583    struct ir3_builder *build, unsigned nrpt,                                   \
2584    struct ir3_instruction_rpt a, unsigned aflags)                              \
2585 {                                                                              \
2586    struct ir3_instruction_rpt dst;                                             \
2587    assert(nrpt <= ARRAY_SIZE(dst.rpts));                                       \
2588    for (unsigned rpt = 0; rpt < nrpt; rpt++)                                   \
2589       dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags);                  \
2590    ir3_instr_create_rpt(dst.rpts, nrpt);                                       \
2591    return dst;                                                                 \
2592 }
2593 
2594 /* clang-format on */
2595 #define INSTR1F(f, name)  __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name,   \
2596                                    false)
2597 #define INSTR1(name)      __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2598 #define INSTR1S(name)     __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2599 #define INSTR1NODST(name) __INSTR1((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2600 
2601 /* clang-format off */
2602 #define __INSTR2(flag, dst_count, name, opc, scalar_alu)                       \
2603 static inline struct ir3_instruction *ir3_##name(                              \
2604    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags,      \
2605    struct ir3_instruction *b, unsigned bflags)                                 \
2606 {                                                                              \
2607    struct ir3_instruction *instr = ir3_build_instr(build, opc, dst_count, 2);  \
2608    unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags &   \
2609                                      IR3_REG_SHARED) : 0;                      \
2610    for (unsigned i = 0; i < dst_count; i++)                                    \
2611       __ssa_dst(instr)->flags |= dst_flag;                                     \
2612    __ssa_src(instr, a, aflags);                                                \
2613    __ssa_src(instr, b, bflags);                                                \
2614    instr->flags |= flag;                                                       \
2615    return instr;                                                               \
2616 }                                                                              \
2617 static inline struct ir3_instruction_rpt ir3_##name##_rpt(                     \
2618    struct ir3_builder *build, unsigned nrpt,                                   \
2619    struct ir3_instruction_rpt a, unsigned aflags,                              \
2620    struct ir3_instruction_rpt b, unsigned bflags)                              \
2621 {                                                                              \
2622    struct ir3_instruction_rpt dst;                                             \
2623    assert(nrpt <= ARRAY_SIZE(dst.rpts));                                       \
2624    for (unsigned rpt = 0; rpt < nrpt; rpt++) {                                 \
2625       dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags,                   \
2626                                  b.rpts[rpt], bflags);                         \
2627    }                                                                           \
2628    ir3_instr_create_rpt(dst.rpts, nrpt);                                       \
2629    return dst;                                                                 \
2630 }
2631 /* clang-format on */
2632 #define INSTR2F(f, name)   __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name,  \
2633                                     false)
2634 #define INSTR2(name)       __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2635 #define INSTR2S(name)      __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2636 #define INSTR2NODST(name)  __INSTR2((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2637 
2638 /* clang-format off */
2639 #define __INSTR3(flag, dst_count, name, opc, scalar_alu)                       \
2640 static inline struct ir3_instruction *ir3_##name(                              \
2641    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags,      \
2642    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2643    unsigned cflags)                                                            \
2644 {                                                                              \
2645    struct ir3_instruction *instr =                                             \
2646       ir3_build_instr(build, opc, dst_count, 3);                               \
2647    unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags &   \
2648                                      c->dsts[0]->flags & IR3_REG_SHARED) : 0;  \
2649    for (unsigned i = 0; i < dst_count; i++)                                    \
2650       __ssa_dst(instr)->flags |= dst_flag;                                     \
2651    __ssa_src(instr, a, aflags);                                                \
2652    __ssa_src(instr, b, bflags);                                                \
2653    __ssa_src(instr, c, cflags);                                                \
2654    instr->flags |= flag;                                                       \
2655    return instr;                                                               \
2656 }                                                                              \
2657 static inline struct ir3_instruction_rpt ir3_##name##_rpt(                     \
2658    struct ir3_builder *build, unsigned nrpt,                                   \
2659    struct ir3_instruction_rpt a, unsigned aflags,                              \
2660    struct ir3_instruction_rpt b, unsigned bflags,                              \
2661    struct ir3_instruction_rpt c, unsigned cflags)                              \
2662 {                                                                              \
2663    struct ir3_instruction_rpt dst;                                             \
2664    assert(nrpt <= ARRAY_SIZE(dst.rpts));                                       \
2665    for (unsigned rpt = 0; rpt < nrpt; rpt++) {                                 \
2666       dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags,                   \
2667                                  b.rpts[rpt], bflags,                          \
2668                                  c.rpts[rpt], cflags);                         \
2669    }                                                                           \
2670    ir3_instr_create_rpt(dst.rpts, nrpt);                                       \
2671    return dst;                                                                 \
2672 }
2673 /* clang-format on */
2674 #define INSTR3F(f, name)  __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name,   \
2675                                    false)
2676 #define INSTR3(name)      __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2677 #define INSTR3S(name)     __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2678 #define INSTR3NODST(name) __INSTR3((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2679 
2680 /* clang-format off */
2681 #define __INSTR4(flag, dst_count, name, opc)                                   \
2682 static inline struct ir3_instruction *ir3_##name(                              \
2683    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags,      \
2684    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2685    unsigned cflags, struct ir3_instruction *d, unsigned dflags)                \
2686 {                                                                              \
2687    struct ir3_instruction *instr =                                             \
2688       ir3_build_instr(build, opc, dst_count, 4);                               \
2689    for (unsigned i = 0; i < dst_count; i++)                                    \
2690       __ssa_dst(instr);                                                        \
2691    __ssa_src(instr, a, aflags);                                                \
2692    __ssa_src(instr, b, bflags);                                                \
2693    __ssa_src(instr, c, cflags);                                                \
2694    __ssa_src(instr, d, dflags);                                                \
2695    instr->flags |= flag;                                                       \
2696    return instr;                                                               \
2697 }
2698 /* clang-format on */
2699 #define INSTR4F(f, name)  __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2700 #define INSTR4(name)      __INSTR4((ir3_instruction_flags)0, 1, name, OPC_##name)
2701 #define INSTR4NODST(name) __INSTR4((ir3_instruction_flags)0, 0, name, OPC_##name)
2702 
2703 /* clang-format off */
2704 #define __INSTR5(flag, name, opc)                                              \
2705 static inline struct ir3_instruction *ir3_##name(                              \
2706    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags,      \
2707    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2708    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2709    struct ir3_instruction *e, unsigned eflags)                                 \
2710 {                                                                              \
2711    struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 5);          \
2712    __ssa_dst(instr);                                                           \
2713    __ssa_src(instr, a, aflags);                                                \
2714    __ssa_src(instr, b, bflags);                                                \
2715    __ssa_src(instr, c, cflags);                                                \
2716    __ssa_src(instr, d, dflags);                                                \
2717    __ssa_src(instr, e, eflags);                                                \
2718    instr->flags |= flag;                                                       \
2719    return instr;                                                               \
2720 }
2721 /* clang-format on */
2722 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2723 #define INSTR5(name)     __INSTR5((ir3_instruction_flags)0, name, OPC_##name)
2724 
2725 /* clang-format off */
2726 #define __INSTR6(flag, dst_count, name, opc)                                   \
2727 static inline struct ir3_instruction *ir3_##name(                              \
2728    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags,      \
2729    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2730    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2731    struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f,      \
2732    unsigned fflags)                                                            \
2733 {                                                                              \
2734    struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 6);          \
2735    for (unsigned i = 0; i < dst_count; i++)                                    \
2736       __ssa_dst(instr);                                                        \
2737    __ssa_src(instr, a, aflags);                                                \
2738    __ssa_src(instr, b, bflags);                                                \
2739    __ssa_src(instr, c, cflags);                                                \
2740    __ssa_src(instr, d, dflags);                                                \
2741    __ssa_src(instr, e, eflags);                                                \
2742    __ssa_src(instr, f, fflags);                                                \
2743    instr->flags |= flag;                                                       \
2744    return instr;                                                               \
2745 }
2746 /* clang-format on */
2747 #define INSTR6F(f, name)  __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2748 #define INSTR6(name)      __INSTR6((ir3_instruction_flags)0, 1, name, OPC_##name)
2749 #define INSTR6NODST(name) __INSTR6((ir3_instruction_flags)0, 0, name, OPC_##name)
2750 
2751 /* cat0 instructions: */
2752 INSTR0(NOP)
INSTR1NODST(BR)2753 INSTR1NODST(BR)
2754 INSTR1NODST(BALL)
2755 INSTR1NODST(BANY)
2756 INSTR2NODST(BRAA)
2757 INSTR2NODST(BRAO)
2758 INSTR0(JUMP)
2759 INSTR1NODST(KILL)
2760 INSTR1NODST(DEMOTE)
2761 INSTR0(END)
2762 INSTR0(CHSH)
2763 INSTR0(CHMASK)
2764 INSTR1NODST(PREDT)
2765 INSTR1NODST(PREDF)
2766 INSTR0(PREDE)
2767 INSTR0(GETONE)
2768 INSTR0(GETLAST)
2769 INSTR0(SHPS)
2770 INSTR0(SHPE)
2771 
2772 /* cat1 macros */
2773 INSTR1(ANY_MACRO)
2774 INSTR1(ALL_MACRO)
2775 INSTR1(READ_FIRST_MACRO)
2776 INSTR2(READ_COND_MACRO)
2777 INSTR1(READ_GETLAST_MACRO)
2778 
2779 static inline struct ir3_instruction *
2780 ir3_ELECT_MACRO(struct ir3_builder *build)
2781 {
2782    struct ir3_instruction *instr =
2783       ir3_build_instr(build, OPC_ELECT_MACRO, 1, 0);
2784    __ssa_dst(instr);
2785    return instr;
2786 }
2787 
2788 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_builder * build)2789 ir3_SHPS_MACRO(struct ir3_builder *build)
2790 {
2791    struct ir3_instruction *instr = ir3_build_instr(build, OPC_SHPS_MACRO, 1, 0);
2792    __ssa_dst(instr);
2793    return instr;
2794 }
2795 
2796 /* cat2 instructions, most 2 src but some 1 src: */
2797 INSTR2S(ADD_F)
INSTR2S(MIN_F)2798 INSTR2S(MIN_F)
2799 INSTR2S(MAX_F)
2800 INSTR2S(MUL_F)
2801 INSTR1S(SIGN_F)
2802 INSTR2S(CMPS_F)
2803 INSTR1S(ABSNEG_F)
2804 INSTR2S(CMPV_F)
2805 INSTR1S(FLOOR_F)
2806 INSTR1S(CEIL_F)
2807 INSTR1S(RNDNE_F)
2808 INSTR1S(RNDAZ_F)
2809 INSTR1S(TRUNC_F)
2810 INSTR2S(ADD_U)
2811 INSTR2S(ADD_S)
2812 INSTR2S(SUB_U)
2813 INSTR2S(SUB_S)
2814 INSTR2S(CMPS_U)
2815 INSTR2S(CMPS_S)
2816 INSTR2S(MIN_U)
2817 INSTR2S(MIN_S)
2818 INSTR2S(MAX_U)
2819 INSTR2S(MAX_S)
2820 INSTR1S(ABSNEG_S)
2821 INSTR2S(AND_B)
2822 INSTR2S(OR_B)
2823 INSTR1S(NOT_B)
2824 INSTR2S(XOR_B)
2825 INSTR2S(CMPV_U)
2826 INSTR2S(CMPV_S)
2827 INSTR2S(MUL_U24)
2828 INSTR2S(MUL_S24)
2829 INSTR2S(MULL_U)
2830 INSTR1S(BFREV_B)
2831 INSTR1S(CLZ_S)
2832 INSTR1S(CLZ_B)
2833 INSTR2S(SHL_B)
2834 INSTR2S(SHR_B)
2835 INSTR2S(ASHR_B)
2836 INSTR2(BARY_F)
2837 INSTR2(FLAT_B)
2838 INSTR2S(MGEN_B)
2839 INSTR2S(GETBIT_B)
2840 INSTR1(SETRM)
2841 INSTR1S(CBITS_B)
2842 INSTR2S(SHB)
2843 INSTR2S(MSAD)
2844 
2845 /* cat3 instructions: */
2846 INSTR3(MAD_U16)
2847 INSTR3(MADSH_U16)
2848 INSTR3(MAD_S16)
2849 INSTR3(MADSH_M16)
2850 INSTR3(MAD_U24)
2851 INSTR3(MAD_S24)
2852 INSTR3(MAD_F16)
2853 INSTR3(MAD_F32)
2854 INSTR3(DP2ACC)
2855 INSTR3(DP4ACC)
2856 /* NOTE: SEL_B32 checks for zero vs nonzero */
2857 INSTR3S(SEL_B16)
2858 INSTR3S(SEL_B32)
2859 INSTR3S(SEL_S16)
2860 INSTR3S(SEL_S32)
2861 INSTR3S(SEL_F16)
2862 INSTR3S(SEL_F32)
2863 INSTR3(SAD_S16)
2864 INSTR3(SAD_S32)
2865 INSTR3S(SHRM)
2866 INSTR3S(SHLM)
2867 INSTR3S(SHRG)
2868 INSTR3S(SHLG)
2869 INSTR3S(ANDG)
2870 
2871 /* cat4 instructions: */
2872 INSTR1S(RCP)
2873 INSTR1S(RSQ)
2874 INSTR1S(HRSQ)
2875 INSTR1S(LOG2)
2876 INSTR1S(HLOG2)
2877 INSTR1S(EXP2)
2878 INSTR1S(HEXP2)
2879 INSTR1S(SIN)
2880 INSTR1S(COS)
2881 INSTR1S(SQRT)
2882 
2883 /* cat5 instructions: */
2884 INSTR1(DSX)
2885 INSTR1(DSXPP_MACRO)
2886 INSTR1(DSY)
2887 INSTR1(DSYPP_MACRO)
2888 INSTR1F(3D, DSX)
2889 INSTR1F(3D, DSY)
2890 INSTR1(RGETPOS)
2891 
2892 static inline struct ir3_instruction *
2893 ir3_SAM(struct ir3_builder *build, opc_t opc, type_t type, unsigned wrmask,
2894         ir3_instruction_flags flags, struct ir3_instruction *samp_tex,
2895         struct ir3_instruction *src0, struct ir3_instruction *src1)
2896 {
2897    struct ir3_instruction *sam;
2898    unsigned nreg = 0;
2899 
2900    if (flags & IR3_INSTR_S2EN) {
2901       nreg++;
2902    }
2903    if (src0 || opc == OPC_SAM) {
2904       nreg++;
2905    }
2906    if (src1) {
2907       nreg++;
2908    }
2909 
2910    sam = ir3_build_instr(build, opc, 1, nreg);
2911    sam->flags |= flags;
2912    __ssa_dst(sam)->wrmask = wrmask;
2913    if (flags & IR3_INSTR_S2EN) {
2914       __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2915    }
2916    if (src0) {
2917       __ssa_src(sam, src0, 0);
2918    } else if (opc == OPC_SAM) {
2919       /* Create a dummy shared source for the coordinate, for the prefetch
2920        * case. It needs to be shared so that we don't accidentally disable early
2921        * preamble, and this is what the blob does.
2922        */
2923       ir3_src_create(sam, regid(48, 0), IR3_REG_SHARED);
2924    }
2925    if (src1) {
2926       __ssa_src(sam, src1, 0);
2927    }
2928    sam->cat5.type = type;
2929 
2930    return sam;
2931 }
2932 
2933 /* brcst.active rx, ry behaves like a conditional move: rx either keeps its
2934  * value or is set to ry. In order to model this in SSA form, we add an extra
2935  * argument (the initial value of rx) and tie it to the destination.
2936  */
2937 static inline struct ir3_instruction *
ir3_BRCST_ACTIVE(struct ir3_builder * build,unsigned cluster_size,struct ir3_instruction * src,struct ir3_instruction * dst_default)2938 ir3_BRCST_ACTIVE(struct ir3_builder *build, unsigned cluster_size,
2939                  struct ir3_instruction *src,
2940                  struct ir3_instruction *dst_default)
2941 {
2942    struct ir3_instruction *brcst =
2943       ir3_build_instr(build, OPC_BRCST_ACTIVE, 1, 2);
2944    brcst->cat5.cluster_size = cluster_size;
2945    brcst->cat5.type = TYPE_U32;
2946    struct ir3_register *brcst_dst = __ssa_dst(brcst);
2947    __ssa_src(brcst, src, 0);
2948    struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0);
2949    ir3_reg_tie(brcst_dst, default_src);
2950    return brcst;
2951 }
2952 
2953 /* cat6 instructions: */
2954 INSTR0(GETFIBERID)
2955 INSTR2(LDLV)
2956 INSTR3(LDG)
2957 INSTR3(LDL)
2958 INSTR3(LDLW)
2959 INSTR3(LDP)
2960 INSTR4NODST(STG)
2961 INSTR3NODST(STL)
2962 INSTR3NODST(STLW)
2963 INSTR3NODST(STP)
2964 INSTR1(RESINFO)
2965 INSTR1(RESFMT)
2966 INSTR2(ATOMIC_ADD)
2967 INSTR2(ATOMIC_SUB)
2968 INSTR2(ATOMIC_XCHG)
2969 INSTR2(ATOMIC_INC)
2970 INSTR2(ATOMIC_DEC)
2971 INSTR2(ATOMIC_CMPXCHG)
2972 INSTR2(ATOMIC_MIN)
2973 INSTR2(ATOMIC_MAX)
2974 INSTR2(ATOMIC_AND)
2975 INSTR2(ATOMIC_OR)
2976 INSTR2(ATOMIC_XOR)
2977 INSTR2(LDC)
2978 INSTR2(QUAD_SHUFFLE_BRCST)
2979 INSTR1(QUAD_SHUFFLE_HORIZ)
2980 INSTR1(QUAD_SHUFFLE_VERT)
2981 INSTR1(QUAD_SHUFFLE_DIAG)
2982 INSTR2NODST(LDC_K)
2983 INSTR2NODST(STC)
2984 INSTR2NODST(STSC)
2985 INSTR2(SHFL)
2986 #ifndef GPU
2987 #elif GPU >= 600
2988 INSTR4NODST(STIB);
2989 INSTR3(LDIB);
2990 INSTR5(LDG_A);
2991 INSTR6NODST(STG_A);
2992 INSTR2(ATOMIC_G_ADD)
2993 INSTR2(ATOMIC_G_SUB)
2994 INSTR2(ATOMIC_G_XCHG)
2995 INSTR2(ATOMIC_G_INC)
2996 INSTR2(ATOMIC_G_DEC)
2997 INSTR2(ATOMIC_G_CMPXCHG)
2998 INSTR2(ATOMIC_G_MIN)
2999 INSTR2(ATOMIC_G_MAX)
3000 INSTR2(ATOMIC_G_AND)
3001 INSTR2(ATOMIC_G_OR)
3002 INSTR2(ATOMIC_G_XOR)
3003 INSTR3(ATOMIC_B_ADD)
3004 INSTR3(ATOMIC_B_SUB)
3005 INSTR3(ATOMIC_B_XCHG)
3006 INSTR3(ATOMIC_B_INC)
3007 INSTR3(ATOMIC_B_DEC)
3008 INSTR3(ATOMIC_B_CMPXCHG)
3009 INSTR3(ATOMIC_B_MIN)
3010 INSTR3(ATOMIC_B_MAX)
3011 INSTR3(ATOMIC_B_AND)
3012 INSTR3(ATOMIC_B_OR)
3013 INSTR3(ATOMIC_B_XOR)
3014 #elif GPU >= 400
3015 INSTR3(LDGB)
3016 #if GPU >= 500
3017 INSTR3(LDIB)
3018 #endif
3019 INSTR4NODST(STGB)
3020 INSTR4NODST(STIB)
3021 INSTR4(ATOMIC_S_ADD)
3022 INSTR4(ATOMIC_S_SUB)
3023 INSTR4(ATOMIC_S_XCHG)
3024 INSTR4(ATOMIC_S_INC)
3025 INSTR4(ATOMIC_S_DEC)
3026 INSTR4(ATOMIC_S_CMPXCHG)
3027 INSTR4(ATOMIC_S_MIN)
3028 INSTR4(ATOMIC_S_MAX)
3029 INSTR4(ATOMIC_S_AND)
3030 INSTR4(ATOMIC_S_OR)
3031 INSTR4(ATOMIC_S_XOR)
3032 #endif
3033 INSTR4NODST(LDG_K)
3034 
3035 /* cat7 instructions: */
3036 INSTR0(BAR)
3037 INSTR0(FENCE)
3038 INSTR0(CCINV)
3039 
3040 /* ************************************************************************* */
3041 #include "util/bitset.h"
3042 
3043 #define MAX_REG 256
3044 
3045 typedef BITSET_DECLARE(fullstate_t, 2 * GPR_REG_SIZE);
3046 typedef BITSET_DECLARE(halfstate_t, GPR_REG_SIZE);
3047 typedef BITSET_DECLARE(sharedstate_t, 2 * SHARED_REG_SIZE);
3048 typedef BITSET_DECLARE(nongprstate_t, 2 * NONGPR_REG_SIZE);
3049 
3050 typedef struct {
3051    bool mergedregs;
3052    fullstate_t full;
3053    halfstate_t half;
3054    sharedstate_t shared;
3055    nongprstate_t nongpr;
3056 } regmask_t;
3057 
3058 static inline BITSET_WORD *
__regmask_file(regmask_t * regmask,enum ir3_reg_file file)3059 __regmask_file(regmask_t *regmask, enum ir3_reg_file file)
3060 {
3061    switch (file) {
3062    case IR3_FILE_FULL:
3063       return regmask->full;
3064    case IR3_FILE_HALF:
3065       return regmask->half;
3066    case IR3_FILE_SHARED:
3067       return regmask->shared;
3068    case IR3_FILE_NONGPR:
3069       return regmask->nongpr;
3070    }
3071    unreachable("bad file");
3072 }
3073 
3074 static inline bool
__regmask_get(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3075 __regmask_get(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3076 {
3077    BITSET_WORD *regs = __regmask_file(regmask, file);
3078    for (unsigned i = 0; i < size; i++) {
3079       if (BITSET_TEST(regs, n + i))
3080          return true;
3081    }
3082    return false;
3083 }
3084 
3085 static inline void
__regmask_set(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3086 __regmask_set(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3087 {
3088    BITSET_WORD *regs = __regmask_file(regmask, file);
3089    for (unsigned i = 0; i < size; i++)
3090       BITSET_SET(regs, n + i);
3091 }
3092 
3093 static inline void
__regmask_clear(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3094 __regmask_clear(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3095 {
3096    BITSET_WORD *regs = __regmask_file(regmask, file);
3097    for (unsigned i = 0; i < size; i++)
3098       BITSET_CLEAR(regs, n + i);
3099 }
3100 
3101 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)3102 regmask_init(regmask_t *regmask, bool mergedregs)
3103 {
3104    memset(regmask, 0, sizeof(*regmask));
3105    regmask->mergedregs = mergedregs;
3106 }
3107 
3108 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)3109 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
3110 {
3111    assert(dst->mergedregs == a->mergedregs);
3112    assert(dst->mergedregs == b->mergedregs);
3113 
3114    for (unsigned i = 0; i < ARRAY_SIZE(dst->full); i++)
3115       dst->full[i] = a->full[i] | b->full[i];
3116    for (unsigned i = 0; i < ARRAY_SIZE(dst->half); i++)
3117       dst->half[i] = a->half[i] | b->half[i];
3118    for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3119       dst->shared[i] = a->shared[i] | b->shared[i];
3120    for (unsigned i = 0; i < ARRAY_SIZE(dst->nongpr); i++)
3121       dst->nongpr[i] = a->nongpr[i] | b->nongpr[i];
3122 }
3123 
3124 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)3125 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
3126 {
3127    for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3128       dst->shared[i] = a->shared[i] | b->shared[i];
3129 }
3130 
3131 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)3132 regmask_set(regmask_t *regmask, struct ir3_register *reg)
3133 {
3134    unsigned size = reg_elem_size(reg);
3135    enum ir3_reg_file file;
3136    unsigned num = post_ra_reg_num(reg);
3137    unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3138    if (reg->flags & IR3_REG_RELATIV) {
3139       __regmask_set(regmask, file, n, size * reg->size);
3140    } else {
3141       for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3142          if (mask & 1)
3143             __regmask_set(regmask, file, n, size);
3144    }
3145 }
3146 
3147 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)3148 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
3149 {
3150    unsigned size = reg_elem_size(reg);
3151    enum ir3_reg_file file;
3152    unsigned num = post_ra_reg_num(reg);
3153    unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3154    if (reg->flags & IR3_REG_RELATIV) {
3155       __regmask_clear(regmask, file, n, size * reg->size);
3156    } else {
3157       for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3158          if (mask & 1)
3159             __regmask_clear(regmask, file, n, size);
3160    }
3161 }
3162 
3163 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)3164 regmask_get(regmask_t *regmask, struct ir3_register *reg)
3165 {
3166    unsigned size = reg_elem_size(reg);
3167    enum ir3_reg_file file;
3168    unsigned num = post_ra_reg_num(reg);
3169    unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3170    if (reg->flags & IR3_REG_RELATIV) {
3171       return __regmask_get(regmask, file, n, size * reg->size);
3172    } else {
3173       for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3174          if (mask & 1)
3175             if (__regmask_get(regmask, file, n, size))
3176                return true;
3177    }
3178    return false;
3179 }
3180 /* ************************************************************************* */
3181 
3182 #endif /* IR3_H_ */
3183