• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2013 Rob Clark <robdclark@gmail.com>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #ifndef IR3_H_
7 #define IR3_H_
8 
9 #include <stdbool.h>
10 #include <stdint.h>
11 
12 #include "compiler/shader_enums.h"
13 
14 #include "util/bitscan.h"
15 #include "util/list.h"
16 #include "util/set.h"
17 #include "util/u_debug.h"
18 
19 #include "freedreno_common.h"
20 
21 #include "instr-a3xx.h"
22 
23 /* low level intermediate representation of an adreno shader program */
24 
25 struct ir3_compiler;
26 struct ir3;
27 struct ir3_instruction;
28 struct ir3_block;
29 
30 struct ir3_info {
31    void *data; /* used internally in ir3 assembler */
32    /* Size in bytes of the shader binary, including NIR constants and
33     * padding
34     */
35    uint32_t size;
36    /* byte offset from start of the shader to the NIR constant data. */
37    uint32_t constant_data_offset;
38    /* Size in dwords of the instructions. */
39    uint16_t sizedwords;
40    uint16_t instrs_count; /* expanded to account for rpt's */
41    uint16_t preamble_instrs_count;
42    uint16_t nops_count;   /* # of nop instructions, including nopN */
43    uint16_t mov_count;
44    uint16_t cov_count;
45    uint16_t stp_count;
46    uint16_t ldp_count;
47    /* NOTE: max_reg, etc, does not include registers not touched
48     * by the shader (ie. vertex fetched via VFD_DECODE but not
49     * touched by shader)
50     */
51    int8_t max_reg; /* highest GPR # used by shader */
52    int8_t max_half_reg;
53    int16_t max_const;
54    /* This is the maximum # of waves that can executed at once in one core,
55     * assuming that they are all executing this shader.
56     */
57    int8_t max_waves;
58    uint8_t subgroup_size;
59    bool double_threadsize;
60    bool multi_dword_ldp_stp;
61    bool early_preamble;
62    bool uses_ray_intersection;
63 
64    /* number of sync bits: */
65    uint16_t ss, sy;
66 
67    /* estimate of number of cycles stalled on (ss) */
68    uint16_t sstall;
69    /* estimate of number of cycles stalled on (sy) */
70    uint16_t systall;
71 
72    uint16_t last_baryf; /* instruction # of last varying fetch */
73 
74    uint16_t last_helper; /* last instruction to use helper invocations */
75 
76    /* Number of instructions of a given category: */
77    uint16_t instrs_per_cat[8];
78 };
79 
80 struct ir3_merge_set {
81    uint16_t preferred_reg;
82    uint16_t size;
83    uint16_t alignment;
84 
85    unsigned interval_start;
86    unsigned spill_slot;
87 
88    unsigned regs_count;
89    struct ir3_register **regs;
90 };
91 
92 typedef enum ir3_register_flags {
93    IR3_REG_CONST = BIT(0),
94    IR3_REG_IMMED = BIT(1),
95    IR3_REG_HALF = BIT(2),
96    /* Shared registers have the same value for all threads when read.
97     * They can only be written when one thread is active (that is, inside
98     * a "getone" block).
99     */
100    IR3_REG_SHARED = BIT(3),
101    IR3_REG_RELATIV = BIT(4),
102    IR3_REG_R = BIT(5),
103    /* Most instructions, it seems, can do float abs/neg but not
104     * integer.  The CP pass needs to know what is intended (int or
105     * float) in order to do the right thing.  For this reason the
106     * abs/neg flags are split out into float and int variants.  In
107     * addition, .b (bitwise) operations, the negate is actually a
108     * bitwise not, so split that out into a new flag to make it
109     * more clear.
110     */
111    IR3_REG_FNEG = BIT(6),
112    IR3_REG_FABS = BIT(7),
113    IR3_REG_SNEG = BIT(8),
114    IR3_REG_SABS = BIT(9),
115    IR3_REG_BNOT = BIT(10),
116    /* (ei) flag, end-input?  Set on last bary, presumably to signal
117     * that the shader needs no more input:
118     *
119     * Note: Has different meaning on other instructions like add.s/u
120     */
121    IR3_REG_EI = BIT(11),
122    /* meta-flags, for intermediate stages of IR, ie.
123     * before register assignment is done:
124     */
125    IR3_REG_SSA = BIT(12), /* 'def' is ptr to assigning destination */
126    IR3_REG_ARRAY = BIT(13),
127 
128    /* Set on a use whenever the SSA value becomes dead after the current
129     * instruction.
130     */
131    IR3_REG_KILL = BIT(14),
132 
133    /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
134     * same SSA value in a single instruction, this is only set on the first
135     * use.
136     */
137    IR3_REG_FIRST_KILL = BIT(15),
138 
139    /* Set when a destination doesn't have any uses and is dead immediately
140     * after the instruction. This can happen even after optimizations for
141     * corner cases such as destinations of atomic instructions.
142     */
143    IR3_REG_UNUSED = BIT(16),
144 
145    /* "Early-clobber" on a destination means that the destination is
146     * (potentially) written before any sources are read and therefore
147     * interferes with the sources of the instruction.
148     */
149    IR3_REG_EARLY_CLOBBER = BIT(17),
150 
151    /* If this is the last usage of a specific value in the register, the
152     * register cannot be read without being written to first after this.
153     * Note: This effectively has the same semantics as IR3_REG_KILL.
154     */
155    IR3_REG_LAST_USE = BIT(18),
156 
157    /* Predicate register (p0.c). Cannot be combined with half or shared. */
158    IR3_REG_PREDICATE = BIT(19),
159 
160    /* Render target dst. Only used by alias.rt. */
161    IR3_REG_RT = BIT(20),
162 
163    /* Register that is initialized using alias.tex (or will be once the
164     * alias.tex instructions are inserted). Before alias.tex is inserted, alias
165     * registers may contain things that are normally not allowed by the owning
166     * instruction (e.g., consts or immediates) because they will be replaced by
167     * GPRs later.
168     * Note that if wrmask > 1, this will be set if any of the registers is an
169     * alias, even though not all of them may be. We currently have no way to
170     * tell which ones are actual aliases.
171     */
172    IR3_REG_ALIAS = BIT(21),
173 
174    /* Alias registers allow us to allocate non-consecutive registers and remap
175     * them to consecutive ones using alias.tex. We implement this by adding the
176     * sources of collects directly to the sources of their users. This way, RA
177     * treats them as scalar registers and we can remap them to consecutive
178     * registers afterwards. This flag is used to keep track of the scalar
179     * sources that should be remapped together. Every source of such an "alias
180     * group" will have the IR3_REG_ALIAS set, while the first one will also have
181     * IR3_REG_FIRST_ALIAS set.
182     */
183    IR3_REG_FIRST_ALIAS = BIT(22),
184 } ir3_register_flags;
185 
186 struct ir3_register {
187    BITMASK_ENUM(ir3_register_flags) flags;
188 
189    unsigned name;
190 
191    /* used for cat5 instructions, but also for internal/IR level
192     * tracking of what registers are read/written by an instruction.
193     * wrmask may be a bad name since it is used to represent both
194     * src and dst that touch multiple adjacent registers.
195     */
196    unsigned wrmask : 16; /* up to vec16 */
197 
198    /* for relative addressing, 32bits for array size is too small,
199     * but otoh we don't need to deal with disjoint sets, so instead
200     * use a simple size field (number of scalar components).
201     *
202     * Note the size field isn't important for relative const (since
203     * we don't have to do register allocation for constants).
204     */
205    unsigned size : 16;
206 
207    /* normal registers:
208     * the component is in the low two bits of the reg #, so
209     * rN.x becomes: (N << 2) | x
210     */
211    uint16_t num;
212    union {
213       /* immediate: */
214       int32_t iim_val;
215       uint32_t uim_val;
216       float fim_val;
217       /* relative: */
218       struct {
219          uint16_t id;
220          int16_t offset;
221          uint16_t base;
222       } array;
223    };
224 
225    /* For IR3_REG_SSA, dst registers contain pointer back to the instruction
226     * containing this register.
227     */
228    struct ir3_instruction *instr;
229 
230    /* For IR3_REG_SSA, src registers contain ptr back to assigning
231     * instruction.
232     *
233     * For IR3_REG_ARRAY, the pointer is back to the last dependent
234     * array access (although the net effect is the same, it points
235     * back to a previous instruction that we depend on).
236     */
237    struct ir3_register *def;
238 
239    /* Pointer to another register in the instruction that must share the same
240     * physical register. Each destination can be tied with one source, and
241     * they must have "tied" pointing to each other.
242     */
243    struct ir3_register *tied;
244 
245    unsigned spill_slot, next_use;
246 
247    unsigned merge_set_offset;
248    struct ir3_merge_set *merge_set;
249    unsigned interval_start, interval_end;
250 };
251 
252 /*
253  * Stupid/simple growable array implementation:
254  */
255 #define DECLARE_ARRAY(type, name)                                              \
256    unsigned name##_count, name##_sz;                                           \
257    type *name;
258 
259 #define array_insert(ctx, arr, ...)                                            \
260    do {                                                                        \
261       if (arr##_count == arr##_sz) {                                           \
262          arr##_sz = MAX2(2 * arr##_sz, 16);                                    \
263          arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0]));             \
264       }                                                                        \
265       arr[arr##_count++] = __VA_ARGS__;                                        \
266    } while (0)
267 
268 typedef enum {
269    REDUCE_OP_ADD_U,
270    REDUCE_OP_ADD_F,
271    REDUCE_OP_MUL_U,
272    REDUCE_OP_MUL_F,
273    REDUCE_OP_MIN_U,
274    REDUCE_OP_MIN_S,
275    REDUCE_OP_MIN_F,
276    REDUCE_OP_MAX_U,
277    REDUCE_OP_MAX_S,
278    REDUCE_OP_MAX_F,
279    REDUCE_OP_AND_B,
280    REDUCE_OP_OR_B,
281    REDUCE_OP_XOR_B,
282 } reduce_op_t;
283 
284 typedef enum {
285    ALIAS_TEX = 0,
286    ALIAS_RT = 1,
287    ALIAS_MEM = 2,
288 } ir3_alias_scope;
289 
290 typedef enum {
291    SHFL_XOR = 1,
292    SHFL_UP = 2,
293    SHFL_DOWN = 3,
294    SHFL_RUP = 6,
295    SHFL_RDOWN = 7,
296 } ir3_shfl_mode;
297 
298 typedef enum ir3_instruction_flags {
299    /* (sy) flag is set on first instruction, and after sample
300     * instructions (probably just on RAW hazard).
301     */
302    IR3_INSTR_SY = BIT(0),
303    /* (ss) flag is set on first instruction, and first instruction
304     * to depend on the result of "long" instructions (RAW hazard):
305     *
306     *   rcp, rsq, log2, exp2, sin, cos, sqrt
307     *
308     * It seems to synchronize until all in-flight instructions are
309     * completed, for example:
310     *
311     *   rsq hr1.w, hr1.w
312     *   add.f hr2.z, (neg)hr2.z, hc0.y
313     *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
314     *   rsq hr2.x, hr2.x
315     *   (rpt1)nop
316     *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
317     *   nop
318     *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
319     *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
320     *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
321     *
322     * The last mul.f does not have (ss) set, presumably because the
323     * (ss) on the previous instruction does the job.
324     *
325     * The blob driver also seems to set it on WAR hazards, although
326     * not really clear if this is needed or just blob compiler being
327     * sloppy.  So far I haven't found a case where removing the (ss)
328     * causes problems for WAR hazard, but I could just be getting
329     * lucky:
330     *
331     *   rcp r1.y, r3.y
332     *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
333     *
334     */
335    IR3_INSTR_SS = BIT(1),
336    /* (jp) flag is set on jump targets:
337     */
338    IR3_INSTR_JP = BIT(2),
339    /* (eq) flag kills helper invocations when they are no longer needed */
340    IR3_INSTR_EQ = BIT(3),
341    IR3_INSTR_UL = BIT(4),
342    IR3_INSTR_3D = BIT(5),
343    IR3_INSTR_A = BIT(6),
344    IR3_INSTR_O = BIT(7),
345    IR3_INSTR_P = BIT(8),
346    IR3_INSTR_S = BIT(9),
347    IR3_INSTR_S2EN = BIT(10),
348    IR3_INSTR_SAT = BIT(11),
349    /* (cat5/cat6) Bindless */
350    IR3_INSTR_B = BIT(12),
351    /* (cat5/cat6) nonuniform */
352    IR3_INSTR_NONUNIF = BIT(13),
353    /* (cat5-only) Get some parts of the encoding from a1.x */
354    IR3_INSTR_A1EN = BIT(14),
355    /* uniform destination for ldc, which must be set if and only if it has a
356     * shared reg destination
357     */
358    IR3_INSTR_U = BIT(15),
359    /* meta-flags, for intermediate stages of IR, ie.
360     * before register assignment is done:
361     */
362    IR3_INSTR_MARK = BIT(16),
363 
364    /* Used by shared register allocation when creating spill/reload instructions
365     * to inform validation that this is created by RA. This also may be set on
366     * an instruction where a spill has been folded into it.
367     */
368    IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
369 
370    IR3_INSTR_UNUSED = BIT(17),
371 
372    /* Used to indicate that a mov comes from a lowered READ_FIRST/READ_COND
373     * and may broadcast a helper invocation's value from a vector register to a
374     * shared register that may be read by other invocations. This factors into
375     * (eq) calculations.
376     */
377    IR3_INSTR_NEEDS_HELPERS = BIT(18),
378 
379    /* isam.v */
380    IR3_INSTR_V = BIT(19),
381 
382    /* isam.1d. Note that .1d is an active-low bit. */
383    IR3_INSTR_INV_1D = BIT(20),
384 
385    /* isam.v/ldib.b/stib.b can optionally use an immediate offset with one of
386     * their sources.
387     */
388    IR3_INSTR_IMM_OFFSET = BIT(21),
389 } ir3_instruction_flags;
390 
391 struct ir3_instruction {
392    struct ir3_block *block;
393    opc_t opc;
394    BITMASK_ENUM(ir3_instruction_flags) flags;
395    uint8_t repeat;
396    uint8_t nop;
397 #if MESA_DEBUG
398    unsigned srcs_max, dsts_max;
399 #endif
400    unsigned srcs_count, dsts_count;
401    struct ir3_register **dsts;
402    struct ir3_register **srcs;
403    union {
404       struct {
405          char inv1, inv2;
406          int immed;
407          struct ir3_block *target;
408          const char *target_label;
409          unsigned idx; /* for brac.N */
410       } cat0;
411       struct {
412          type_t src_type, dst_type;
413          round_t round;
414          reduce_op_t reduce_op;
415       } cat1;
416       struct {
417          enum {
418             IR3_COND_LT = 0,
419             IR3_COND_LE = 1,
420             IR3_COND_GT = 2,
421             IR3_COND_GE = 3,
422             IR3_COND_EQ = 4,
423             IR3_COND_NE = 5,
424          } condition;
425       } cat2;
426       struct {
427          enum {
428             IR3_SRC_UNSIGNED = 0,
429             IR3_SRC_MIXED = 1,
430          } signedness;
431          enum {
432             IR3_SRC_PACKED_LOW = 0,
433             IR3_SRC_PACKED_HIGH = 1,
434          } packed;
435          bool swapped;
436       } cat3;
437       struct {
438          unsigned samp, tex;
439          unsigned tex_base : 3;
440          unsigned cluster_size : 4;
441          type_t type;
442       } cat5;
443       struct {
444          type_t type;
445          /* TODO remove dst_offset and handle as a ir3_register
446           * which might be IMMED, similar to how src_offset is
447           * handled.
448           */
449          int dst_offset;
450          int iim_val;       /* for ldgb/stgb, # of components */
451          unsigned d    : 3; /* for ldc, component offset */
452          bool typed    : 1;
453          unsigned base : 3;
454          ir3_shfl_mode shfl_mode : 3;
455       } cat6;
456       struct {
457          unsigned w : 1; /* write */
458          unsigned r : 1; /* read */
459          unsigned l : 1; /* local */
460          unsigned g : 1; /* global */
461 
462          ir3_alias_scope alias_scope;
463          unsigned alias_table_size_minus_one;
464          bool alias_type_float;
465       } cat7;
466       /* for meta-instructions, just used to hold extra data
467        * before instruction scheduling, etc
468        */
469       struct {
470          int off; /* component/offset */
471       } split;
472       struct {
473          /* Per-source index back to the entry in the
474           * ir3_shader_variant::outputs table.
475           */
476          unsigned *outidxs;
477       } end;
478       struct {
479          /* used to temporarily hold reference to nir_phi_instr
480           * until we resolve the phi srcs
481           */
482          void *nphi;
483          unsigned comp;
484       } phi;
485       struct {
486          unsigned samp, tex;
487          unsigned input_offset;
488          unsigned samp_base : 3;
489          unsigned tex_base  : 3;
490       } prefetch;
491       struct {
492          /* maps back to entry in ir3_shader_variant::inputs table: */
493          int inidx;
494          /* for sysvals, identifies the sysval type.  Mostly so we can
495           * identify the special cases where a sysval should not be DCE'd
496           * (currently, just pre-fs texture fetch)
497           */
498          gl_system_value sysval;
499       } input;
500       struct {
501          unsigned src_base, src_size;
502          unsigned dst_base;
503       } push_consts;
504       struct {
505          uint64_t value;
506       } raw;
507    };
508 
509    /* For assigning jump offsets, we need instruction's position: */
510    uint32_t ip;
511 
512    /* used for per-pass extra instruction data.
513     *
514     * TODO we should remove the per-pass data like this and 'use_count'
515     * and do something similar to what RA does w/ ir3_ra_instr_data..
516     * ie. use the ir3_count_instructions pass, and then use instr->ip
517     * to index into a table of pass-private data.
518     */
519    void *data;
520 
521    /**
522     * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
523     */
524    struct set *uses;
525 
526    int use_count; /* currently just updated/used by cp */
527 
528    /* an instruction can reference at most one address register amongst
529     * it's src/dst registers.  Beyond that, you need to insert mov's.
530     *
531     * NOTE: do not write this directly, use ir3_instr_set_address()
532     */
533    struct ir3_register *address;
534 
535    /* Tracking for additional dependent instructions.  Used to handle
536     * barriers, WAR hazards for arrays/SSBOs/etc.
537     */
538    DECLARE_ARRAY(struct ir3_instruction *, deps);
539 
540    /*
541     * From PoV of instruction scheduling, not execution (ie. ignores global/
542     * local distinction):
543     *                            shared  image  atomic  SSBO  everything
544     *   barrier()/            -   R/W     R/W    R/W     R/W       X
545     *     groupMemoryBarrier()
546     *     memoryBarrier()
547     *     (but only images declared coherent?)
548     *   memoryBarrierAtomic() -                  R/W
549     *   memoryBarrierBuffer() -                          R/W
550     *   memoryBarrierImage()  -           R/W
551     *   memoryBarrierShared() -   R/W
552     *
553     * TODO I think for SSBO/image/shared, in cases where we can determine
554     * which variable is accessed, we don't need to care about accesses to
555     * different variables (unless declared coherent??)
556     */
557    enum {
558       IR3_BARRIER_EVERYTHING = 1 << 0,
559       IR3_BARRIER_SHARED_R = 1 << 1,
560       IR3_BARRIER_SHARED_W = 1 << 2,
561       IR3_BARRIER_IMAGE_R = 1 << 3,
562       IR3_BARRIER_IMAGE_W = 1 << 4,
563       IR3_BARRIER_BUFFER_R = 1 << 5,
564       IR3_BARRIER_BUFFER_W = 1 << 6,
565       IR3_BARRIER_ARRAY_R = 1 << 7,
566       IR3_BARRIER_ARRAY_W = 1 << 8,
567       IR3_BARRIER_PRIVATE_R = 1 << 9,
568       IR3_BARRIER_PRIVATE_W = 1 << 10,
569       IR3_BARRIER_CONST_W = 1 << 11,
570       IR3_BARRIER_ACTIVE_FIBERS_R = 1 << 12,
571       IR3_BARRIER_ACTIVE_FIBERS_W = 1 << 13,
572    } barrier_class,
573       barrier_conflict;
574 
575    /* Entry in ir3_block's instruction list: */
576    struct list_head node;
577 
578    /* List of this instruction's repeat group. Vectorized NIR instructions are
579     * emitted as multiple scalar instructions that are linked together using
580     * this field. After RA, the ir3_combine_rpt pass iterates these groups and,
581     * if the register assignment allows it, merges them into a (rptN)
582     * instruction.
583     *
584     * NOTE: this is not a typical list as there is no empty list head. The list
585     * head is stored in the first instruction of the repeat group so also refers
586     * to a list entry. In order to distinguish the list's first entry, we use
587     * serialno: instructions in a repeat group are always emitted consecutively
588     * so the first will have the lowest serialno.
589     *
590     * As this is not a typical list, we have to be careful with using the
591     * existing list helper. For example, using list_length on the first
592     * instruction will yield one less than the number of instructions in its
593     * group.
594     */
595    struct list_head rpt_node;
596 
597    uint32_t serialno;
598 
599    // TODO only computerator/assembler:
600    int line;
601 };
602 
603 /* Represents repeat groups in return values and arguments of the rpt builder
604  * API functions.
605  */
606 struct ir3_instruction_rpt {
607    struct ir3_instruction *rpts[4];
608 };
609 
610 struct ir3 {
611    struct ir3_compiler *compiler;
612    gl_shader_stage type;
613 
614    DECLARE_ARRAY(struct ir3_instruction *, inputs);
615 
616    /* Track bary.f (and ldlv) instructions.. this is needed in
617     * scheduling to ensure that all varying fetches happen before
618     * any potential kill instructions.  The hw gets grumpy if all
619     * threads in a group are killed before the last bary.f gets
620     * a chance to signal end of input (ei).
621     */
622    DECLARE_ARRAY(struct ir3_instruction *, baryfs);
623 
624    /* Track all indirect instructions (read and write).  To avoid
625     * deadlock scenario where an address register gets scheduled,
626     * but other dependent src instructions cannot be scheduled due
627     * to dependency on a *different* address register value, the
628     * scheduler needs to ensure that all dependencies other than
629     * the instruction other than the address register are scheduled
630     * before the one that writes the address register.  Having a
631     * convenient list of instructions that reference some address
632     * register simplifies this.
633     */
634    DECLARE_ARRAY(struct ir3_instruction *, a0_users);
635 
636    /* same for a1.x: */
637    DECLARE_ARRAY(struct ir3_instruction *, a1_users);
638 
639    /* Track texture sample instructions which need texture state
640     * patched in (for astc-srgb workaround):
641     */
642    DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
643 
644    /* Track tg4 instructions which need texture state patched in (for tg4
645     * swizzling workaround):
646     */
647    DECLARE_ARRAY(struct ir3_instruction *, tg4);
648 
649    /* List of blocks: */
650    struct list_head block_list;
651 
652    /* List of ir3_array's: */
653    struct list_head array_list;
654 
655 #if MESA_DEBUG
656    unsigned block_count;
657 #endif
658    unsigned instr_count;
659 };
660 
661 struct ir3_array {
662    struct list_head node;
663    unsigned length;
664    unsigned id;
665 
666    struct nir_def *r;
667 
668    /* To avoid array write's from getting DCE'd, keep track of the
669     * most recent write.  Any array access depends on the most
670     * recent write.  This way, nothing depends on writes after the
671     * last read.  But all the writes that happen before that have
672     * something depending on them
673     */
674    struct ir3_register *last_write;
675 
676    /* extra stuff used in RA pass: */
677    unsigned base; /* base vreg name */
678    unsigned reg;  /* base physical reg */
679    uint16_t start_ip, end_ip;
680 
681    /* Indicates if half-precision */
682    bool half;
683 
684    bool unused;
685 };
686 
687 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
688 
689 struct ir3_block {
690    struct list_head node;
691    struct ir3 *shader;
692 
693    const struct nir_block *nblock;
694 
695    struct list_head instr_list; /* list of ir3_instruction */
696 
697    /* each block has either one or two successors.. in case of two
698     * successors, 'condition' decides which one to follow.  A block preceding
699     * an if/else has two successors.
700     *
701     * In some cases the path that the machine actually takes through the
702     * program may not match the per-thread view of the CFG. In particular
703     * this is the case for if/else, where the machine jumps from the end of
704     * the if to the beginning of the else and switches active lanes. While
705     * most things only care about the per-thread view, we need to use the
706     * "physical" view when allocating shared registers. "successors" contains
707     * the per-thread successors, and "physical_successors" contains the
708     * physical successors which includes the fallthrough edge from the if to
709     * the else.
710     */
711    struct ir3_block *successors[2];
712 
713    bool divergent_condition;
714 
715    DECLARE_ARRAY(struct ir3_block *, predecessors);
716    DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
717    DECLARE_ARRAY(struct ir3_block *, physical_successors);
718 
719    uint16_t start_ip, end_ip;
720 
721    bool reconvergence_point;
722 
723    bool in_early_preamble;
724 
725    /* Track instructions which do not write a register but other-
726     * wise must not be discarded (such as kill, stg, etc)
727     */
728    DECLARE_ARRAY(struct ir3_instruction *, keeps);
729 
730    /* used for per-pass extra block data.  Mainly used right
731     * now in RA step to track livein/liveout.
732     */
733    void *data;
734 
735    uint32_t index;
736 
737    struct ir3_block *imm_dom;
738    DECLARE_ARRAY(struct ir3_block *, dom_children);
739 
740    uint32_t dom_pre_index;
741    uint32_t dom_post_index;
742 
743    uint32_t loop_depth;
744 
745 #if MESA_DEBUG
746    uint32_t serialno;
747 #endif
748 };
749 
750 enum ir3_cursor_option {
751    IR3_CURSOR_BEFORE_BLOCK,
752    IR3_CURSOR_AFTER_BLOCK,
753    IR3_CURSOR_BEFORE_INSTR,
754    IR3_CURSOR_AFTER_INSTR,
755 };
756 
757 struct ir3_cursor {
758    enum ir3_cursor_option option;
759    union {
760       struct ir3_block *block;
761       struct ir3_instruction *instr;
762    };
763 };
764 
765 struct ir3_builder {
766    struct ir3_cursor cursor;
767 };
768 
769 static inline uint32_t
block_id(struct ir3_block * block)770 block_id(struct ir3_block *block)
771 {
772 #if MESA_DEBUG
773    return block->serialno;
774 #else
775    return (uint32_t)(unsigned long)block;
776 #endif
777 }
778 
779 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)780 ir3_start_block(struct ir3 *ir)
781 {
782    return list_first_entry(&ir->block_list, struct ir3_block, node);
783 }
784 
785 static inline struct ir3_block *
ir3_end_block(struct ir3 * ir)786 ir3_end_block(struct ir3 *ir)
787 {
788    return list_last_entry(&ir->block_list, struct ir3_block, node);
789 }
790 
791 struct ir3_instruction *ir3_find_end(struct ir3 *ir);
792 
793 struct ir3_instruction *ir3_block_get_terminator(struct ir3_block *block);
794 
795 struct ir3_instruction *ir3_block_take_terminator(struct ir3_block *block);
796 
797 struct ir3_instruction *
798 ir3_block_get_last_non_terminator(struct ir3_block *block);
799 
800 struct ir3_instruction *ir3_block_get_last_phi(struct ir3_block *block);
801 
802 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)803 ir3_after_preamble(struct ir3 *ir)
804 {
805    struct ir3_block *block = ir3_start_block(ir);
806    /* The preamble will have a usually-empty else branch, and we want to skip
807     * that to get to the block after the preamble.
808     */
809    struct ir3_instruction *terminator = ir3_block_get_terminator(block);
810    if (terminator && (terminator->opc == OPC_SHPS))
811       return block->successors[1]->successors[0];
812    else
813       return block;
814 }
815 
816 static inline bool
ir3_has_preamble(struct ir3 * ir)817 ir3_has_preamble(struct ir3 *ir)
818 {
819    return ir3_start_block(ir) != ir3_after_preamble(ir);
820 }
821 
822 struct ir3_instruction *ir3_find_shpe(struct ir3 *ir);
823 
824 /* Create an empty preamble and return shpe. */
825 struct ir3_instruction *ir3_create_empty_preamble(struct ir3 *ir);
826 
827 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
828 void ir3_block_link_physical(struct ir3_block *pred, struct ir3_block *succ);
829 void ir3_block_remove_predecessor(struct ir3_block *block,
830                                   struct ir3_block *pred);
831 unsigned ir3_block_get_pred_index(struct ir3_block *block,
832                                   struct ir3_block *pred);
833 
834 void ir3_calc_dominance(struct ir3 *ir);
835 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
836 
837 struct ir3_shader_variant;
838 
839 struct ir3 *ir3_create(struct ir3_compiler *compiler,
840                        struct ir3_shader_variant *v);
841 void ir3_destroy(struct ir3 *shader);
842 
843 void ir3_collect_info(struct ir3_shader_variant *v);
844 void *ir3_alloc(struct ir3 *shader, int sz);
845 
846 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
847                                          unsigned reg_count,
848                                          bool double_threadsize);
849 
850 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
851                                            bool double_threadsize);
852 
853 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
854                                   unsigned regs_count);
855 
856 struct ir3_block *ir3_block_create(struct ir3 *shader);
857 
858 struct ir3_instruction *ir3_build_instr(struct ir3_builder *builder, opc_t opc,
859                                         int ndst, int nsrc);
860 struct ir3_instruction *ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc,
861                                             int ndst, int nsrc);
862 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
863                                          int ndst, int nsrc);
864 struct ir3_instruction *ir3_instr_create_at_end(struct ir3_block *block,
865                                                 opc_t opc, int ndst, int nsrc);
866 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
867 void ir3_instr_add_dep(struct ir3_instruction *instr,
868                        struct ir3_instruction *dep);
869 const char *ir3_instr_name(struct ir3_instruction *instr);
870 void ir3_instr_remove(struct ir3_instruction *instr);
871 
872 void ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n);
873 bool ir3_instr_is_rpt(const struct ir3_instruction *instr);
874 bool ir3_instr_is_first_rpt(const struct ir3_instruction *instr);
875 struct ir3_instruction *ir3_instr_prev_rpt(const struct ir3_instruction *instr);
876 struct ir3_instruction *ir3_instr_first_rpt(struct ir3_instruction *instr);
877 unsigned ir3_instr_rpt_length(const struct ir3_instruction *instr);
878 
879 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
880                                     int flags);
881 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
882                                     int flags);
883 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
884                                    struct ir3_register *reg);
885 
886 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)887 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
888 {
889    assert(!dst->tied && !src->tied);
890    dst->tied = src;
891    src->tied = dst;
892 }
893 
894 void ir3_reg_set_last_array(struct ir3_instruction *instr,
895                             struct ir3_register *reg,
896                             struct ir3_register *last_write);
897 
898 void ir3_instr_set_address(struct ir3_instruction *instr,
899                            struct ir3_instruction *addr);
900 
901 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)902 ir3_instr_check_mark(struct ir3_instruction *instr)
903 {
904    if (instr->flags & IR3_INSTR_MARK)
905       return true; /* already visited */
906    instr->flags |= IR3_INSTR_MARK;
907    return false;
908 }
909 
910 void ir3_block_clear_mark(struct ir3_block *block);
911 void ir3_clear_mark(struct ir3 *shader);
912 
913 unsigned ir3_count_instructions(struct ir3 *ir);
914 unsigned ir3_count_instructions_sched(struct ir3 *ir);
915 unsigned ir3_count_instructions_ra(struct ir3 *ir);
916 
917 /**
918  * Move 'instr' to just before 'after'
919  */
920 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)921 ir3_instr_move_before(struct ir3_instruction *instr,
922                       struct ir3_instruction *after)
923 {
924    list_delinit(&instr->node);
925    list_addtail(&instr->node, &after->node);
926 }
927 
928 /**
929  * Move 'instr' to just after 'before':
930  */
931 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)932 ir3_instr_move_after(struct ir3_instruction *instr,
933                      struct ir3_instruction *before)
934 {
935    list_delinit(&instr->node);
936    list_add(&instr->node, &before->node);
937 }
938 
939 /**
940  * Move 'instr' to the beginning of the block:
941  */
942 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)943 ir3_instr_move_before_block(struct ir3_instruction *instr,
944                             struct ir3_block *block)
945 {
946    list_delinit(&instr->node);
947    list_add(&instr->node, &block->instr_list);
948 }
949 
950 typedef bool (*use_filter_cb)(struct ir3_instruction *use, unsigned src_n);
951 
952 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
953 void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter);
954 
955 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
956 void ir3_fixup_src_type(struct ir3_instruction *instr);
957 
958 int ir3_flut(struct ir3_register *src_reg);
959 
960 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
961 
962 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
963 
964 /**
965  * Given an instruction whose result we want to test for nonzero, return a
966  * potentially different instruction for which the result would be the same.
967  * This might be one of its sources if instr doesn't change the nonzero-ness.
968  */
969 struct ir3_instruction *
970 ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr);
971 
972 bool ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc);
973 
974 #include "util/set.h"
975 #define foreach_ssa_use(__use, __instr)                                        \
976    for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
977         __use = NULL)                                                          \
978       set_foreach ((__instr)->uses, __entry)                                   \
979          if ((__use = (void *)__entry->key))
980 
981 static inline uint32_t
reg_num(const struct ir3_register * reg)982 reg_num(const struct ir3_register *reg)
983 {
984    return reg->num >> 2;
985 }
986 
987 static inline uint32_t
reg_comp(const struct ir3_register * reg)988 reg_comp(const struct ir3_register *reg)
989 {
990    return reg->num & 0x3;
991 }
992 
993 static inline bool
is_flow(struct ir3_instruction * instr)994 is_flow(struct ir3_instruction *instr)
995 {
996    return (opc_cat(instr->opc) == 0);
997 }
998 
999 static inline bool
is_terminator(struct ir3_instruction * instr)1000 is_terminator(struct ir3_instruction *instr)
1001 {
1002    switch (instr->opc) {
1003    case OPC_BR:
1004    case OPC_JUMP:
1005    case OPC_BANY:
1006    case OPC_BALL:
1007    case OPC_BRAA:
1008    case OPC_BRAO:
1009    case OPC_SHPS:
1010    case OPC_GETONE:
1011    case OPC_GETLAST:
1012    case OPC_PREDT:
1013    case OPC_PREDF:
1014       return true;
1015    default:
1016       return false;
1017    }
1018 }
1019 
1020 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)1021 is_kill_or_demote(struct ir3_instruction *instr)
1022 {
1023    return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
1024 }
1025 
1026 static inline bool
is_nop(struct ir3_instruction * instr)1027 is_nop(struct ir3_instruction *instr)
1028 {
1029    return instr->opc == OPC_NOP;
1030 }
1031 
1032 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)1033 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
1034 {
1035    unsigned dst_type = (dst->flags & IR3_REG_HALF);
1036    unsigned src_type = (src->flags & IR3_REG_HALF);
1037 
1038    /* Treat shared->normal copies and normal->shared copies as same-type. */
1039    return dst_type == src_type;
1040 }
1041 
1042 /* Is it a non-transformative (ie. not type changing) mov?  This can
1043  * also include absneg.s/absneg.f, which for the most part can be
1044  * treated as a mov (single src argument).
1045  */
1046 static inline bool
is_same_type_mov(struct ir3_instruction * instr)1047 is_same_type_mov(struct ir3_instruction *instr)
1048 {
1049    struct ir3_register *dst;
1050 
1051    switch (instr->opc) {
1052    case OPC_MOV:
1053       if (instr->cat1.src_type != instr->cat1.dst_type)
1054          return false;
1055       /* If the type of dest reg and src reg are different,
1056        * it shouldn't be considered as same type mov
1057        */
1058       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1059          return false;
1060       break;
1061    case OPC_ABSNEG_F:
1062    case OPC_ABSNEG_S:
1063       if (instr->flags & IR3_INSTR_SAT)
1064          return false;
1065       /* If the type of dest reg and src reg are different,
1066        * it shouldn't be considered as same type mov
1067        */
1068       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1069          return false;
1070       break;
1071    default:
1072       return false;
1073    }
1074 
1075    dst = instr->dsts[0];
1076 
1077    /* mov's that write to a0 or p0.x are special: */
1078    if (dst->flags & IR3_REG_PREDICATE)
1079       return false;
1080    if (reg_num(dst) == REG_A0)
1081       return false;
1082 
1083    if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
1084       return false;
1085 
1086    return true;
1087 }
1088 
1089 /* A move from const, which changes size but not type, can also be
1090  * folded into dest instruction in some cases.
1091  */
1092 static inline bool
is_const_mov(struct ir3_instruction * instr)1093 is_const_mov(struct ir3_instruction *instr)
1094 {
1095    if (instr->opc != OPC_MOV)
1096       return false;
1097 
1098    if (!(instr->srcs[0]->flags & IR3_REG_CONST))
1099       return false;
1100 
1101    type_t src_type = instr->cat1.src_type;
1102    type_t dst_type = instr->cat1.dst_type;
1103 
1104    /* Allow a narrowing move, but not a widening one.  A narrowing
1105     * move from full c1.x can be folded into a hc1.x use in an ALU
1106     * instruction because it is doing the same thing as constant-
1107     * demotion.  If CONSTANT_DEMOTION_ENABLE wasn't set, we'd need
1108     * return false in all cases.
1109     */
1110    if ((type_size(dst_type) > type_size(src_type)) ||
1111        (type_size(dst_type) == 8))
1112       return false;
1113 
1114    return (type_float(src_type) && type_float(dst_type)) ||
1115           (type_uint(src_type) && type_uint(dst_type)) ||
1116           (type_sint(src_type) && type_sint(dst_type));
1117 }
1118 
1119 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)1120 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
1121 {
1122    switch (instr->opc) {
1123    case OPC_BALLOT_MACRO:
1124    case OPC_ANY_MACRO:
1125    case OPC_ALL_MACRO:
1126    case OPC_ELECT_MACRO:
1127    case OPC_READ_COND_MACRO:
1128    case OPC_READ_GETLAST_MACRO:
1129    case OPC_READ_FIRST_MACRO:
1130    case OPC_SCAN_MACRO:
1131    case OPC_SCAN_CLUSTERS_MACRO:
1132       return true;
1133    default:
1134       return false;
1135    }
1136 }
1137 
1138 static inline bool
is_alu(struct ir3_instruction * instr)1139 is_alu(struct ir3_instruction *instr)
1140 {
1141    return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
1142 }
1143 
1144 static inline bool
is_sfu(struct ir3_instruction * instr)1145 is_sfu(struct ir3_instruction *instr)
1146 {
1147    return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
1148 }
1149 
1150 static inline bool
is_tex(struct ir3_instruction * instr)1151 is_tex(struct ir3_instruction *instr)
1152 {
1153    return (opc_cat(instr->opc) == 5) && instr->opc != OPC_TCINV;
1154 }
1155 
1156 static inline bool
is_tex_shuffle(struct ir3_instruction * instr)1157 is_tex_shuffle(struct ir3_instruction *instr)
1158 {
1159    switch (instr->opc) {
1160    case OPC_BRCST_ACTIVE:
1161    case OPC_QUAD_SHUFFLE_BRCST:
1162    case OPC_QUAD_SHUFFLE_HORIZ:
1163    case OPC_QUAD_SHUFFLE_VERT:
1164    case OPC_QUAD_SHUFFLE_DIAG:
1165       return true;
1166    default:
1167       return false;
1168    }
1169 }
1170 
1171 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)1172 is_tex_or_prefetch(struct ir3_instruction *instr)
1173 {
1174    return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
1175 }
1176 
1177 static inline bool
is_mem(struct ir3_instruction * instr)1178 is_mem(struct ir3_instruction *instr)
1179 {
1180    return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
1181 }
1182 
1183 static inline bool
is_barrier(struct ir3_instruction * instr)1184 is_barrier(struct ir3_instruction *instr)
1185 {
1186    return (opc_cat(instr->opc) == 7) && instr->opc != OPC_ALIAS;
1187 }
1188 
1189 static inline bool
is_half(struct ir3_instruction * instr)1190 is_half(struct ir3_instruction *instr)
1191 {
1192    return !!(instr->dsts[0]->flags & IR3_REG_HALF);
1193 }
1194 
1195 static inline bool
is_shared(struct ir3_instruction * instr)1196 is_shared(struct ir3_instruction *instr)
1197 {
1198    return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
1199 }
1200 
1201 static inline bool
is_store(struct ir3_instruction * instr)1202 is_store(struct ir3_instruction *instr)
1203 {
1204    /* these instructions, the "destination" register is
1205     * actually a source, the address to store to.
1206     */
1207    switch (instr->opc) {
1208    case OPC_STG:
1209    case OPC_STG_A:
1210    case OPC_STGB:
1211    case OPC_STIB:
1212    case OPC_STP:
1213    case OPC_STL:
1214    case OPC_STLW:
1215    case OPC_L2G:
1216    case OPC_G2L:
1217       return true;
1218    default:
1219       return false;
1220    }
1221 }
1222 
1223 static inline bool
is_load(struct ir3_instruction * instr)1224 is_load(struct ir3_instruction *instr)
1225 {
1226    switch (instr->opc) {
1227    case OPC_LDG:
1228    case OPC_LDG_A:
1229    case OPC_LDGB:
1230    case OPC_LDIB:
1231    case OPC_LDL:
1232    case OPC_LDP:
1233    case OPC_L2G:
1234    case OPC_LDLW:
1235    case OPC_LDLV:
1236    case OPC_RAY_INTERSECTION:
1237       /* probably some others too.. */
1238       return true;
1239    case OPC_LDC:
1240       return instr->dsts_count > 0;
1241    default:
1242       return false;
1243    }
1244 }
1245 
1246 static inline bool
is_input(struct ir3_instruction * instr)1247 is_input(struct ir3_instruction *instr)
1248 {
1249    /* in some cases, ldlv is used to fetch varying without
1250     * interpolation.. fortunately inloc is the first src
1251     * register in either case
1252     */
1253    switch (instr->opc) {
1254    case OPC_LDLV:
1255    case OPC_BARY_F:
1256    case OPC_FLAT_B:
1257       return true;
1258    default:
1259       return false;
1260    }
1261 }
1262 
1263 /* Whether non-helper invocations can read the value of helper invocations. We
1264  * cannot insert (eq) before these instructions.
1265  */
1266 static inline bool
uses_helpers(struct ir3_instruction * instr)1267 uses_helpers(struct ir3_instruction *instr)
1268 {
1269    switch (instr->opc) {
1270    /* These require helper invocations to be present */
1271    case OPC_SAMB:
1272    case OPC_GETLOD:
1273    case OPC_DSX:
1274    case OPC_DSY:
1275    case OPC_DSXPP_1:
1276    case OPC_DSYPP_1:
1277    case OPC_DSXPP_MACRO:
1278    case OPC_DSYPP_MACRO:
1279    case OPC_QUAD_SHUFFLE_BRCST:
1280    case OPC_QUAD_SHUFFLE_HORIZ:
1281    case OPC_QUAD_SHUFFLE_VERT:
1282    case OPC_QUAD_SHUFFLE_DIAG:
1283    case OPC_META_TEX_PREFETCH:
1284       return true;
1285 
1286    /* sam requires helper invocations except for dummy prefetch instructions */
1287    case OPC_SAM:
1288       return instr->dsts_count != 0;
1289 
1290    /* Subgroup operations don't require helper invocations to be present, but
1291     * will use helper invocations if they are present.
1292     */
1293    case OPC_BALLOT_MACRO:
1294    case OPC_ANY_MACRO:
1295    case OPC_ALL_MACRO:
1296    case OPC_READ_FIRST_MACRO:
1297    case OPC_READ_COND_MACRO:
1298    case OPC_MOVMSK:
1299    case OPC_BRCST_ACTIVE:
1300       return true;
1301 
1302    /* Catch lowered READ_FIRST/READ_COND. For elect, don't include the getone
1303     * in the preamble because it doesn't actually matter which fiber is
1304     * selected.
1305     */
1306    case OPC_MOV:
1307    case OPC_ELECT_MACRO:
1308       return instr->flags & IR3_INSTR_NEEDS_HELPERS;
1309 
1310    default:
1311       return false;
1312    }
1313 }
1314 
1315 static inline bool
is_bool(struct ir3_instruction * instr)1316 is_bool(struct ir3_instruction *instr)
1317 {
1318    switch (instr->opc) {
1319    case OPC_CMPS_F:
1320    case OPC_CMPS_S:
1321    case OPC_CMPS_U:
1322       return true;
1323    default:
1324       return false;
1325    }
1326 }
1327 
1328 static inline opc_t
cat3_half_opc(opc_t opc)1329 cat3_half_opc(opc_t opc)
1330 {
1331    switch (opc) {
1332    case OPC_MAD_F32:
1333       return OPC_MAD_F16;
1334    case OPC_SEL_B32:
1335       return OPC_SEL_B16;
1336    case OPC_SEL_S32:
1337       return OPC_SEL_S16;
1338    case OPC_SEL_F32:
1339       return OPC_SEL_F16;
1340    case OPC_SAD_S32:
1341       return OPC_SAD_S16;
1342    default:
1343       return opc;
1344    }
1345 }
1346 
1347 static inline opc_t
cat3_full_opc(opc_t opc)1348 cat3_full_opc(opc_t opc)
1349 {
1350    switch (opc) {
1351    case OPC_MAD_F16:
1352       return OPC_MAD_F32;
1353    case OPC_SEL_B16:
1354       return OPC_SEL_B32;
1355    case OPC_SEL_S16:
1356       return OPC_SEL_S32;
1357    case OPC_SEL_F16:
1358       return OPC_SEL_F32;
1359    case OPC_SAD_S16:
1360       return OPC_SAD_S32;
1361    default:
1362       return opc;
1363    }
1364 }
1365 
1366 static inline opc_t
cat4_half_opc(opc_t opc)1367 cat4_half_opc(opc_t opc)
1368 {
1369    switch (opc) {
1370    case OPC_RSQ:
1371       return OPC_HRSQ;
1372    case OPC_LOG2:
1373       return OPC_HLOG2;
1374    case OPC_EXP2:
1375       return OPC_HEXP2;
1376    default:
1377       return opc;
1378    }
1379 }
1380 
1381 static inline opc_t
cat4_full_opc(opc_t opc)1382 cat4_full_opc(opc_t opc)
1383 {
1384    switch (opc) {
1385    case OPC_HRSQ:
1386       return OPC_RSQ;
1387    case OPC_HLOG2:
1388       return OPC_LOG2;
1389    case OPC_HEXP2:
1390       return OPC_EXP2;
1391    default:
1392       return opc;
1393    }
1394 }
1395 
1396 static inline bool
is_meta(struct ir3_instruction * instr)1397 is_meta(struct ir3_instruction *instr)
1398 {
1399    return (opc_cat(instr->opc) == OPC_META);
1400 }
1401 
1402 static inline unsigned
reg_elems(const struct ir3_register * reg)1403 reg_elems(const struct ir3_register *reg)
1404 {
1405    if (reg->flags & IR3_REG_ARRAY)
1406       return reg->size;
1407    else
1408       return util_last_bit(reg->wrmask);
1409 }
1410 
1411 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1412 reg_elem_size(const struct ir3_register *reg)
1413 {
1414    return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1415 }
1416 
1417 static inline unsigned
reg_size(const struct ir3_register * reg)1418 reg_size(const struct ir3_register *reg)
1419 {
1420    return reg_elems(reg) * reg_elem_size(reg);
1421 }
1422 
1423 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
1424  * and have to handle relative accesses specially.
1425  */
1426 
1427 static inline unsigned
post_ra_reg_elems(struct ir3_register * reg)1428 post_ra_reg_elems(struct ir3_register *reg)
1429 {
1430    if (reg->flags & IR3_REG_RELATIV)
1431       return reg->size;
1432    return reg_elems(reg);
1433 }
1434 
1435 static inline unsigned
post_ra_reg_num(struct ir3_register * reg)1436 post_ra_reg_num(struct ir3_register *reg)
1437 {
1438    if (reg->flags & IR3_REG_RELATIV)
1439       return reg->array.base;
1440    return reg->num;
1441 }
1442 
1443 static inline unsigned
dest_regs(struct ir3_instruction * instr)1444 dest_regs(struct ir3_instruction *instr)
1445 {
1446    if (instr->dsts_count == 0)
1447       return 0;
1448 
1449    assert(instr->dsts_count == 1);
1450    return util_last_bit(instr->dsts[0]->wrmask);
1451 }
1452 
1453 static inline bool
is_reg_gpr(const struct ir3_register * reg)1454 is_reg_gpr(const struct ir3_register *reg)
1455 {
1456    if (reg->flags &
1457        (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_PREDICATE | IR3_REG_RT)) {
1458       return false;
1459    }
1460    if (reg_num(reg) == REG_A0)
1461       return false;
1462    if (!(reg->flags & (IR3_REG_SSA | IR3_REG_RELATIV)) &&
1463        reg->num == INVALID_REG)
1464       return false;
1465    return true;
1466 }
1467 
1468 static inline bool
is_reg_a0(const struct ir3_register * reg)1469 is_reg_a0(const struct ir3_register *reg)
1470 {
1471    if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1472       return false;
1473    return reg->num == regid(REG_A0, 0);
1474 }
1475 
1476 /* is dst a normal temp register: */
1477 static inline bool
is_dest_gpr(const struct ir3_register * dst)1478 is_dest_gpr(const struct ir3_register *dst)
1479 {
1480    if (dst->wrmask == 0)
1481       return false;
1482    return is_reg_gpr(dst);
1483 }
1484 
1485 static inline bool
writes_gpr(struct ir3_instruction * instr)1486 writes_gpr(struct ir3_instruction *instr)
1487 {
1488    if (dest_regs(instr) == 0)
1489       return false;
1490    return is_dest_gpr(instr->dsts[0]);
1491 }
1492 
1493 static inline bool
writes_addr0(struct ir3_instruction * instr)1494 writes_addr0(struct ir3_instruction *instr)
1495 {
1496    /* Note: only the first dest can write to a0.x */
1497    if (instr->dsts_count > 0) {
1498       struct ir3_register *dst = instr->dsts[0];
1499       return dst->num == regid(REG_A0, 0);
1500    }
1501    return false;
1502 }
1503 
1504 static inline bool
writes_addr1(struct ir3_instruction * instr)1505 writes_addr1(struct ir3_instruction *instr)
1506 {
1507    /* Note: only the first dest can write to a1.x */
1508    if (instr->dsts_count > 0) {
1509       struct ir3_register *dst = instr->dsts[0];
1510       return dst->num == regid(REG_A0, 1);
1511    }
1512    return false;
1513 }
1514 
1515 static inline bool
writes_pred(struct ir3_instruction * instr)1516 writes_pred(struct ir3_instruction *instr)
1517 {
1518    /* Note: only the first dest can write to p0 */
1519    if (instr->dsts_count > 0) {
1520       struct ir3_register *dst = instr->dsts[0];
1521       return !!(dst->flags & IR3_REG_PREDICATE);
1522    }
1523    return false;
1524 }
1525 
1526 /* r0.x - r47.w are "normal" registers. r48.x - r55.w are shared registers.
1527  * Everything above those are non-GPR registers like a0.x and p0.x that aren't
1528  * assigned by RA.
1529  */
1530 #define GPR_REG_SIZE (4 * 48)
1531 #define SHARED_REG_START GPR_REG_SIZE
1532 #define SHARED_REG_SIZE (4 * 8)
1533 #define NONGPR_REG_START (SHARED_REG_START + SHARED_REG_SIZE)
1534 #define NONGPR_REG_SIZE (4 * 8)
1535 
1536 enum ir3_reg_file {
1537    IR3_FILE_FULL,
1538    IR3_FILE_HALF,
1539    IR3_FILE_SHARED,
1540    IR3_FILE_NONGPR,
1541 };
1542 
1543 /* Return a file + offset that can be used for determining if two registers
1544  * alias. The register is only really used for its flags, the num is taken from
1545  * the parameter. Registers overlap if they are in the same file and have an
1546  * overlapping offset. The offset is multiplied by 2 for full registers to
1547  * handle aliasing half and full registers, that is it's in units of half-regs.
1548  */
1549 static inline unsigned
ir3_reg_file_offset(const struct ir3_register * reg,unsigned num,bool mergedregs,enum ir3_reg_file * file)1550 ir3_reg_file_offset(const struct ir3_register *reg, unsigned num,
1551                     bool mergedregs, enum ir3_reg_file *file)
1552 {
1553    assert(!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
1554    unsigned size = reg_elem_size(reg);
1555    if (!is_reg_gpr(reg)) {
1556       *file = IR3_FILE_NONGPR;
1557       return (num - NONGPR_REG_START) * size;
1558    } else if (reg->flags & IR3_REG_SHARED) {
1559       *file = IR3_FILE_SHARED;
1560       return (num - SHARED_REG_START) * size;
1561    } else if (mergedregs || !(reg->flags & IR3_REG_HALF)) {
1562       *file = IR3_FILE_FULL;
1563       return num * size;
1564    } else {
1565       *file = IR3_FILE_HALF;
1566       return num;
1567    }
1568 }
1569 
1570 /* returns defining instruction for reg */
1571 /* TODO better name */
1572 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1573 ssa(struct ir3_register *reg)
1574 {
1575    if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1576       return reg->def->instr;
1577    return NULL;
1578 }
1579 
1580 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1581 conflicts(struct ir3_register *a, struct ir3_register *b)
1582 {
1583    return (a && b) && (a->def != b->def);
1584 }
1585 
1586 static inline bool
reg_is_addr1(struct ir3_register * r)1587 reg_is_addr1(struct ir3_register *r)
1588 {
1589    if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1590       return false;
1591    return r->num == regid(REG_A0, 1);
1592 }
1593 
1594 static inline type_t
half_type(type_t type)1595 half_type(type_t type)
1596 {
1597    switch (type) {
1598    case TYPE_F32:
1599       return TYPE_F16;
1600    case TYPE_U32:
1601    case TYPE_U8_32:
1602       return TYPE_U16;
1603    case TYPE_S32:
1604       return TYPE_S16;
1605    case TYPE_F16:
1606    case TYPE_U16:
1607    case TYPE_S16:
1608       return type;
1609    case TYPE_U8:
1610       return type;
1611    default:
1612       assert(0);
1613       return (type_t)~0;
1614    }
1615 }
1616 
1617 static inline type_t
full_type(type_t type)1618 full_type(type_t type)
1619 {
1620    switch (type) {
1621    case TYPE_F16:
1622       return TYPE_F32;
1623    case TYPE_U8:
1624    case TYPE_U8_32:
1625    case TYPE_U16:
1626       return TYPE_U32;
1627    case TYPE_S16:
1628       return TYPE_S32;
1629    case TYPE_F32:
1630    case TYPE_U32:
1631    case TYPE_S32:
1632       return type;
1633    default:
1634       assert(0);
1635       return (type_t)~0;
1636    }
1637 }
1638 
1639 /* some cat2 instructions (ie. those which are not float) can embed an
1640  * immediate:
1641  */
1642 static inline bool
ir3_cat2_int(opc_t opc)1643 ir3_cat2_int(opc_t opc)
1644 {
1645    switch (opc) {
1646    case OPC_ADD_U:
1647    case OPC_ADD_S:
1648    case OPC_SUB_U:
1649    case OPC_SUB_S:
1650    case OPC_CMPS_U:
1651    case OPC_CMPS_S:
1652    case OPC_MIN_U:
1653    case OPC_MIN_S:
1654    case OPC_MAX_U:
1655    case OPC_MAX_S:
1656    case OPC_CMPV_U:
1657    case OPC_CMPV_S:
1658    case OPC_MUL_U24:
1659    case OPC_MUL_S24:
1660    case OPC_MULL_U:
1661    case OPC_CLZ_S:
1662    case OPC_ABSNEG_S:
1663    case OPC_AND_B:
1664    case OPC_OR_B:
1665    case OPC_NOT_B:
1666    case OPC_XOR_B:
1667    case OPC_BFREV_B:
1668    case OPC_CLZ_B:
1669    case OPC_SHL_B:
1670    case OPC_SHR_B:
1671    case OPC_ASHR_B:
1672    case OPC_MGEN_B:
1673    case OPC_GETBIT_B:
1674    case OPC_CBITS_B:
1675    case OPC_BARY_F:
1676    case OPC_FLAT_B:
1677       return true;
1678 
1679    default:
1680       return false;
1681    }
1682 }
1683 
1684 /* map cat2 instruction to valid abs/neg flags: */
1685 static inline unsigned
ir3_cat2_absneg(opc_t opc)1686 ir3_cat2_absneg(opc_t opc)
1687 {
1688    switch (opc) {
1689    case OPC_ADD_F:
1690    case OPC_MIN_F:
1691    case OPC_MAX_F:
1692    case OPC_MUL_F:
1693    case OPC_SIGN_F:
1694    case OPC_CMPS_F:
1695    case OPC_ABSNEG_F:
1696    case OPC_CMPV_F:
1697    case OPC_FLOOR_F:
1698    case OPC_CEIL_F:
1699    case OPC_RNDNE_F:
1700    case OPC_RNDAZ_F:
1701    case OPC_TRUNC_F:
1702    case OPC_BARY_F:
1703       return IR3_REG_FABS | IR3_REG_FNEG;
1704 
1705    case OPC_ADD_U:
1706    case OPC_ADD_S:
1707    case OPC_SUB_U:
1708    case OPC_SUB_S:
1709    case OPC_CMPS_U:
1710    case OPC_CMPS_S:
1711    case OPC_MIN_U:
1712    case OPC_MIN_S:
1713    case OPC_MAX_U:
1714    case OPC_MAX_S:
1715    case OPC_CMPV_U:
1716    case OPC_CMPV_S:
1717    case OPC_MUL_U24:
1718    case OPC_MUL_S24:
1719    case OPC_MULL_U:
1720    case OPC_CLZ_S:
1721       return 0;
1722 
1723    case OPC_ABSNEG_S:
1724       return IR3_REG_SABS | IR3_REG_SNEG;
1725 
1726    case OPC_AND_B:
1727    case OPC_OR_B:
1728    case OPC_NOT_B:
1729    case OPC_XOR_B:
1730    case OPC_BFREV_B:
1731    case OPC_CLZ_B:
1732    case OPC_SHL_B:
1733    case OPC_SHR_B:
1734    case OPC_ASHR_B:
1735    case OPC_MGEN_B:
1736    case OPC_GETBIT_B:
1737    case OPC_CBITS_B:
1738       return IR3_REG_BNOT;
1739 
1740    default:
1741       return 0;
1742    }
1743 }
1744 
1745 /* map cat3 instructions to valid abs/neg flags: */
1746 static inline unsigned
ir3_cat3_absneg(opc_t opc,unsigned src_n)1747 ir3_cat3_absneg(opc_t opc, unsigned src_n)
1748 {
1749    switch (opc) {
1750    case OPC_MAD_F16:
1751    case OPC_MAD_F32:
1752    case OPC_SEL_F16:
1753    case OPC_SEL_F32:
1754       return IR3_REG_FNEG;
1755 
1756    case OPC_SAD_S16:
1757    case OPC_SAD_S32:
1758       return src_n == 1 ? IR3_REG_SNEG : 0;
1759 
1760    case OPC_MAD_U16:
1761    case OPC_MADSH_U16:
1762    case OPC_MAD_S16:
1763    case OPC_MADSH_M16:
1764    case OPC_MAD_U24:
1765    case OPC_MAD_S24:
1766    case OPC_SEL_S16:
1767    case OPC_SEL_S32:
1768       /* neg *may* work on 3rd src.. */
1769 
1770    case OPC_SEL_B16:
1771    case OPC_SEL_B32:
1772 
1773    case OPC_SHRM:
1774    case OPC_SHLM:
1775    case OPC_SHRG:
1776    case OPC_SHLG:
1777    case OPC_ANDG:
1778    case OPC_WMM:
1779    case OPC_WMM_ACCU:
1780 
1781    default:
1782       return 0;
1783    }
1784 }
1785 
1786 /* Return the type (float, int, or uint) the op uses when converting from the
1787  * internal result of the op (which is assumed to be the same size as the
1788  * sources) to the destination when they are not the same size. If F32 it does
1789  * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1790  * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1791  * doesn't do anything sensible or is unknown.
1792  */
1793 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1794 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1795 {
1796    *can_fold = true;
1797    switch (instr->opc) {
1798    case OPC_ADD_F:
1799    case OPC_MUL_F:
1800    case OPC_BARY_F:
1801    case OPC_MAD_F32:
1802    case OPC_MAD_F16:
1803    case OPC_WMM:
1804    case OPC_WMM_ACCU:
1805       return TYPE_F32;
1806 
1807    case OPC_ADD_U:
1808    case OPC_SUB_U:
1809    case OPC_MIN_U:
1810    case OPC_MAX_U:
1811    case OPC_AND_B:
1812    case OPC_OR_B:
1813    case OPC_NOT_B:
1814    case OPC_XOR_B:
1815    case OPC_MUL_U24:
1816    case OPC_MULL_U:
1817    case OPC_SHL_B:
1818    case OPC_SHR_B:
1819    case OPC_ASHR_B:
1820    case OPC_MAD_U24:
1821    case OPC_SHRM:
1822    case OPC_SHLM:
1823    case OPC_SHRG:
1824    case OPC_SHLG:
1825    case OPC_ANDG:
1826    /* Comparison ops zero-extend/truncate their results, so consider them as
1827     * unsigned here.
1828     */
1829    case OPC_CMPS_F:
1830    case OPC_CMPV_F:
1831    case OPC_CMPS_U:
1832    case OPC_CMPS_S:
1833       return TYPE_U32;
1834 
1835    case OPC_ADD_S:
1836    case OPC_SUB_S:
1837    case OPC_MIN_S:
1838    case OPC_MAX_S:
1839    case OPC_ABSNEG_S:
1840    case OPC_MUL_S24:
1841    case OPC_MAD_S24:
1842       return TYPE_S32;
1843 
1844    /* We assume that any move->move folding that could be done was done by
1845     * NIR.
1846     */
1847    case OPC_MOV:
1848    default:
1849       *can_fold = false;
1850       return TYPE_U32;
1851    }
1852 }
1853 
1854 /* Return the src and dst types for the conversion which is already folded
1855  * into the op. We can assume that instr has folded in a conversion from
1856  * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1857  * to call if ir3_output_conv_type() returns can_fold = true.
1858  */
1859 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1860 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1861 {
1862    switch (instr->opc) {
1863    case OPC_CMPS_F:
1864    case OPC_CMPV_F:
1865    case OPC_CMPS_U:
1866    case OPC_CMPS_S:
1867       /* Comparisons only return 0/1 and the size of the comparison sources
1868        * is irrelevant, never consider them as having an output conversion
1869        * by returning a type with the dest size here:
1870        */
1871       return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1872                                                     : full_type(base_type);
1873 
1874    case OPC_BARY_F:
1875       /* bary.f doesn't have an explicit source, but we can assume here that
1876        * the varying data it reads is in fp32.
1877        *
1878        * This may be fp16 on older gen's depending on some register
1879        * settings, but it's probably not worth plumbing that through for a
1880        * small improvement that NIR would hopefully handle for us anyway.
1881        */
1882       return TYPE_F32;
1883 
1884    case OPC_FLAT_B:
1885       /* Treat the input data as u32 if not interpolating. */
1886       return TYPE_U32;
1887 
1888    default:
1889       return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1890                                                     : full_type(base_type);
1891    }
1892 }
1893 
1894 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1895 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1896 {
1897    return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1898                                                  : full_type(base_type);
1899 }
1900 
1901 /* Some instructions have signed/unsigned variants which are identical except
1902  * for whether the folded conversion sign-extends or zero-extends, and we can
1903  * fold in a mismatching move by rewriting the opcode. Return the opcode to
1904  * switch signedness, and whether one exists.
1905  */
1906 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1907 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1908 {
1909    switch (opc) {
1910 #define PAIR(u, s)                                                             \
1911    case OPC_##u:                                                               \
1912       return OPC_##s;                                                          \
1913    case OPC_##s:                                                               \
1914       return OPC_##u;
1915       PAIR(ADD_U, ADD_S)
1916       PAIR(SUB_U, SUB_S)
1917       /* Note: these are only identical when the sources are half, but that's
1918        * the only case we call this function for anyway.
1919        */
1920       PAIR(MUL_U24, MUL_S24)
1921 
1922    default:
1923       *can_swap = false;
1924       return opc;
1925    }
1926 }
1927 
1928 #define MASK(n) ((1 << (n)) - 1)
1929 
1930 /* iterator for an instructions's sources (reg), also returns src #: */
1931 #define foreach_src_n(__srcreg, __n, __instr)                                  \
1932    if ((__instr)->srcs_count)                                                  \
1933       for (struct ir3_register *__srcreg = (struct ir3_register *)~0; __srcreg;\
1934            __srcreg = NULL)                                                    \
1935          for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt;    \
1936               __n++)                                                           \
1937             if ((__srcreg = (__instr)->srcs[__n]))
1938 
1939 /* iterator for an instructions's sources (reg): */
1940 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1941 
1942 #define foreach_src_if(__srcreg, __instr, __filter)                            \
1943    foreach_src (__srcreg, __instr)                                             \
1944       if (__filter(__srcreg))
1945 
1946 /* Is this either the first src in an alias group (see IR3_REG_FIRST_ALIAS) or a
1947  * normal src.
1948  */
1949 static inline bool
ir3_src_is_first_in_group(struct ir3_register * src)1950 ir3_src_is_first_in_group(struct ir3_register *src)
1951 {
1952    return (src->flags & IR3_REG_FIRST_ALIAS) || !(src->flags & IR3_REG_ALIAS);
1953 }
1954 
1955 /* Iterator for an instruction's sources taking alias groups into account.
1956  * __src_n will hold the original source index (i.e., the index before expanding
1957  * collects to alias groups) while __alias_n the index within the current
1958  * group. Thus, the actual source index is __src_n + __alias_n.
1959  */
1960 #define foreach_src_with_alias_n(__srcreg, __src_n, __alias_n, __instr)        \
1961    for (unsigned __src_n = -1, __alias_n = -1, __e = 0; !__e; __e = 1)         \
1962       foreach_src (__srcreg, __instr)                                          \
1963          if (__src_n += ir3_src_is_first_in_group(__srcreg) ? 1 : 0,           \
1964              __alias_n =                                                       \
1965                 ir3_src_is_first_in_group(__srcreg) ? 0 : __alias_n + 1,       \
1966              true)
1967 
1968 /* Iterator for all the sources in the alias group (see IR3_REG_FIRST_ALIAS)
1969  * starting at source index __start. __alias_n is the offset of the source
1970  * from the start of the alias group.
1971  */
1972 #define foreach_src_in_alias_group_n(__alias, __alias_n, __instr, __start)     \
1973    for (struct ir3_register *__alias = __instr->srcs[__start];                 \
1974         __alias && (__alias->flags & IR3_REG_FIRST_ALIAS); __alias = NULL)     \
1975       for (unsigned __i = __start, __alias_n = 0;                              \
1976            __i < __instr->srcs_count &&                                        \
1977            (__i == __start || !ir3_src_is_first_in_group(__instr->srcs[__i])); \
1978            __i++, __alias_n++)                                                 \
1979          if ((__alias = __instr->srcs[__i]))
1980 
1981 #define foreach_src_in_alias_group(__alias, __instr, __start)                  \
1982    foreach_src_in_alias_group_n (__alias, __alias_n, __instr, __start)
1983 
1984 /* iterator for an instructions's destinations (reg), also returns dst #: */
1985 #define foreach_dst_n(__dstreg, __n, __instr)                                  \
1986    if ((__instr)->dsts_count)                                                  \
1987       for (struct ir3_register *__dstreg = (struct ir3_register *)~0; __dstreg;\
1988            __dstreg = NULL)                                                    \
1989          for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt;    \
1990               __n++)                                                           \
1991             if ((__dstreg = (__instr)->dsts[__n]))
1992 
1993 /* iterator for an instructions's destinations (reg): */
1994 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1995 
1996 #define foreach_dst_if(__dstreg, __instr, __filter)                            \
1997    foreach_dst (__dstreg, __instr)                                             \
1998       if (__filter(__dstreg))
1999 
2000 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)2001 __ssa_src_cnt(struct ir3_instruction *instr)
2002 {
2003    return instr->srcs_count + instr->deps_count;
2004 }
2005 
2006 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)2007 __is_false_dep(struct ir3_instruction *instr, unsigned n)
2008 {
2009    if (n >= instr->srcs_count)
2010       return true;
2011    return false;
2012 }
2013 
2014 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)2015 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
2016 {
2017    if (__is_false_dep(instr, n))
2018       return &instr->deps[n - instr->srcs_count];
2019    if (ssa(instr->srcs[n]))
2020       return &instr->srcs[n]->def->instr;
2021    return NULL;
2022 }
2023 
2024 #define foreach_ssa_srcp_n(__srcp, __n, __instr)                               \
2025    for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL)   \
2026       for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt;      \
2027            __n++)                                                              \
2028          if ((__srcp = __ssa_srcp_n(__instr, __n)))
2029 
2030 #define foreach_ssa_srcp(__srcp, __instr)                                      \
2031    foreach_ssa_srcp_n (__srcp, __i, __instr)
2032 
2033 /* iterator for an instruction's SSA sources (instr), also returns src #: */
2034 #define foreach_ssa_src_n(__srcinst, __n, __instr)                             \
2035    for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst;             \
2036         __srcinst = NULL)                                                      \
2037       foreach_ssa_srcp_n (__srcp, __n, __instr)                                \
2038          if ((__srcinst = *__srcp))
2039 
2040 /* iterator for an instruction's SSA sources (instr): */
2041 #define foreach_ssa_src(__srcinst, __instr)                                    \
2042    foreach_ssa_src_n (__srcinst, __i, __instr)
2043 
2044 /* iterators for shader inputs: */
2045 #define foreach_input_n(__ininstr, __cnt, __ir)                                \
2046    for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr;             \
2047         __ininstr = NULL)                                                      \
2048       for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++)          \
2049          if ((__ininstr = (__ir)->inputs[__cnt]))
2050 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
2051 
2052 /* iterators for instructions: */
2053 #define foreach_instr(__instr, __list)                                         \
2054    list_for_each_entry (struct ir3_instruction, __instr, __list, node)
2055 #define foreach_instr_from(__instr, __start, __list)                           \
2056    list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \
2057                             __list, node)
2058 #define foreach_instr_rev(__instr, __list)                                     \
2059    list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
2060 #define foreach_instr_safe(__instr, __list)                                    \
2061    list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
2062 #define foreach_instr_from_safe(__instr, __start, __list)                      \
2063    list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start,     \
2064                                  __list, node)
2065 
2066 /* Iterate over all instructions in a repeat group. */
2067 #define foreach_instr_rpt(__rpt, __instr)                                      \
2068    if (assert(ir3_instr_is_first_rpt(__instr)), true)                          \
2069       for (struct ir3_instruction *__rpt = __instr, *__first = __instr;        \
2070            __first || __rpt != __instr;                                        \
2071            __first = NULL, __rpt =                                             \
2072                               list_entry(__rpt->rpt_node.next,                 \
2073                                          struct ir3_instruction, rpt_node))
2074 
2075 /* Iterate over all instructions except the first one in a repeat group. */
2076 #define foreach_instr_rpt_excl(__rpt, __instr)                                 \
2077    if (assert(ir3_instr_is_first_rpt(__instr)), true)                          \
2078       list_for_each_entry (struct ir3_instruction, __rpt, &__instr->rpt_node,  \
2079                            rpt_node)
2080 
2081 #define foreach_instr_rpt_excl_safe(__rpt, __instr)                            \
2082    if (assert(ir3_instr_is_first_rpt(__instr)), true)                          \
2083       list_for_each_entry_safe (struct ir3_instruction, __rpt,                 \
2084                                 &__instr->rpt_node, rpt_node)
2085 
2086 /* iterators for blocks: */
2087 #define foreach_block(__block, __list)                                         \
2088    list_for_each_entry (struct ir3_block, __block, __list, node)
2089 #define foreach_block_safe(__block, __list)                                    \
2090    list_for_each_entry_safe (struct ir3_block, __block, __list, node)
2091 #define foreach_block_rev(__block, __list)                                     \
2092    list_for_each_entry_rev (struct ir3_block, __block, __list, node)
2093 
2094 /* iterators for arrays: */
2095 #define foreach_array(__array, __list)                                         \
2096    list_for_each_entry (struct ir3_array, __array, __list, node)
2097 #define foreach_array_safe(__array, __list)                                    \
2098    list_for_each_entry_safe (struct ir3_array, __array, __list, node)
2099 
2100 #define IR3_PASS(ir, pass, ...)                                                \
2101    ({                                                                          \
2102       bool progress = pass(ir, ##__VA_ARGS__);                                 \
2103       if (progress) {                                                          \
2104          ir3_debug_print(ir, "AFTER: " #pass);                                 \
2105          ir3_validate(ir);                                                     \
2106       }                                                                        \
2107       progress;                                                                \
2108    })
2109 
2110 /* validate: */
2111 void ir3_validate(struct ir3 *ir);
2112 
2113 /* dump: */
2114 void ir3_print(struct ir3 *ir);
2115 void ir3_print_instr(struct ir3_instruction *instr);
2116 
2117 struct log_stream;
2118 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
2119 
2120 /* delay calculation: */
2121 unsigned ir3_src_read_delay(struct ir3_compiler *compiler,
2122                             struct ir3_instruction *instr, unsigned src_n);
2123 int ir3_delayslots(struct ir3_compiler *compiler,
2124                    struct ir3_instruction *assigner,
2125                    struct ir3_instruction *consumer, unsigned n, bool soft);
2126 unsigned ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
2127                                     struct ir3_instruction *assigner,
2128                                     struct ir3_instruction *consumer,
2129                                     unsigned assigner_n, unsigned consumer_n);
2130 
2131 /* estimated (ss)/(sy) delay calculation */
2132 
2133 static inline bool
is_local_mem_load(struct ir3_instruction * instr)2134 is_local_mem_load(struct ir3_instruction *instr)
2135 {
2136    return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
2137       instr->opc == OPC_LDLW;
2138 }
2139 
2140 bool is_scalar_alu(struct ir3_instruction *instr,
2141                    const struct ir3_compiler *compiler);
2142 
2143 /* Does this instruction sometimes need (ss) to wait for its result? */
2144 static inline bool
is_ss_producer(struct ir3_instruction * instr)2145 is_ss_producer(struct ir3_instruction *instr)
2146 {
2147    foreach_dst (dst, instr) {
2148       if (dst->flags & IR3_REG_SHARED)
2149          return true;
2150    }
2151 
2152    if (instr->block->in_early_preamble && writes_addr1(instr))
2153       return true;
2154 
2155    return is_sfu(instr) || is_local_mem_load(instr) || instr->opc == OPC_SHFL;
2156 }
2157 
2158 static inline bool
needs_ss(const struct ir3_compiler * compiler,struct ir3_instruction * producer,struct ir3_instruction * consumer)2159 needs_ss(const struct ir3_compiler *compiler, struct ir3_instruction *producer,
2160          struct ir3_instruction *consumer)
2161 {
2162    if (is_scalar_alu(producer, compiler) &&
2163        is_scalar_alu(consumer, compiler) &&
2164        (producer->dsts[0]->flags & IR3_REG_HALF) ==
2165        (consumer->srcs[0]->flags & IR3_REG_HALF))
2166       return false;
2167 
2168    return is_ss_producer(producer);
2169 }
2170 
2171 static inline bool
supports_ss(struct ir3_instruction * instr)2172 supports_ss(struct ir3_instruction *instr)
2173 {
2174    return opc_cat(instr->opc) < 5 || instr->opc == OPC_ALIAS;
2175 }
2176 
2177 /* The soft delay for approximating the cost of (ss). */
2178 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)2179 soft_ss_delay(struct ir3_instruction *instr)
2180 {
2181    /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
2182     * using nop's instead of (ss) is:
2183     *
2184     *     8 - single warp
2185     *     9 - two warps
2186     *    10 - four warps
2187     *
2188     * and so on. Not quite sure where it tapers out (ie. how many warps share an
2189     * SFU unit). But 10 seems like a reasonable # to choose:
2190     */
2191    if (is_sfu(instr) || is_local_mem_load(instr))
2192       return 10;
2193 
2194    /* The blob adds 6 nops between shared producers and consumers, and before we
2195     * used (ss) this was sufficient in most cases.
2196     */
2197    return 6;
2198 }
2199 
2200 static inline bool
is_sy_producer(struct ir3_instruction * instr)2201 is_sy_producer(struct ir3_instruction *instr)
2202 {
2203    return is_tex_or_prefetch(instr) ||
2204       (is_load(instr) && !is_local_mem_load(instr)) ||
2205       is_atomic(instr->opc);
2206 }
2207 
2208 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)2209 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
2210 {
2211    /* TODO: this is just an optimistic guess, we can do better post-RA.
2212     */
2213    bool double_wavesize =
2214       shader->type == MESA_SHADER_FRAGMENT ||
2215       shader->type == MESA_SHADER_COMPUTE;
2216 
2217    unsigned components = reg_elems(instr->dsts[0]);
2218 
2219    /* These numbers come from counting the number of delay slots to get
2220     * cat5/cat6 results back using nops instead of (sy). Note that these numbers
2221     * are with the result preloaded to cache by loading it before in the same
2222     * shader - uncached results are much larger.
2223     *
2224     * Note: most ALU instructions can't complete at the full doubled rate, so
2225     * they take 2 cycles. The only exception is fp16 instructions with no
2226     * built-in conversions. Therefore divide the latency by 2.
2227     *
2228     * TODO: Handle this properly in the scheduler and remove this.
2229     */
2230    if (instr->opc == OPC_LDC) {
2231       if (double_wavesize)
2232          return (21 + 8 * components) / 2;
2233       else
2234          return 18 + 4 * components;
2235    } else if (is_tex_or_prefetch(instr)) {
2236       if (double_wavesize) {
2237          switch (components) {
2238          case 1: return 58 / 2;
2239          case 2: return 60 / 2;
2240          case 3: return 77 / 2;
2241          case 4: return 79 / 2;
2242          default: unreachable("bad number of components");
2243          }
2244       } else {
2245          switch (components) {
2246          case 1: return 51;
2247          case 2: return 53;
2248          case 3: return 62;
2249          case 4: return 64;
2250          default: unreachable("bad number of components");
2251          }
2252       }
2253    } else {
2254       /* TODO: measure other cat6 opcodes like ldg */
2255       if (double_wavesize)
2256          return (172 + components) / 2;
2257       else
2258          return 109 + components;
2259    }
2260 }
2261 
2262 /* Some instructions don't immediately consume their sources so may introduce a
2263  * WAR hazard.
2264  */
2265 static inline bool
is_war_hazard_producer(struct ir3_instruction * instr)2266 is_war_hazard_producer(struct ir3_instruction *instr)
2267 {
2268    return is_tex(instr) || is_mem(instr) || is_ss_producer(instr) ||
2269           instr->opc == OPC_STC;
2270 }
2271 
2272 bool ir3_cleanup_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2273 bool ir3_merge_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2274 bool ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v);
2275 bool ir3_create_alias_tex_regs(struct ir3 *ir);
2276 bool ir3_insert_alias_tex(struct ir3 *ir);
2277 bool ir3_create_alias_rt(struct ir3 *ir, struct ir3_shader_variant *v);
2278 
2279 /* unreachable block elimination: */
2280 bool ir3_remove_unreachable(struct ir3 *ir);
2281 
2282 /* calculate reconvergence information: */
2283 void ir3_calc_reconvergence(struct ir3_shader_variant *so);
2284 
2285 /* lower invalid shared phis after calculating reconvergence information: */
2286 bool ir3_lower_shared_phis(struct ir3 *ir);
2287 
2288 /* dead code elimination: */
2289 struct ir3_shader_variant;
2290 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
2291 
2292 /* fp16 conversion folding */
2293 bool ir3_cf(struct ir3 *ir);
2294 
2295 /* shared mov folding */
2296 bool ir3_shared_fold(struct ir3 *ir);
2297 
2298 /* copy-propagate: */
2299 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
2300 
2301 /* common subexpression elimination: */
2302 bool ir3_cse(struct ir3 *ir);
2303 
2304 /* Make arrays SSA */
2305 bool ir3_array_to_ssa(struct ir3 *ir);
2306 
2307 /* scheduling: */
2308 bool ir3_sched_add_deps(struct ir3 *ir);
2309 int ir3_sched(struct ir3 *ir);
2310 
2311 struct ir3_context;
2312 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
2313 
2314 /* register assignment: */
2315 int ir3_ra(struct ir3_shader_variant *v);
2316 void ir3_ra_predicates(struct ir3_shader_variant *v);
2317 
2318 /* lower subgroup ops: */
2319 bool ir3_lower_subgroups(struct ir3 *ir);
2320 
2321 /* legalize: */
2322 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
2323 bool ir3_legalize_relative(struct ir3 *ir);
2324 
2325 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)2326 ir3_has_latency_to_hide(struct ir3 *ir)
2327 {
2328    /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
2329     * know the nature of the fragment shader.  Just assume it will have
2330     * latency to hide:
2331     */
2332    if (ir->type != MESA_SHADER_FRAGMENT)
2333       return true;
2334 
2335    foreach_block (block, &ir->block_list) {
2336       foreach_instr (instr, &block->instr_list) {
2337          if (is_tex_or_prefetch(instr))
2338             return true;
2339 
2340          if (is_load(instr)) {
2341             switch (instr->opc) {
2342             case OPC_LDLV:
2343             case OPC_LDL:
2344             case OPC_LDLW:
2345                break;
2346             default:
2347                return true;
2348             }
2349          }
2350       }
2351    }
2352 
2353    return false;
2354 }
2355 
2356 /**
2357  * Move 'instr' to after the last phi node at the beginning of the block:
2358  */
2359 static inline void
ir3_instr_move_after_phis(struct ir3_instruction * instr,struct ir3_block * block)2360 ir3_instr_move_after_phis(struct ir3_instruction *instr,
2361                           struct ir3_block *block)
2362 {
2363    struct ir3_instruction *last_phi = ir3_block_get_last_phi(block);
2364    if (last_phi)
2365       ir3_instr_move_after(instr, last_phi);
2366    else
2367       ir3_instr_move_before_block(instr, block);
2368 }
2369 
2370 static inline struct ir3_cursor
ir3_before_block(struct ir3_block * block)2371 ir3_before_block(struct ir3_block *block)
2372 {
2373    assert(block);
2374    struct ir3_cursor cursor;
2375    cursor.option = IR3_CURSOR_BEFORE_BLOCK;
2376    cursor.block = block;
2377    return cursor;
2378 }
2379 
2380 static inline struct ir3_cursor
ir3_after_block(struct ir3_block * block)2381 ir3_after_block(struct ir3_block *block)
2382 {
2383    assert(block);
2384    struct ir3_cursor cursor;
2385    cursor.option = IR3_CURSOR_AFTER_BLOCK;
2386    cursor.block = block;
2387    return cursor;
2388 }
2389 
2390 static inline struct ir3_cursor
ir3_before_instr(struct ir3_instruction * instr)2391 ir3_before_instr(struct ir3_instruction *instr)
2392 {
2393    assert(instr);
2394    struct ir3_cursor cursor;
2395    cursor.option = IR3_CURSOR_BEFORE_INSTR;
2396    cursor.instr = instr;
2397    return cursor;
2398 }
2399 
2400 static inline struct ir3_cursor
ir3_after_instr(struct ir3_instruction * instr)2401 ir3_after_instr(struct ir3_instruction *instr)
2402 {
2403    assert(instr);
2404    struct ir3_cursor cursor;
2405    cursor.option = IR3_CURSOR_AFTER_INSTR;
2406    cursor.instr = instr;
2407    return cursor;
2408 }
2409 
2410 static inline struct ir3_cursor
ir3_before_terminator(struct ir3_block * block)2411 ir3_before_terminator(struct ir3_block *block)
2412 {
2413    assert(block);
2414    struct ir3_instruction *terminator = ir3_block_get_terminator(block);
2415 
2416    if (terminator)
2417       return ir3_before_instr(terminator);
2418    return ir3_after_block(block);
2419 }
2420 
2421 static inline struct ir3_cursor
ir3_after_phis(struct ir3_block * block)2422 ir3_after_phis(struct ir3_block *block)
2423 {
2424    assert(block);
2425 
2426    foreach_instr (instr, &block->instr_list) {
2427       if (instr->opc != OPC_META_PHI)
2428          return ir3_before_instr(instr);
2429    }
2430 
2431    return ir3_after_block(block);
2432 }
2433 
2434 static inline struct ir3_cursor
ir3_after_instr_and_phis(struct ir3_instruction * instr)2435 ir3_after_instr_and_phis(struct ir3_instruction *instr)
2436 {
2437    if (instr->opc == OPC_META_PHI) {
2438       return ir3_after_phis(instr->block);
2439    } else {
2440       return ir3_after_instr(instr);
2441    }
2442 }
2443 
2444 static inline struct ir3_builder
ir3_builder_at(struct ir3_cursor cursor)2445 ir3_builder_at(struct ir3_cursor cursor)
2446 {
2447    struct ir3_builder builder;
2448    builder.cursor = cursor;
2449    return builder;
2450 }
2451 
2452 
2453 /* ************************************************************************* */
2454 /* instruction helpers */
2455 
2456 /* creates SSA src of correct type (ie. half vs full precision) */
2457 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)2458 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
2459           unsigned flags)
2460 {
2461    struct ir3_register *reg;
2462    flags |= src->dsts[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED);
2463    reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
2464    reg->def = src->dsts[0];
2465    reg->wrmask = src->dsts[0]->wrmask;
2466    return reg;
2467 }
2468 
2469 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)2470 __ssa_dst(struct ir3_instruction *instr)
2471 {
2472    struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
2473    reg->instr = instr;
2474    return reg;
2475 }
2476 
2477 static BITMASK_ENUM(ir3_register_flags)
type_flags(type_t type)2478 type_flags(type_t type)
2479 {
2480    if (type_size(type) < 32)
2481       return IR3_REG_HALF;
2482    return (ir3_register_flags)0;
2483 }
2484 
2485 static inline struct ir3_instruction *
create_immed_typed_shared(struct ir3_builder * build,uint32_t val,type_t type,bool shared)2486 create_immed_typed_shared(struct ir3_builder *build, uint32_t val, type_t type,
2487                           bool shared)
2488 {
2489    struct ir3_instruction *mov;
2490    ir3_register_flags flags = type_flags(type);
2491 
2492    mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2493    mov->cat1.src_type = type;
2494    mov->cat1.dst_type = type;
2495    __ssa_dst(mov)->flags |= flags | (shared ? IR3_REG_SHARED : 0);
2496    ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
2497 
2498    return mov;
2499 }
2500 
2501 static inline struct ir3_instruction *
create_immed_typed(struct ir3_builder * build,uint32_t val,type_t type)2502 create_immed_typed(struct ir3_builder *build, uint32_t val, type_t type)
2503 {
2504    return create_immed_typed_shared(build, val, type, false);
2505 }
2506 
2507 static inline struct ir3_instruction *
create_immed_shared(struct ir3_builder * build,uint32_t val,bool shared)2508 create_immed_shared(struct ir3_builder *build, uint32_t val, bool shared)
2509 {
2510    return create_immed_typed_shared(build, val, TYPE_U32, shared);
2511 }
2512 
2513 static inline struct ir3_instruction *
create_immed(struct ir3_builder * build,uint32_t val)2514 create_immed(struct ir3_builder *build, uint32_t val)
2515 {
2516    return create_immed_shared(build, val, false);
2517 }
2518 
2519 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_builder * build,unsigned n,type_t type)2520 create_uniform_typed(struct ir3_builder *build, unsigned n, type_t type)
2521 {
2522    struct ir3_instruction *mov;
2523    ir3_register_flags flags = type_flags(type);
2524 
2525    mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2526    mov->cat1.src_type = type;
2527    mov->cat1.dst_type = type;
2528    __ssa_dst(mov)->flags |= flags;
2529    ir3_src_create(mov, n, IR3_REG_CONST | flags);
2530 
2531    return mov;
2532 }
2533 
2534 static inline struct ir3_instruction *
create_uniform(struct ir3_builder * build,unsigned n)2535 create_uniform(struct ir3_builder *build, unsigned n)
2536 {
2537    return create_uniform_typed(build, n, TYPE_F32);
2538 }
2539 
2540 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_builder * build,int n,type_t type,struct ir3_instruction * address)2541 create_uniform_indirect(struct ir3_builder *build, int n, type_t type,
2542                         struct ir3_instruction *address)
2543 {
2544    struct ir3_instruction *mov;
2545 
2546    mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2547    mov->cat1.src_type = type;
2548    mov->cat1.dst_type = type;
2549    __ssa_dst(mov);
2550    ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
2551 
2552    ir3_instr_set_address(mov, address);
2553 
2554    return mov;
2555 }
2556 
2557 static inline struct ir3_instruction *
ir3_MOV(struct ir3_builder * build,struct ir3_instruction * src,type_t type)2558 ir3_MOV(struct ir3_builder *build, struct ir3_instruction *src, type_t type)
2559 {
2560    struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOV, 1, 1);
2561    ir3_register_flags flags = type_flags(type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2562 
2563    __ssa_dst(instr)->flags |= flags;
2564    if (src->dsts[0]->flags & IR3_REG_ARRAY) {
2565       struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
2566       src_reg->array = src->dsts[0]->array;
2567    } else {
2568       __ssa_src(instr, src, 0);
2569    }
2570    assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
2571    instr->cat1.src_type = type;
2572    instr->cat1.dst_type = type;
2573    return instr;
2574 }
2575 
2576 static inline struct ir3_instruction_rpt
ir3_MOV_rpt(struct ir3_builder * build,unsigned nrpt,struct ir3_instruction_rpt src,type_t type)2577 ir3_MOV_rpt(struct ir3_builder *build, unsigned nrpt,
2578             struct ir3_instruction_rpt src, type_t type)
2579 {
2580    struct ir3_instruction_rpt dst;
2581    assert(nrpt <= ARRAY_SIZE(dst.rpts));
2582 
2583    for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2584       dst.rpts[rpt] = ir3_MOV(build, src.rpts[rpt], type);
2585 
2586    ir3_instr_create_rpt(dst.rpts, nrpt);
2587    return dst;
2588 }
2589 
2590 static inline struct ir3_instruction *
ir3_COV(struct ir3_builder * build,struct ir3_instruction * src,type_t src_type,type_t dst_type)2591 ir3_COV(struct ir3_builder *build, struct ir3_instruction *src, type_t src_type,
2592         type_t dst_type)
2593 {
2594    struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOV, 1, 1);
2595    ir3_register_flags dst_flags = type_flags(dst_type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2596    ASSERTED ir3_register_flags src_flags = type_flags(src_type);
2597 
2598    assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2599 
2600    __ssa_dst(instr)->flags |= dst_flags;
2601    __ssa_src(instr, src, 0);
2602    instr->cat1.src_type = src_type;
2603    instr->cat1.dst_type = dst_type;
2604    assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2605    return instr;
2606 }
2607 
2608 static inline struct ir3_instruction_rpt
ir3_COV_rpt(struct ir3_builder * build,unsigned nrpt,struct ir3_instruction_rpt src,type_t src_type,type_t dst_type)2609 ir3_COV_rpt(struct ir3_builder *build, unsigned nrpt,
2610             struct ir3_instruction_rpt src, type_t src_type, type_t dst_type)
2611 {
2612    struct ir3_instruction_rpt dst;
2613 
2614    for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2615       dst.rpts[rpt] = ir3_COV(build, src.rpts[rpt], src_type, dst_type);
2616 
2617    ir3_instr_create_rpt(dst.rpts, nrpt);
2618    return dst;
2619 }
2620 
2621 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_builder * build,unsigned components)2622 ir3_MOVMSK(struct ir3_builder *build, unsigned components)
2623 {
2624    struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOVMSK, 1, 0);
2625 
2626    struct ir3_register *dst = __ssa_dst(instr);
2627    dst->flags |= IR3_REG_SHARED;
2628    dst->wrmask = (1 << components) - 1;
2629    instr->repeat = components - 1;
2630    return instr;
2631 }
2632 
2633 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_builder * build,struct ir3_instruction * src,unsigned components)2634 ir3_BALLOT_MACRO(struct ir3_builder *build, struct ir3_instruction *src,
2635                  unsigned components)
2636 {
2637    struct ir3_instruction *instr =
2638       ir3_build_instr(build, OPC_BALLOT_MACRO, 1, 1);
2639 
2640    struct ir3_register *dst = __ssa_dst(instr);
2641    dst->flags |= IR3_REG_SHARED;
2642    dst->wrmask = (1 << components) - 1;
2643 
2644    __ssa_src(instr, src, 0);
2645 
2646    return instr;
2647 }
2648 
2649 /* clang-format off */
2650 #define __INSTR0(flag, name, opc)                                              \
2651 static inline struct ir3_instruction *ir3_##name(struct ir3_builder *build)    \
2652 {                                                                              \
2653    struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 0);          \
2654    instr->flags |= flag;                                                       \
2655    return instr;                                                               \
2656 }
2657 /* clang-format on */
2658 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2659 #define INSTR0(name)     __INSTR0((ir3_instruction_flags)0, name, OPC_##name)
2660 
2661 /* clang-format off */
2662 #define __INSTR1(flag, dst_count, name, opc, scalar_alu)                       \
2663 static inline struct ir3_instruction *ir3_##name(                              \
2664    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags)      \
2665 {                                                                              \
2666    struct ir3_instruction *instr =                                             \
2667       ir3_build_instr(build, opc, dst_count, 1);                               \
2668    unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & IR3_REG_SHARED) : 0;  \
2669    for (unsigned i = 0; i < dst_count; i++)                                    \
2670       __ssa_dst(instr)->flags |= dst_flag;                                     \
2671    __ssa_src(instr, a, aflags);                                                \
2672    instr->flags |= flag;                                                       \
2673    return instr;                                                               \
2674 }                                                                              \
2675 static inline struct ir3_instruction_rpt ir3_##name##_rpt(                     \
2676    struct ir3_builder *build, unsigned nrpt,                                   \
2677    struct ir3_instruction_rpt a, unsigned aflags)                              \
2678 {                                                                              \
2679    struct ir3_instruction_rpt dst;                                             \
2680    assert(nrpt <= ARRAY_SIZE(dst.rpts));                                       \
2681    for (unsigned rpt = 0; rpt < nrpt; rpt++)                                   \
2682       dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags);                  \
2683    ir3_instr_create_rpt(dst.rpts, nrpt);                                       \
2684    return dst;                                                                 \
2685 }
2686 
2687 /* clang-format on */
2688 #define INSTR1F(f, name)  __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name,   \
2689                                    false)
2690 #define INSTR1(name)      __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2691 #define INSTR1S(name)     __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2692 #define INSTR1NODST(name) __INSTR1((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2693 
2694 /* clang-format off */
2695 #define __INSTR2(flag, dst_count, name, opc, scalar_alu)                       \
2696 static inline struct ir3_instruction *ir3_##name(                              \
2697    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags,      \
2698    struct ir3_instruction *b, unsigned bflags)                                 \
2699 {                                                                              \
2700    struct ir3_instruction *instr = ir3_build_instr(build, opc, dst_count, 2);  \
2701    unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags &   \
2702                                      IR3_REG_SHARED) : 0;                      \
2703    for (unsigned i = 0; i < dst_count; i++)                                    \
2704       __ssa_dst(instr)->flags |= dst_flag;                                     \
2705    __ssa_src(instr, a, aflags);                                                \
2706    __ssa_src(instr, b, bflags);                                                \
2707    instr->flags |= flag;                                                       \
2708    return instr;                                                               \
2709 }                                                                              \
2710 static inline struct ir3_instruction_rpt ir3_##name##_rpt(                     \
2711    struct ir3_builder *build, unsigned nrpt,                                   \
2712    struct ir3_instruction_rpt a, unsigned aflags,                              \
2713    struct ir3_instruction_rpt b, unsigned bflags)                              \
2714 {                                                                              \
2715    struct ir3_instruction_rpt dst;                                             \
2716    assert(nrpt <= ARRAY_SIZE(dst.rpts));                                       \
2717    for (unsigned rpt = 0; rpt < nrpt; rpt++) {                                 \
2718       dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags,                   \
2719                                  b.rpts[rpt], bflags);                         \
2720    }                                                                           \
2721    ir3_instr_create_rpt(dst.rpts, nrpt);                                       \
2722    return dst;                                                                 \
2723 }
2724 /* clang-format on */
2725 #define INSTR2F(f, name)   __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name,  \
2726                                     false)
2727 #define INSTR2(name)       __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2728 #define INSTR2S(name)      __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2729 #define INSTR2NODST(name)  __INSTR2((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2730 
2731 /* clang-format off */
2732 #define __INSTR3(flag, dst_count, name, opc, scalar_alu)                       \
2733 static inline struct ir3_instruction *ir3_##name(                              \
2734    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags,      \
2735    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2736    unsigned cflags)                                                            \
2737 {                                                                              \
2738    struct ir3_instruction *instr =                                             \
2739       ir3_build_instr(build, opc, dst_count, 3);                               \
2740    unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags &   \
2741                                      c->dsts[0]->flags & IR3_REG_SHARED) : 0;  \
2742    for (unsigned i = 0; i < dst_count; i++)                                    \
2743       __ssa_dst(instr)->flags |= dst_flag;                                     \
2744    __ssa_src(instr, a, aflags);                                                \
2745    __ssa_src(instr, b, bflags);                                                \
2746    __ssa_src(instr, c, cflags);                                                \
2747    instr->flags |= flag;                                                       \
2748    return instr;                                                               \
2749 }                                                                              \
2750 static inline struct ir3_instruction_rpt ir3_##name##_rpt(                     \
2751    struct ir3_builder *build, unsigned nrpt,                                   \
2752    struct ir3_instruction_rpt a, unsigned aflags,                              \
2753    struct ir3_instruction_rpt b, unsigned bflags,                              \
2754    struct ir3_instruction_rpt c, unsigned cflags)                              \
2755 {                                                                              \
2756    struct ir3_instruction_rpt dst;                                             \
2757    assert(nrpt <= ARRAY_SIZE(dst.rpts));                                       \
2758    for (unsigned rpt = 0; rpt < nrpt; rpt++) {                                 \
2759       dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags,                   \
2760                                  b.rpts[rpt], bflags,                          \
2761                                  c.rpts[rpt], cflags);                         \
2762    }                                                                           \
2763    ir3_instr_create_rpt(dst.rpts, nrpt);                                       \
2764    return dst;                                                                 \
2765 }
2766 /* clang-format on */
2767 #define INSTR3F(f, name)  __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name,   \
2768                                    false)
2769 #define INSTR3(name)      __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2770 #define INSTR3S(name)     __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2771 #define INSTR3NODST(name) __INSTR3((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2772 
2773 /* clang-format off */
2774 #define __INSTR4(flag, dst_count, name, opc)                                   \
2775 static inline struct ir3_instruction *ir3_##name(                              \
2776    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags,      \
2777    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2778    unsigned cflags, struct ir3_instruction *d, unsigned dflags)                \
2779 {                                                                              \
2780    struct ir3_instruction *instr =                                             \
2781       ir3_build_instr(build, opc, dst_count, 4);                               \
2782    for (unsigned i = 0; i < dst_count; i++)                                    \
2783       __ssa_dst(instr);                                                        \
2784    __ssa_src(instr, a, aflags);                                                \
2785    __ssa_src(instr, b, bflags);                                                \
2786    __ssa_src(instr, c, cflags);                                                \
2787    __ssa_src(instr, d, dflags);                                                \
2788    instr->flags |= flag;                                                       \
2789    return instr;                                                               \
2790 }
2791 /* clang-format on */
2792 #define INSTR4F(f, name)  __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2793 #define INSTR4(name)      __INSTR4((ir3_instruction_flags)0, 1, name, OPC_##name)
2794 #define INSTR4NODST(name) __INSTR4((ir3_instruction_flags)0, 0, name, OPC_##name)
2795 
2796 /* clang-format off */
2797 #define __INSTR5(flag, name, opc)                                              \
2798 static inline struct ir3_instruction *ir3_##name(                              \
2799    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags,      \
2800    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2801    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2802    struct ir3_instruction *e, unsigned eflags)                                 \
2803 {                                                                              \
2804    struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 5);          \
2805    __ssa_dst(instr);                                                           \
2806    __ssa_src(instr, a, aflags);                                                \
2807    __ssa_src(instr, b, bflags);                                                \
2808    __ssa_src(instr, c, cflags);                                                \
2809    __ssa_src(instr, d, dflags);                                                \
2810    __ssa_src(instr, e, eflags);                                                \
2811    instr->flags |= flag;                                                       \
2812    return instr;                                                               \
2813 }
2814 /* clang-format on */
2815 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2816 #define INSTR5(name)     __INSTR5((ir3_instruction_flags)0, name, OPC_##name)
2817 
2818 /* clang-format off */
2819 #define __INSTR6(flag, dst_count, name, opc)                                   \
2820 static inline struct ir3_instruction *ir3_##name(                              \
2821    struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags,      \
2822    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2823    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2824    struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f,      \
2825    unsigned fflags)                                                            \
2826 {                                                                              \
2827    struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 6);          \
2828    for (unsigned i = 0; i < dst_count; i++)                                    \
2829       __ssa_dst(instr);                                                        \
2830    __ssa_src(instr, a, aflags);                                                \
2831    __ssa_src(instr, b, bflags);                                                \
2832    __ssa_src(instr, c, cflags);                                                \
2833    __ssa_src(instr, d, dflags);                                                \
2834    __ssa_src(instr, e, eflags);                                                \
2835    __ssa_src(instr, f, fflags);                                                \
2836    instr->flags |= flag;                                                       \
2837    return instr;                                                               \
2838 }
2839 /* clang-format on */
2840 #define INSTR6F(f, name)  __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2841 #define INSTR6(name)      __INSTR6((ir3_instruction_flags)0, 1, name, OPC_##name)
2842 #define INSTR6NODST(name) __INSTR6((ir3_instruction_flags)0, 0, name, OPC_##name)
2843 
2844 /* cat0 instructions: */
2845 INSTR0(NOP)
INSTR1NODST(BR)2846 INSTR1NODST(BR)
2847 INSTR1NODST(BALL)
2848 INSTR1NODST(BANY)
2849 INSTR2NODST(BRAA)
2850 INSTR2NODST(BRAO)
2851 INSTR0(JUMP)
2852 INSTR1NODST(KILL)
2853 INSTR1NODST(DEMOTE)
2854 INSTR0(END)
2855 INSTR0(CHSH)
2856 INSTR0(CHMASK)
2857 INSTR1NODST(PREDT)
2858 INSTR1NODST(PREDF)
2859 INSTR0(PREDE)
2860 INSTR0(GETONE)
2861 INSTR0(GETLAST)
2862 INSTR0(SHPS)
2863 INSTR0(SHPE)
2864 
2865 /* cat1 macros */
2866 INSTR1(ANY_MACRO)
2867 INSTR1(ALL_MACRO)
2868 INSTR1(READ_FIRST_MACRO)
2869 INSTR2(READ_COND_MACRO)
2870 INSTR1(READ_GETLAST_MACRO)
2871 
2872 static inline struct ir3_instruction *
2873 ir3_ELECT_MACRO(struct ir3_builder *build)
2874 {
2875    struct ir3_instruction *instr =
2876       ir3_build_instr(build, OPC_ELECT_MACRO, 1, 0);
2877    __ssa_dst(instr);
2878    return instr;
2879 }
2880 
2881 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_builder * build)2882 ir3_SHPS_MACRO(struct ir3_builder *build)
2883 {
2884    struct ir3_instruction *instr = ir3_build_instr(build, OPC_SHPS_MACRO, 1, 0);
2885    __ssa_dst(instr);
2886    return instr;
2887 }
2888 
2889 /* cat2 instructions, most 2 src but some 1 src: */
2890 INSTR2S(ADD_F)
INSTR2S(MIN_F)2891 INSTR2S(MIN_F)
2892 INSTR2S(MAX_F)
2893 INSTR2S(MUL_F)
2894 INSTR1S(SIGN_F)
2895 INSTR2S(CMPS_F)
2896 INSTR1S(ABSNEG_F)
2897 INSTR2S(CMPV_F)
2898 INSTR1S(FLOOR_F)
2899 INSTR1S(CEIL_F)
2900 INSTR1S(RNDNE_F)
2901 INSTR1S(RNDAZ_F)
2902 INSTR1S(TRUNC_F)
2903 INSTR2S(ADD_U)
2904 INSTR2S(ADD_S)
2905 INSTR2S(SUB_U)
2906 INSTR2S(SUB_S)
2907 INSTR2S(CMPS_U)
2908 INSTR2S(CMPS_S)
2909 INSTR2S(MIN_U)
2910 INSTR2S(MIN_S)
2911 INSTR2S(MAX_U)
2912 INSTR2S(MAX_S)
2913 INSTR1S(ABSNEG_S)
2914 INSTR2S(AND_B)
2915 INSTR2S(OR_B)
2916 INSTR1S(NOT_B)
2917 INSTR2S(XOR_B)
2918 INSTR2S(CMPV_U)
2919 INSTR2S(CMPV_S)
2920 INSTR2S(MUL_U24)
2921 INSTR2S(MUL_S24)
2922 INSTR2S(MULL_U)
2923 INSTR1S(BFREV_B)
2924 INSTR1S(CLZ_S)
2925 INSTR1S(CLZ_B)
2926 INSTR2S(SHL_B)
2927 INSTR2S(SHR_B)
2928 INSTR2S(ASHR_B)
2929 INSTR2(BARY_F)
2930 INSTR2(FLAT_B)
2931 INSTR2S(MGEN_B)
2932 INSTR2S(GETBIT_B)
2933 INSTR1(SETRM)
2934 INSTR1S(CBITS_B)
2935 INSTR2S(SHB)
2936 INSTR2S(MSAD)
2937 
2938 /* cat3 instructions: */
2939 INSTR3(MAD_U16)
2940 INSTR3(MADSH_U16)
2941 INSTR3(MAD_S16)
2942 INSTR3(MADSH_M16)
2943 INSTR3(MAD_U24)
2944 INSTR3(MAD_S24)
2945 INSTR3(MAD_F16)
2946 INSTR3(MAD_F32)
2947 INSTR3(DP2ACC)
2948 INSTR3(DP4ACC)
2949 /* NOTE: SEL_B32 checks for zero vs nonzero */
2950 INSTR3S(SEL_B16)
2951 INSTR3S(SEL_B32)
2952 INSTR3S(SEL_S16)
2953 INSTR3S(SEL_S32)
2954 INSTR3S(SEL_F16)
2955 INSTR3S(SEL_F32)
2956 INSTR3(SAD_S16)
2957 INSTR3(SAD_S32)
2958 INSTR3S(SHRM)
2959 INSTR3S(SHLM)
2960 INSTR3S(SHRG)
2961 INSTR3S(SHLG)
2962 INSTR3S(ANDG)
2963 
2964 /* cat4 instructions: */
2965 INSTR1S(RCP)
2966 INSTR1S(RSQ)
2967 INSTR1S(HRSQ)
2968 INSTR1S(LOG2)
2969 INSTR1S(HLOG2)
2970 INSTR1S(EXP2)
2971 INSTR1S(HEXP2)
2972 INSTR1S(SIN)
2973 INSTR1S(COS)
2974 INSTR1S(SQRT)
2975 
2976 /* cat5 instructions: */
2977 INSTR1(DSX)
2978 INSTR1(DSXPP_MACRO)
2979 INSTR1(DSY)
2980 INSTR1(DSYPP_MACRO)
2981 INSTR1F(3D, DSX)
2982 INSTR1F(3D, DSY)
2983 INSTR1(RGETPOS)
2984 
2985 static inline struct ir3_instruction *
2986 ir3_SAM(struct ir3_builder *build, opc_t opc, type_t type, unsigned wrmask,
2987         ir3_instruction_flags flags, struct ir3_instruction *samp_tex,
2988         struct ir3_instruction *src0, struct ir3_instruction *src1)
2989 {
2990    struct ir3_instruction *sam;
2991    unsigned nreg = 0;
2992 
2993    if (flags & IR3_INSTR_S2EN) {
2994       nreg++;
2995    }
2996    if (src0 || opc == OPC_SAM) {
2997       nreg++;
2998    }
2999    if (src1) {
3000       nreg++;
3001    }
3002 
3003    sam = ir3_build_instr(build, opc, 1, nreg);
3004    sam->flags |= flags;
3005    __ssa_dst(sam)->wrmask = wrmask;
3006    if (flags & IR3_INSTR_S2EN) {
3007       __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
3008    }
3009    if (src0) {
3010       __ssa_src(sam, src0, 0);
3011    } else if (opc == OPC_SAM) {
3012       /* Create a dummy shared source for the coordinate, for the prefetch
3013        * case. It needs to be shared so that we don't accidentally disable early
3014        * preamble, and this is what the blob does.
3015        */
3016       ir3_src_create(sam, regid(48, 0), IR3_REG_SHARED);
3017    }
3018    if (src1) {
3019       __ssa_src(sam, src1, 0);
3020    }
3021    sam->cat5.type = type;
3022 
3023    return sam;
3024 }
3025 
3026 /* brcst.active rx, ry behaves like a conditional move: rx either keeps its
3027  * value or is set to ry. In order to model this in SSA form, we add an extra
3028  * argument (the initial value of rx) and tie it to the destination.
3029  */
3030 static inline struct ir3_instruction *
ir3_BRCST_ACTIVE(struct ir3_builder * build,unsigned cluster_size,struct ir3_instruction * src,struct ir3_instruction * dst_default)3031 ir3_BRCST_ACTIVE(struct ir3_builder *build, unsigned cluster_size,
3032                  struct ir3_instruction *src,
3033                  struct ir3_instruction *dst_default)
3034 {
3035    struct ir3_instruction *brcst =
3036       ir3_build_instr(build, OPC_BRCST_ACTIVE, 1, 2);
3037    brcst->cat5.cluster_size = cluster_size;
3038    brcst->cat5.type = TYPE_U32;
3039    struct ir3_register *brcst_dst = __ssa_dst(brcst);
3040    __ssa_src(brcst, src, 0);
3041    struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0);
3042    ir3_reg_tie(brcst_dst, default_src);
3043    return brcst;
3044 }
3045 
3046 /* cat6 instructions: */
3047 INSTR0(GETFIBERID)
3048 INSTR2(LDLV)
3049 INSTR3(LDG)
3050 INSTR3(LDL)
3051 INSTR3(LDLW)
3052 INSTR3(LDP)
3053 INSTR4NODST(STG)
3054 INSTR3NODST(STL)
3055 INSTR3NODST(STLW)
3056 INSTR3NODST(STP)
3057 INSTR1(RESINFO)
3058 INSTR1(RESFMT)
3059 INSTR2(ATOMIC_ADD)
3060 INSTR2(ATOMIC_SUB)
3061 INSTR2(ATOMIC_XCHG)
3062 INSTR2(ATOMIC_INC)
3063 INSTR2(ATOMIC_DEC)
3064 INSTR2(ATOMIC_CMPXCHG)
3065 INSTR2(ATOMIC_MIN)
3066 INSTR2(ATOMIC_MAX)
3067 INSTR2(ATOMIC_AND)
3068 INSTR2(ATOMIC_OR)
3069 INSTR2(ATOMIC_XOR)
3070 INSTR2(LDC)
3071 INSTR2(QUAD_SHUFFLE_BRCST)
3072 INSTR1(QUAD_SHUFFLE_HORIZ)
3073 INSTR1(QUAD_SHUFFLE_VERT)
3074 INSTR1(QUAD_SHUFFLE_DIAG)
3075 INSTR2NODST(LDC_K)
3076 INSTR2NODST(STC)
3077 INSTR2NODST(STSC)
3078 INSTR2(SHFL)
3079 #ifndef GPU
3080 #elif GPU >= 600
3081 INSTR4NODST(STIB);
3082 INSTR3(LDIB);
3083 INSTR5(LDG_A);
3084 INSTR6NODST(STG_A);
3085 INSTR2(ATOMIC_G_ADD)
3086 INSTR2(ATOMIC_G_SUB)
3087 INSTR2(ATOMIC_G_XCHG)
3088 INSTR2(ATOMIC_G_INC)
3089 INSTR2(ATOMIC_G_DEC)
3090 INSTR2(ATOMIC_G_CMPXCHG)
3091 INSTR2(ATOMIC_G_MIN)
3092 INSTR2(ATOMIC_G_MAX)
3093 INSTR2(ATOMIC_G_AND)
3094 INSTR2(ATOMIC_G_OR)
3095 INSTR2(ATOMIC_G_XOR)
3096 INSTR3(ATOMIC_B_ADD)
3097 INSTR3(ATOMIC_B_SUB)
3098 INSTR3(ATOMIC_B_XCHG)
3099 INSTR3(ATOMIC_B_INC)
3100 INSTR3(ATOMIC_B_DEC)
3101 INSTR3(ATOMIC_B_CMPXCHG)
3102 INSTR3(ATOMIC_B_MIN)
3103 INSTR3(ATOMIC_B_MAX)
3104 INSTR3(ATOMIC_B_AND)
3105 INSTR3(ATOMIC_B_OR)
3106 INSTR3(ATOMIC_B_XOR)
3107 #elif GPU >= 400
3108 INSTR3(LDGB)
3109 #if GPU >= 500
3110 INSTR3(LDIB)
3111 #endif
3112 INSTR4NODST(STGB)
3113 INSTR4NODST(STIB)
3114 INSTR4(ATOMIC_S_ADD)
3115 INSTR4(ATOMIC_S_SUB)
3116 INSTR4(ATOMIC_S_XCHG)
3117 INSTR4(ATOMIC_S_INC)
3118 INSTR4(ATOMIC_S_DEC)
3119 INSTR4(ATOMIC_S_CMPXCHG)
3120 INSTR4(ATOMIC_S_MIN)
3121 INSTR4(ATOMIC_S_MAX)
3122 INSTR4(ATOMIC_S_AND)
3123 INSTR4(ATOMIC_S_OR)
3124 INSTR4(ATOMIC_S_XOR)
3125 #endif
3126 INSTR4NODST(LDG_K)
3127 INSTR5(RAY_INTERSECTION)
3128 
3129 /* cat7 instructions: */
3130 INSTR0(BAR)
3131 INSTR0(FENCE)
3132 INSTR0(CCINV)
3133 
3134 /* ************************************************************************* */
3135 #include "util/bitset.h"
3136 
3137 #define MAX_REG 256
3138 
3139 typedef BITSET_DECLARE(fullstate_t, 2 * GPR_REG_SIZE);
3140 typedef BITSET_DECLARE(halfstate_t, GPR_REG_SIZE);
3141 typedef BITSET_DECLARE(sharedstate_t, 2 * SHARED_REG_SIZE);
3142 typedef BITSET_DECLARE(nongprstate_t, 2 * NONGPR_REG_SIZE);
3143 
3144 typedef struct {
3145    bool mergedregs;
3146    fullstate_t full;
3147    halfstate_t half;
3148    sharedstate_t shared;
3149    nongprstate_t nongpr;
3150 } regmask_t;
3151 
3152 static inline BITSET_WORD *
__regmask_file(regmask_t * regmask,enum ir3_reg_file file)3153 __regmask_file(regmask_t *regmask, enum ir3_reg_file file)
3154 {
3155    switch (file) {
3156    case IR3_FILE_FULL:
3157       return regmask->full;
3158    case IR3_FILE_HALF:
3159       return regmask->half;
3160    case IR3_FILE_SHARED:
3161       return regmask->shared;
3162    case IR3_FILE_NONGPR:
3163       return regmask->nongpr;
3164    }
3165    unreachable("bad file");
3166 }
3167 
3168 static inline bool
__regmask_get(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3169 __regmask_get(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3170 {
3171    BITSET_WORD *regs = __regmask_file(regmask, file);
3172    for (unsigned i = 0; i < size; i++) {
3173       if (BITSET_TEST(regs, n + i))
3174          return true;
3175    }
3176    return false;
3177 }
3178 
3179 static inline void
__regmask_set(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3180 __regmask_set(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3181 {
3182    BITSET_WORD *regs = __regmask_file(regmask, file);
3183    for (unsigned i = 0; i < size; i++)
3184       BITSET_SET(regs, n + i);
3185 }
3186 
3187 static inline void
__regmask_clear(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3188 __regmask_clear(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3189 {
3190    BITSET_WORD *regs = __regmask_file(regmask, file);
3191    for (unsigned i = 0; i < size; i++)
3192       BITSET_CLEAR(regs, n + i);
3193 }
3194 
3195 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)3196 regmask_init(regmask_t *regmask, bool mergedregs)
3197 {
3198    memset(regmask, 0, sizeof(*regmask));
3199    regmask->mergedregs = mergedregs;
3200 }
3201 
3202 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)3203 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
3204 {
3205    assert(dst->mergedregs == a->mergedregs);
3206    assert(dst->mergedregs == b->mergedregs);
3207 
3208    for (unsigned i = 0; i < ARRAY_SIZE(dst->full); i++)
3209       dst->full[i] = a->full[i] | b->full[i];
3210    for (unsigned i = 0; i < ARRAY_SIZE(dst->half); i++)
3211       dst->half[i] = a->half[i] | b->half[i];
3212    for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3213       dst->shared[i] = a->shared[i] | b->shared[i];
3214    for (unsigned i = 0; i < ARRAY_SIZE(dst->nongpr); i++)
3215       dst->nongpr[i] = a->nongpr[i] | b->nongpr[i];
3216 }
3217 
3218 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)3219 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
3220 {
3221    for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3222       dst->shared[i] = a->shared[i] | b->shared[i];
3223 }
3224 
3225 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)3226 regmask_set(regmask_t *regmask, struct ir3_register *reg)
3227 {
3228    unsigned size = reg_elem_size(reg);
3229    enum ir3_reg_file file;
3230    unsigned num = post_ra_reg_num(reg);
3231    unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3232    if (reg->flags & IR3_REG_RELATIV) {
3233       __regmask_set(regmask, file, n, size * reg->size);
3234    } else {
3235       for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3236          if (mask & 1)
3237             __regmask_set(regmask, file, n, size);
3238    }
3239 }
3240 
3241 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)3242 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
3243 {
3244    unsigned size = reg_elem_size(reg);
3245    enum ir3_reg_file file;
3246    unsigned num = post_ra_reg_num(reg);
3247    unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3248    if (reg->flags & IR3_REG_RELATIV) {
3249       __regmask_clear(regmask, file, n, size * reg->size);
3250    } else {
3251       for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3252          if (mask & 1)
3253             __regmask_clear(regmask, file, n, size);
3254    }
3255 }
3256 
3257 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)3258 regmask_get(regmask_t *regmask, struct ir3_register *reg)
3259 {
3260    unsigned size = reg_elem_size(reg);
3261    enum ir3_reg_file file;
3262    unsigned num = post_ra_reg_num(reg);
3263    unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3264    if (reg->flags & IR3_REG_RELATIV) {
3265       return __regmask_get(regmask, file, n, size * reg->size);
3266    } else {
3267       for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3268          if (mask & 1)
3269             if (__regmask_get(regmask, file, n, size))
3270                return true;
3271    }
3272    return false;
3273 }
3274 /* ************************************************************************* */
3275 
3276 #endif /* IR3_H_ */
3277