• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #ifndef IR3_H_
25 #define IR3_H_
26 
27 #include <stdbool.h>
28 #include <stdint.h>
29 
30 #include "compiler/shader_enums.h"
31 
32 #include "util/bitscan.h"
33 #include "util/list.h"
34 #include "util/set.h"
35 #include "util/u_debug.h"
36 
37 #include "freedreno_common.h"
38 
39 #include "instr-a3xx.h"
40 
41 /* low level intermediate representation of an adreno shader program */
42 
43 struct ir3_compiler;
44 struct ir3;
45 struct ir3_instruction;
46 struct ir3_block;
47 
48 struct ir3_info {
49    void *data; /* used internally in ir3 assembler */
50    /* Size in bytes of the shader binary, including NIR constants and
51     * padding
52     */
53    uint32_t size;
54    /* byte offset from start of the shader to the NIR constant data. */
55    uint32_t constant_data_offset;
56    /* Size in dwords of the instructions. */
57    uint16_t sizedwords;
58    uint16_t instrs_count; /* expanded to account for rpt's */
59    uint16_t nops_count;   /* # of nop instructions, including nopN */
60    uint16_t mov_count;
61    uint16_t cov_count;
62    uint16_t stp_count;
63    uint16_t ldp_count;
64    /* NOTE: max_reg, etc, does not include registers not touched
65     * by the shader (ie. vertex fetched via VFD_DECODE but not
66     * touched by shader)
67     */
68    int8_t max_reg; /* highest GPR # used by shader */
69    int8_t max_half_reg;
70    int16_t max_const;
71    /* This is the maximum # of waves that can executed at once in one core,
72     * assuming that they are all executing this shader.
73     */
74    int8_t max_waves;
75    uint8_t subgroup_size;
76    bool double_threadsize;
77    bool multi_dword_ldp_stp;
78 
79    /* number of sync bits: */
80    uint16_t ss, sy;
81 
82    /* estimate of number of cycles stalled on (ss) */
83    uint16_t sstall;
84    /* estimate of number of cycles stalled on (sy) */
85    uint16_t systall;
86 
87    uint16_t last_baryf; /* instruction # of last varying fetch */
88 
89    uint16_t last_helper; /* last instruction to use helper invocations */
90 
91    /* Number of instructions of a given category: */
92    uint16_t instrs_per_cat[8];
93 };
94 
95 struct ir3_merge_set {
96    uint16_t preferred_reg;
97    uint16_t size;
98    uint16_t alignment;
99 
100    unsigned interval_start;
101    unsigned spill_slot;
102 
103    unsigned regs_count;
104    struct ir3_register **regs;
105 };
106 
107 typedef enum ir3_register_flags {
108    IR3_REG_CONST = BIT(0),
109    IR3_REG_IMMED = BIT(1),
110    IR3_REG_HALF = BIT(2),
111    /* Shared registers have the same value for all threads when read.
112     * They can only be written when one thread is active (that is, inside
113     * a "getone" block).
114     */
115    IR3_REG_SHARED = BIT(3),
116    IR3_REG_RELATIV = BIT(4),
117    IR3_REG_R = BIT(5),
118    /* Most instructions, it seems, can do float abs/neg but not
119     * integer.  The CP pass needs to know what is intended (int or
120     * float) in order to do the right thing.  For this reason the
121     * abs/neg flags are split out into float and int variants.  In
122     * addition, .b (bitwise) operations, the negate is actually a
123     * bitwise not, so split that out into a new flag to make it
124     * more clear.
125     */
126    IR3_REG_FNEG = BIT(6),
127    IR3_REG_FABS = BIT(7),
128    IR3_REG_SNEG = BIT(8),
129    IR3_REG_SABS = BIT(9),
130    IR3_REG_BNOT = BIT(10),
131    /* (ei) flag, end-input?  Set on last bary, presumably to signal
132     * that the shader needs no more input:
133     *
134     * Note: Has different meaning on other instructions like add.s/u
135     */
136    IR3_REG_EI = BIT(11),
137    /* meta-flags, for intermediate stages of IR, ie.
138     * before register assignment is done:
139     */
140    IR3_REG_SSA = BIT(12), /* 'def' is ptr to assigning destination */
141    IR3_REG_ARRAY = BIT(13),
142 
143    /* Set on a use whenever the SSA value becomes dead after the current
144     * instruction.
145     */
146    IR3_REG_KILL = BIT(14),
147 
148    /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
149     * same SSA value in a single instruction, this is only set on the first
150     * use.
151     */
152    IR3_REG_FIRST_KILL = BIT(15),
153 
154    /* Set when a destination doesn't have any uses and is dead immediately
155     * after the instruction. This can happen even after optimizations for
156     * corner cases such as destinations of atomic instructions.
157     */
158    IR3_REG_UNUSED = BIT(16),
159 
160    /* "Early-clobber" on a destination means that the destination is
161     * (potentially) written before any sources are read and therefore
162     * interferes with the sources of the instruction.
163     */
164    IR3_REG_EARLY_CLOBBER = BIT(17),
165 
166    /* If this is the last usage of a specific value in the register, the
167     * register cannot be read without being written to first after this.
168     * Note: This effectively has the same semantics as IR3_REG_KILL.
169     */
170    IR3_REG_LAST_USE = BIT(18),
171 } ir3_register_flags;
172 
173 struct ir3_register {
174    BITMASK_ENUM(ir3_register_flags) flags;
175 
176    unsigned name;
177 
178    /* used for cat5 instructions, but also for internal/IR level
179     * tracking of what registers are read/written by an instruction.
180     * wrmask may be a bad name since it is used to represent both
181     * src and dst that touch multiple adjacent registers.
182     */
183    unsigned wrmask : 16; /* up to vec16 */
184 
185    /* for relative addressing, 32bits for array size is too small,
186     * but otoh we don't need to deal with disjoint sets, so instead
187     * use a simple size field (number of scalar components).
188     *
189     * Note the size field isn't important for relative const (since
190     * we don't have to do register allocation for constants).
191     */
192    unsigned size : 16;
193 
194    /* normal registers:
195     * the component is in the low two bits of the reg #, so
196     * rN.x becomes: (N << 2) | x
197     */
198    uint16_t num;
199    union {
200       /* immediate: */
201       int32_t iim_val;
202       uint32_t uim_val;
203       float fim_val;
204       /* relative: */
205       struct {
206          uint16_t id;
207          int16_t offset;
208          uint16_t base;
209       } array;
210    };
211 
212    /* For IR3_REG_SSA, dst registers contain pointer back to the instruction
213     * containing this register.
214     */
215    struct ir3_instruction *instr;
216 
217    /* For IR3_REG_SSA, src registers contain ptr back to assigning
218     * instruction.
219     *
220     * For IR3_REG_ARRAY, the pointer is back to the last dependent
221     * array access (although the net effect is the same, it points
222     * back to a previous instruction that we depend on).
223     */
224    struct ir3_register *def;
225 
226    /* Pointer to another register in the instruction that must share the same
227     * physical register. Each destination can be tied with one source, and
228     * they must have "tied" pointing to each other.
229     */
230    struct ir3_register *tied;
231 
232    unsigned spill_slot, next_use;
233 
234    unsigned merge_set_offset;
235    struct ir3_merge_set *merge_set;
236    unsigned interval_start, interval_end;
237 };
238 
239 /*
240  * Stupid/simple growable array implementation:
241  */
242 #define DECLARE_ARRAY(type, name)                                              \
243    unsigned name##_count, name##_sz;                                           \
244    type *name;
245 
246 #define array_insert(ctx, arr, ...)                                            \
247    do {                                                                        \
248       if (arr##_count == arr##_sz) {                                           \
249          arr##_sz = MAX2(2 * arr##_sz, 16);                                    \
250          arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0]));             \
251       }                                                                        \
252       arr[arr##_count++] = __VA_ARGS__;                                        \
253    } while (0)
254 
255 typedef enum {
256    REDUCE_OP_ADD_U,
257    REDUCE_OP_ADD_F,
258    REDUCE_OP_MUL_U,
259    REDUCE_OP_MUL_F,
260    REDUCE_OP_MIN_U,
261    REDUCE_OP_MIN_S,
262    REDUCE_OP_MIN_F,
263    REDUCE_OP_MAX_U,
264    REDUCE_OP_MAX_S,
265    REDUCE_OP_MAX_F,
266    REDUCE_OP_AND_B,
267    REDUCE_OP_OR_B,
268    REDUCE_OP_XOR_B,
269 } reduce_op_t;
270 
271 typedef enum {
272    ALIAS_TEX = 0,
273    ALIAS_RT = 3,
274    ALIAS_MEM = 4,
275 } ir3_alias_scope;
276 
277 typedef enum ir3_instruction_flags {
278    /* (sy) flag is set on first instruction, and after sample
279     * instructions (probably just on RAW hazard).
280     */
281    IR3_INSTR_SY = BIT(0),
282    /* (ss) flag is set on first instruction, and first instruction
283     * to depend on the result of "long" instructions (RAW hazard):
284     *
285     *   rcp, rsq, log2, exp2, sin, cos, sqrt
286     *
287     * It seems to synchronize until all in-flight instructions are
288     * completed, for example:
289     *
290     *   rsq hr1.w, hr1.w
291     *   add.f hr2.z, (neg)hr2.z, hc0.y
292     *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
293     *   rsq hr2.x, hr2.x
294     *   (rpt1)nop
295     *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
296     *   nop
297     *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
298     *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
299     *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
300     *
301     * The last mul.f does not have (ss) set, presumably because the
302     * (ss) on the previous instruction does the job.
303     *
304     * The blob driver also seems to set it on WAR hazards, although
305     * not really clear if this is needed or just blob compiler being
306     * sloppy.  So far I haven't found a case where removing the (ss)
307     * causes problems for WAR hazard, but I could just be getting
308     * lucky:
309     *
310     *   rcp r1.y, r3.y
311     *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
312     *
313     */
314    IR3_INSTR_SS = BIT(1),
315    /* (jp) flag is set on jump targets:
316     */
317    IR3_INSTR_JP = BIT(2),
318    /* (eq) flag kills helper invocations when they are no longer needed */
319    IR3_INSTR_EQ = BIT(3),
320    IR3_INSTR_UL = BIT(4),
321    IR3_INSTR_3D = BIT(5),
322    IR3_INSTR_A = BIT(6),
323    IR3_INSTR_O = BIT(7),
324    IR3_INSTR_P = BIT(8),
325    IR3_INSTR_S = BIT(9),
326    IR3_INSTR_S2EN = BIT(10),
327    IR3_INSTR_SAT = BIT(11),
328    /* (cat5/cat6) Bindless */
329    IR3_INSTR_B = BIT(12),
330    /* (cat5/cat6) nonuniform */
331    IR3_INSTR_NONUNIF = BIT(13),
332    /* (cat5-only) Get some parts of the encoding from a1.x */
333    IR3_INSTR_A1EN = BIT(14),
334    /* meta-flags, for intermediate stages of IR, ie.
335     * before register assignment is done:
336     */
337    IR3_INSTR_MARK = BIT(15),
338 
339    /* Used by shared register allocation when creating spill/reload instructions
340     * to inform validation that this is created by RA. This also may be set on
341     * an instruction where a spill has been folded into it.
342     */
343    IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
344 
345    IR3_INSTR_UNUSED = BIT(17),
346 } ir3_instruction_flags;
347 
348 struct ir3_instruction {
349    struct ir3_block *block;
350    opc_t opc;
351    BITMASK_ENUM(ir3_instruction_flags) flags;
352    uint8_t repeat;
353    uint8_t nop;
354 #ifdef DEBUG
355    unsigned srcs_max, dsts_max;
356 #endif
357    unsigned srcs_count, dsts_count;
358    struct ir3_register **dsts;
359    struct ir3_register **srcs;
360    union {
361       struct {
362          char inv1, inv2;
363          char comp1, comp2;
364          int immed;
365          struct ir3_block *target;
366          const char *target_label;
367          brtype_t brtype;
368          unsigned idx; /* for brac.N */
369       } cat0;
370       struct {
371          type_t src_type, dst_type;
372          round_t round;
373          reduce_op_t reduce_op;
374       } cat1;
375       struct {
376          enum {
377             IR3_COND_LT = 0,
378             IR3_COND_LE = 1,
379             IR3_COND_GT = 2,
380             IR3_COND_GE = 3,
381             IR3_COND_EQ = 4,
382             IR3_COND_NE = 5,
383          } condition;
384       } cat2;
385       struct {
386          enum {
387             IR3_SRC_UNSIGNED = 0,
388             IR3_SRC_MIXED = 1,
389          } signedness;
390          enum {
391             IR3_SRC_PACKED_LOW = 0,
392             IR3_SRC_PACKED_HIGH = 1,
393          } packed;
394          bool swapped;
395       } cat3;
396       struct {
397          unsigned samp, tex;
398          unsigned tex_base : 3;
399          unsigned cluster_size : 4;
400          type_t type;
401       } cat5;
402       struct {
403          type_t type;
404          /* TODO remove dst_offset and handle as a ir3_register
405           * which might be IMMED, similar to how src_offset is
406           * handled.
407           */
408          int dst_offset;
409          int iim_val;       /* for ldgb/stgb, # of components */
410          unsigned d    : 3; /* for ldc, component offset */
411          bool typed    : 1;
412          unsigned base : 3;
413       } cat6;
414       struct {
415          unsigned w : 1; /* write */
416          unsigned r : 1; /* read */
417          unsigned l : 1; /* local */
418          unsigned g : 1; /* global */
419 
420          ir3_alias_scope alias_scope;
421       } cat7;
422       /* for meta-instructions, just used to hold extra data
423        * before instruction scheduling, etc
424        */
425       struct {
426          int off; /* component/offset */
427       } split;
428       struct {
429          /* Per-source index back to the entry in the
430           * ir3_shader_variant::outputs table.
431           */
432          unsigned *outidxs;
433       } end;
434       struct {
435          /* used to temporarily hold reference to nir_phi_instr
436           * until we resolve the phi srcs
437           */
438          void *nphi;
439       } phi;
440       struct {
441          unsigned samp, tex;
442          unsigned input_offset;
443          unsigned samp_base : 3;
444          unsigned tex_base  : 3;
445       } prefetch;
446       struct {
447          /* maps back to entry in ir3_shader_variant::inputs table: */
448          int inidx;
449          /* for sysvals, identifies the sysval type.  Mostly so we can
450           * identify the special cases where a sysval should not be DCE'd
451           * (currently, just pre-fs texture fetch)
452           */
453          gl_system_value sysval;
454       } input;
455       struct {
456          unsigned src_base, src_size;
457          unsigned dst_base;
458       } push_consts;
459       struct {
460          uint64_t value;
461       } raw;
462    };
463 
464    /* For assigning jump offsets, we need instruction's position: */
465    uint32_t ip;
466 
467    /* used for per-pass extra instruction data.
468     *
469     * TODO we should remove the per-pass data like this and 'use_count'
470     * and do something similar to what RA does w/ ir3_ra_instr_data..
471     * ie. use the ir3_count_instructions pass, and then use instr->ip
472     * to index into a table of pass-private data.
473     */
474    void *data;
475 
476    /**
477     * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
478     */
479    struct set *uses;
480 
481    int use_count; /* currently just updated/used by cp */
482 
483    /* an instruction can reference at most one address register amongst
484     * it's src/dst registers.  Beyond that, you need to insert mov's.
485     *
486     * NOTE: do not write this directly, use ir3_instr_set_address()
487     */
488    struct ir3_register *address;
489 
490    /* Tracking for additional dependent instructions.  Used to handle
491     * barriers, WAR hazards for arrays/SSBOs/etc.
492     */
493    DECLARE_ARRAY(struct ir3_instruction *, deps);
494 
495    /*
496     * From PoV of instruction scheduling, not execution (ie. ignores global/
497     * local distinction):
498     *                            shared  image  atomic  SSBO  everything
499     *   barrier()/            -   R/W     R/W    R/W     R/W       X
500     *     groupMemoryBarrier()
501     *     memoryBarrier()
502     *     (but only images declared coherent?)
503     *   memoryBarrierAtomic() -                  R/W
504     *   memoryBarrierBuffer() -                          R/W
505     *   memoryBarrierImage()  -           R/W
506     *   memoryBarrierShared() -   R/W
507     *
508     * TODO I think for SSBO/image/shared, in cases where we can determine
509     * which variable is accessed, we don't need to care about accesses to
510     * different variables (unless declared coherent??)
511     */
512    enum {
513       IR3_BARRIER_EVERYTHING = 1 << 0,
514       IR3_BARRIER_SHARED_R = 1 << 1,
515       IR3_BARRIER_SHARED_W = 1 << 2,
516       IR3_BARRIER_IMAGE_R = 1 << 3,
517       IR3_BARRIER_IMAGE_W = 1 << 4,
518       IR3_BARRIER_BUFFER_R = 1 << 5,
519       IR3_BARRIER_BUFFER_W = 1 << 6,
520       IR3_BARRIER_ARRAY_R = 1 << 7,
521       IR3_BARRIER_ARRAY_W = 1 << 8,
522       IR3_BARRIER_PRIVATE_R = 1 << 9,
523       IR3_BARRIER_PRIVATE_W = 1 << 10,
524       IR3_BARRIER_CONST_W = 1 << 11,
525       IR3_BARRIER_ACTIVE_FIBERS_R = 1 << 12,
526       IR3_BARRIER_ACTIVE_FIBERS_W = 1 << 13,
527    } barrier_class,
528       barrier_conflict;
529 
530    /* Entry in ir3_block's instruction list: */
531    struct list_head node;
532 
533    uint32_t serialno;
534 
535    // TODO only computerator/assembler:
536    int line;
537 };
538 
539 struct ir3 {
540    struct ir3_compiler *compiler;
541    gl_shader_stage type;
542 
543    DECLARE_ARRAY(struct ir3_instruction *, inputs);
544 
545    /* Track bary.f (and ldlv) instructions.. this is needed in
546     * scheduling to ensure that all varying fetches happen before
547     * any potential kill instructions.  The hw gets grumpy if all
548     * threads in a group are killed before the last bary.f gets
549     * a chance to signal end of input (ei).
550     */
551    DECLARE_ARRAY(struct ir3_instruction *, baryfs);
552 
553    /* Track all indirect instructions (read and write).  To avoid
554     * deadlock scenario where an address register gets scheduled,
555     * but other dependent src instructions cannot be scheduled due
556     * to dependency on a *different* address register value, the
557     * scheduler needs to ensure that all dependencies other than
558     * the instruction other than the address register are scheduled
559     * before the one that writes the address register.  Having a
560     * convenient list of instructions that reference some address
561     * register simplifies this.
562     */
563    DECLARE_ARRAY(struct ir3_instruction *, a0_users);
564 
565    /* same for a1.x: */
566    DECLARE_ARRAY(struct ir3_instruction *, a1_users);
567 
568    /* and same for instructions that consume predicate register: */
569    DECLARE_ARRAY(struct ir3_instruction *, predicates);
570 
571    /* Track texture sample instructions which need texture state
572     * patched in (for astc-srgb workaround):
573     */
574    DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
575 
576    /* Track tg4 instructions which need texture state patched in (for tg4
577     * swizzling workaround):
578     */
579    DECLARE_ARRAY(struct ir3_instruction *, tg4);
580 
581    /* List of blocks: */
582    struct list_head block_list;
583 
584    /* List of ir3_array's: */
585    struct list_head array_list;
586 
587 #ifdef DEBUG
588    unsigned block_count;
589 #endif
590    unsigned instr_count;
591 };
592 
593 struct ir3_array {
594    struct list_head node;
595    unsigned length;
596    unsigned id;
597 
598    struct nir_def *r;
599 
600    /* To avoid array write's from getting DCE'd, keep track of the
601     * most recent write.  Any array access depends on the most
602     * recent write.  This way, nothing depends on writes after the
603     * last read.  But all the writes that happen before that have
604     * something depending on them
605     */
606    struct ir3_register *last_write;
607 
608    /* extra stuff used in RA pass: */
609    unsigned base; /* base vreg name */
610    unsigned reg;  /* base physical reg */
611    uint16_t start_ip, end_ip;
612 
613    /* Indicates if half-precision */
614    bool half;
615 
616    bool unused;
617 };
618 
619 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
620 
621 enum ir3_branch_type {
622    IR3_BRANCH_COND,    /* condition */
623    IR3_BRANCH_ANY,     /* subgroupAny(condition) */
624    IR3_BRANCH_ALL,     /* subgroupAll(condition) */
625    IR3_BRANCH_GETONE,  /* subgroupElect() */
626    IR3_BRANCH_GETLAST, /* getlast.w8 */
627    IR3_BRANCH_SHPS,    /* preamble start */
628 };
629 
630 struct ir3_block {
631    struct list_head node;
632    struct ir3 *shader;
633 
634    const struct nir_block *nblock;
635 
636    struct list_head instr_list; /* list of ir3_instruction */
637 
638    /* The actual branch condition, if there are two successors */
639    enum ir3_branch_type brtype;
640 
641    /* each block has either one or two successors.. in case of two
642     * successors, 'condition' decides which one to follow.  A block preceding
643     * an if/else has two successors.
644     *
645     * In some cases the path that the machine actually takes through the
646     * program may not match the per-thread view of the CFG. In particular
647     * this is the case for if/else, where the machine jumps from the end of
648     * the if to the beginning of the else and switches active lanes. While
649     * most things only care about the per-thread view, we need to use the
650     * "physical" view when allocating shared registers. "successors" contains
651     * the per-thread successors, and "physical_successors" contains the
652     * physical successors which includes the fallthrough edge from the if to
653     * the else.
654     */
655    struct ir3_instruction *condition;
656    struct ir3_block *successors[2];
657 
658    DECLARE_ARRAY(struct ir3_block *, predecessors);
659    DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
660    DECLARE_ARRAY(struct ir3_block *, physical_successors);
661 
662    uint16_t start_ip, end_ip;
663 
664    bool reconvergence_point;
665 
666    /* Track instructions which do not write a register but other-
667     * wise must not be discarded (such as kill, stg, etc)
668     */
669    DECLARE_ARRAY(struct ir3_instruction *, keeps);
670 
671    /* used for per-pass extra block data.  Mainly used right
672     * now in RA step to track livein/liveout.
673     */
674    void *data;
675 
676    uint32_t index;
677 
678    struct ir3_block *imm_dom;
679    DECLARE_ARRAY(struct ir3_block *, dom_children);
680 
681    uint32_t dom_pre_index;
682    uint32_t dom_post_index;
683 
684    uint32_t loop_id;
685    uint32_t loop_depth;
686 
687 #ifdef DEBUG
688    uint32_t serialno;
689 #endif
690 };
691 
692 static inline uint32_t
block_id(struct ir3_block * block)693 block_id(struct ir3_block *block)
694 {
695 #ifdef DEBUG
696    return block->serialno;
697 #else
698    return (uint32_t)(unsigned long)block;
699 #endif
700 }
701 
702 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)703 ir3_start_block(struct ir3 *ir)
704 {
705    return list_first_entry(&ir->block_list, struct ir3_block, node);
706 }
707 
708 static inline struct ir3_block *
ir3_end_block(struct ir3 * ir)709 ir3_end_block(struct ir3 *ir)
710 {
711    return list_last_entry(&ir->block_list, struct ir3_block, node);
712 }
713 
714 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)715 ir3_after_preamble(struct ir3 *ir)
716 {
717    struct ir3_block *block = ir3_start_block(ir);
718    /* The preamble will have a usually-empty else branch, and we want to skip
719     * that to get to the block after the preamble.
720     */
721    if (block->brtype == IR3_BRANCH_SHPS)
722       return block->successors[1]->successors[0];
723    else
724       return block;
725 }
726 
727 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
728 void ir3_block_link_physical(struct ir3_block *pred, struct ir3_block *succ);
729 void ir3_block_remove_predecessor(struct ir3_block *block,
730                                   struct ir3_block *pred);
731 unsigned ir3_block_get_pred_index(struct ir3_block *block,
732                                   struct ir3_block *pred);
733 
734 void ir3_calc_dominance(struct ir3 *ir);
735 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
736 
737 struct ir3_shader_variant;
738 
739 struct ir3 *ir3_create(struct ir3_compiler *compiler,
740                        struct ir3_shader_variant *v);
741 void ir3_destroy(struct ir3 *shader);
742 
743 void ir3_collect_info(struct ir3_shader_variant *v);
744 void *ir3_alloc(struct ir3 *shader, int sz);
745 
746 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
747                                          unsigned reg_count,
748                                          bool double_threadsize);
749 
750 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
751                                            bool double_threadsize);
752 
753 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
754                                   unsigned regs_count);
755 
756 struct ir3_block *ir3_block_create(struct ir3 *shader);
757 
758 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
759                                          int ndst, int nsrc);
760 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
761 void ir3_instr_add_dep(struct ir3_instruction *instr,
762                        struct ir3_instruction *dep);
763 const char *ir3_instr_name(struct ir3_instruction *instr);
764 
765 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
766                                     int flags);
767 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
768                                     int flags);
769 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
770                                    struct ir3_register *reg);
771 
772 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)773 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
774 {
775    assert(!dst->tied && !src->tied);
776    dst->tied = src;
777    src->tied = dst;
778 }
779 
780 void ir3_reg_set_last_array(struct ir3_instruction *instr,
781                             struct ir3_register *reg,
782                             struct ir3_register *last_write);
783 
784 void ir3_instr_set_address(struct ir3_instruction *instr,
785                            struct ir3_instruction *addr);
786 
787 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)788 ir3_instr_check_mark(struct ir3_instruction *instr)
789 {
790    if (instr->flags & IR3_INSTR_MARK)
791       return true; /* already visited */
792    instr->flags |= IR3_INSTR_MARK;
793    return false;
794 }
795 
796 void ir3_block_clear_mark(struct ir3_block *block);
797 void ir3_clear_mark(struct ir3 *shader);
798 
799 unsigned ir3_count_instructions(struct ir3 *ir);
800 unsigned ir3_count_instructions_ra(struct ir3 *ir);
801 
802 /**
803  * Move 'instr' to just before 'after'
804  */
805 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)806 ir3_instr_move_before(struct ir3_instruction *instr,
807                       struct ir3_instruction *after)
808 {
809    list_delinit(&instr->node);
810    list_addtail(&instr->node, &after->node);
811 }
812 
813 /**
814  * Move 'instr' to just after 'before':
815  */
816 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)817 ir3_instr_move_after(struct ir3_instruction *instr,
818                      struct ir3_instruction *before)
819 {
820    list_delinit(&instr->node);
821    list_add(&instr->node, &before->node);
822 }
823 
824 /**
825  * Move 'instr' to the beginning of the block:
826  */
827 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)828 ir3_instr_move_before_block(struct ir3_instruction *instr,
829                             struct ir3_block *block)
830 {
831    list_delinit(&instr->node);
832    list_add(&instr->node, &block->instr_list);
833 }
834 
835 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
836 
837 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
838 void ir3_fixup_src_type(struct ir3_instruction *instr);
839 
840 int ir3_flut(struct ir3_register *src_reg);
841 
842 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
843 
844 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
845 
846 #include "util/set.h"
847 #define foreach_ssa_use(__use, __instr)                                        \
848    for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
849         __use = NULL)                                                          \
850       set_foreach ((__instr)->uses, __entry)                                   \
851          if ((__use = (void *)__entry->key))
852 
853 static inline uint32_t
reg_num(const struct ir3_register * reg)854 reg_num(const struct ir3_register *reg)
855 {
856    return reg->num >> 2;
857 }
858 
859 static inline uint32_t
reg_comp(const struct ir3_register * reg)860 reg_comp(const struct ir3_register *reg)
861 {
862    return reg->num & 0x3;
863 }
864 
865 static inline bool
is_flow(struct ir3_instruction * instr)866 is_flow(struct ir3_instruction *instr)
867 {
868    return (opc_cat(instr->opc) == 0);
869 }
870 
871 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)872 is_kill_or_demote(struct ir3_instruction *instr)
873 {
874    return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
875 }
876 
877 static inline bool
is_nop(struct ir3_instruction * instr)878 is_nop(struct ir3_instruction *instr)
879 {
880    return instr->opc == OPC_NOP;
881 }
882 
883 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)884 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
885 {
886    unsigned dst_type = (dst->flags & IR3_REG_HALF);
887    unsigned src_type = (src->flags & IR3_REG_HALF);
888 
889    /* Treat shared->normal copies as same-type, because they can generally be
890     * folded, but not normal->shared copies.
891     */
892    if (dst_type != src_type ||
893        ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
894       return false;
895    else
896       return true;
897 }
898 
899 /* Is it a non-transformative (ie. not type changing) mov?  This can
900  * also include absneg.s/absneg.f, which for the most part can be
901  * treated as a mov (single src argument).
902  */
903 static inline bool
is_same_type_mov(struct ir3_instruction * instr)904 is_same_type_mov(struct ir3_instruction *instr)
905 {
906    struct ir3_register *dst;
907 
908    switch (instr->opc) {
909    case OPC_MOV:
910       if (instr->cat1.src_type != instr->cat1.dst_type)
911          return false;
912       /* If the type of dest reg and src reg are different,
913        * it shouldn't be considered as same type mov
914        */
915       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
916          return false;
917       break;
918    case OPC_ABSNEG_F:
919    case OPC_ABSNEG_S:
920       if (instr->flags & IR3_INSTR_SAT)
921          return false;
922       /* If the type of dest reg and src reg are different,
923        * it shouldn't be considered as same type mov
924        */
925       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
926          return false;
927       break;
928    case OPC_META_PHI:
929       return instr->srcs_count == 1;
930    default:
931       return false;
932    }
933 
934    dst = instr->dsts[0];
935 
936    /* mov's that write to a0 or p0.x are special: */
937    if (dst->num == regid(REG_P0, 0))
938       return false;
939    if (reg_num(dst) == REG_A0)
940       return false;
941 
942    if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
943       return false;
944 
945    return true;
946 }
947 
948 /* A move from const, which changes size but not type, can also be
949  * folded into dest instruction in some cases.
950  */
951 static inline bool
is_const_mov(struct ir3_instruction * instr)952 is_const_mov(struct ir3_instruction *instr)
953 {
954    if (instr->opc != OPC_MOV)
955       return false;
956 
957    if (!(instr->srcs[0]->flags & IR3_REG_CONST))
958       return false;
959 
960    type_t src_type = instr->cat1.src_type;
961    type_t dst_type = instr->cat1.dst_type;
962 
963    return (type_float(src_type) && type_float(dst_type)) ||
964           (type_uint(src_type) && type_uint(dst_type)) ||
965           (type_sint(src_type) && type_sint(dst_type));
966 }
967 
968 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)969 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
970 {
971    switch (instr->opc) {
972    case OPC_BALLOT_MACRO:
973    case OPC_ANY_MACRO:
974    case OPC_ALL_MACRO:
975    case OPC_ELECT_MACRO:
976    case OPC_READ_COND_MACRO:
977    case OPC_READ_FIRST_MACRO:
978    case OPC_SWZ_SHARED_MACRO:
979    case OPC_SCAN_MACRO:
980       return true;
981    default:
982       return false;
983    }
984 }
985 
986 static inline bool
is_alu(struct ir3_instruction * instr)987 is_alu(struct ir3_instruction *instr)
988 {
989    return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
990 }
991 
992 static inline bool
is_sfu(struct ir3_instruction * instr)993 is_sfu(struct ir3_instruction *instr)
994 {
995    return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
996 }
997 
998 static inline bool
is_tex(struct ir3_instruction * instr)999 is_tex(struct ir3_instruction *instr)
1000 {
1001    return (opc_cat(instr->opc) == 5) && instr->opc != OPC_TCINV;
1002 }
1003 
1004 static inline bool
is_tex_shuffle(struct ir3_instruction * instr)1005 is_tex_shuffle(struct ir3_instruction *instr)
1006 {
1007    switch (instr->opc) {
1008    case OPC_BRCST_ACTIVE:
1009    case OPC_QUAD_SHUFFLE_BRCST:
1010    case OPC_QUAD_SHUFFLE_HORIZ:
1011    case OPC_QUAD_SHUFFLE_VERT:
1012    case OPC_QUAD_SHUFFLE_DIAG:
1013       return true;
1014    default:
1015       return false;
1016    }
1017 }
1018 
1019 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)1020 is_tex_or_prefetch(struct ir3_instruction *instr)
1021 {
1022    return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
1023 }
1024 
1025 static inline bool
is_mem(struct ir3_instruction * instr)1026 is_mem(struct ir3_instruction *instr)
1027 {
1028    return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
1029 }
1030 
1031 static inline bool
is_barrier(struct ir3_instruction * instr)1032 is_barrier(struct ir3_instruction *instr)
1033 {
1034    return (opc_cat(instr->opc) == 7);
1035 }
1036 
1037 static inline bool
is_half(struct ir3_instruction * instr)1038 is_half(struct ir3_instruction *instr)
1039 {
1040    return !!(instr->dsts[0]->flags & IR3_REG_HALF);
1041 }
1042 
1043 static inline bool
is_shared(struct ir3_instruction * instr)1044 is_shared(struct ir3_instruction *instr)
1045 {
1046    return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
1047 }
1048 
1049 static inline bool
is_store(struct ir3_instruction * instr)1050 is_store(struct ir3_instruction *instr)
1051 {
1052    /* these instructions, the "destination" register is
1053     * actually a source, the address to store to.
1054     */
1055    switch (instr->opc) {
1056    case OPC_STG:
1057    case OPC_STG_A:
1058    case OPC_STGB:
1059    case OPC_STIB:
1060    case OPC_STP:
1061    case OPC_STL:
1062    case OPC_STLW:
1063    case OPC_L2G:
1064    case OPC_G2L:
1065       return true;
1066    default:
1067       return false;
1068    }
1069 }
1070 
1071 static inline bool
is_load(struct ir3_instruction * instr)1072 is_load(struct ir3_instruction *instr)
1073 {
1074    switch (instr->opc) {
1075    case OPC_LDG:
1076    case OPC_LDG_A:
1077    case OPC_LDGB:
1078    case OPC_LDIB:
1079    case OPC_LDL:
1080    case OPC_LDP:
1081    case OPC_L2G:
1082    case OPC_LDLW:
1083    case OPC_LDC:
1084    case OPC_LDLV:
1085       /* probably some others too.. */
1086       return true;
1087    default:
1088       return false;
1089    }
1090 }
1091 
1092 static inline bool
is_input(struct ir3_instruction * instr)1093 is_input(struct ir3_instruction *instr)
1094 {
1095    /* in some cases, ldlv is used to fetch varying without
1096     * interpolation.. fortunately inloc is the first src
1097     * register in either case
1098     */
1099    switch (instr->opc) {
1100    case OPC_LDLV:
1101    case OPC_BARY_F:
1102    case OPC_FLAT_B:
1103       return true;
1104    default:
1105       return false;
1106    }
1107 }
1108 
1109 /* Whether non-helper invocations can read the value of helper invocations. We
1110  * cannot insert (eq) before these instructions.
1111  */
1112 static inline bool
uses_helpers(struct ir3_instruction * instr)1113 uses_helpers(struct ir3_instruction *instr)
1114 {
1115    switch (instr->opc) {
1116    /* These require helper invocations to be present */
1117    case OPC_SAM:
1118    case OPC_SAMB:
1119    case OPC_GETLOD:
1120    case OPC_DSX:
1121    case OPC_DSY:
1122    case OPC_DSXPP_1:
1123    case OPC_DSYPP_1:
1124    case OPC_DSXPP_MACRO:
1125    case OPC_DSYPP_MACRO:
1126    case OPC_QUAD_SHUFFLE_BRCST:
1127    case OPC_QUAD_SHUFFLE_HORIZ:
1128    case OPC_QUAD_SHUFFLE_VERT:
1129    case OPC_QUAD_SHUFFLE_DIAG:
1130    case OPC_META_TEX_PREFETCH:
1131       return true;
1132 
1133    /* Subgroup operations don't require helper invocations to be present, but
1134     * will use helper invocations if they are present.
1135     */
1136    case OPC_BALLOT_MACRO:
1137    case OPC_ANY_MACRO:
1138    case OPC_ALL_MACRO:
1139    case OPC_ELECT_MACRO:
1140    case OPC_READ_FIRST_MACRO:
1141    case OPC_READ_COND_MACRO:
1142    case OPC_MOVMSK:
1143    case OPC_BRCST_ACTIVE:
1144       return true;
1145 
1146    /* Catch lowered READ_FIRST/READ_COND. */
1147    case OPC_MOV:
1148       return (instr->dsts[0]->flags & IR3_REG_SHARED) &&
1149              !(instr->srcs[0]->flags & IR3_REG_SHARED);
1150 
1151    default:
1152       return false;
1153    }
1154 }
1155 
1156 static inline bool
is_bool(struct ir3_instruction * instr)1157 is_bool(struct ir3_instruction *instr)
1158 {
1159    switch (instr->opc) {
1160    case OPC_CMPS_F:
1161    case OPC_CMPS_S:
1162    case OPC_CMPS_U:
1163       return true;
1164    default:
1165       return false;
1166    }
1167 }
1168 
1169 static inline opc_t
cat3_half_opc(opc_t opc)1170 cat3_half_opc(opc_t opc)
1171 {
1172    switch (opc) {
1173    case OPC_MAD_F32:
1174       return OPC_MAD_F16;
1175    case OPC_SEL_B32:
1176       return OPC_SEL_B16;
1177    case OPC_SEL_S32:
1178       return OPC_SEL_S16;
1179    case OPC_SEL_F32:
1180       return OPC_SEL_F16;
1181    case OPC_SAD_S32:
1182       return OPC_SAD_S16;
1183    default:
1184       return opc;
1185    }
1186 }
1187 
1188 static inline opc_t
cat3_full_opc(opc_t opc)1189 cat3_full_opc(opc_t opc)
1190 {
1191    switch (opc) {
1192    case OPC_MAD_F16:
1193       return OPC_MAD_F32;
1194    case OPC_SEL_B16:
1195       return OPC_SEL_B32;
1196    case OPC_SEL_S16:
1197       return OPC_SEL_S32;
1198    case OPC_SEL_F16:
1199       return OPC_SEL_F32;
1200    case OPC_SAD_S16:
1201       return OPC_SAD_S32;
1202    default:
1203       return opc;
1204    }
1205 }
1206 
1207 static inline opc_t
cat4_half_opc(opc_t opc)1208 cat4_half_opc(opc_t opc)
1209 {
1210    switch (opc) {
1211    case OPC_RSQ:
1212       return OPC_HRSQ;
1213    case OPC_LOG2:
1214       return OPC_HLOG2;
1215    case OPC_EXP2:
1216       return OPC_HEXP2;
1217    default:
1218       return opc;
1219    }
1220 }
1221 
1222 static inline opc_t
cat4_full_opc(opc_t opc)1223 cat4_full_opc(opc_t opc)
1224 {
1225    switch (opc) {
1226    case OPC_HRSQ:
1227       return OPC_RSQ;
1228    case OPC_HLOG2:
1229       return OPC_LOG2;
1230    case OPC_HEXP2:
1231       return OPC_EXP2;
1232    default:
1233       return opc;
1234    }
1235 }
1236 
1237 static inline bool
is_meta(struct ir3_instruction * instr)1238 is_meta(struct ir3_instruction *instr)
1239 {
1240    return (opc_cat(instr->opc) == OPC_META);
1241 }
1242 
1243 static inline unsigned
reg_elems(const struct ir3_register * reg)1244 reg_elems(const struct ir3_register *reg)
1245 {
1246    if (reg->flags & IR3_REG_ARRAY)
1247       return reg->size;
1248    else
1249       return util_last_bit(reg->wrmask);
1250 }
1251 
1252 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1253 reg_elem_size(const struct ir3_register *reg)
1254 {
1255    return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1256 }
1257 
1258 static inline unsigned
reg_size(const struct ir3_register * reg)1259 reg_size(const struct ir3_register *reg)
1260 {
1261    return reg_elems(reg) * reg_elem_size(reg);
1262 }
1263 
1264 static inline unsigned
dest_regs(struct ir3_instruction * instr)1265 dest_regs(struct ir3_instruction *instr)
1266 {
1267    if (instr->dsts_count == 0)
1268       return 0;
1269 
1270    assert(instr->dsts_count == 1);
1271    return util_last_bit(instr->dsts[0]->wrmask);
1272 }
1273 
1274 /* is dst a normal temp register: */
1275 static inline bool
is_dest_gpr(struct ir3_register * dst)1276 is_dest_gpr(struct ir3_register *dst)
1277 {
1278    if (dst->wrmask == 0)
1279       return false;
1280    if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
1281       return false;
1282    return true;
1283 }
1284 
1285 static inline bool
writes_gpr(struct ir3_instruction * instr)1286 writes_gpr(struct ir3_instruction *instr)
1287 {
1288    if (dest_regs(instr) == 0)
1289       return false;
1290    return is_dest_gpr(instr->dsts[0]);
1291 }
1292 
1293 static inline bool
writes_addr0(struct ir3_instruction * instr)1294 writes_addr0(struct ir3_instruction *instr)
1295 {
1296    /* Note: only the first dest can write to a0.x */
1297    if (instr->dsts_count > 0) {
1298       struct ir3_register *dst = instr->dsts[0];
1299       return dst->num == regid(REG_A0, 0);
1300    }
1301    return false;
1302 }
1303 
1304 static inline bool
writes_addr1(struct ir3_instruction * instr)1305 writes_addr1(struct ir3_instruction *instr)
1306 {
1307    /* Note: only the first dest can write to a1.x */
1308    if (instr->dsts_count > 0) {
1309       struct ir3_register *dst = instr->dsts[0];
1310       return dst->num == regid(REG_A0, 1);
1311    }
1312    return false;
1313 }
1314 
1315 static inline bool
writes_pred(struct ir3_instruction * instr)1316 writes_pred(struct ir3_instruction *instr)
1317 {
1318    /* Note: only the first dest can write to p0.x */
1319    if (instr->dsts_count > 0) {
1320       struct ir3_register *dst = instr->dsts[0];
1321       return reg_num(dst) == REG_P0;
1322    }
1323    return false;
1324 }
1325 
1326 /* Is it something other than a normal register. Shared regs, p0, and a0/a1
1327  * are considered special here. Special registers are always accessed with one
1328  * size and never alias normal registers, even though a naive calculation
1329  * would sometimes make it seem like e.g. r30.z aliases a0.x.
1330  */
1331 static inline bool
is_reg_special(const struct ir3_register * reg)1332 is_reg_special(const struct ir3_register *reg)
1333 {
1334    return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
1335           (reg_num(reg) == REG_P0);
1336 }
1337 
1338 /* Same as above but in cases where we don't have a register. r48.x and above
1339  * are shared/special.
1340  */
1341 static inline bool
is_reg_num_special(unsigned num)1342 is_reg_num_special(unsigned num)
1343 {
1344    return num >= 48 * 4;
1345 }
1346 
1347 /* returns defining instruction for reg */
1348 /* TODO better name */
1349 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1350 ssa(struct ir3_register *reg)
1351 {
1352    if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1353       return reg->def->instr;
1354    return NULL;
1355 }
1356 
1357 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1358 conflicts(struct ir3_register *a, struct ir3_register *b)
1359 {
1360    return (a && b) && (a->def != b->def);
1361 }
1362 
1363 static inline bool
reg_gpr(struct ir3_register * r)1364 reg_gpr(struct ir3_register *r)
1365 {
1366    if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1367       return false;
1368    if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
1369       return false;
1370    return true;
1371 }
1372 
1373 static inline type_t
half_type(type_t type)1374 half_type(type_t type)
1375 {
1376    switch (type) {
1377    case TYPE_F32:
1378       return TYPE_F16;
1379    case TYPE_U32:
1380       return TYPE_U16;
1381    case TYPE_S32:
1382       return TYPE_S16;
1383    case TYPE_F16:
1384    case TYPE_U16:
1385    case TYPE_S16:
1386       return type;
1387    case TYPE_U8:
1388    case TYPE_S8:
1389       return type;
1390    default:
1391       assert(0);
1392       return (type_t)~0;
1393    }
1394 }
1395 
1396 static inline type_t
full_type(type_t type)1397 full_type(type_t type)
1398 {
1399    switch (type) {
1400    case TYPE_F16:
1401       return TYPE_F32;
1402    case TYPE_U8:
1403    case TYPE_U16:
1404       return TYPE_U32;
1405    case TYPE_S8:
1406    case TYPE_S16:
1407       return TYPE_S32;
1408    case TYPE_F32:
1409    case TYPE_U32:
1410    case TYPE_S32:
1411       return type;
1412    default:
1413       assert(0);
1414       return (type_t)~0;
1415    }
1416 }
1417 
1418 /* some cat2 instructions (ie. those which are not float) can embed an
1419  * immediate:
1420  */
1421 static inline bool
ir3_cat2_int(opc_t opc)1422 ir3_cat2_int(opc_t opc)
1423 {
1424    switch (opc) {
1425    case OPC_ADD_U:
1426    case OPC_ADD_S:
1427    case OPC_SUB_U:
1428    case OPC_SUB_S:
1429    case OPC_CMPS_U:
1430    case OPC_CMPS_S:
1431    case OPC_MIN_U:
1432    case OPC_MIN_S:
1433    case OPC_MAX_U:
1434    case OPC_MAX_S:
1435    case OPC_CMPV_U:
1436    case OPC_CMPV_S:
1437    case OPC_MUL_U24:
1438    case OPC_MUL_S24:
1439    case OPC_MULL_U:
1440    case OPC_CLZ_S:
1441    case OPC_ABSNEG_S:
1442    case OPC_AND_B:
1443    case OPC_OR_B:
1444    case OPC_NOT_B:
1445    case OPC_XOR_B:
1446    case OPC_BFREV_B:
1447    case OPC_CLZ_B:
1448    case OPC_SHL_B:
1449    case OPC_SHR_B:
1450    case OPC_ASHR_B:
1451    case OPC_MGEN_B:
1452    case OPC_GETBIT_B:
1453    case OPC_CBITS_B:
1454    case OPC_BARY_F:
1455    case OPC_FLAT_B:
1456       return true;
1457 
1458    default:
1459       return false;
1460    }
1461 }
1462 
1463 /* map cat2 instruction to valid abs/neg flags: */
1464 static inline unsigned
ir3_cat2_absneg(opc_t opc)1465 ir3_cat2_absneg(opc_t opc)
1466 {
1467    switch (opc) {
1468    case OPC_ADD_F:
1469    case OPC_MIN_F:
1470    case OPC_MAX_F:
1471    case OPC_MUL_F:
1472    case OPC_SIGN_F:
1473    case OPC_CMPS_F:
1474    case OPC_ABSNEG_F:
1475    case OPC_CMPV_F:
1476    case OPC_FLOOR_F:
1477    case OPC_CEIL_F:
1478    case OPC_RNDNE_F:
1479    case OPC_RNDAZ_F:
1480    case OPC_TRUNC_F:
1481    case OPC_BARY_F:
1482       return IR3_REG_FABS | IR3_REG_FNEG;
1483 
1484    case OPC_ADD_U:
1485    case OPC_ADD_S:
1486    case OPC_SUB_U:
1487    case OPC_SUB_S:
1488    case OPC_CMPS_U:
1489    case OPC_CMPS_S:
1490    case OPC_MIN_U:
1491    case OPC_MIN_S:
1492    case OPC_MAX_U:
1493    case OPC_MAX_S:
1494    case OPC_CMPV_U:
1495    case OPC_CMPV_S:
1496    case OPC_MUL_U24:
1497    case OPC_MUL_S24:
1498    case OPC_MULL_U:
1499    case OPC_CLZ_S:
1500       return 0;
1501 
1502    case OPC_ABSNEG_S:
1503       return IR3_REG_SABS | IR3_REG_SNEG;
1504 
1505    case OPC_AND_B:
1506    case OPC_OR_B:
1507    case OPC_NOT_B:
1508    case OPC_XOR_B:
1509    case OPC_BFREV_B:
1510    case OPC_CLZ_B:
1511    case OPC_SHL_B:
1512    case OPC_SHR_B:
1513    case OPC_ASHR_B:
1514    case OPC_MGEN_B:
1515    case OPC_GETBIT_B:
1516    case OPC_CBITS_B:
1517       return IR3_REG_BNOT;
1518 
1519    default:
1520       return 0;
1521    }
1522 }
1523 
1524 /* map cat3 instructions to valid abs/neg flags: */
1525 static inline unsigned
ir3_cat3_absneg(opc_t opc)1526 ir3_cat3_absneg(opc_t opc)
1527 {
1528    switch (opc) {
1529    case OPC_MAD_F16:
1530    case OPC_MAD_F32:
1531    case OPC_SEL_F16:
1532    case OPC_SEL_F32:
1533       return IR3_REG_FNEG;
1534 
1535    case OPC_MAD_U16:
1536    case OPC_MADSH_U16:
1537    case OPC_MAD_S16:
1538    case OPC_MADSH_M16:
1539    case OPC_MAD_U24:
1540    case OPC_MAD_S24:
1541    case OPC_SEL_S16:
1542    case OPC_SEL_S32:
1543    case OPC_SAD_S16:
1544    case OPC_SAD_S32:
1545       /* neg *may* work on 3rd src.. */
1546 
1547    case OPC_SEL_B16:
1548    case OPC_SEL_B32:
1549 
1550    case OPC_SHRM:
1551    case OPC_SHLM:
1552    case OPC_SHRG:
1553    case OPC_SHLG:
1554    case OPC_ANDG:
1555    case OPC_WMM:
1556    case OPC_WMM_ACCU:
1557 
1558    default:
1559       return 0;
1560    }
1561 }
1562 
1563 /* Return the type (float, int, or uint) the op uses when converting from the
1564  * internal result of the op (which is assumed to be the same size as the
1565  * sources) to the destination when they are not the same size. If F32 it does
1566  * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1567  * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1568  * doesn't do anything sensible or is unknown.
1569  */
1570 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1571 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1572 {
1573    *can_fold = true;
1574    switch (instr->opc) {
1575    case OPC_ADD_F:
1576    case OPC_MUL_F:
1577    case OPC_BARY_F:
1578    case OPC_MAD_F32:
1579    case OPC_MAD_F16:
1580    case OPC_WMM:
1581    case OPC_WMM_ACCU:
1582       return TYPE_F32;
1583 
1584    case OPC_ADD_U:
1585    case OPC_SUB_U:
1586    case OPC_MIN_U:
1587    case OPC_MAX_U:
1588    case OPC_AND_B:
1589    case OPC_OR_B:
1590    case OPC_NOT_B:
1591    case OPC_XOR_B:
1592    case OPC_MUL_U24:
1593    case OPC_MULL_U:
1594    case OPC_SHL_B:
1595    case OPC_SHR_B:
1596    case OPC_ASHR_B:
1597    case OPC_MAD_U24:
1598    case OPC_SHRM:
1599    case OPC_SHLM:
1600    case OPC_SHRG:
1601    case OPC_SHLG:
1602    case OPC_ANDG:
1603    /* Comparison ops zero-extend/truncate their results, so consider them as
1604     * unsigned here.
1605     */
1606    case OPC_CMPS_F:
1607    case OPC_CMPV_F:
1608    case OPC_CMPS_U:
1609    case OPC_CMPS_S:
1610       return TYPE_U32;
1611 
1612    case OPC_ADD_S:
1613    case OPC_SUB_S:
1614    case OPC_MIN_S:
1615    case OPC_MAX_S:
1616    case OPC_ABSNEG_S:
1617    case OPC_MUL_S24:
1618    case OPC_MAD_S24:
1619       return TYPE_S32;
1620 
1621    /* We assume that any move->move folding that could be done was done by
1622     * NIR.
1623     */
1624    case OPC_MOV:
1625    default:
1626       *can_fold = false;
1627       return TYPE_U32;
1628    }
1629 }
1630 
1631 /* Return the src and dst types for the conversion which is already folded
1632  * into the op. We can assume that instr has folded in a conversion from
1633  * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1634  * to call if ir3_output_conv_type() returns can_fold = true.
1635  */
1636 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1637 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1638 {
1639    switch (instr->opc) {
1640    case OPC_CMPS_F:
1641    case OPC_CMPV_F:
1642    case OPC_CMPS_U:
1643    case OPC_CMPS_S:
1644       /* Comparisons only return 0/1 and the size of the comparison sources
1645        * is irrelevant, never consider them as having an output conversion
1646        * by returning a type with the dest size here:
1647        */
1648       return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1649                                                     : full_type(base_type);
1650 
1651    case OPC_BARY_F:
1652       /* bary.f doesn't have an explicit source, but we can assume here that
1653        * the varying data it reads is in fp32.
1654        *
1655        * This may be fp16 on older gen's depending on some register
1656        * settings, but it's probably not worth plumbing that through for a
1657        * small improvement that NIR would hopefully handle for us anyway.
1658        */
1659       return TYPE_F32;
1660 
1661    case OPC_FLAT_B:
1662       /* Treat the input data as u32 if not interpolating. */
1663       return TYPE_U32;
1664 
1665    default:
1666       return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1667                                                     : full_type(base_type);
1668    }
1669 }
1670 
1671 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1672 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1673 {
1674    return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1675                                                  : full_type(base_type);
1676 }
1677 
1678 /* Some instructions have signed/unsigned variants which are identical except
1679  * for whether the folded conversion sign-extends or zero-extends, and we can
1680  * fold in a mismatching move by rewriting the opcode. Return the opcode to
1681  * switch signedness, and whether one exists.
1682  */
1683 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1684 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1685 {
1686    switch (opc) {
1687 #define PAIR(u, s)                                                             \
1688    case OPC_##u:                                                               \
1689       return OPC_##s;                                                          \
1690    case OPC_##s:                                                               \
1691       return OPC_##u;
1692       PAIR(ADD_U, ADD_S)
1693       PAIR(SUB_U, SUB_S)
1694       /* Note: these are only identical when the sources are half, but that's
1695        * the only case we call this function for anyway.
1696        */
1697       PAIR(MUL_U24, MUL_S24)
1698 
1699    default:
1700       *can_swap = false;
1701       return opc;
1702    }
1703 }
1704 
1705 #define MASK(n) ((1 << (n)) - 1)
1706 
1707 /* iterator for an instructions's sources (reg), also returns src #: */
1708 #define foreach_src_n(__srcreg, __n, __instr)                                  \
1709    if ((__instr)->srcs_count)                                                  \
1710       for (struct ir3_register *__srcreg = (struct ir3_register *)~0; __srcreg;\
1711            __srcreg = NULL)                                                    \
1712          for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt;    \
1713               __n++)                                                           \
1714             if ((__srcreg = (__instr)->srcs[__n]))
1715 
1716 /* iterator for an instructions's sources (reg): */
1717 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1718 
1719 /* iterator for an instructions's destinations (reg), also returns dst #: */
1720 #define foreach_dst_n(__dstreg, __n, __instr)                                  \
1721    if ((__instr)->dsts_count)                                                  \
1722       for (struct ir3_register *__dstreg = (struct ir3_register *)~0; __dstreg;\
1723            __dstreg = NULL)                                                    \
1724          for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt;    \
1725               __n++)                                                           \
1726             if ((__dstreg = (__instr)->dsts[__n]))
1727 
1728 /* iterator for an instructions's destinations (reg): */
1729 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1730 
1731 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1732 __ssa_src_cnt(struct ir3_instruction *instr)
1733 {
1734    return instr->srcs_count + instr->deps_count;
1735 }
1736 
1737 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1738 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1739 {
1740    if (n >= instr->srcs_count)
1741       return true;
1742    return false;
1743 }
1744 
1745 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1746 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1747 {
1748    if (__is_false_dep(instr, n))
1749       return &instr->deps[n - instr->srcs_count];
1750    if (ssa(instr->srcs[n]))
1751       return &instr->srcs[n]->def->instr;
1752    return NULL;
1753 }
1754 
1755 #define foreach_ssa_srcp_n(__srcp, __n, __instr)                               \
1756    for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL)   \
1757       for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt;      \
1758            __n++)                                                              \
1759          if ((__srcp = __ssa_srcp_n(__instr, __n)))
1760 
1761 #define foreach_ssa_srcp(__srcp, __instr)                                      \
1762    foreach_ssa_srcp_n (__srcp, __i, __instr)
1763 
1764 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1765 #define foreach_ssa_src_n(__srcinst, __n, __instr)                             \
1766    for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst;             \
1767         __srcinst = NULL)                                                      \
1768       foreach_ssa_srcp_n (__srcp, __n, __instr)                                \
1769          if ((__srcinst = *__srcp))
1770 
1771 /* iterator for an instruction's SSA sources (instr): */
1772 #define foreach_ssa_src(__srcinst, __instr)                                    \
1773    foreach_ssa_src_n (__srcinst, __i, __instr)
1774 
1775 /* iterators for shader inputs: */
1776 #define foreach_input_n(__ininstr, __cnt, __ir)                                \
1777    for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr;             \
1778         __ininstr = NULL)                                                      \
1779       for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++)          \
1780          if ((__ininstr = (__ir)->inputs[__cnt]))
1781 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1782 
1783 /* iterators for instructions: */
1784 #define foreach_instr(__instr, __list)                                         \
1785    list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1786 #define foreach_instr_from(__instr, __start, __list)                           \
1787    list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \
1788                             __list, node)
1789 #define foreach_instr_rev(__instr, __list)                                     \
1790    list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1791 #define foreach_instr_safe(__instr, __list)                                    \
1792    list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1793 #define foreach_instr_from_safe(__instr, __start, __list)                      \
1794    list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start,     \
1795                                  __list, node)
1796 
1797 /* iterators for blocks: */
1798 #define foreach_block(__block, __list)                                         \
1799    list_for_each_entry (struct ir3_block, __block, __list, node)
1800 #define foreach_block_safe(__block, __list)                                    \
1801    list_for_each_entry_safe (struct ir3_block, __block, __list, node)
1802 #define foreach_block_rev(__block, __list)                                     \
1803    list_for_each_entry_rev (struct ir3_block, __block, __list, node)
1804 
1805 /* iterators for arrays: */
1806 #define foreach_array(__array, __list)                                         \
1807    list_for_each_entry (struct ir3_array, __array, __list, node)
1808 #define foreach_array_safe(__array, __list)                                    \
1809    list_for_each_entry_safe (struct ir3_array, __array, __list, node)
1810 
1811 #define IR3_PASS(ir, pass, ...)                                                \
1812    ({                                                                          \
1813       bool progress = pass(ir, ##__VA_ARGS__);                                 \
1814       if (progress) {                                                          \
1815          ir3_debug_print(ir, "AFTER: " #pass);                                 \
1816          ir3_validate(ir);                                                     \
1817       }                                                                        \
1818       progress;                                                                \
1819    })
1820 
1821 /* validate: */
1822 void ir3_validate(struct ir3 *ir);
1823 
1824 /* dump: */
1825 void ir3_print(struct ir3 *ir);
1826 void ir3_print_instr(struct ir3_instruction *instr);
1827 
1828 struct log_stream;
1829 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
1830 
1831 /* delay calculation: */
1832 int ir3_delayslots(struct ir3_instruction *assigner,
1833                    struct ir3_instruction *consumer, unsigned n, bool soft);
1834 unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
1835                                     struct ir3_instruction *consumer,
1836                                     unsigned assigner_n, unsigned consumer_n);
1837 unsigned ir3_delay_calc(struct ir3_block *block,
1838                         struct ir3_instruction *instr, bool mergedregs);
1839 
1840 /* estimated (ss)/(sy) delay calculation */
1841 
1842 static inline bool
is_local_mem_load(struct ir3_instruction * instr)1843 is_local_mem_load(struct ir3_instruction *instr)
1844 {
1845    return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
1846       instr->opc == OPC_LDLW;
1847 }
1848 
1849 /* Does this instruction need (ss) to wait for its result? */
1850 static inline bool
is_ss_producer(struct ir3_instruction * instr)1851 is_ss_producer(struct ir3_instruction *instr)
1852 {
1853    foreach_dst (dst, instr) {
1854       if (dst->flags & IR3_REG_SHARED)
1855          return true;
1856    }
1857    return is_sfu(instr) || is_local_mem_load(instr);
1858 }
1859 
1860 /* The soft delay for approximating the cost of (ss). */
1861 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)1862 soft_ss_delay(struct ir3_instruction *instr)
1863 {
1864    /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
1865     * using nop's instead of (ss) is:
1866     *
1867     *     8 - single warp
1868     *     9 - two warps
1869     *    10 - four warps
1870     *
1871     * and so on. Not quite sure where it tapers out (ie. how many warps share an
1872     * SFU unit). But 10 seems like a reasonable # to choose:
1873     */
1874    if (is_sfu(instr) || is_local_mem_load(instr))
1875       return 10;
1876 
1877    /* The blob adds 6 nops between shared producers and consumers, and before we
1878     * used (ss) this was sufficient in most cases.
1879     */
1880    return 6;
1881 }
1882 
1883 static inline bool
is_sy_producer(struct ir3_instruction * instr)1884 is_sy_producer(struct ir3_instruction *instr)
1885 {
1886    return is_tex_or_prefetch(instr) ||
1887       (is_load(instr) && !is_local_mem_load(instr)) ||
1888       is_atomic(instr->opc);
1889 }
1890 
1891 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)1892 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
1893 {
1894    /* TODO: this is just an optimistic guess, we can do better post-RA.
1895     */
1896    bool double_wavesize =
1897       shader->type == MESA_SHADER_FRAGMENT ||
1898       shader->type == MESA_SHADER_COMPUTE;
1899 
1900    unsigned components = reg_elems(instr->dsts[0]);
1901 
1902    /* These numbers come from counting the number of delay slots to get
1903     * cat5/cat6 results back using nops instead of (sy). Note that these numbers
1904     * are with the result preloaded to cache by loading it before in the same
1905     * shader - uncached results are much larger.
1906     *
1907     * Note: most ALU instructions can't complete at the full doubled rate, so
1908     * they take 2 cycles. The only exception is fp16 instructions with no
1909     * built-in conversions. Therefore divide the latency by 2.
1910     *
1911     * TODO: Handle this properly in the scheduler and remove this.
1912     */
1913    if (instr->opc == OPC_LDC) {
1914       if (double_wavesize)
1915          return (21 + 8 * components) / 2;
1916       else
1917          return 18 + 4 * components;
1918    } else if (is_tex_or_prefetch(instr)) {
1919       if (double_wavesize) {
1920          switch (components) {
1921          case 1: return 58 / 2;
1922          case 2: return 60 / 2;
1923          case 3: return 77 / 2;
1924          case 4: return 79 / 2;
1925          default: unreachable("bad number of components");
1926          }
1927       } else {
1928          switch (components) {
1929          case 1: return 51;
1930          case 2: return 53;
1931          case 3: return 62;
1932          case 4: return 64;
1933          default: unreachable("bad number of components");
1934          }
1935       }
1936    } else {
1937       /* TODO: measure other cat6 opcodes like ldg */
1938       if (double_wavesize)
1939          return (172 + components) / 2;
1940       else
1941          return 109 + components;
1942    }
1943 }
1944 
1945 /* unreachable block elimination: */
1946 bool ir3_remove_unreachable(struct ir3 *ir);
1947 
1948 /* calculate reconvergence information: */
1949 void ir3_calc_reconvergence(struct ir3_shader_variant *so);
1950 
1951 /* dead code elimination: */
1952 struct ir3_shader_variant;
1953 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
1954 
1955 /* fp16 conversion folding */
1956 bool ir3_cf(struct ir3 *ir);
1957 
1958 /* copy-propagate: */
1959 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1960 
1961 /* common subexpression elimination: */
1962 bool ir3_cse(struct ir3 *ir);
1963 
1964 /* Make arrays SSA */
1965 bool ir3_array_to_ssa(struct ir3 *ir);
1966 
1967 /* scheduling: */
1968 bool ir3_sched_add_deps(struct ir3 *ir);
1969 int ir3_sched(struct ir3 *ir);
1970 
1971 struct ir3_context;
1972 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
1973 
1974 /* register assignment: */
1975 int ir3_ra(struct ir3_shader_variant *v);
1976 
1977 /* lower subgroup ops: */
1978 bool ir3_lower_subgroups(struct ir3 *ir);
1979 
1980 /* legalize: */
1981 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
1982 bool ir3_legalize_relative(struct ir3 *ir);
1983 
1984 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)1985 ir3_has_latency_to_hide(struct ir3 *ir)
1986 {
1987    /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
1988     * know the nature of the fragment shader.  Just assume it will have
1989     * latency to hide:
1990     */
1991    if (ir->type != MESA_SHADER_FRAGMENT)
1992       return true;
1993 
1994    foreach_block (block, &ir->block_list) {
1995       foreach_instr (instr, &block->instr_list) {
1996          if (is_tex_or_prefetch(instr))
1997             return true;
1998 
1999          if (is_load(instr)) {
2000             switch (instr->opc) {
2001             case OPC_LDLV:
2002             case OPC_LDL:
2003             case OPC_LDLW:
2004                break;
2005             default:
2006                return true;
2007             }
2008          }
2009       }
2010    }
2011 
2012    return false;
2013 }
2014 
2015 /* ************************************************************************* */
2016 /* instruction helpers */
2017 
2018 /* creates SSA src of correct type (ie. half vs full precision) */
2019 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)2020 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
2021           unsigned flags)
2022 {
2023    struct ir3_register *reg;
2024    if (src->dsts[0]->flags & IR3_REG_HALF)
2025       flags |= IR3_REG_HALF;
2026    reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
2027    reg->def = src->dsts[0];
2028    reg->wrmask = src->dsts[0]->wrmask;
2029    return reg;
2030 }
2031 
2032 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)2033 __ssa_dst(struct ir3_instruction *instr)
2034 {
2035    struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
2036    reg->instr = instr;
2037    return reg;
2038 }
2039 
2040 static ir3_register_flags
type_flags(type_t type)2041 type_flags(type_t type)
2042 {
2043    if (type_size(type) < 32)
2044       return IR3_REG_HALF;
2045    return (ir3_register_flags)0;
2046 }
2047 
2048 static inline struct ir3_instruction *
create_immed_typed(struct ir3_block * block,uint32_t val,type_t type)2049 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
2050 {
2051    struct ir3_instruction *mov;
2052    ir3_register_flags flags = type_flags(type);
2053 
2054    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2055    mov->cat1.src_type = type;
2056    mov->cat1.dst_type = type;
2057    __ssa_dst(mov)->flags |= flags;
2058    ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
2059 
2060    return mov;
2061 }
2062 
2063 static inline struct ir3_instruction *
create_immed(struct ir3_block * block,uint32_t val)2064 create_immed(struct ir3_block *block, uint32_t val)
2065 {
2066    return create_immed_typed(block, val, TYPE_U32);
2067 }
2068 
2069 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_block * block,unsigned n,type_t type)2070 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
2071 {
2072    struct ir3_instruction *mov;
2073    ir3_register_flags flags = type_flags(type);
2074 
2075    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2076    mov->cat1.src_type = type;
2077    mov->cat1.dst_type = type;
2078    __ssa_dst(mov)->flags |= flags;
2079    ir3_src_create(mov, n, IR3_REG_CONST | flags);
2080 
2081    return mov;
2082 }
2083 
2084 static inline struct ir3_instruction *
create_uniform(struct ir3_block * block,unsigned n)2085 create_uniform(struct ir3_block *block, unsigned n)
2086 {
2087    return create_uniform_typed(block, n, TYPE_F32);
2088 }
2089 
2090 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_block * block,int n,type_t type,struct ir3_instruction * address)2091 create_uniform_indirect(struct ir3_block *block, int n, type_t type,
2092                         struct ir3_instruction *address)
2093 {
2094    struct ir3_instruction *mov;
2095 
2096    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2097    mov->cat1.src_type = type;
2098    mov->cat1.dst_type = type;
2099    __ssa_dst(mov);
2100    ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
2101 
2102    ir3_instr_set_address(mov, address);
2103 
2104    return mov;
2105 }
2106 
2107 static inline struct ir3_instruction *
ir3_MOV(struct ir3_block * block,struct ir3_instruction * src,type_t type)2108 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
2109 {
2110    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2111    ir3_register_flags flags = type_flags(type);
2112 
2113    __ssa_dst(instr)->flags |= flags;
2114    if (src->dsts[0]->flags & IR3_REG_ARRAY) {
2115       struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
2116       src_reg->array = src->dsts[0]->array;
2117    } else {
2118       __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
2119    }
2120    assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
2121    instr->cat1.src_type = type;
2122    instr->cat1.dst_type = type;
2123    return instr;
2124 }
2125 
2126 static inline struct ir3_instruction *
ir3_COV(struct ir3_block * block,struct ir3_instruction * src,type_t src_type,type_t dst_type)2127 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
2128         type_t dst_type)
2129 {
2130    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2131    ir3_register_flags dst_flags = type_flags(dst_type);
2132    ASSERTED ir3_register_flags src_flags = type_flags(src_type);
2133 
2134    assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2135 
2136    __ssa_dst(instr)->flags |= dst_flags;
2137    __ssa_src(instr, src, 0);
2138    instr->cat1.src_type = src_type;
2139    instr->cat1.dst_type = dst_type;
2140    assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2141    return instr;
2142 }
2143 
2144 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_block * block,unsigned components)2145 ir3_MOVMSK(struct ir3_block *block, unsigned components)
2146 {
2147    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
2148 
2149    struct ir3_register *dst = __ssa_dst(instr);
2150    dst->flags |= IR3_REG_SHARED;
2151    dst->wrmask = (1 << components) - 1;
2152    instr->repeat = components - 1;
2153    return instr;
2154 }
2155 
2156 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_block * block,struct ir3_instruction * src,unsigned components)2157 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
2158                  unsigned components)
2159 {
2160    struct ir3_instruction *instr =
2161       ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
2162 
2163    struct ir3_register *dst = __ssa_dst(instr);
2164    dst->flags |= IR3_REG_SHARED;
2165    dst->wrmask = (1 << components) - 1;
2166 
2167    __ssa_src(instr, src, 0);
2168 
2169    return instr;
2170 }
2171 
2172 static inline struct ir3_instruction *
ir3_NOP(struct ir3_block * block)2173 ir3_NOP(struct ir3_block *block)
2174 {
2175    return ir3_instr_create(block, OPC_NOP, 0, 0);
2176 }
2177 
2178 /* clang-format off */
2179 #define __INSTR0(flag, name, opc)                                              \
2180 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block)      \
2181 {                                                                              \
2182    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0);         \
2183    instr->flags |= flag;                                                       \
2184    return instr;                                                               \
2185 }
2186 /* clang-format on */
2187 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2188 #define INSTR0(name)     __INSTR0((ir3_instruction_flags)0, name, OPC_##name)
2189 
2190 /* clang-format off */
2191 #define __INSTR1(flag, dst_count, name, opc)                                   \
2192 static inline struct ir3_instruction *ir3_##name(                              \
2193    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags)        \
2194 {                                                                              \
2195    struct ir3_instruction *instr =                                             \
2196       ir3_instr_create(block, opc, dst_count, 1);                              \
2197    for (unsigned i = 0; i < dst_count; i++)                                    \
2198       __ssa_dst(instr);                                                        \
2199    __ssa_src(instr, a, aflags);                                                \
2200    instr->flags |= flag;                                                       \
2201    return instr;                                                               \
2202 }
2203 /* clang-format on */
2204 #define INSTR1F(f, name)  __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2205 #define INSTR1(name)      __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name)
2206 #define INSTR1NODST(name) __INSTR1((ir3_instruction_flags)0, 0, name, OPC_##name)
2207 
2208 /* clang-format off */
2209 #define __INSTR2(flag, dst_count, name, opc)                                   \
2210 static inline struct ir3_instruction *ir3_##name(                              \
2211    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2212    struct ir3_instruction *b, unsigned bflags)                                 \
2213 {                                                                              \
2214    struct ir3_instruction *instr = ir3_instr_create(block, opc, dst_count, 2); \
2215    for (unsigned i = 0; i < dst_count; i++)                                    \
2216       __ssa_dst(instr);                                                        \
2217    __ssa_src(instr, a, aflags);                                                \
2218    __ssa_src(instr, b, bflags);                                                \
2219    instr->flags |= flag;                                                       \
2220    return instr;                                                               \
2221 }
2222 /* clang-format on */
2223 #define INSTR2F(f, name)   __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2224 #define INSTR2(name)       __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name)
2225 #define INSTR2NODST(name)  __INSTR2((ir3_instruction_flags)0, 0, name, OPC_##name)
2226 
2227 /* clang-format off */
2228 #define __INSTR3(flag, dst_count, name, opc)                                   \
2229 static inline struct ir3_instruction *ir3_##name(                              \
2230    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2231    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2232    unsigned cflags)                                                            \
2233 {                                                                              \
2234    struct ir3_instruction *instr =                                             \
2235       ir3_instr_create(block, opc, dst_count, 3);                              \
2236    for (unsigned i = 0; i < dst_count; i++)                                    \
2237       __ssa_dst(instr);                                                        \
2238    __ssa_src(instr, a, aflags);                                                \
2239    __ssa_src(instr, b, bflags);                                                \
2240    __ssa_src(instr, c, cflags);                                                \
2241    instr->flags |= flag;                                                       \
2242    return instr;                                                               \
2243 }
2244 /* clang-format on */
2245 #define INSTR3F(f, name)  __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2246 #define INSTR3(name)      __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name)
2247 #define INSTR3NODST(name) __INSTR3((ir3_instruction_flags)0, 0, name, OPC_##name)
2248 
2249 /* clang-format off */
2250 #define __INSTR4(flag, dst_count, name, opc)                                   \
2251 static inline struct ir3_instruction *ir3_##name(                              \
2252    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2253    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2254    unsigned cflags, struct ir3_instruction *d, unsigned dflags)                \
2255 {                                                                              \
2256    struct ir3_instruction *instr =                                             \
2257       ir3_instr_create(block, opc, dst_count, 4);                              \
2258    for (unsigned i = 0; i < dst_count; i++)                                    \
2259       __ssa_dst(instr);                                                        \
2260    __ssa_src(instr, a, aflags);                                                \
2261    __ssa_src(instr, b, bflags);                                                \
2262    __ssa_src(instr, c, cflags);                                                \
2263    __ssa_src(instr, d, dflags);                                                \
2264    instr->flags |= flag;                                                       \
2265    return instr;                                                               \
2266 }
2267 /* clang-format on */
2268 #define INSTR4F(f, name)  __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2269 #define INSTR4(name)      __INSTR4((ir3_instruction_flags)0, 1, name, OPC_##name)
2270 #define INSTR4NODST(name) __INSTR4((ir3_instruction_flags)0, 0, name, OPC_##name)
2271 
2272 /* clang-format off */
2273 #define __INSTR5(flag, name, opc)                                              \
2274 static inline struct ir3_instruction *ir3_##name(                              \
2275    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2276    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2277    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2278    struct ir3_instruction *e, unsigned eflags)                                 \
2279 {                                                                              \
2280    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5);         \
2281    __ssa_dst(instr);                                                           \
2282    __ssa_src(instr, a, aflags);                                                \
2283    __ssa_src(instr, b, bflags);                                                \
2284    __ssa_src(instr, c, cflags);                                                \
2285    __ssa_src(instr, d, dflags);                                                \
2286    __ssa_src(instr, e, eflags);                                                \
2287    instr->flags |= flag;                                                       \
2288    return instr;                                                               \
2289 }
2290 /* clang-format on */
2291 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2292 #define INSTR5(name)     __INSTR5((ir3_instruction_flags)0, name, OPC_##name)
2293 
2294 /* clang-format off */
2295 #define __INSTR6(flag, dst_count, name, opc)                                   \
2296 static inline struct ir3_instruction *ir3_##name(                              \
2297    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2298    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2299    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2300    struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f,      \
2301    unsigned fflags)                                                            \
2302 {                                                                              \
2303    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6);         \
2304    for (unsigned i = 0; i < dst_count; i++)                                    \
2305       __ssa_dst(instr);                                                        \
2306    __ssa_src(instr, a, aflags);                                                \
2307    __ssa_src(instr, b, bflags);                                                \
2308    __ssa_src(instr, c, cflags);                                                \
2309    __ssa_src(instr, d, dflags);                                                \
2310    __ssa_src(instr, e, eflags);                                                \
2311    __ssa_src(instr, f, fflags);                                                \
2312    instr->flags |= flag;                                                       \
2313    return instr;                                                               \
2314 }
2315 /* clang-format on */
2316 #define INSTR6F(f, name)  __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2317 #define INSTR6(name)      __INSTR6((ir3_instruction_flags)0, 1, name, OPC_##name)
2318 #define INSTR6NODST(name) __INSTR6((ir3_instruction_flags)0, 0, name, OPC_##name)
2319 
2320 /* cat0 instructions: */
2321 INSTR1NODST(B)
INSTR0(JUMP)2322 INSTR0(JUMP)
2323 INSTR1NODST(KILL)
2324 INSTR1NODST(DEMOTE)
2325 INSTR0(END)
2326 INSTR0(CHSH)
2327 INSTR0(CHMASK)
2328 INSTR1NODST(PREDT)
2329 INSTR0(PREDF)
2330 INSTR0(PREDE)
2331 INSTR0(GETONE)
2332 INSTR0(GETLAST)
2333 INSTR0(SHPS)
2334 INSTR0(SHPE)
2335 
2336 /* cat1 macros */
2337 INSTR1(ANY_MACRO)
2338 INSTR1(ALL_MACRO)
2339 INSTR1(READ_FIRST_MACRO)
2340 INSTR2(READ_COND_MACRO)
2341 
2342 static inline struct ir3_instruction *
2343 ir3_ELECT_MACRO(struct ir3_block *block)
2344 {
2345    struct ir3_instruction *instr =
2346       ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
2347    __ssa_dst(instr);
2348    return instr;
2349 }
2350 
2351 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_block * block)2352 ir3_SHPS_MACRO(struct ir3_block *block)
2353 {
2354    struct ir3_instruction *instr =
2355       ir3_instr_create(block, OPC_SHPS_MACRO, 1, 0);
2356    __ssa_dst(instr);
2357    return instr;
2358 }
2359 
2360 /* cat2 instructions, most 2 src but some 1 src: */
2361 INSTR2(ADD_F)
INSTR2(MIN_F)2362 INSTR2(MIN_F)
2363 INSTR2(MAX_F)
2364 INSTR2(MUL_F)
2365 INSTR1(SIGN_F)
2366 INSTR2(CMPS_F)
2367 INSTR1(ABSNEG_F)
2368 INSTR2(CMPV_F)
2369 INSTR1(FLOOR_F)
2370 INSTR1(CEIL_F)
2371 INSTR1(RNDNE_F)
2372 INSTR1(RNDAZ_F)
2373 INSTR1(TRUNC_F)
2374 INSTR2(ADD_U)
2375 INSTR2(ADD_S)
2376 INSTR2(SUB_U)
2377 INSTR2(SUB_S)
2378 INSTR2(CMPS_U)
2379 INSTR2(CMPS_S)
2380 INSTR2(MIN_U)
2381 INSTR2(MIN_S)
2382 INSTR2(MAX_U)
2383 INSTR2(MAX_S)
2384 INSTR1(ABSNEG_S)
2385 INSTR2(AND_B)
2386 INSTR2(OR_B)
2387 INSTR1(NOT_B)
2388 INSTR2(XOR_B)
2389 INSTR2(CMPV_U)
2390 INSTR2(CMPV_S)
2391 INSTR2(MUL_U24)
2392 INSTR2(MUL_S24)
2393 INSTR2(MULL_U)
2394 INSTR1(BFREV_B)
2395 INSTR1(CLZ_S)
2396 INSTR1(CLZ_B)
2397 INSTR2(SHL_B)
2398 INSTR2(SHR_B)
2399 INSTR2(ASHR_B)
2400 INSTR2(BARY_F)
2401 INSTR2(FLAT_B)
2402 INSTR2(MGEN_B)
2403 INSTR2(GETBIT_B)
2404 INSTR1(SETRM)
2405 INSTR1(CBITS_B)
2406 INSTR2(SHB)
2407 INSTR2(MSAD)
2408 
2409 /* cat3 instructions: */
2410 INSTR3(MAD_U16)
2411 INSTR3(MADSH_U16)
2412 INSTR3(MAD_S16)
2413 INSTR3(MADSH_M16)
2414 INSTR3(MAD_U24)
2415 INSTR3(MAD_S24)
2416 INSTR3(MAD_F16)
2417 INSTR3(MAD_F32)
2418 INSTR3(DP2ACC)
2419 INSTR3(DP4ACC)
2420 /* NOTE: SEL_B32 checks for zero vs nonzero */
2421 INSTR3(SEL_B16)
2422 INSTR3(SEL_B32)
2423 INSTR3(SEL_S16)
2424 INSTR3(SEL_S32)
2425 INSTR3(SEL_F16)
2426 INSTR3(SEL_F32)
2427 INSTR3(SAD_S16)
2428 INSTR3(SAD_S32)
2429 
2430 /* cat4 instructions: */
2431 INSTR1(RCP)
2432 INSTR1(RSQ)
2433 INSTR1(HRSQ)
2434 INSTR1(LOG2)
2435 INSTR1(HLOG2)
2436 INSTR1(EXP2)
2437 INSTR1(HEXP2)
2438 INSTR1(SIN)
2439 INSTR1(COS)
2440 INSTR1(SQRT)
2441 
2442 /* cat5 instructions: */
2443 INSTR1(DSX)
2444 INSTR1(DSXPP_MACRO)
2445 INSTR1(DSY)
2446 INSTR1(DSYPP_MACRO)
2447 INSTR1F(3D, DSX)
2448 INSTR1F(3D, DSY)
2449 INSTR1(RGETPOS)
2450 
2451 static inline struct ir3_instruction *
2452 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
2453         ir3_instruction_flags flags, struct ir3_instruction *samp_tex,
2454         struct ir3_instruction *src0, struct ir3_instruction *src1)
2455 {
2456    struct ir3_instruction *sam;
2457    unsigned nreg = 0;
2458 
2459    if (flags & IR3_INSTR_S2EN) {
2460       nreg++;
2461    }
2462    if (src0) {
2463       nreg++;
2464    }
2465    if (src1) {
2466       nreg++;
2467    }
2468 
2469    sam = ir3_instr_create(block, opc, 1, nreg);
2470    sam->flags |= flags;
2471    __ssa_dst(sam)->wrmask = wrmask;
2472    if (flags & IR3_INSTR_S2EN) {
2473       __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2474    }
2475    if (src0) {
2476       __ssa_src(sam, src0, 0);
2477    }
2478    if (src1) {
2479       __ssa_src(sam, src1, 0);
2480    }
2481    sam->cat5.type = type;
2482 
2483    return sam;
2484 }
2485 
2486 /* brcst.active rx, ry behaves like a conditional move: rx either keeps its
2487  * value or is set to ry. In order to model this in SSA form, we add an extra
2488  * argument (the initial value of rx) and tie it to the destination.
2489  */
2490 static inline struct ir3_instruction *
ir3_BRCST_ACTIVE(struct ir3_block * block,unsigned cluster_size,struct ir3_instruction * src,struct ir3_instruction * dst_default)2491 ir3_BRCST_ACTIVE(struct ir3_block *block, unsigned cluster_size,
2492                  struct ir3_instruction *src,
2493                  struct ir3_instruction *dst_default)
2494 {
2495    struct ir3_instruction *brcst =
2496       ir3_instr_create(block, OPC_BRCST_ACTIVE, 1, 2);
2497    brcst->cat5.cluster_size = cluster_size;
2498    brcst->cat5.type = TYPE_U32;
2499    struct ir3_register *brcst_dst = __ssa_dst(brcst);
2500    __ssa_src(brcst, src, 0);
2501    struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0);
2502    ir3_reg_tie(brcst_dst, default_src);
2503    return brcst;
2504 }
2505 
2506 /* cat6 instructions: */
2507 INSTR0(GETFIBERID)
2508 INSTR2(LDLV)
2509 INSTR3(LDG)
2510 INSTR3(LDL)
2511 INSTR3(LDLW)
2512 INSTR3(LDP)
2513 INSTR4NODST(STG)
2514 INSTR3NODST(STL)
2515 INSTR3NODST(STLW)
2516 INSTR3NODST(STP)
2517 INSTR1(RESINFO)
2518 INSTR1(RESFMT)
2519 INSTR2(ATOMIC_ADD)
2520 INSTR2(ATOMIC_SUB)
2521 INSTR2(ATOMIC_XCHG)
2522 INSTR2(ATOMIC_INC)
2523 INSTR2(ATOMIC_DEC)
2524 INSTR2(ATOMIC_CMPXCHG)
2525 INSTR2(ATOMIC_MIN)
2526 INSTR2(ATOMIC_MAX)
2527 INSTR2(ATOMIC_AND)
2528 INSTR2(ATOMIC_OR)
2529 INSTR2(ATOMIC_XOR)
2530 INSTR2(LDC)
2531 INSTR2(QUAD_SHUFFLE_BRCST)
2532 INSTR1(QUAD_SHUFFLE_HORIZ)
2533 INSTR1(QUAD_SHUFFLE_VERT)
2534 INSTR1(QUAD_SHUFFLE_DIAG)
2535 INSTR2NODST(LDC_K)
2536 INSTR2NODST(STC)
2537 INSTR2NODST(STSC)
2538 #ifndef GPU
2539 #elif GPU >= 600
2540 INSTR3NODST(STIB);
2541 INSTR2(LDIB);
2542 INSTR5(LDG_A);
2543 INSTR6NODST(STG_A);
2544 INSTR2(ATOMIC_G_ADD)
2545 INSTR2(ATOMIC_G_SUB)
2546 INSTR2(ATOMIC_G_XCHG)
2547 INSTR2(ATOMIC_G_INC)
2548 INSTR2(ATOMIC_G_DEC)
2549 INSTR2(ATOMIC_G_CMPXCHG)
2550 INSTR2(ATOMIC_G_MIN)
2551 INSTR2(ATOMIC_G_MAX)
2552 INSTR2(ATOMIC_G_AND)
2553 INSTR2(ATOMIC_G_OR)
2554 INSTR2(ATOMIC_G_XOR)
2555 INSTR3(ATOMIC_B_ADD)
2556 INSTR3(ATOMIC_B_SUB)
2557 INSTR3(ATOMIC_B_XCHG)
2558 INSTR3(ATOMIC_B_INC)
2559 INSTR3(ATOMIC_B_DEC)
2560 INSTR3(ATOMIC_B_CMPXCHG)
2561 INSTR3(ATOMIC_B_MIN)
2562 INSTR3(ATOMIC_B_MAX)
2563 INSTR3(ATOMIC_B_AND)
2564 INSTR3(ATOMIC_B_OR)
2565 INSTR3(ATOMIC_B_XOR)
2566 #elif GPU >= 400
2567 INSTR3(LDGB)
2568 #if GPU >= 500
2569 INSTR3(LDIB)
2570 #endif
2571 INSTR4NODST(STGB)
2572 INSTR4NODST(STIB)
2573 INSTR4(ATOMIC_S_ADD)
2574 INSTR4(ATOMIC_S_SUB)
2575 INSTR4(ATOMIC_S_XCHG)
2576 INSTR4(ATOMIC_S_INC)
2577 INSTR4(ATOMIC_S_DEC)
2578 INSTR4(ATOMIC_S_CMPXCHG)
2579 INSTR4(ATOMIC_S_MIN)
2580 INSTR4(ATOMIC_S_MAX)
2581 INSTR4(ATOMIC_S_AND)
2582 INSTR4(ATOMIC_S_OR)
2583 INSTR4(ATOMIC_S_XOR)
2584 #endif
2585 INSTR4NODST(LDG_K)
2586 
2587 /* cat7 instructions: */
2588 INSTR0(BAR)
2589 INSTR0(FENCE)
2590 INSTR0(CCINV)
2591 
2592 /* ************************************************************************* */
2593 #include "util/bitset.h"
2594 
2595 #define MAX_REG 256
2596 
2597 typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
2598 
2599 typedef struct {
2600    bool mergedregs;
2601    regmaskstate_t mask;
2602 } regmask_t;
2603 
2604 static inline bool
__regmask_get(regmask_t * regmask,bool half,unsigned n)2605 __regmask_get(regmask_t *regmask, bool half, unsigned n)
2606 {
2607    if (regmask->mergedregs) {
2608       /* a6xx+ case, with merged register file, we track things in terms
2609        * of half-precision registers, with a full precisions register
2610        * using two half-precision slots.
2611        *
2612        * Pretend that special regs (a0.x, a1.x, etc.) are full registers to
2613        * avoid having them alias normal full regs.
2614        */
2615       if (half && !is_reg_num_special(n)) {
2616          return BITSET_TEST(regmask->mask, n);
2617       } else {
2618          n *= 2;
2619          return BITSET_TEST(regmask->mask, n) ||
2620                 BITSET_TEST(regmask->mask, n + 1);
2621       }
2622    } else {
2623       /* pre a6xx case, with separate register file for half and full
2624        * precision:
2625        */
2626       if (half)
2627          n += MAX_REG;
2628       return BITSET_TEST(regmask->mask, n);
2629    }
2630 }
2631 
2632 static inline void
__regmask_set(regmask_t * regmask,bool half,unsigned n)2633 __regmask_set(regmask_t *regmask, bool half, unsigned n)
2634 {
2635    if (regmask->mergedregs) {
2636       /* a6xx+ case, with merged register file, we track things in terms
2637        * of half-precision registers, with a full precisions register
2638        * using two half-precision slots:
2639        */
2640       if (half && !is_reg_num_special(n)) {
2641          BITSET_SET(regmask->mask, n);
2642       } else {
2643          n *= 2;
2644          BITSET_SET(regmask->mask, n);
2645          BITSET_SET(regmask->mask, n + 1);
2646       }
2647    } else {
2648       /* pre a6xx case, with separate register file for half and full
2649        * precision:
2650        */
2651       if (half)
2652          n += MAX_REG;
2653       BITSET_SET(regmask->mask, n);
2654    }
2655 }
2656 
2657 static inline void
__regmask_clear(regmask_t * regmask,bool half,unsigned n)2658 __regmask_clear(regmask_t *regmask, bool half, unsigned n)
2659 {
2660    if (regmask->mergedregs) {
2661       /* a6xx+ case, with merged register file, we track things in terms
2662        * of half-precision registers, with a full precisions register
2663        * using two half-precision slots:
2664        */
2665       if (half && !is_reg_num_special(n)) {
2666          BITSET_CLEAR(regmask->mask, n);
2667       } else {
2668          n *= 2;
2669          BITSET_CLEAR(regmask->mask, n);
2670          BITSET_CLEAR(regmask->mask, n + 1);
2671       }
2672    } else {
2673       /* pre a6xx case, with separate register file for half and full
2674        * precision:
2675        */
2676       if (half)
2677          n += MAX_REG;
2678       BITSET_CLEAR(regmask->mask, n);
2679    }
2680 }
2681 
2682 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)2683 regmask_init(regmask_t *regmask, bool mergedregs)
2684 {
2685    memset(&regmask->mask, 0, sizeof(regmask->mask));
2686    regmask->mergedregs = mergedregs;
2687 }
2688 
2689 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)2690 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
2691 {
2692    assert(dst->mergedregs == a->mergedregs);
2693    assert(dst->mergedregs == b->mergedregs);
2694 
2695    for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2696       dst->mask[i] = a->mask[i] | b->mask[i];
2697 }
2698 
2699 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)2700 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
2701 {
2702    regmaskstate_t shared_mask;
2703    BITSET_ZERO(shared_mask);
2704 
2705    if (b->mergedregs) {
2706       BITSET_SET_RANGE(shared_mask, 2 * 4 * 48, 2 * 4 * 56 - 1);
2707    } else {
2708       BITSET_SET_RANGE(shared_mask, 4 * 48, 4 * 56 - 1);
2709    }
2710 
2711    for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2712       dst->mask[i] = a->mask[i] | (b->mask[i] & shared_mask[i]);
2713 }
2714 
2715 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)2716 regmask_set(regmask_t *regmask, struct ir3_register *reg)
2717 {
2718    bool half = reg->flags & IR3_REG_HALF;
2719    if (reg->flags & IR3_REG_RELATIV) {
2720       for (unsigned i = 0; i < reg->size; i++)
2721          __regmask_set(regmask, half, reg->array.base + i);
2722    } else {
2723       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2724          if (mask & 1)
2725             __regmask_set(regmask, half, n);
2726    }
2727 }
2728 
2729 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)2730 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
2731 {
2732    bool half = reg->flags & IR3_REG_HALF;
2733    if (reg->flags & IR3_REG_RELATIV) {
2734       for (unsigned i = 0; i < reg->size; i++)
2735          __regmask_clear(regmask, half, reg->array.base + i);
2736    } else {
2737       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2738          if (mask & 1)
2739             __regmask_clear(regmask, half, n);
2740    }
2741 }
2742 
2743 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)2744 regmask_get(regmask_t *regmask, struct ir3_register *reg)
2745 {
2746    bool half = reg->flags & IR3_REG_HALF;
2747    if (reg->flags & IR3_REG_RELATIV) {
2748       for (unsigned i = 0; i < reg->size; i++)
2749          if (__regmask_get(regmask, half, reg->array.base + i))
2750             return true;
2751    } else {
2752       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2753          if (mask & 1)
2754             if (__regmask_get(regmask, half, n))
2755                return true;
2756    }
2757    return false;
2758 }
2759 /* ************************************************************************* */
2760 
2761 #endif /* IR3_H_ */
2762