• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #ifndef IR3_H_
25 #define IR3_H_
26 
27 #include <stdbool.h>
28 #include <stdint.h>
29 
30 #include "compiler/shader_enums.h"
31 
32 #include "util/bitscan.h"
33 #include "util/list.h"
34 #include "util/set.h"
35 #include "util/u_debug.h"
36 
37 #include "instr-a3xx.h"
38 
39 /* low level intermediate representation of an adreno shader program */
40 
41 struct ir3_compiler;
42 struct ir3;
43 struct ir3_instruction;
44 struct ir3_block;
45 
46 struct ir3_info {
47    void *data; /* used internally in ir3 assembler */
48    /* Size in bytes of the shader binary, including NIR constants and
49     * padding
50     */
51    uint32_t size;
52    /* byte offset from start of the shader to the NIR constant data. */
53    uint32_t constant_data_offset;
54    /* Size in dwords of the instructions. */
55    uint16_t sizedwords;
56    uint16_t instrs_count; /* expanded to account for rpt's */
57    uint16_t nops_count;   /* # of nop instructions, including nopN */
58    uint16_t mov_count;
59    uint16_t cov_count;
60    uint16_t stp_count;
61    uint16_t ldp_count;
62    /* NOTE: max_reg, etc, does not include registers not touched
63     * by the shader (ie. vertex fetched via VFD_DECODE but not
64     * touched by shader)
65     */
66    int8_t max_reg; /* highest GPR # used by shader */
67    int8_t max_half_reg;
68    int16_t max_const;
69    /* This is the maximum # of waves that can executed at once in one core,
70     * assuming that they are all executing this shader.
71     */
72    int8_t max_waves;
73    bool double_threadsize;
74    bool multi_dword_ldp_stp;
75 
76    /* number of sync bits: */
77    uint16_t ss, sy;
78 
79    /* estimate of number of cycles stalled on (ss) */
80    uint16_t sstall;
81    /* estimate of number of cycles stalled on (sy) */
82    uint16_t systall;
83 
84    uint16_t last_baryf; /* instruction # of last varying fetch */
85 
86    /* Number of instructions of a given category: */
87    uint16_t instrs_per_cat[8];
88 };
89 
90 struct ir3_merge_set {
91    uint16_t preferred_reg;
92    uint16_t size;
93    uint16_t alignment;
94 
95    unsigned interval_start;
96    unsigned spill_slot;
97 
98    unsigned regs_count;
99    struct ir3_register **regs;
100 };
101 
102 struct ir3_register {
103    enum {
104       IR3_REG_CONST = 0x001,
105       IR3_REG_IMMED = 0x002,
106       IR3_REG_HALF = 0x004,
107       /* Shared registers have the same value for all threads when read.
108        * They can only be written when one thread is active (that is, inside
109        * a "getone" block).
110        */
111       IR3_REG_SHARED = 0x008,
112       IR3_REG_RELATIV = 0x010,
113       IR3_REG_R = 0x020,
114       /* Most instructions, it seems, can do float abs/neg but not
115        * integer.  The CP pass needs to know what is intended (int or
116        * float) in order to do the right thing.  For this reason the
117        * abs/neg flags are split out into float and int variants.  In
118        * addition, .b (bitwise) operations, the negate is actually a
119        * bitwise not, so split that out into a new flag to make it
120        * more clear.
121        */
122       IR3_REG_FNEG = 0x040,
123       IR3_REG_FABS = 0x080,
124       IR3_REG_SNEG = 0x100,
125       IR3_REG_SABS = 0x200,
126       IR3_REG_BNOT = 0x400,
127       /* (ei) flag, end-input?  Set on last bary, presumably to signal
128        * that the shader needs no more input:
129        *
130        * Note: Has different meaning on other instructions like add.s/u
131        */
132       IR3_REG_EI = 0x2000,
133       /* meta-flags, for intermediate stages of IR, ie.
134        * before register assignment is done:
135        */
136       IR3_REG_SSA = 0x4000, /* 'def' is ptr to assigning destination */
137       IR3_REG_ARRAY = 0x8000,
138 
139       /* Set on a use whenever the SSA value becomes dead after the current
140        * instruction.
141        */
142       IR3_REG_KILL = 0x10000,
143 
144       /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
145        * same SSA value in a single instruction, this is only set on the first
146        * use.
147        */
148       IR3_REG_FIRST_KILL = 0x20000,
149 
150       /* Set when a destination doesn't have any uses and is dead immediately
151        * after the instruction. This can happen even after optimizations for
152        * corner cases such as destinations of atomic instructions.
153        */
154       IR3_REG_UNUSED = 0x40000,
155 
156       /* "Early-clobber" on a destination means that the destination is
157        * (potentially) written before any sources are read and therefore
158        * interferes with the sources of the instruction.
159        */
160       IR3_REG_EARLY_CLOBBER = 0x80000,
161    } flags;
162 
163    unsigned name;
164 
165    /* used for cat5 instructions, but also for internal/IR level
166     * tracking of what registers are read/written by an instruction.
167     * wrmask may be a bad name since it is used to represent both
168     * src and dst that touch multiple adjacent registers.
169     */
170    unsigned wrmask : 16; /* up to vec16 */
171 
172    /* for relative addressing, 32bits for array size is too small,
173     * but otoh we don't need to deal with disjoint sets, so instead
174     * use a simple size field (number of scalar components).
175     *
176     * Note the size field isn't important for relative const (since
177     * we don't have to do register allocation for constants).
178     */
179    unsigned size : 16;
180 
181    /* normal registers:
182     * the component is in the low two bits of the reg #, so
183     * rN.x becomes: (N << 2) | x
184     */
185    uint16_t num;
186    union {
187       /* immediate: */
188       int32_t iim_val;
189       uint32_t uim_val;
190       float fim_val;
191       /* relative: */
192       struct {
193          uint16_t id;
194          int16_t offset;
195          uint16_t base;
196       } array;
197    };
198 
199    /* For IR3_REG_SSA, dst registers contain pointer back to the instruction
200     * containing this register.
201     */
202    struct ir3_instruction *instr;
203 
204    /* For IR3_REG_SSA, src registers contain ptr back to assigning
205     * instruction.
206     *
207     * For IR3_REG_ARRAY, the pointer is back to the last dependent
208     * array access (although the net effect is the same, it points
209     * back to a previous instruction that we depend on).
210     */
211    struct ir3_register *def;
212 
213    /* Pointer to another register in the instruction that must share the same
214     * physical register. Each destination can be tied with one source, and
215     * they must have "tied" pointing to each other.
216     */
217    struct ir3_register *tied;
218 
219    unsigned spill_slot, next_use;
220 
221    unsigned merge_set_offset;
222    struct ir3_merge_set *merge_set;
223    unsigned interval_start, interval_end;
224 };
225 
226 /*
227  * Stupid/simple growable array implementation:
228  */
229 #define DECLARE_ARRAY(type, name)                                              \
230    unsigned name##_count, name##_sz;                                           \
231    type *name;
232 
233 #define array_insert(ctx, arr, ...)                                            \
234    do {                                                                        \
235       if (arr##_count == arr##_sz) {                                           \
236          arr##_sz = MAX2(2 * arr##_sz, 16);                                    \
237          arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0]));             \
238       }                                                                        \
239       arr[arr##_count++] = __VA_ARGS__;                                        \
240    } while (0)
241 
242 typedef enum {
243    REDUCE_OP_ADD_U,
244    REDUCE_OP_ADD_F,
245    REDUCE_OP_MUL_U,
246    REDUCE_OP_MUL_F,
247    REDUCE_OP_MIN_U,
248    REDUCE_OP_MIN_S,
249    REDUCE_OP_MIN_F,
250    REDUCE_OP_MAX_U,
251    REDUCE_OP_MAX_S,
252    REDUCE_OP_MAX_F,
253    REDUCE_OP_AND_B,
254    REDUCE_OP_OR_B,
255    REDUCE_OP_XOR_B,
256 } reduce_op_t;
257 
258 struct ir3_instruction {
259    struct ir3_block *block;
260    opc_t opc;
261    enum {
262       /* (sy) flag is set on first instruction, and after sample
263        * instructions (probably just on RAW hazard).
264        */
265       IR3_INSTR_SY = 0x001,
266       /* (ss) flag is set on first instruction, and first instruction
267        * to depend on the result of "long" instructions (RAW hazard):
268        *
269        *   rcp, rsq, log2, exp2, sin, cos, sqrt
270        *
271        * It seems to synchronize until all in-flight instructions are
272        * completed, for example:
273        *
274        *   rsq hr1.w, hr1.w
275        *   add.f hr2.z, (neg)hr2.z, hc0.y
276        *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
277        *   rsq hr2.x, hr2.x
278        *   (rpt1)nop
279        *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
280        *   nop
281        *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
282        *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
283        *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
284        *
285        * The last mul.f does not have (ss) set, presumably because the
286        * (ss) on the previous instruction does the job.
287        *
288        * The blob driver also seems to set it on WAR hazards, although
289        * not really clear if this is needed or just blob compiler being
290        * sloppy.  So far I haven't found a case where removing the (ss)
291        * causes problems for WAR hazard, but I could just be getting
292        * lucky:
293        *
294        *   rcp r1.y, r3.y
295        *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
296        *
297        */
298       IR3_INSTR_SS = 0x002,
299       /* (jp) flag is set on jump targets:
300        */
301       IR3_INSTR_JP = 0x004,
302       IR3_INSTR_UL = 0x008,
303       IR3_INSTR_3D = 0x010,
304       IR3_INSTR_A = 0x020,
305       IR3_INSTR_O = 0x040,
306       IR3_INSTR_P = 0x080,
307       IR3_INSTR_S = 0x100,
308       IR3_INSTR_S2EN = 0x200,
309       IR3_INSTR_SAT = 0x400,
310       /* (cat5/cat6) Bindless */
311       IR3_INSTR_B = 0x800,
312       /* (cat5/cat6) nonuniform */
313       IR3_INSTR_NONUNIF = 0x1000,
314       /* (cat5-only) Get some parts of the encoding from a1.x */
315       IR3_INSTR_A1EN = 0x02000,
316       /* meta-flags, for intermediate stages of IR, ie.
317        * before register assignment is done:
318        */
319       IR3_INSTR_MARK = 0x04000,
320       IR3_INSTR_UNUSED = 0x08000,
321    } flags;
322    uint8_t repeat;
323    uint8_t nop;
324 #ifdef DEBUG
325    unsigned srcs_max, dsts_max;
326 #endif
327    unsigned srcs_count, dsts_count;
328    struct ir3_register **dsts;
329    struct ir3_register **srcs;
330    union {
331       struct {
332          char inv1, inv2;
333          char comp1, comp2;
334          int immed;
335          struct ir3_block *target;
336          const char *target_label;
337          brtype_t brtype;
338          unsigned idx; /* for brac.N */
339       } cat0;
340       struct {
341          type_t src_type, dst_type;
342          round_t round;
343          reduce_op_t reduce_op;
344       } cat1;
345       struct {
346          enum {
347             IR3_COND_LT = 0,
348             IR3_COND_LE = 1,
349             IR3_COND_GT = 2,
350             IR3_COND_GE = 3,
351             IR3_COND_EQ = 4,
352             IR3_COND_NE = 5,
353          } condition;
354       } cat2;
355       struct {
356          enum {
357             IR3_SRC_UNSIGNED = 0,
358             IR3_SRC_MIXED = 1,
359          } signedness;
360          enum {
361             IR3_SRC_PACKED_LOW = 0,
362             IR3_SRC_PACKED_HIGH = 1,
363          } packed;
364          bool swapped;
365       } cat3;
366       struct {
367          unsigned samp, tex;
368          unsigned tex_base : 3;
369          unsigned cluster_size : 4;
370          type_t type;
371       } cat5;
372       struct {
373          type_t type;
374          /* TODO remove dst_offset and handle as a ir3_register
375           * which might be IMMED, similar to how src_offset is
376           * handled.
377           */
378          int dst_offset;
379          int iim_val;       /* for ldgb/stgb, # of components */
380          unsigned d    : 3; /* for ldc, component offset */
381          bool typed    : 1;
382          unsigned base : 3;
383       } cat6;
384       struct {
385          unsigned w : 1; /* write */
386          unsigned r : 1; /* read */
387          unsigned l : 1; /* local */
388          unsigned g : 1; /* global */
389       } cat7;
390       /* for meta-instructions, just used to hold extra data
391        * before instruction scheduling, etc
392        */
393       struct {
394          int off; /* component/offset */
395       } split;
396       struct {
397          /* Per-source index back to the entry in the
398           * ir3_shader_variant::outputs table.
399           */
400          unsigned *outidxs;
401       } end;
402       struct {
403          /* used to temporarily hold reference to nir_phi_instr
404           * until we resolve the phi srcs
405           */
406          void *nphi;
407       } phi;
408       struct {
409          unsigned samp, tex;
410          unsigned input_offset;
411          unsigned samp_base : 3;
412          unsigned tex_base  : 3;
413       } prefetch;
414       struct {
415          /* maps back to entry in ir3_shader_variant::inputs table: */
416          int inidx;
417          /* for sysvals, identifies the sysval type.  Mostly so we can
418           * identify the special cases where a sysval should not be DCE'd
419           * (currently, just pre-fs texture fetch)
420           */
421          gl_system_value sysval;
422       } input;
423    };
424 
425    /* For assigning jump offsets, we need instruction's position: */
426    uint32_t ip;
427 
428    /* used for per-pass extra instruction data.
429     *
430     * TODO we should remove the per-pass data like this and 'use_count'
431     * and do something similar to what RA does w/ ir3_ra_instr_data..
432     * ie. use the ir3_count_instructions pass, and then use instr->ip
433     * to index into a table of pass-private data.
434     */
435    void *data;
436 
437    /**
438     * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
439     */
440    struct set *uses;
441 
442    int use_count; /* currently just updated/used by cp */
443 
444    /* an instruction can reference at most one address register amongst
445     * it's src/dst registers.  Beyond that, you need to insert mov's.
446     *
447     * NOTE: do not write this directly, use ir3_instr_set_address()
448     */
449    struct ir3_register *address;
450 
451    /* Tracking for additional dependent instructions.  Used to handle
452     * barriers, WAR hazards for arrays/SSBOs/etc.
453     */
454    DECLARE_ARRAY(struct ir3_instruction *, deps);
455 
456    /*
457     * From PoV of instruction scheduling, not execution (ie. ignores global/
458     * local distinction):
459     *                            shared  image  atomic  SSBO  everything
460     *   barrier()/            -   R/W     R/W    R/W     R/W       X
461     *     groupMemoryBarrier()
462     *     memoryBarrier()
463     *     (but only images declared coherent?)
464     *   memoryBarrierAtomic() -                  R/W
465     *   memoryBarrierBuffer() -                          R/W
466     *   memoryBarrierImage()  -           R/W
467     *   memoryBarrierShared() -   R/W
468     *
469     * TODO I think for SSBO/image/shared, in cases where we can determine
470     * which variable is accessed, we don't need to care about accesses to
471     * different variables (unless declared coherent??)
472     */
473    enum {
474       IR3_BARRIER_EVERYTHING = 1 << 0,
475       IR3_BARRIER_SHARED_R = 1 << 1,
476       IR3_BARRIER_SHARED_W = 1 << 2,
477       IR3_BARRIER_IMAGE_R = 1 << 3,
478       IR3_BARRIER_IMAGE_W = 1 << 4,
479       IR3_BARRIER_BUFFER_R = 1 << 5,
480       IR3_BARRIER_BUFFER_W = 1 << 6,
481       IR3_BARRIER_ARRAY_R = 1 << 7,
482       IR3_BARRIER_ARRAY_W = 1 << 8,
483       IR3_BARRIER_PRIVATE_R = 1 << 9,
484       IR3_BARRIER_PRIVATE_W = 1 << 10,
485       IR3_BARRIER_CONST_W = 1 << 11,
486       IR3_BARRIER_ACTIVE_FIBERS_R = 1 << 12,
487       IR3_BARRIER_ACTIVE_FIBERS_W = 1 << 13,
488    } barrier_class,
489       barrier_conflict;
490 
491    /* Entry in ir3_block's instruction list: */
492    struct list_head node;
493 
494    uint32_t serialno;
495 
496    // TODO only computerator/assembler:
497    int line;
498 };
499 
500 struct ir3 {
501    struct ir3_compiler *compiler;
502    gl_shader_stage type;
503 
504    DECLARE_ARRAY(struct ir3_instruction *, inputs);
505 
506    /* Track bary.f (and ldlv) instructions.. this is needed in
507     * scheduling to ensure that all varying fetches happen before
508     * any potential kill instructions.  The hw gets grumpy if all
509     * threads in a group are killed before the last bary.f gets
510     * a chance to signal end of input (ei).
511     */
512    DECLARE_ARRAY(struct ir3_instruction *, baryfs);
513 
514    /* Track all indirect instructions (read and write).  To avoid
515     * deadlock scenario where an address register gets scheduled,
516     * but other dependent src instructions cannot be scheduled due
517     * to dependency on a *different* address register value, the
518     * scheduler needs to ensure that all dependencies other than
519     * the instruction other than the address register are scheduled
520     * before the one that writes the address register.  Having a
521     * convenient list of instructions that reference some address
522     * register simplifies this.
523     */
524    DECLARE_ARRAY(struct ir3_instruction *, a0_users);
525 
526    /* same for a1.x: */
527    DECLARE_ARRAY(struct ir3_instruction *, a1_users);
528 
529    /* and same for instructions that consume predicate register: */
530    DECLARE_ARRAY(struct ir3_instruction *, predicates);
531 
532    /* Track texture sample instructions which need texture state
533     * patched in (for astc-srgb workaround):
534     */
535    DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
536 
537    /* Track tg4 instructions which need texture state patched in (for tg4
538     * swizzling workaround):
539     */
540    DECLARE_ARRAY(struct ir3_instruction *, tg4);
541 
542    /* List of blocks: */
543    struct list_head block_list;
544 
545    /* List of ir3_array's: */
546    struct list_head array_list;
547 
548 #ifdef DEBUG
549    unsigned block_count;
550 #endif
551    unsigned instr_count;
552 };
553 
554 struct ir3_array {
555    struct list_head node;
556    unsigned length;
557    unsigned id;
558 
559    struct nir_register *r;
560 
561    /* To avoid array write's from getting DCE'd, keep track of the
562     * most recent write.  Any array access depends on the most
563     * recent write.  This way, nothing depends on writes after the
564     * last read.  But all the writes that happen before that have
565     * something depending on them
566     */
567    struct ir3_register *last_write;
568 
569    /* extra stuff used in RA pass: */
570    unsigned base; /* base vreg name */
571    unsigned reg;  /* base physical reg */
572    uint16_t start_ip, end_ip;
573 
574    /* Indicates if half-precision */
575    bool half;
576 
577    bool unused;
578 };
579 
580 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
581 
582 enum ir3_branch_type {
583    IR3_BRANCH_COND,   /* condition */
584    IR3_BRANCH_ANY,    /* subgroupAny(condition) */
585    IR3_BRANCH_ALL,    /* subgroupAll(condition) */
586    IR3_BRANCH_GETONE, /* subgroupElect() */
587    IR3_BRANCH_SHPS,   /* preamble start */
588 };
589 
590 struct ir3_block {
591    struct list_head node;
592    struct ir3 *shader;
593 
594    const struct nir_block *nblock;
595 
596    struct list_head instr_list; /* list of ir3_instruction */
597 
598    /* The actual branch condition, if there are two successors */
599    enum ir3_branch_type brtype;
600 
601    /* each block has either one or two successors.. in case of two
602     * successors, 'condition' decides which one to follow.  A block preceding
603     * an if/else has two successors.
604     *
605     * In some cases the path that the machine actually takes through the
606     * program may not match the per-thread view of the CFG. In particular
607     * this is the case for if/else, where the machine jumps from the end of
608     * the if to the beginning of the else and switches active lanes. While
609     * most things only care about the per-thread view, we need to use the
610     * "physical" view when allocating shared registers. "successors" contains
611     * the per-thread successors, and "physical_successors" contains the
612     * physical successors which includes the fallthrough edge from the if to
613     * the else.
614     */
615    struct ir3_instruction *condition;
616    struct ir3_block *successors[2];
617    struct ir3_block *physical_successors[2];
618 
619    DECLARE_ARRAY(struct ir3_block *, predecessors);
620    DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
621 
622    uint16_t start_ip, end_ip;
623 
624    /* Track instructions which do not write a register but other-
625     * wise must not be discarded (such as kill, stg, etc)
626     */
627    DECLARE_ARRAY(struct ir3_instruction *, keeps);
628 
629    /* used for per-pass extra block data.  Mainly used right
630     * now in RA step to track livein/liveout.
631     */
632    void *data;
633 
634    uint32_t index;
635 
636    struct ir3_block *imm_dom;
637    DECLARE_ARRAY(struct ir3_block *, dom_children);
638 
639    uint32_t dom_pre_index;
640    uint32_t dom_post_index;
641 
642    uint32_t loop_id;
643    uint32_t loop_depth;
644 
645 #ifdef DEBUG
646    uint32_t serialno;
647 #endif
648 };
649 
650 static inline uint32_t
block_id(struct ir3_block * block)651 block_id(struct ir3_block *block)
652 {
653 #ifdef DEBUG
654    return block->serialno;
655 #else
656    return (uint32_t)(unsigned long)block;
657 #endif
658 }
659 
660 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)661 ir3_start_block(struct ir3 *ir)
662 {
663    return list_first_entry(&ir->block_list, struct ir3_block, node);
664 }
665 
666 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)667 ir3_after_preamble(struct ir3 *ir)
668 {
669    struct ir3_block *block = ir3_start_block(ir);
670    /* The preamble will have a usually-empty else branch, and we want to skip
671     * that to get to the block after the preamble.
672     */
673    if (block->brtype == IR3_BRANCH_SHPS)
674       return block->successors[1]->successors[0];
675    else
676       return block;
677 }
678 
679 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
680 void ir3_block_add_physical_predecessor(struct ir3_block *block,
681                                         struct ir3_block *pred);
682 void ir3_block_remove_predecessor(struct ir3_block *block,
683                                   struct ir3_block *pred);
684 void ir3_block_remove_physical_predecessor(struct ir3_block *block,
685                                            struct ir3_block *pred);
686 unsigned ir3_block_get_pred_index(struct ir3_block *block,
687                                   struct ir3_block *pred);
688 
689 void ir3_calc_dominance(struct ir3 *ir);
690 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
691 
692 struct ir3_shader_variant;
693 
694 struct ir3 *ir3_create(struct ir3_compiler *compiler,
695                        struct ir3_shader_variant *v);
696 void ir3_destroy(struct ir3 *shader);
697 
698 void ir3_collect_info(struct ir3_shader_variant *v);
699 void *ir3_alloc(struct ir3 *shader, int sz);
700 
701 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
702                                          unsigned reg_count,
703                                          bool double_threadsize);
704 
705 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
706                                            bool double_threadsize);
707 
708 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
709                                   unsigned regs_count);
710 
711 struct ir3_block *ir3_block_create(struct ir3 *shader);
712 
713 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
714                                          int ndst, int nsrc);
715 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
716 void ir3_instr_add_dep(struct ir3_instruction *instr,
717                        struct ir3_instruction *dep);
718 const char *ir3_instr_name(struct ir3_instruction *instr);
719 
720 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
721                                     int flags);
722 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
723                                     int flags);
724 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
725                                    struct ir3_register *reg);
726 
727 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)728 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
729 {
730    assert(!dst->tied && !src->tied);
731    dst->tied = src;
732    src->tied = dst;
733 }
734 
735 void ir3_reg_set_last_array(struct ir3_instruction *instr,
736                             struct ir3_register *reg,
737                             struct ir3_register *last_write);
738 
739 void ir3_instr_set_address(struct ir3_instruction *instr,
740                            struct ir3_instruction *addr);
741 
742 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)743 ir3_instr_check_mark(struct ir3_instruction *instr)
744 {
745    if (instr->flags & IR3_INSTR_MARK)
746       return true; /* already visited */
747    instr->flags |= IR3_INSTR_MARK;
748    return false;
749 }
750 
751 void ir3_block_clear_mark(struct ir3_block *block);
752 void ir3_clear_mark(struct ir3 *shader);
753 
754 unsigned ir3_count_instructions(struct ir3 *ir);
755 unsigned ir3_count_instructions_ra(struct ir3 *ir);
756 
757 /**
758  * Move 'instr' to just before 'after'
759  */
760 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)761 ir3_instr_move_before(struct ir3_instruction *instr,
762                       struct ir3_instruction *after)
763 {
764    list_delinit(&instr->node);
765    list_addtail(&instr->node, &after->node);
766 }
767 
768 /**
769  * Move 'instr' to just after 'before':
770  */
771 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)772 ir3_instr_move_after(struct ir3_instruction *instr,
773                      struct ir3_instruction *before)
774 {
775    list_delinit(&instr->node);
776    list_add(&instr->node, &before->node);
777 }
778 
779 /**
780  * Move 'instr' to the beginning of the block:
781  */
782 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)783 ir3_instr_move_before_block(struct ir3_instruction *instr,
784                             struct ir3_block *block)
785 {
786    list_delinit(&instr->node);
787    list_add(&instr->node, &block->instr_list);
788 }
789 
790 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
791 
792 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
793 void ir3_fixup_src_type(struct ir3_instruction *instr);
794 
795 int ir3_flut(struct ir3_register *src_reg);
796 
797 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
798 
799 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
800 
801 #include "util/set.h"
802 #define foreach_ssa_use(__use, __instr)                                        \
803    for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
804         __use = NULL)                                                          \
805       set_foreach ((__instr)->uses, __entry)                                   \
806          if ((__use = (void *)__entry->key))
807 
808 static inline uint32_t
reg_num(const struct ir3_register * reg)809 reg_num(const struct ir3_register *reg)
810 {
811    return reg->num >> 2;
812 }
813 
814 static inline uint32_t
reg_comp(const struct ir3_register * reg)815 reg_comp(const struct ir3_register *reg)
816 {
817    return reg->num & 0x3;
818 }
819 
820 static inline bool
is_flow(struct ir3_instruction * instr)821 is_flow(struct ir3_instruction *instr)
822 {
823    return (opc_cat(instr->opc) == 0);
824 }
825 
826 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)827 is_kill_or_demote(struct ir3_instruction *instr)
828 {
829    return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
830 }
831 
832 static inline bool
is_nop(struct ir3_instruction * instr)833 is_nop(struct ir3_instruction *instr)
834 {
835    return instr->opc == OPC_NOP;
836 }
837 
838 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)839 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
840 {
841    unsigned dst_type = (dst->flags & IR3_REG_HALF);
842    unsigned src_type = (src->flags & IR3_REG_HALF);
843 
844    /* Treat shared->normal copies as same-type, because they can generally be
845     * folded, but not normal->shared copies.
846     */
847    if (dst_type != src_type ||
848        ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
849       return false;
850    else
851       return true;
852 }
853 
854 /* Is it a non-transformative (ie. not type changing) mov?  This can
855  * also include absneg.s/absneg.f, which for the most part can be
856  * treated as a mov (single src argument).
857  */
858 static inline bool
is_same_type_mov(struct ir3_instruction * instr)859 is_same_type_mov(struct ir3_instruction *instr)
860 {
861    struct ir3_register *dst;
862 
863    switch (instr->opc) {
864    case OPC_MOV:
865       if (instr->cat1.src_type != instr->cat1.dst_type)
866          return false;
867       /* If the type of dest reg and src reg are different,
868        * it shouldn't be considered as same type mov
869        */
870       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
871          return false;
872       break;
873    case OPC_ABSNEG_F:
874    case OPC_ABSNEG_S:
875       if (instr->flags & IR3_INSTR_SAT)
876          return false;
877       /* If the type of dest reg and src reg are different,
878        * it shouldn't be considered as same type mov
879        */
880       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
881          return false;
882       break;
883    case OPC_META_PHI:
884       return instr->srcs_count == 1;
885    default:
886       return false;
887    }
888 
889    dst = instr->dsts[0];
890 
891    /* mov's that write to a0 or p0.x are special: */
892    if (dst->num == regid(REG_P0, 0))
893       return false;
894    if (reg_num(dst) == REG_A0)
895       return false;
896 
897    if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
898       return false;
899 
900    return true;
901 }
902 
903 /* A move from const, which changes size but not type, can also be
904  * folded into dest instruction in some cases.
905  */
906 static inline bool
is_const_mov(struct ir3_instruction * instr)907 is_const_mov(struct ir3_instruction *instr)
908 {
909    if (instr->opc != OPC_MOV)
910       return false;
911 
912    if (!(instr->srcs[0]->flags & IR3_REG_CONST))
913       return false;
914 
915    type_t src_type = instr->cat1.src_type;
916    type_t dst_type = instr->cat1.dst_type;
917 
918    return (type_float(src_type) && type_float(dst_type)) ||
919           (type_uint(src_type) && type_uint(dst_type)) ||
920           (type_sint(src_type) && type_sint(dst_type));
921 }
922 
923 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)924 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
925 {
926    switch (instr->opc) {
927    case OPC_BALLOT_MACRO:
928    case OPC_ANY_MACRO:
929    case OPC_ALL_MACRO:
930    case OPC_ELECT_MACRO:
931    case OPC_READ_COND_MACRO:
932    case OPC_READ_FIRST_MACRO:
933    case OPC_SWZ_SHARED_MACRO:
934    case OPC_SCAN_MACRO:
935       return true;
936    default:
937       return false;
938    }
939 }
940 
941 static inline bool
is_alu(struct ir3_instruction * instr)942 is_alu(struct ir3_instruction *instr)
943 {
944    return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
945 }
946 
947 static inline bool
is_sfu(struct ir3_instruction * instr)948 is_sfu(struct ir3_instruction *instr)
949 {
950    return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
951 }
952 
953 static inline bool
is_tex(struct ir3_instruction * instr)954 is_tex(struct ir3_instruction *instr)
955 {
956    return (opc_cat(instr->opc) == 5);
957 }
958 
959 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)960 is_tex_or_prefetch(struct ir3_instruction *instr)
961 {
962    return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
963 }
964 
965 static inline bool
is_mem(struct ir3_instruction * instr)966 is_mem(struct ir3_instruction *instr)
967 {
968    return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
969 }
970 
971 static inline bool
is_barrier(struct ir3_instruction * instr)972 is_barrier(struct ir3_instruction *instr)
973 {
974    return (opc_cat(instr->opc) == 7);
975 }
976 
977 static inline bool
is_half(struct ir3_instruction * instr)978 is_half(struct ir3_instruction *instr)
979 {
980    return !!(instr->dsts[0]->flags & IR3_REG_HALF);
981 }
982 
983 static inline bool
is_shared(struct ir3_instruction * instr)984 is_shared(struct ir3_instruction *instr)
985 {
986    return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
987 }
988 
989 static inline bool
is_store(struct ir3_instruction * instr)990 is_store(struct ir3_instruction *instr)
991 {
992    /* these instructions, the "destination" register is
993     * actually a source, the address to store to.
994     */
995    switch (instr->opc) {
996    case OPC_STG:
997    case OPC_STG_A:
998    case OPC_STGB:
999    case OPC_STIB:
1000    case OPC_STP:
1001    case OPC_STL:
1002    case OPC_STLW:
1003    case OPC_L2G:
1004    case OPC_G2L:
1005       return true;
1006    default:
1007       return false;
1008    }
1009 }
1010 
1011 static inline bool
is_load(struct ir3_instruction * instr)1012 is_load(struct ir3_instruction *instr)
1013 {
1014    switch (instr->opc) {
1015    case OPC_LDG:
1016    case OPC_LDG_A:
1017    case OPC_LDGB:
1018    case OPC_LDIB:
1019    case OPC_LDL:
1020    case OPC_LDP:
1021    case OPC_L2G:
1022    case OPC_LDLW:
1023    case OPC_LDC:
1024    case OPC_LDLV:
1025       /* probably some others too.. */
1026       return true;
1027    default:
1028       return false;
1029    }
1030 }
1031 
1032 static inline bool
is_input(struct ir3_instruction * instr)1033 is_input(struct ir3_instruction *instr)
1034 {
1035    /* in some cases, ldlv is used to fetch varying without
1036     * interpolation.. fortunately inloc is the first src
1037     * register in either case
1038     */
1039    switch (instr->opc) {
1040    case OPC_LDLV:
1041    case OPC_BARY_F:
1042    case OPC_FLAT_B:
1043       return true;
1044    default:
1045       return false;
1046    }
1047 }
1048 
1049 static inline bool
is_bool(struct ir3_instruction * instr)1050 is_bool(struct ir3_instruction *instr)
1051 {
1052    switch (instr->opc) {
1053    case OPC_CMPS_F:
1054    case OPC_CMPS_S:
1055    case OPC_CMPS_U:
1056       return true;
1057    default:
1058       return false;
1059    }
1060 }
1061 
1062 static inline opc_t
cat3_half_opc(opc_t opc)1063 cat3_half_opc(opc_t opc)
1064 {
1065    switch (opc) {
1066    case OPC_MAD_F32:
1067       return OPC_MAD_F16;
1068    case OPC_SEL_B32:
1069       return OPC_SEL_B16;
1070    case OPC_SEL_S32:
1071       return OPC_SEL_S16;
1072    case OPC_SEL_F32:
1073       return OPC_SEL_F16;
1074    case OPC_SAD_S32:
1075       return OPC_SAD_S16;
1076    default:
1077       return opc;
1078    }
1079 }
1080 
1081 static inline opc_t
cat3_full_opc(opc_t opc)1082 cat3_full_opc(opc_t opc)
1083 {
1084    switch (opc) {
1085    case OPC_MAD_F16:
1086       return OPC_MAD_F32;
1087    case OPC_SEL_B16:
1088       return OPC_SEL_B32;
1089    case OPC_SEL_S16:
1090       return OPC_SEL_S32;
1091    case OPC_SEL_F16:
1092       return OPC_SEL_F32;
1093    case OPC_SAD_S16:
1094       return OPC_SAD_S32;
1095    default:
1096       return opc;
1097    }
1098 }
1099 
1100 static inline opc_t
cat4_half_opc(opc_t opc)1101 cat4_half_opc(opc_t opc)
1102 {
1103    switch (opc) {
1104    case OPC_RSQ:
1105       return OPC_HRSQ;
1106    case OPC_LOG2:
1107       return OPC_HLOG2;
1108    case OPC_EXP2:
1109       return OPC_HEXP2;
1110    default:
1111       return opc;
1112    }
1113 }
1114 
1115 static inline opc_t
cat4_full_opc(opc_t opc)1116 cat4_full_opc(opc_t opc)
1117 {
1118    switch (opc) {
1119    case OPC_HRSQ:
1120       return OPC_RSQ;
1121    case OPC_HLOG2:
1122       return OPC_LOG2;
1123    case OPC_HEXP2:
1124       return OPC_EXP2;
1125    default:
1126       return opc;
1127    }
1128 }
1129 
1130 static inline bool
is_meta(struct ir3_instruction * instr)1131 is_meta(struct ir3_instruction *instr)
1132 {
1133    return (opc_cat(instr->opc) == -1);
1134 }
1135 
1136 static inline unsigned
reg_elems(const struct ir3_register * reg)1137 reg_elems(const struct ir3_register *reg)
1138 {
1139    if (reg->flags & IR3_REG_ARRAY)
1140       return reg->size;
1141    else
1142       return util_last_bit(reg->wrmask);
1143 }
1144 
1145 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1146 reg_elem_size(const struct ir3_register *reg)
1147 {
1148    return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1149 }
1150 
1151 static inline unsigned
reg_size(const struct ir3_register * reg)1152 reg_size(const struct ir3_register *reg)
1153 {
1154    return reg_elems(reg) * reg_elem_size(reg);
1155 }
1156 
1157 static inline unsigned
dest_regs(struct ir3_instruction * instr)1158 dest_regs(struct ir3_instruction *instr)
1159 {
1160    if (instr->dsts_count == 0)
1161       return 0;
1162 
1163    assert(instr->dsts_count == 1);
1164    return util_last_bit(instr->dsts[0]->wrmask);
1165 }
1166 
1167 /* is dst a normal temp register: */
1168 static inline bool
is_dest_gpr(struct ir3_register * dst)1169 is_dest_gpr(struct ir3_register *dst)
1170 {
1171    if (dst->wrmask == 0)
1172       return false;
1173    if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
1174       return false;
1175    return true;
1176 }
1177 
1178 static inline bool
writes_gpr(struct ir3_instruction * instr)1179 writes_gpr(struct ir3_instruction *instr)
1180 {
1181    if (dest_regs(instr) == 0)
1182       return false;
1183    return is_dest_gpr(instr->dsts[0]);
1184 }
1185 
1186 static inline bool
writes_addr0(struct ir3_instruction * instr)1187 writes_addr0(struct ir3_instruction *instr)
1188 {
1189    /* Note: only the first dest can write to a0.x */
1190    if (instr->dsts_count > 0) {
1191       struct ir3_register *dst = instr->dsts[0];
1192       return dst->num == regid(REG_A0, 0);
1193    }
1194    return false;
1195 }
1196 
1197 static inline bool
writes_addr1(struct ir3_instruction * instr)1198 writes_addr1(struct ir3_instruction *instr)
1199 {
1200    /* Note: only the first dest can write to a1.x */
1201    if (instr->dsts_count > 0) {
1202       struct ir3_register *dst = instr->dsts[0];
1203       return dst->num == regid(REG_A0, 1);
1204    }
1205    return false;
1206 }
1207 
1208 static inline bool
writes_pred(struct ir3_instruction * instr)1209 writes_pred(struct ir3_instruction *instr)
1210 {
1211    /* Note: only the first dest can write to p0.x */
1212    if (instr->dsts_count > 0) {
1213       struct ir3_register *dst = instr->dsts[0];
1214       return reg_num(dst) == REG_P0;
1215    }
1216    return false;
1217 }
1218 
1219 /* Is it something other than a normal register. Shared regs, p0, and a0/a1
1220  * are considered special here. Special registers are always accessed with one
1221  * size and never alias normal registers, even though a naive calculation
1222  * would sometimes make it seem like e.g. r30.z aliases a0.x.
1223  */
1224 static inline bool
is_reg_special(const struct ir3_register * reg)1225 is_reg_special(const struct ir3_register *reg)
1226 {
1227    return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
1228           (reg_num(reg) == REG_P0);
1229 }
1230 
1231 /* Same as above but in cases where we don't have a register. r48.x and above
1232  * are shared/special.
1233  */
1234 static inline bool
is_reg_num_special(unsigned num)1235 is_reg_num_special(unsigned num)
1236 {
1237    return num >= 48 * 4;
1238 }
1239 
1240 /* returns defining instruction for reg */
1241 /* TODO better name */
1242 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1243 ssa(struct ir3_register *reg)
1244 {
1245    if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1246       return reg->def->instr;
1247    return NULL;
1248 }
1249 
1250 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1251 conflicts(struct ir3_register *a, struct ir3_register *b)
1252 {
1253    return (a && b) && (a->def != b->def);
1254 }
1255 
1256 static inline bool
reg_gpr(struct ir3_register * r)1257 reg_gpr(struct ir3_register *r)
1258 {
1259    if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1260       return false;
1261    if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
1262       return false;
1263    return true;
1264 }
1265 
1266 static inline type_t
half_type(type_t type)1267 half_type(type_t type)
1268 {
1269    switch (type) {
1270    case TYPE_F32:
1271       return TYPE_F16;
1272    case TYPE_U32:
1273       return TYPE_U16;
1274    case TYPE_S32:
1275       return TYPE_S16;
1276    case TYPE_F16:
1277    case TYPE_U16:
1278    case TYPE_S16:
1279       return type;
1280    case TYPE_U8:
1281    case TYPE_S8:
1282       return type;
1283    default:
1284       assert(0);
1285       return ~0;
1286    }
1287 }
1288 
1289 static inline type_t
full_type(type_t type)1290 full_type(type_t type)
1291 {
1292    switch (type) {
1293    case TYPE_F16:
1294       return TYPE_F32;
1295    case TYPE_U8:
1296    case TYPE_U16:
1297       return TYPE_U32;
1298    case TYPE_S8:
1299    case TYPE_S16:
1300       return TYPE_S32;
1301    case TYPE_F32:
1302    case TYPE_U32:
1303    case TYPE_S32:
1304       return type;
1305    default:
1306       assert(0);
1307       return ~0;
1308    }
1309 }
1310 
1311 /* some cat2 instructions (ie. those which are not float) can embed an
1312  * immediate:
1313  */
1314 static inline bool
ir3_cat2_int(opc_t opc)1315 ir3_cat2_int(opc_t opc)
1316 {
1317    switch (opc) {
1318    case OPC_ADD_U:
1319    case OPC_ADD_S:
1320    case OPC_SUB_U:
1321    case OPC_SUB_S:
1322    case OPC_CMPS_U:
1323    case OPC_CMPS_S:
1324    case OPC_MIN_U:
1325    case OPC_MIN_S:
1326    case OPC_MAX_U:
1327    case OPC_MAX_S:
1328    case OPC_CMPV_U:
1329    case OPC_CMPV_S:
1330    case OPC_MUL_U24:
1331    case OPC_MUL_S24:
1332    case OPC_MULL_U:
1333    case OPC_CLZ_S:
1334    case OPC_ABSNEG_S:
1335    case OPC_AND_B:
1336    case OPC_OR_B:
1337    case OPC_NOT_B:
1338    case OPC_XOR_B:
1339    case OPC_BFREV_B:
1340    case OPC_CLZ_B:
1341    case OPC_SHL_B:
1342    case OPC_SHR_B:
1343    case OPC_ASHR_B:
1344    case OPC_MGEN_B:
1345    case OPC_GETBIT_B:
1346    case OPC_CBITS_B:
1347    case OPC_BARY_F:
1348    case OPC_FLAT_B:
1349       return true;
1350 
1351    default:
1352       return false;
1353    }
1354 }
1355 
1356 /* map cat2 instruction to valid abs/neg flags: */
1357 static inline unsigned
ir3_cat2_absneg(opc_t opc)1358 ir3_cat2_absneg(opc_t opc)
1359 {
1360    switch (opc) {
1361    case OPC_ADD_F:
1362    case OPC_MIN_F:
1363    case OPC_MAX_F:
1364    case OPC_MUL_F:
1365    case OPC_SIGN_F:
1366    case OPC_CMPS_F:
1367    case OPC_ABSNEG_F:
1368    case OPC_CMPV_F:
1369    case OPC_FLOOR_F:
1370    case OPC_CEIL_F:
1371    case OPC_RNDNE_F:
1372    case OPC_RNDAZ_F:
1373    case OPC_TRUNC_F:
1374    case OPC_BARY_F:
1375       return IR3_REG_FABS | IR3_REG_FNEG;
1376 
1377    case OPC_ADD_U:
1378    case OPC_ADD_S:
1379    case OPC_SUB_U:
1380    case OPC_SUB_S:
1381    case OPC_CMPS_U:
1382    case OPC_CMPS_S:
1383    case OPC_MIN_U:
1384    case OPC_MIN_S:
1385    case OPC_MAX_U:
1386    case OPC_MAX_S:
1387    case OPC_CMPV_U:
1388    case OPC_CMPV_S:
1389    case OPC_MUL_U24:
1390    case OPC_MUL_S24:
1391    case OPC_MULL_U:
1392    case OPC_CLZ_S:
1393       return 0;
1394 
1395    case OPC_ABSNEG_S:
1396       return IR3_REG_SABS | IR3_REG_SNEG;
1397 
1398    case OPC_AND_B:
1399    case OPC_OR_B:
1400    case OPC_NOT_B:
1401    case OPC_XOR_B:
1402    case OPC_BFREV_B:
1403    case OPC_CLZ_B:
1404    case OPC_SHL_B:
1405    case OPC_SHR_B:
1406    case OPC_ASHR_B:
1407    case OPC_MGEN_B:
1408    case OPC_GETBIT_B:
1409    case OPC_CBITS_B:
1410       return IR3_REG_BNOT;
1411 
1412    default:
1413       return 0;
1414    }
1415 }
1416 
1417 /* map cat3 instructions to valid abs/neg flags: */
1418 static inline unsigned
ir3_cat3_absneg(opc_t opc)1419 ir3_cat3_absneg(opc_t opc)
1420 {
1421    switch (opc) {
1422    case OPC_MAD_F16:
1423    case OPC_MAD_F32:
1424    case OPC_SEL_F16:
1425    case OPC_SEL_F32:
1426       return IR3_REG_FNEG;
1427 
1428    case OPC_MAD_U16:
1429    case OPC_MADSH_U16:
1430    case OPC_MAD_S16:
1431    case OPC_MADSH_M16:
1432    case OPC_MAD_U24:
1433    case OPC_MAD_S24:
1434    case OPC_SEL_S16:
1435    case OPC_SEL_S32:
1436    case OPC_SAD_S16:
1437    case OPC_SAD_S32:
1438       /* neg *may* work on 3rd src.. */
1439 
1440    case OPC_SEL_B16:
1441    case OPC_SEL_B32:
1442 
1443    case OPC_SHRM:
1444    case OPC_SHLM:
1445    case OPC_SHRG:
1446    case OPC_SHLG:
1447    case OPC_ANDG:
1448    case OPC_WMM:
1449    case OPC_WMM_ACCU:
1450 
1451    default:
1452       return 0;
1453    }
1454 }
1455 
1456 /* Return the type (float, int, or uint) the op uses when converting from the
1457  * internal result of the op (which is assumed to be the same size as the
1458  * sources) to the destination when they are not the same size. If F32 it does
1459  * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1460  * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1461  * doesn't do anything sensible or is unknown.
1462  */
1463 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1464 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1465 {
1466    *can_fold = true;
1467    switch (instr->opc) {
1468    case OPC_ADD_F:
1469    case OPC_MUL_F:
1470    case OPC_BARY_F:
1471    case OPC_MAD_F32:
1472    case OPC_MAD_F16:
1473    case OPC_WMM:
1474    case OPC_WMM_ACCU:
1475       return TYPE_F32;
1476 
1477    case OPC_ADD_U:
1478    case OPC_SUB_U:
1479    case OPC_MIN_U:
1480    case OPC_MAX_U:
1481    case OPC_AND_B:
1482    case OPC_OR_B:
1483    case OPC_NOT_B:
1484    case OPC_XOR_B:
1485    case OPC_MUL_U24:
1486    case OPC_MULL_U:
1487    case OPC_SHL_B:
1488    case OPC_SHR_B:
1489    case OPC_ASHR_B:
1490    case OPC_MAD_U24:
1491    case OPC_SHRM:
1492    case OPC_SHLM:
1493    case OPC_SHRG:
1494    case OPC_SHLG:
1495    case OPC_ANDG:
1496    /* Comparison ops zero-extend/truncate their results, so consider them as
1497     * unsigned here.
1498     */
1499    case OPC_CMPS_F:
1500    case OPC_CMPV_F:
1501    case OPC_CMPS_U:
1502    case OPC_CMPS_S:
1503       return TYPE_U32;
1504 
1505    case OPC_ADD_S:
1506    case OPC_SUB_S:
1507    case OPC_MIN_S:
1508    case OPC_MAX_S:
1509    case OPC_ABSNEG_S:
1510    case OPC_MUL_S24:
1511    case OPC_MAD_S24:
1512       return TYPE_S32;
1513 
1514    /* We assume that any move->move folding that could be done was done by
1515     * NIR.
1516     */
1517    case OPC_MOV:
1518    default:
1519       *can_fold = false;
1520       return TYPE_U32;
1521    }
1522 }
1523 
1524 /* Return the src and dst types for the conversion which is already folded
1525  * into the op. We can assume that instr has folded in a conversion from
1526  * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1527  * to call if ir3_output_conv_type() returns can_fold = true.
1528  */
1529 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1530 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1531 {
1532    switch (instr->opc) {
1533    case OPC_CMPS_F:
1534    case OPC_CMPV_F:
1535    case OPC_CMPS_U:
1536    case OPC_CMPS_S:
1537       /* Comparisons only return 0/1 and the size of the comparison sources
1538        * is irrelevant, never consider them as having an output conversion
1539        * by returning a type with the dest size here:
1540        */
1541       return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1542                                                     : full_type(base_type);
1543 
1544    case OPC_BARY_F:
1545       /* bary.f doesn't have an explicit source, but we can assume here that
1546        * the varying data it reads is in fp32.
1547        *
1548        * This may be fp16 on older gen's depending on some register
1549        * settings, but it's probably not worth plumbing that through for a
1550        * small improvement that NIR would hopefully handle for us anyway.
1551        */
1552       return TYPE_F32;
1553 
1554    case OPC_FLAT_B:
1555       /* Treat the input data as u32 if not interpolating. */
1556       return TYPE_U32;
1557 
1558    default:
1559       return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1560                                                     : full_type(base_type);
1561    }
1562 }
1563 
1564 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1565 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1566 {
1567    return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1568                                                  : full_type(base_type);
1569 }
1570 
1571 /* Some instructions have signed/unsigned variants which are identical except
1572  * for whether the folded conversion sign-extends or zero-extends, and we can
1573  * fold in a mismatching move by rewriting the opcode. Return the opcode to
1574  * switch signedness, and whether one exists.
1575  */
1576 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1577 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1578 {
1579    switch (opc) {
1580 #define PAIR(u, s)                                                             \
1581    case OPC_##u:                                                               \
1582       return OPC_##s;                                                          \
1583    case OPC_##s:                                                               \
1584       return OPC_##u;
1585       PAIR(ADD_U, ADD_S)
1586       PAIR(SUB_U, SUB_S)
1587       /* Note: these are only identical when the sources are half, but that's
1588        * the only case we call this function for anyway.
1589        */
1590       PAIR(MUL_U24, MUL_S24)
1591 
1592    default:
1593       *can_swap = false;
1594       return opc;
1595    }
1596 }
1597 
1598 #define MASK(n) ((1 << (n)) - 1)
1599 
1600 /* iterator for an instructions's sources (reg), also returns src #: */
1601 #define foreach_src_n(__srcreg, __n, __instr)                                  \
1602    if ((__instr)->srcs_count)                                                  \
1603       for (struct ir3_register *__srcreg = (void *)~0; __srcreg;               \
1604            __srcreg = NULL)                                                    \
1605          for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt;    \
1606               __n++)                                                           \
1607             if ((__srcreg = (__instr)->srcs[__n]))
1608 
1609 /* iterator for an instructions's sources (reg): */
1610 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1611 
1612 /* iterator for an instructions's destinations (reg), also returns dst #: */
1613 #define foreach_dst_n(__dstreg, __n, __instr)                                  \
1614    if ((__instr)->dsts_count)                                                  \
1615       for (struct ir3_register *__dstreg = (void *)~0; __dstreg;               \
1616            __dstreg = NULL)                                                    \
1617          for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt;    \
1618               __n++)                                                           \
1619             if ((__dstreg = (__instr)->dsts[__n]))
1620 
1621 /* iterator for an instructions's destinations (reg): */
1622 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1623 
1624 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1625 __ssa_src_cnt(struct ir3_instruction *instr)
1626 {
1627    return instr->srcs_count + instr->deps_count;
1628 }
1629 
1630 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1631 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1632 {
1633    if (n >= instr->srcs_count)
1634       return true;
1635    return false;
1636 }
1637 
1638 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1639 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1640 {
1641    if (__is_false_dep(instr, n))
1642       return &instr->deps[n - instr->srcs_count];
1643    if (ssa(instr->srcs[n]))
1644       return &instr->srcs[n]->def->instr;
1645    return NULL;
1646 }
1647 
1648 #define foreach_ssa_srcp_n(__srcp, __n, __instr)                               \
1649    for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL)   \
1650       for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt;      \
1651            __n++)                                                              \
1652          if ((__srcp = __ssa_srcp_n(__instr, __n)))
1653 
1654 #define foreach_ssa_srcp(__srcp, __instr)                                      \
1655    foreach_ssa_srcp_n (__srcp, __i, __instr)
1656 
1657 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1658 #define foreach_ssa_src_n(__srcinst, __n, __instr)                             \
1659    for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst;             \
1660         __srcinst = NULL)                                                      \
1661       foreach_ssa_srcp_n (__srcp, __n, __instr)                                \
1662          if ((__srcinst = *__srcp))
1663 
1664 /* iterator for an instruction's SSA sources (instr): */
1665 #define foreach_ssa_src(__srcinst, __instr)                                    \
1666    foreach_ssa_src_n (__srcinst, __i, __instr)
1667 
1668 /* iterators for shader inputs: */
1669 #define foreach_input_n(__ininstr, __cnt, __ir)                                \
1670    for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr;             \
1671         __ininstr = NULL)                                                      \
1672       for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++)          \
1673          if ((__ininstr = (__ir)->inputs[__cnt]))
1674 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1675 
1676 /* iterators for instructions: */
1677 #define foreach_instr(__instr, __list)                                         \
1678    list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1679 #define foreach_instr_rev(__instr, __list)                                     \
1680    list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1681 #define foreach_instr_safe(__instr, __list)                                    \
1682    list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1683 #define foreach_instr_from_safe(__instr, __start, __list)                      \
1684    list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start,     \
1685                                  __list, node)
1686 
1687 /* iterators for blocks: */
1688 #define foreach_block(__block, __list)                                         \
1689    list_for_each_entry (struct ir3_block, __block, __list, node)
1690 #define foreach_block_safe(__block, __list)                                    \
1691    list_for_each_entry_safe (struct ir3_block, __block, __list, node)
1692 #define foreach_block_rev(__block, __list)                                     \
1693    list_for_each_entry_rev (struct ir3_block, __block, __list, node)
1694 
1695 /* iterators for arrays: */
1696 #define foreach_array(__array, __list)                                         \
1697    list_for_each_entry (struct ir3_array, __array, __list, node)
1698 #define foreach_array_safe(__array, __list)                                    \
1699    list_for_each_entry_safe (struct ir3_array, __array, __list, node)
1700 
1701 #define IR3_PASS(ir, pass, ...)                                                \
1702    ({                                                                          \
1703       bool progress = pass(ir, ##__VA_ARGS__);                                 \
1704       if (progress) {                                                          \
1705          ir3_debug_print(ir, "AFTER: " #pass);                                 \
1706          ir3_validate(ir);                                                     \
1707       }                                                                        \
1708       progress;                                                                \
1709    })
1710 
1711 /* validate: */
1712 void ir3_validate(struct ir3 *ir);
1713 
1714 /* dump: */
1715 void ir3_print(struct ir3 *ir);
1716 void ir3_print_instr(struct ir3_instruction *instr);
1717 
1718 struct log_stream;
1719 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
1720 
1721 /* delay calculation: */
1722 int ir3_delayslots(struct ir3_instruction *assigner,
1723                    struct ir3_instruction *consumer, unsigned n, bool soft);
1724 unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
1725                                     struct ir3_instruction *consumer,
1726                                     unsigned assigner_n, unsigned consumer_n);
1727 unsigned ir3_delay_calc(struct ir3_block *block,
1728                         struct ir3_instruction *instr, bool mergedregs);
1729 
1730 /* estimated (ss)/(sy) delay calculation */
1731 
1732 static inline bool
is_local_mem_load(struct ir3_instruction * instr)1733 is_local_mem_load(struct ir3_instruction *instr)
1734 {
1735    return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
1736       instr->opc == OPC_LDLW;
1737 }
1738 
1739 /* Does this instruction need (ss) to wait for its result? */
1740 static inline bool
is_ss_producer(struct ir3_instruction * instr)1741 is_ss_producer(struct ir3_instruction *instr)
1742 {
1743    foreach_dst (dst, instr) {
1744       if (dst->flags & IR3_REG_SHARED)
1745          return true;
1746    }
1747    return is_sfu(instr) || is_local_mem_load(instr);
1748 }
1749 
1750 /* The soft delay for approximating the cost of (ss). */
1751 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)1752 soft_ss_delay(struct ir3_instruction *instr)
1753 {
1754    /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
1755     * using nop's instead of (ss) is:
1756     *
1757     *     8 - single warp
1758     *     9 - two warps
1759     *    10 - four warps
1760     *
1761     * and so on. Not quite sure where it tapers out (ie. how many warps share an
1762     * SFU unit). But 10 seems like a reasonable # to choose:
1763     */
1764    if (is_sfu(instr) || is_local_mem_load(instr))
1765       return 10;
1766 
1767    /* The blob adds 6 nops between shared producers and consumers, and before we
1768     * used (ss) this was sufficient in most cases.
1769     */
1770    return 6;
1771 }
1772 
1773 static inline bool
is_sy_producer(struct ir3_instruction * instr)1774 is_sy_producer(struct ir3_instruction *instr)
1775 {
1776    return is_tex_or_prefetch(instr) ||
1777       (is_load(instr) && !is_local_mem_load(instr)) ||
1778       is_atomic(instr->opc);
1779 }
1780 
1781 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)1782 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
1783 {
1784    /* TODO: this is just an optimistic guess, we can do better post-RA.
1785     */
1786    bool double_wavesize =
1787       shader->type == MESA_SHADER_FRAGMENT ||
1788       shader->type == MESA_SHADER_COMPUTE;
1789 
1790    unsigned components = reg_elems(instr->dsts[0]);
1791 
1792    /* These numbers come from counting the number of delay slots to get
1793     * cat5/cat6 results back using nops instead of (sy). Note that these numbers
1794     * are with the result preloaded to cache by loading it before in the same
1795     * shader - uncached results are much larger.
1796     *
1797     * Note: most ALU instructions can't complete at the full doubled rate, so
1798     * they take 2 cycles. The only exception is fp16 instructions with no
1799     * built-in conversions. Therefore divide the latency by 2.
1800     *
1801     * TODO: Handle this properly in the scheduler and remove this.
1802     */
1803    if (instr->opc == OPC_LDC) {
1804       if (double_wavesize)
1805          return (21 + 8 * components) / 2;
1806       else
1807          return 18 + 4 * components;
1808    } else if (is_tex_or_prefetch(instr)) {
1809       if (double_wavesize) {
1810          switch (components) {
1811          case 1: return 58 / 2;
1812          case 2: return 60 / 2;
1813          case 3: return 77 / 2;
1814          case 4: return 79 / 2;
1815          default: unreachable("bad number of components");
1816          }
1817       } else {
1818          switch (components) {
1819          case 1: return 51;
1820          case 2: return 53;
1821          case 3: return 62;
1822          case 4: return 64;
1823          default: unreachable("bad number of components");
1824          }
1825       }
1826    } else {
1827       /* TODO: measure other cat6 opcodes like ldg */
1828       if (double_wavesize)
1829          return (172 + components) / 2;
1830       else
1831          return 109 + components;
1832    }
1833 }
1834 
1835 
1836 /* unreachable block elimination: */
1837 bool ir3_remove_unreachable(struct ir3 *ir);
1838 
1839 /* dead code elimination: */
1840 struct ir3_shader_variant;
1841 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
1842 
1843 /* fp16 conversion folding */
1844 bool ir3_cf(struct ir3 *ir);
1845 
1846 /* copy-propagate: */
1847 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1848 
1849 /* common subexpression elimination: */
1850 bool ir3_cse(struct ir3 *ir);
1851 
1852 /* Make arrays SSA */
1853 bool ir3_array_to_ssa(struct ir3 *ir);
1854 
1855 /* scheduling: */
1856 bool ir3_sched_add_deps(struct ir3 *ir);
1857 int ir3_sched(struct ir3 *ir);
1858 
1859 struct ir3_context;
1860 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
1861 
1862 /* register assignment: */
1863 int ir3_ra(struct ir3_shader_variant *v);
1864 
1865 /* lower subgroup ops: */
1866 bool ir3_lower_subgroups(struct ir3 *ir);
1867 
1868 /* legalize: */
1869 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
1870 bool ir3_legalize_relative(struct ir3 *ir);
1871 
1872 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)1873 ir3_has_latency_to_hide(struct ir3 *ir)
1874 {
1875    /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
1876     * know the nature of the fragment shader.  Just assume it will have
1877     * latency to hide:
1878     */
1879    if (ir->type != MESA_SHADER_FRAGMENT)
1880       return true;
1881 
1882    foreach_block (block, &ir->block_list) {
1883       foreach_instr (instr, &block->instr_list) {
1884          if (is_tex_or_prefetch(instr))
1885             return true;
1886 
1887          if (is_load(instr)) {
1888             switch (instr->opc) {
1889             case OPC_LDLV:
1890             case OPC_LDL:
1891             case OPC_LDLW:
1892                break;
1893             default:
1894                return true;
1895             }
1896          }
1897       }
1898    }
1899 
1900    return false;
1901 }
1902 
1903 /* ************************************************************************* */
1904 /* instruction helpers */
1905 
1906 /* creates SSA src of correct type (ie. half vs full precision) */
1907 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)1908 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
1909           unsigned flags)
1910 {
1911    struct ir3_register *reg;
1912    if (src->dsts[0]->flags & IR3_REG_HALF)
1913       flags |= IR3_REG_HALF;
1914    reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
1915    reg->def = src->dsts[0];
1916    reg->wrmask = src->dsts[0]->wrmask;
1917    return reg;
1918 }
1919 
1920 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)1921 __ssa_dst(struct ir3_instruction *instr)
1922 {
1923    struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
1924    reg->instr = instr;
1925    return reg;
1926 }
1927 
1928 static inline struct ir3_instruction *
create_immed_typed(struct ir3_block * block,uint32_t val,type_t type)1929 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
1930 {
1931    struct ir3_instruction *mov;
1932    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1933 
1934    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1935    mov->cat1.src_type = type;
1936    mov->cat1.dst_type = type;
1937    __ssa_dst(mov)->flags |= flags;
1938    ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
1939 
1940    return mov;
1941 }
1942 
1943 static inline struct ir3_instruction *
create_immed(struct ir3_block * block,uint32_t val)1944 create_immed(struct ir3_block *block, uint32_t val)
1945 {
1946    return create_immed_typed(block, val, TYPE_U32);
1947 }
1948 
1949 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_block * block,unsigned n,type_t type)1950 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
1951 {
1952    struct ir3_instruction *mov;
1953    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1954 
1955    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1956    mov->cat1.src_type = type;
1957    mov->cat1.dst_type = type;
1958    __ssa_dst(mov)->flags |= flags;
1959    ir3_src_create(mov, n, IR3_REG_CONST | flags);
1960 
1961    return mov;
1962 }
1963 
1964 static inline struct ir3_instruction *
create_uniform(struct ir3_block * block,unsigned n)1965 create_uniform(struct ir3_block *block, unsigned n)
1966 {
1967    return create_uniform_typed(block, n, TYPE_F32);
1968 }
1969 
1970 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_block * block,int n,type_t type,struct ir3_instruction * address)1971 create_uniform_indirect(struct ir3_block *block, int n, type_t type,
1972                         struct ir3_instruction *address)
1973 {
1974    struct ir3_instruction *mov;
1975 
1976    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1977    mov->cat1.src_type = type;
1978    mov->cat1.dst_type = type;
1979    __ssa_dst(mov);
1980    ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
1981 
1982    ir3_instr_set_address(mov, address);
1983 
1984    return mov;
1985 }
1986 
1987 static inline struct ir3_instruction *
ir3_MOV(struct ir3_block * block,struct ir3_instruction * src,type_t type)1988 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
1989 {
1990    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
1991    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1992 
1993    __ssa_dst(instr)->flags |= flags;
1994    if (src->dsts[0]->flags & IR3_REG_ARRAY) {
1995       struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
1996       src_reg->array = src->dsts[0]->array;
1997    } else {
1998       __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
1999    }
2000    assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
2001    instr->cat1.src_type = type;
2002    instr->cat1.dst_type = type;
2003    return instr;
2004 }
2005 
2006 static inline struct ir3_instruction *
ir3_COV(struct ir3_block * block,struct ir3_instruction * src,type_t src_type,type_t dst_type)2007 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
2008         type_t dst_type)
2009 {
2010    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2011    unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
2012    unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
2013 
2014    assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2015 
2016    __ssa_dst(instr)->flags |= dst_flags;
2017    __ssa_src(instr, src, 0);
2018    instr->cat1.src_type = src_type;
2019    instr->cat1.dst_type = dst_type;
2020    assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2021    return instr;
2022 }
2023 
2024 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_block * block,unsigned components)2025 ir3_MOVMSK(struct ir3_block *block, unsigned components)
2026 {
2027    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
2028 
2029    struct ir3_register *dst = __ssa_dst(instr);
2030    dst->flags |= IR3_REG_SHARED;
2031    dst->wrmask = (1 << components) - 1;
2032    instr->repeat = components - 1;
2033    return instr;
2034 }
2035 
2036 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_block * block,struct ir3_instruction * src,unsigned components)2037 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
2038                  unsigned components)
2039 {
2040    struct ir3_instruction *instr =
2041       ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
2042 
2043    struct ir3_register *dst = __ssa_dst(instr);
2044    dst->flags |= IR3_REG_SHARED;
2045    dst->wrmask = (1 << components) - 1;
2046 
2047    __ssa_src(instr, src, 0);
2048 
2049    return instr;
2050 }
2051 
2052 static inline struct ir3_instruction *
ir3_NOP(struct ir3_block * block)2053 ir3_NOP(struct ir3_block *block)
2054 {
2055    return ir3_instr_create(block, OPC_NOP, 0, 0);
2056 }
2057 
2058 /* clang-format off */
2059 #define __INSTR0(flag, name, opc)                                              \
2060 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block)      \
2061 {                                                                              \
2062    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0);         \
2063    instr->flags |= flag;                                                       \
2064    return instr;                                                               \
2065 }
2066 /* clang-format on */
2067 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2068 #define INSTR0(name)     __INSTR0(0, name, OPC_##name)
2069 
2070 /* clang-format off */
2071 #define __INSTR1(flag, dst_count, name, opc)                                   \
2072 static inline struct ir3_instruction *ir3_##name(                              \
2073    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags)        \
2074 {                                                                              \
2075    struct ir3_instruction *instr =                                             \
2076       ir3_instr_create(block, opc, dst_count, 1);                              \
2077    for (unsigned i = 0; i < dst_count; i++)                                    \
2078       __ssa_dst(instr);                                                        \
2079    __ssa_src(instr, a, aflags);                                                \
2080    instr->flags |= flag;                                                       \
2081    return instr;                                                               \
2082 }
2083 /* clang-format on */
2084 #define INSTR1F(f, name)  __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2085 #define INSTR1(name)      __INSTR1(0, 1, name, OPC_##name)
2086 #define INSTR1NODST(name) __INSTR1(0, 0, name, OPC_##name)
2087 
2088 /* clang-format off */
2089 #define __INSTR2(flag, dst_count, name, opc)                                   \
2090 static inline struct ir3_instruction *ir3_##name(                              \
2091    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2092    struct ir3_instruction *b, unsigned bflags)                                 \
2093 {                                                                              \
2094    struct ir3_instruction *instr = ir3_instr_create(block, opc, dst_count, 2); \
2095    for (unsigned i = 0; i < dst_count; i++)                                    \
2096       __ssa_dst(instr);                                                        \
2097    __ssa_src(instr, a, aflags);                                                \
2098    __ssa_src(instr, b, bflags);                                                \
2099    instr->flags |= flag;                                                       \
2100    return instr;                                                               \
2101 }
2102 /* clang-format on */
2103 #define INSTR2F(f, name)   __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2104 #define INSTR2(name)       __INSTR2(0, 1, name, OPC_##name)
2105 #define INSTR2NODST(name)  __INSTR2(0, 0, name, OPC_##name)
2106 
2107 /* clang-format off */
2108 #define __INSTR3(flag, dst_count, name, opc)                                   \
2109 static inline struct ir3_instruction *ir3_##name(                              \
2110    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2111    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2112    unsigned cflags)                                                            \
2113 {                                                                              \
2114    struct ir3_instruction *instr =                                             \
2115       ir3_instr_create(block, opc, dst_count, 3);                              \
2116    for (unsigned i = 0; i < dst_count; i++)                                    \
2117       __ssa_dst(instr);                                                        \
2118    __ssa_src(instr, a, aflags);                                                \
2119    __ssa_src(instr, b, bflags);                                                \
2120    __ssa_src(instr, c, cflags);                                                \
2121    instr->flags |= flag;                                                       \
2122    return instr;                                                               \
2123 }
2124 /* clang-format on */
2125 #define INSTR3F(f, name)  __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2126 #define INSTR3(name)      __INSTR3(0, 1, name, OPC_##name)
2127 #define INSTR3NODST(name) __INSTR3(0, 0, name, OPC_##name)
2128 
2129 /* clang-format off */
2130 #define __INSTR4(flag, dst_count, name, opc)                                   \
2131 static inline struct ir3_instruction *ir3_##name(                              \
2132    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2133    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2134    unsigned cflags, struct ir3_instruction *d, unsigned dflags)                \
2135 {                                                                              \
2136    struct ir3_instruction *instr =                                             \
2137       ir3_instr_create(block, opc, dst_count, 4);                              \
2138    for (unsigned i = 0; i < dst_count; i++)                                    \
2139       __ssa_dst(instr);                                                        \
2140    __ssa_src(instr, a, aflags);                                                \
2141    __ssa_src(instr, b, bflags);                                                \
2142    __ssa_src(instr, c, cflags);                                                \
2143    __ssa_src(instr, d, dflags);                                                \
2144    instr->flags |= flag;                                                       \
2145    return instr;                                                               \
2146 }
2147 /* clang-format on */
2148 #define INSTR4F(f, name)  __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2149 #define INSTR4(name)      __INSTR4(0, 1, name, OPC_##name)
2150 #define INSTR4NODST(name) __INSTR4(0, 0, name, OPC_##name)
2151 
2152 /* clang-format off */
2153 #define __INSTR5(flag, name, opc)                                              \
2154 static inline struct ir3_instruction *ir3_##name(                              \
2155    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2156    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2157    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2158    struct ir3_instruction *e, unsigned eflags)                                 \
2159 {                                                                              \
2160    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5);         \
2161    __ssa_dst(instr);                                                           \
2162    __ssa_src(instr, a, aflags);                                                \
2163    __ssa_src(instr, b, bflags);                                                \
2164    __ssa_src(instr, c, cflags);                                                \
2165    __ssa_src(instr, d, dflags);                                                \
2166    __ssa_src(instr, e, eflags);                                                \
2167    instr->flags |= flag;                                                       \
2168    return instr;                                                               \
2169 }
2170 /* clang-format on */
2171 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2172 #define INSTR5(name)     __INSTR5(0, name, OPC_##name)
2173 
2174 /* clang-format off */
2175 #define __INSTR6(flag, dst_count, name, opc)                                   \
2176 static inline struct ir3_instruction *ir3_##name(                              \
2177    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2178    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2179    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2180    struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f,      \
2181    unsigned fflags)                                                            \
2182 {                                                                              \
2183    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6);         \
2184    for (unsigned i = 0; i < dst_count; i++)                                    \
2185       __ssa_dst(instr);                                                        \
2186    __ssa_src(instr, a, aflags);                                                \
2187    __ssa_src(instr, b, bflags);                                                \
2188    __ssa_src(instr, c, cflags);                                                \
2189    __ssa_src(instr, d, dflags);                                                \
2190    __ssa_src(instr, e, eflags);                                                \
2191    __ssa_src(instr, f, fflags);                                                \
2192    instr->flags |= flag;                                                       \
2193    return instr;                                                               \
2194 }
2195 /* clang-format on */
2196 #define INSTR6F(f, name)  __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2197 #define INSTR6(name)      __INSTR6(0, 1, name, OPC_##name)
2198 #define INSTR6NODST(name) __INSTR6(0, 0, name, OPC_##name)
2199 
2200 /* cat0 instructions: */
2201 INSTR1NODST(B)
INSTR0(JUMP)2202 INSTR0(JUMP)
2203 INSTR1NODST(KILL)
2204 INSTR1NODST(DEMOTE)
2205 INSTR0(END)
2206 INSTR0(CHSH)
2207 INSTR0(CHMASK)
2208 INSTR1NODST(PREDT)
2209 INSTR0(PREDF)
2210 INSTR0(PREDE)
2211 INSTR0(GETONE)
2212 INSTR0(SHPS)
2213 INSTR0(SHPE)
2214 
2215 /* cat1 macros */
2216 INSTR1(ANY_MACRO)
2217 INSTR1(ALL_MACRO)
2218 INSTR1(READ_FIRST_MACRO)
2219 INSTR2(READ_COND_MACRO)
2220 
2221 static inline struct ir3_instruction *
2222 ir3_ELECT_MACRO(struct ir3_block *block)
2223 {
2224    struct ir3_instruction *instr =
2225       ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
2226    __ssa_dst(instr);
2227    return instr;
2228 }
2229 
2230 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_block * block)2231 ir3_SHPS_MACRO(struct ir3_block *block)
2232 {
2233    struct ir3_instruction *instr =
2234       ir3_instr_create(block, OPC_SHPS_MACRO, 1, 0);
2235    __ssa_dst(instr);
2236    return instr;
2237 }
2238 
2239 /* cat2 instructions, most 2 src but some 1 src: */
2240 INSTR2(ADD_F)
INSTR2(MIN_F)2241 INSTR2(MIN_F)
2242 INSTR2(MAX_F)
2243 INSTR2(MUL_F)
2244 INSTR1(SIGN_F)
2245 INSTR2(CMPS_F)
2246 INSTR1(ABSNEG_F)
2247 INSTR2(CMPV_F)
2248 INSTR1(FLOOR_F)
2249 INSTR1(CEIL_F)
2250 INSTR1(RNDNE_F)
2251 INSTR1(RNDAZ_F)
2252 INSTR1(TRUNC_F)
2253 INSTR2(ADD_U)
2254 INSTR2(ADD_S)
2255 INSTR2(SUB_U)
2256 INSTR2(SUB_S)
2257 INSTR2(CMPS_U)
2258 INSTR2(CMPS_S)
2259 INSTR2(MIN_U)
2260 INSTR2(MIN_S)
2261 INSTR2(MAX_U)
2262 INSTR2(MAX_S)
2263 INSTR1(ABSNEG_S)
2264 INSTR2(AND_B)
2265 INSTR2(OR_B)
2266 INSTR1(NOT_B)
2267 INSTR2(XOR_B)
2268 INSTR2(CMPV_U)
2269 INSTR2(CMPV_S)
2270 INSTR2(MUL_U24)
2271 INSTR2(MUL_S24)
2272 INSTR2(MULL_U)
2273 INSTR1(BFREV_B)
2274 INSTR1(CLZ_S)
2275 INSTR1(CLZ_B)
2276 INSTR2(SHL_B)
2277 INSTR2(SHR_B)
2278 INSTR2(ASHR_B)
2279 INSTR2(BARY_F)
2280 INSTR2(FLAT_B)
2281 INSTR2(MGEN_B)
2282 INSTR2(GETBIT_B)
2283 INSTR1(SETRM)
2284 INSTR1(CBITS_B)
2285 INSTR2(SHB)
2286 INSTR2(MSAD)
2287 
2288 /* cat3 instructions: */
2289 INSTR3(MAD_U16)
2290 INSTR3(MADSH_U16)
2291 INSTR3(MAD_S16)
2292 INSTR3(MADSH_M16)
2293 INSTR3(MAD_U24)
2294 INSTR3(MAD_S24)
2295 INSTR3(MAD_F16)
2296 INSTR3(MAD_F32)
2297 INSTR3(DP2ACC)
2298 INSTR3(DP4ACC)
2299 /* NOTE: SEL_B32 checks for zero vs nonzero */
2300 INSTR3(SEL_B16)
2301 INSTR3(SEL_B32)
2302 INSTR3(SEL_S16)
2303 INSTR3(SEL_S32)
2304 INSTR3(SEL_F16)
2305 INSTR3(SEL_F32)
2306 INSTR3(SAD_S16)
2307 INSTR3(SAD_S32)
2308 
2309 /* cat4 instructions: */
2310 INSTR1(RCP)
2311 INSTR1(RSQ)
2312 INSTR1(HRSQ)
2313 INSTR1(LOG2)
2314 INSTR1(HLOG2)
2315 INSTR1(EXP2)
2316 INSTR1(HEXP2)
2317 INSTR1(SIN)
2318 INSTR1(COS)
2319 INSTR1(SQRT)
2320 
2321 /* cat5 instructions: */
2322 INSTR1(DSX)
2323 INSTR1(DSXPP_MACRO)
2324 INSTR1(DSY)
2325 INSTR1(DSYPP_MACRO)
2326 INSTR1F(3D, DSX)
2327 INSTR1F(3D, DSY)
2328 INSTR1(RGETPOS)
2329 
2330 static inline struct ir3_instruction *
2331 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
2332         unsigned flags, struct ir3_instruction *samp_tex,
2333         struct ir3_instruction *src0, struct ir3_instruction *src1)
2334 {
2335    struct ir3_instruction *sam;
2336    unsigned nreg = 0;
2337 
2338    if (flags & IR3_INSTR_S2EN) {
2339       nreg++;
2340    }
2341    if (src0) {
2342       nreg++;
2343    }
2344    if (src1) {
2345       nreg++;
2346    }
2347 
2348    sam = ir3_instr_create(block, opc, 1, nreg);
2349    sam->flags |= flags;
2350    __ssa_dst(sam)->wrmask = wrmask;
2351    if (flags & IR3_INSTR_S2EN) {
2352       __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2353    }
2354    if (src0) {
2355       __ssa_src(sam, src0, 0);
2356    }
2357    if (src1) {
2358       __ssa_src(sam, src1, 0);
2359    }
2360    sam->cat5.type = type;
2361 
2362    return sam;
2363 }
2364 
2365 /* cat6 instructions: */
2366 INSTR0(GETFIBERID)
2367 INSTR2(LDLV)
2368 INSTR3(LDG)
2369 INSTR3(LDL)
2370 INSTR3(LDLW)
2371 INSTR3(LDP)
2372 INSTR4NODST(STG)
2373 INSTR3NODST(STL)
2374 INSTR3NODST(STLW)
2375 INSTR3NODST(STP)
2376 INSTR1(RESINFO)
2377 INSTR1(RESFMT)
2378 INSTR2(ATOMIC_ADD)
2379 INSTR2(ATOMIC_SUB)
2380 INSTR2(ATOMIC_XCHG)
2381 INSTR2(ATOMIC_INC)
2382 INSTR2(ATOMIC_DEC)
2383 INSTR2(ATOMIC_CMPXCHG)
2384 INSTR2(ATOMIC_MIN)
2385 INSTR2(ATOMIC_MAX)
2386 INSTR2(ATOMIC_AND)
2387 INSTR2(ATOMIC_OR)
2388 INSTR2(ATOMIC_XOR)
2389 INSTR2(LDC)
2390 INSTR2(QUAD_SHUFFLE_BRCST)
2391 INSTR1(QUAD_SHUFFLE_HORIZ)
2392 INSTR1(QUAD_SHUFFLE_VERT)
2393 INSTR1(QUAD_SHUFFLE_DIAG)
2394 INSTR2NODST(LDC_K)
2395 INSTR2NODST(STC)
2396 #if GPU >= 600
2397 INSTR3NODST(STIB);
2398 INSTR2(LDIB);
2399 INSTR5(LDG_A);
2400 INSTR6NODST(STG_A);
2401 INSTR2(ATOMIC_G_ADD)
2402 INSTR2(ATOMIC_G_SUB)
2403 INSTR2(ATOMIC_G_XCHG)
2404 INSTR2(ATOMIC_G_INC)
2405 INSTR2(ATOMIC_G_DEC)
2406 INSTR2(ATOMIC_G_CMPXCHG)
2407 INSTR2(ATOMIC_G_MIN)
2408 INSTR2(ATOMIC_G_MAX)
2409 INSTR2(ATOMIC_G_AND)
2410 INSTR2(ATOMIC_G_OR)
2411 INSTR2(ATOMIC_G_XOR)
2412 INSTR3(ATOMIC_B_ADD)
2413 INSTR3(ATOMIC_B_SUB)
2414 INSTR3(ATOMIC_B_XCHG)
2415 INSTR3(ATOMIC_B_INC)
2416 INSTR3(ATOMIC_B_DEC)
2417 INSTR3(ATOMIC_B_CMPXCHG)
2418 INSTR3(ATOMIC_B_MIN)
2419 INSTR3(ATOMIC_B_MAX)
2420 INSTR3(ATOMIC_B_AND)
2421 INSTR3(ATOMIC_B_OR)
2422 INSTR3(ATOMIC_B_XOR)
2423 #elif GPU >= 400
2424 INSTR3(LDGB)
2425 #if GPU >= 500
2426 INSTR3(LDIB)
2427 #endif
2428 INSTR4NODST(STGB)
2429 INSTR4NODST(STIB)
2430 INSTR4(ATOMIC_S_ADD)
2431 INSTR4(ATOMIC_S_SUB)
2432 INSTR4(ATOMIC_S_XCHG)
2433 INSTR4(ATOMIC_S_INC)
2434 INSTR4(ATOMIC_S_DEC)
2435 INSTR4(ATOMIC_S_CMPXCHG)
2436 INSTR4(ATOMIC_S_MIN)
2437 INSTR4(ATOMIC_S_MAX)
2438 INSTR4(ATOMIC_S_AND)
2439 INSTR4(ATOMIC_S_OR)
2440 INSTR4(ATOMIC_S_XOR)
2441 #endif
2442 
2443 /* cat7 instructions: */
2444 INSTR0(BAR)
2445 INSTR0(FENCE)
2446 
2447 /* ************************************************************************* */
2448 #include "bitset.h"
2449 
2450 #define MAX_REG 256
2451 
2452 typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
2453 
2454 typedef struct {
2455    bool mergedregs;
2456    regmaskstate_t mask;
2457 } regmask_t;
2458 
2459 static inline bool
__regmask_get(regmask_t * regmask,bool half,unsigned n)2460 __regmask_get(regmask_t *regmask, bool half, unsigned n)
2461 {
2462    if (regmask->mergedregs) {
2463       /* a6xx+ case, with merged register file, we track things in terms
2464        * of half-precision registers, with a full precisions register
2465        * using two half-precision slots.
2466        *
2467        * Pretend that special regs (a0.x, a1.x, etc.) are full registers to
2468        * avoid having them alias normal full regs.
2469        */
2470       if (half && !is_reg_num_special(n)) {
2471          return BITSET_TEST(regmask->mask, n);
2472       } else {
2473          n *= 2;
2474          return BITSET_TEST(regmask->mask, n) ||
2475                 BITSET_TEST(regmask->mask, n + 1);
2476       }
2477    } else {
2478       /* pre a6xx case, with separate register file for half and full
2479        * precision:
2480        */
2481       if (half)
2482          n += MAX_REG;
2483       return BITSET_TEST(regmask->mask, n);
2484    }
2485 }
2486 
2487 static inline void
__regmask_set(regmask_t * regmask,bool half,unsigned n)2488 __regmask_set(regmask_t *regmask, bool half, unsigned n)
2489 {
2490    if (regmask->mergedregs) {
2491       /* a6xx+ case, with merged register file, we track things in terms
2492        * of half-precision registers, with a full precisions register
2493        * using two half-precision slots:
2494        */
2495       if (half && !is_reg_num_special(n)) {
2496          BITSET_SET(regmask->mask, n);
2497       } else {
2498          n *= 2;
2499          BITSET_SET(regmask->mask, n);
2500          BITSET_SET(regmask->mask, n + 1);
2501       }
2502    } else {
2503       /* pre a6xx case, with separate register file for half and full
2504        * precision:
2505        */
2506       if (half)
2507          n += MAX_REG;
2508       BITSET_SET(regmask->mask, n);
2509    }
2510 }
2511 
2512 static inline void
__regmask_clear(regmask_t * regmask,bool half,unsigned n)2513 __regmask_clear(regmask_t *regmask, bool half, unsigned n)
2514 {
2515    if (regmask->mergedregs) {
2516       /* a6xx+ case, with merged register file, we track things in terms
2517        * of half-precision registers, with a full precisions register
2518        * using two half-precision slots:
2519        */
2520       if (half && !is_reg_num_special(n)) {
2521          BITSET_CLEAR(regmask->mask, n);
2522       } else {
2523          n *= 2;
2524          BITSET_CLEAR(regmask->mask, n);
2525          BITSET_CLEAR(regmask->mask, n + 1);
2526       }
2527    } else {
2528       /* pre a6xx case, with separate register file for half and full
2529        * precision:
2530        */
2531       if (half)
2532          n += MAX_REG;
2533       BITSET_CLEAR(regmask->mask, n);
2534    }
2535 }
2536 
2537 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)2538 regmask_init(regmask_t *regmask, bool mergedregs)
2539 {
2540    memset(&regmask->mask, 0, sizeof(regmask->mask));
2541    regmask->mergedregs = mergedregs;
2542 }
2543 
2544 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)2545 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
2546 {
2547    assert(dst->mergedregs == a->mergedregs);
2548    assert(dst->mergedregs == b->mergedregs);
2549 
2550    for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2551       dst->mask[i] = a->mask[i] | b->mask[i];
2552 }
2553 
2554 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)2555 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
2556 {
2557    regmaskstate_t shared_mask;
2558    BITSET_ZERO(shared_mask);
2559 
2560    if (b->mergedregs) {
2561       BITSET_SET_RANGE(shared_mask, 2 * 4 * 48, 2 * 4 * 56 - 1);
2562    } else {
2563       BITSET_SET_RANGE(shared_mask, 4 * 48, 4 * 56 - 1);
2564    }
2565 
2566    for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2567       dst->mask[i] = a->mask[i] | (b->mask[i] & shared_mask[i]);
2568 }
2569 
2570 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)2571 regmask_set(regmask_t *regmask, struct ir3_register *reg)
2572 {
2573    bool half = reg->flags & IR3_REG_HALF;
2574    if (reg->flags & IR3_REG_RELATIV) {
2575       for (unsigned i = 0; i < reg->size; i++)
2576          __regmask_set(regmask, half, reg->array.base + i);
2577    } else {
2578       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2579          if (mask & 1)
2580             __regmask_set(regmask, half, n);
2581    }
2582 }
2583 
2584 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)2585 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
2586 {
2587    bool half = reg->flags & IR3_REG_HALF;
2588    if (reg->flags & IR3_REG_RELATIV) {
2589       for (unsigned i = 0; i < reg->size; i++)
2590          __regmask_clear(regmask, half, reg->array.base + i);
2591    } else {
2592       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2593          if (mask & 1)
2594             __regmask_clear(regmask, half, n);
2595    }
2596 }
2597 
2598 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)2599 regmask_get(regmask_t *regmask, struct ir3_register *reg)
2600 {
2601    bool half = reg->flags & IR3_REG_HALF;
2602    if (reg->flags & IR3_REG_RELATIV) {
2603       for (unsigned i = 0; i < reg->size; i++)
2604          if (__regmask_get(regmask, half, reg->array.base + i))
2605             return true;
2606    } else {
2607       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2608          if (mask & 1)
2609             if (__regmask_get(regmask, half, n))
2610                return true;
2611    }
2612    return false;
2613 }
2614 /* ************************************************************************* */
2615 
2616 #endif /* IR3_H_ */
2617