1 /*
2 * Copyright © 2013 Rob Clark <robdclark@gmail.com>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #ifndef IR3_H_
7 #define IR3_H_
8
9 #include <stdbool.h>
10 #include <stdint.h>
11
12 #include "compiler/shader_enums.h"
13
14 #include "util/bitscan.h"
15 #include "util/list.h"
16 #include "util/set.h"
17 #include "util/u_debug.h"
18
19 #include "freedreno_common.h"
20
21 #include "instr-a3xx.h"
22
23 /* low level intermediate representation of an adreno shader program */
24
25 struct ir3_compiler;
26 struct ir3;
27 struct ir3_instruction;
28 struct ir3_block;
29
30 struct ir3_info {
31 void *data; /* used internally in ir3 assembler */
32 /* Size in bytes of the shader binary, including NIR constants and
33 * padding
34 */
35 uint32_t size;
36 /* byte offset from start of the shader to the NIR constant data. */
37 uint32_t constant_data_offset;
38 /* Size in dwords of the instructions. */
39 uint16_t sizedwords;
40 uint16_t instrs_count; /* expanded to account for rpt's */
41 uint16_t preamble_instrs_count;
42 uint16_t nops_count; /* # of nop instructions, including nopN */
43 uint16_t mov_count;
44 uint16_t cov_count;
45 uint16_t stp_count;
46 uint16_t ldp_count;
47 /* NOTE: max_reg, etc, does not include registers not touched
48 * by the shader (ie. vertex fetched via VFD_DECODE but not
49 * touched by shader)
50 */
51 int8_t max_reg; /* highest GPR # used by shader */
52 int8_t max_half_reg;
53 int16_t max_const;
54 /* This is the maximum # of waves that can executed at once in one core,
55 * assuming that they are all executing this shader.
56 */
57 int8_t max_waves;
58 uint8_t subgroup_size;
59 bool double_threadsize;
60 bool multi_dword_ldp_stp;
61 bool early_preamble;
62
63 /* number of sync bits: */
64 uint16_t ss, sy;
65
66 /* estimate of number of cycles stalled on (ss) */
67 uint16_t sstall;
68 /* estimate of number of cycles stalled on (sy) */
69 uint16_t systall;
70
71 uint16_t last_baryf; /* instruction # of last varying fetch */
72
73 uint16_t last_helper; /* last instruction to use helper invocations */
74
75 /* Number of instructions of a given category: */
76 uint16_t instrs_per_cat[8];
77 };
78
79 struct ir3_merge_set {
80 uint16_t preferred_reg;
81 uint16_t size;
82 uint16_t alignment;
83
84 unsigned interval_start;
85 unsigned spill_slot;
86
87 unsigned regs_count;
88 struct ir3_register **regs;
89 };
90
91 typedef enum ir3_register_flags {
92 IR3_REG_CONST = BIT(0),
93 IR3_REG_IMMED = BIT(1),
94 IR3_REG_HALF = BIT(2),
95 /* Shared registers have the same value for all threads when read.
96 * They can only be written when one thread is active (that is, inside
97 * a "getone" block).
98 */
99 IR3_REG_SHARED = BIT(3),
100 IR3_REG_RELATIV = BIT(4),
101 IR3_REG_R = BIT(5),
102 /* Most instructions, it seems, can do float abs/neg but not
103 * integer. The CP pass needs to know what is intended (int or
104 * float) in order to do the right thing. For this reason the
105 * abs/neg flags are split out into float and int variants. In
106 * addition, .b (bitwise) operations, the negate is actually a
107 * bitwise not, so split that out into a new flag to make it
108 * more clear.
109 */
110 IR3_REG_FNEG = BIT(6),
111 IR3_REG_FABS = BIT(7),
112 IR3_REG_SNEG = BIT(8),
113 IR3_REG_SABS = BIT(9),
114 IR3_REG_BNOT = BIT(10),
115 /* (ei) flag, end-input? Set on last bary, presumably to signal
116 * that the shader needs no more input:
117 *
118 * Note: Has different meaning on other instructions like add.s/u
119 */
120 IR3_REG_EI = BIT(11),
121 /* meta-flags, for intermediate stages of IR, ie.
122 * before register assignment is done:
123 */
124 IR3_REG_SSA = BIT(12), /* 'def' is ptr to assigning destination */
125 IR3_REG_ARRAY = BIT(13),
126
127 /* Set on a use whenever the SSA value becomes dead after the current
128 * instruction.
129 */
130 IR3_REG_KILL = BIT(14),
131
132 /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
133 * same SSA value in a single instruction, this is only set on the first
134 * use.
135 */
136 IR3_REG_FIRST_KILL = BIT(15),
137
138 /* Set when a destination doesn't have any uses and is dead immediately
139 * after the instruction. This can happen even after optimizations for
140 * corner cases such as destinations of atomic instructions.
141 */
142 IR3_REG_UNUSED = BIT(16),
143
144 /* "Early-clobber" on a destination means that the destination is
145 * (potentially) written before any sources are read and therefore
146 * interferes with the sources of the instruction.
147 */
148 IR3_REG_EARLY_CLOBBER = BIT(17),
149
150 /* If this is the last usage of a specific value in the register, the
151 * register cannot be read without being written to first after this.
152 * Note: This effectively has the same semantics as IR3_REG_KILL.
153 */
154 IR3_REG_LAST_USE = BIT(18),
155
156 /* Predicate register (p0.c). Cannot be combined with half or shared. */
157 IR3_REG_PREDICATE = BIT(19),
158 } ir3_register_flags;
159
160 struct ir3_register {
161 BITMASK_ENUM(ir3_register_flags) flags;
162
163 unsigned name;
164
165 /* used for cat5 instructions, but also for internal/IR level
166 * tracking of what registers are read/written by an instruction.
167 * wrmask may be a bad name since it is used to represent both
168 * src and dst that touch multiple adjacent registers.
169 */
170 unsigned wrmask : 16; /* up to vec16 */
171
172 /* for relative addressing, 32bits for array size is too small,
173 * but otoh we don't need to deal with disjoint sets, so instead
174 * use a simple size field (number of scalar components).
175 *
176 * Note the size field isn't important for relative const (since
177 * we don't have to do register allocation for constants).
178 */
179 unsigned size : 16;
180
181 /* normal registers:
182 * the component is in the low two bits of the reg #, so
183 * rN.x becomes: (N << 2) | x
184 */
185 uint16_t num;
186 union {
187 /* immediate: */
188 int32_t iim_val;
189 uint32_t uim_val;
190 float fim_val;
191 /* relative: */
192 struct {
193 uint16_t id;
194 int16_t offset;
195 uint16_t base;
196 } array;
197 };
198
199 /* For IR3_REG_SSA, dst registers contain pointer back to the instruction
200 * containing this register.
201 */
202 struct ir3_instruction *instr;
203
204 /* For IR3_REG_SSA, src registers contain ptr back to assigning
205 * instruction.
206 *
207 * For IR3_REG_ARRAY, the pointer is back to the last dependent
208 * array access (although the net effect is the same, it points
209 * back to a previous instruction that we depend on).
210 */
211 struct ir3_register *def;
212
213 /* Pointer to another register in the instruction that must share the same
214 * physical register. Each destination can be tied with one source, and
215 * they must have "tied" pointing to each other.
216 */
217 struct ir3_register *tied;
218
219 unsigned spill_slot, next_use;
220
221 unsigned merge_set_offset;
222 struct ir3_merge_set *merge_set;
223 unsigned interval_start, interval_end;
224 };
225
226 /*
227 * Stupid/simple growable array implementation:
228 */
229 #define DECLARE_ARRAY(type, name) \
230 unsigned name##_count, name##_sz; \
231 type *name;
232
233 #define array_insert(ctx, arr, ...) \
234 do { \
235 if (arr##_count == arr##_sz) { \
236 arr##_sz = MAX2(2 * arr##_sz, 16); \
237 arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0])); \
238 } \
239 arr[arr##_count++] = __VA_ARGS__; \
240 } while (0)
241
242 typedef enum {
243 REDUCE_OP_ADD_U,
244 REDUCE_OP_ADD_F,
245 REDUCE_OP_MUL_U,
246 REDUCE_OP_MUL_F,
247 REDUCE_OP_MIN_U,
248 REDUCE_OP_MIN_S,
249 REDUCE_OP_MIN_F,
250 REDUCE_OP_MAX_U,
251 REDUCE_OP_MAX_S,
252 REDUCE_OP_MAX_F,
253 REDUCE_OP_AND_B,
254 REDUCE_OP_OR_B,
255 REDUCE_OP_XOR_B,
256 } reduce_op_t;
257
258 typedef enum {
259 ALIAS_TEX = 0,
260 ALIAS_RT = 3,
261 ALIAS_MEM = 4,
262 } ir3_alias_scope;
263
264 typedef enum {
265 SHFL_XOR = 1,
266 SHFL_UP = 2,
267 SHFL_DOWN = 3,
268 SHFL_RUP = 6,
269 SHFL_RDOWN = 7,
270 } ir3_shfl_mode;
271
272 typedef enum ir3_instruction_flags {
273 /* (sy) flag is set on first instruction, and after sample
274 * instructions (probably just on RAW hazard).
275 */
276 IR3_INSTR_SY = BIT(0),
277 /* (ss) flag is set on first instruction, and first instruction
278 * to depend on the result of "long" instructions (RAW hazard):
279 *
280 * rcp, rsq, log2, exp2, sin, cos, sqrt
281 *
282 * It seems to synchronize until all in-flight instructions are
283 * completed, for example:
284 *
285 * rsq hr1.w, hr1.w
286 * add.f hr2.z, (neg)hr2.z, hc0.y
287 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
288 * rsq hr2.x, hr2.x
289 * (rpt1)nop
290 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
291 * nop
292 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
293 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
294 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
295 *
296 * The last mul.f does not have (ss) set, presumably because the
297 * (ss) on the previous instruction does the job.
298 *
299 * The blob driver also seems to set it on WAR hazards, although
300 * not really clear if this is needed or just blob compiler being
301 * sloppy. So far I haven't found a case where removing the (ss)
302 * causes problems for WAR hazard, but I could just be getting
303 * lucky:
304 *
305 * rcp r1.y, r3.y
306 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
307 *
308 */
309 IR3_INSTR_SS = BIT(1),
310 /* (jp) flag is set on jump targets:
311 */
312 IR3_INSTR_JP = BIT(2),
313 /* (eq) flag kills helper invocations when they are no longer needed */
314 IR3_INSTR_EQ = BIT(3),
315 IR3_INSTR_UL = BIT(4),
316 IR3_INSTR_3D = BIT(5),
317 IR3_INSTR_A = BIT(6),
318 IR3_INSTR_O = BIT(7),
319 IR3_INSTR_P = BIT(8),
320 IR3_INSTR_S = BIT(9),
321 IR3_INSTR_S2EN = BIT(10),
322 IR3_INSTR_SAT = BIT(11),
323 /* (cat5/cat6) Bindless */
324 IR3_INSTR_B = BIT(12),
325 /* (cat5/cat6) nonuniform */
326 IR3_INSTR_NONUNIF = BIT(13),
327 /* (cat5-only) Get some parts of the encoding from a1.x */
328 IR3_INSTR_A1EN = BIT(14),
329 /* uniform destination for ldc, which must be set if and only if it has a
330 * shared reg destination
331 */
332 IR3_INSTR_U = BIT(15),
333 /* meta-flags, for intermediate stages of IR, ie.
334 * before register assignment is done:
335 */
336 IR3_INSTR_MARK = BIT(16),
337
338 /* Used by shared register allocation when creating spill/reload instructions
339 * to inform validation that this is created by RA. This also may be set on
340 * an instruction where a spill has been folded into it.
341 */
342 IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
343
344 IR3_INSTR_UNUSED = BIT(17),
345
346 /* Used to indicate that a mov comes from a lowered READ_FIRST/READ_COND
347 * and may broadcast a helper invocation's value from a vector register to a
348 * shared register that may be read by other invocations. This factors into
349 * (eq) calculations.
350 */
351 IR3_INSTR_NEEDS_HELPERS = BIT(18),
352
353 /* isam.v */
354 IR3_INSTR_V = BIT(19),
355
356 /* isam.1d. Note that .1d is an active-low bit. */
357 IR3_INSTR_INV_1D = BIT(20),
358
359 /* isam.v/ldib.b/stib.b can optionally use an immediate offset with one of
360 * their sources.
361 */
362 IR3_INSTR_IMM_OFFSET = BIT(21),
363 } ir3_instruction_flags;
364
365 struct ir3_instruction {
366 struct ir3_block *block;
367 opc_t opc;
368 BITMASK_ENUM(ir3_instruction_flags) flags;
369 uint8_t repeat;
370 uint8_t nop;
371 #if MESA_DEBUG
372 unsigned srcs_max, dsts_max;
373 #endif
374 unsigned srcs_count, dsts_count;
375 struct ir3_register **dsts;
376 struct ir3_register **srcs;
377 union {
378 struct {
379 char inv1, inv2;
380 int immed;
381 struct ir3_block *target;
382 const char *target_label;
383 unsigned idx; /* for brac.N */
384 } cat0;
385 struct {
386 type_t src_type, dst_type;
387 round_t round;
388 reduce_op_t reduce_op;
389 } cat1;
390 struct {
391 enum {
392 IR3_COND_LT = 0,
393 IR3_COND_LE = 1,
394 IR3_COND_GT = 2,
395 IR3_COND_GE = 3,
396 IR3_COND_EQ = 4,
397 IR3_COND_NE = 5,
398 } condition;
399 } cat2;
400 struct {
401 enum {
402 IR3_SRC_UNSIGNED = 0,
403 IR3_SRC_MIXED = 1,
404 } signedness;
405 enum {
406 IR3_SRC_PACKED_LOW = 0,
407 IR3_SRC_PACKED_HIGH = 1,
408 } packed;
409 bool swapped;
410 } cat3;
411 struct {
412 unsigned samp, tex;
413 unsigned tex_base : 3;
414 unsigned cluster_size : 4;
415 type_t type;
416 } cat5;
417 struct {
418 type_t type;
419 /* TODO remove dst_offset and handle as a ir3_register
420 * which might be IMMED, similar to how src_offset is
421 * handled.
422 */
423 int dst_offset;
424 int iim_val; /* for ldgb/stgb, # of components */
425 unsigned d : 3; /* for ldc, component offset */
426 bool typed : 1;
427 unsigned base : 3;
428 ir3_shfl_mode shfl_mode : 3;
429 } cat6;
430 struct {
431 unsigned w : 1; /* write */
432 unsigned r : 1; /* read */
433 unsigned l : 1; /* local */
434 unsigned g : 1; /* global */
435
436 ir3_alias_scope alias_scope;
437 } cat7;
438 /* for meta-instructions, just used to hold extra data
439 * before instruction scheduling, etc
440 */
441 struct {
442 int off; /* component/offset */
443 } split;
444 struct {
445 /* Per-source index back to the entry in the
446 * ir3_shader_variant::outputs table.
447 */
448 unsigned *outidxs;
449 } end;
450 struct {
451 /* used to temporarily hold reference to nir_phi_instr
452 * until we resolve the phi srcs
453 */
454 void *nphi;
455 unsigned comp;
456 } phi;
457 struct {
458 unsigned samp, tex;
459 unsigned input_offset;
460 unsigned samp_base : 3;
461 unsigned tex_base : 3;
462 } prefetch;
463 struct {
464 /* maps back to entry in ir3_shader_variant::inputs table: */
465 int inidx;
466 /* for sysvals, identifies the sysval type. Mostly so we can
467 * identify the special cases where a sysval should not be DCE'd
468 * (currently, just pre-fs texture fetch)
469 */
470 gl_system_value sysval;
471 } input;
472 struct {
473 unsigned src_base, src_size;
474 unsigned dst_base;
475 } push_consts;
476 struct {
477 uint64_t value;
478 } raw;
479 };
480
481 /* For assigning jump offsets, we need instruction's position: */
482 uint32_t ip;
483
484 /* used for per-pass extra instruction data.
485 *
486 * TODO we should remove the per-pass data like this and 'use_count'
487 * and do something similar to what RA does w/ ir3_ra_instr_data..
488 * ie. use the ir3_count_instructions pass, and then use instr->ip
489 * to index into a table of pass-private data.
490 */
491 void *data;
492
493 /**
494 * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
495 */
496 struct set *uses;
497
498 int use_count; /* currently just updated/used by cp */
499
500 /* an instruction can reference at most one address register amongst
501 * it's src/dst registers. Beyond that, you need to insert mov's.
502 *
503 * NOTE: do not write this directly, use ir3_instr_set_address()
504 */
505 struct ir3_register *address;
506
507 /* Tracking for additional dependent instructions. Used to handle
508 * barriers, WAR hazards for arrays/SSBOs/etc.
509 */
510 DECLARE_ARRAY(struct ir3_instruction *, deps);
511
512 /*
513 * From PoV of instruction scheduling, not execution (ie. ignores global/
514 * local distinction):
515 * shared image atomic SSBO everything
516 * barrier()/ - R/W R/W R/W R/W X
517 * groupMemoryBarrier()
518 * memoryBarrier()
519 * (but only images declared coherent?)
520 * memoryBarrierAtomic() - R/W
521 * memoryBarrierBuffer() - R/W
522 * memoryBarrierImage() - R/W
523 * memoryBarrierShared() - R/W
524 *
525 * TODO I think for SSBO/image/shared, in cases where we can determine
526 * which variable is accessed, we don't need to care about accesses to
527 * different variables (unless declared coherent??)
528 */
529 enum {
530 IR3_BARRIER_EVERYTHING = 1 << 0,
531 IR3_BARRIER_SHARED_R = 1 << 1,
532 IR3_BARRIER_SHARED_W = 1 << 2,
533 IR3_BARRIER_IMAGE_R = 1 << 3,
534 IR3_BARRIER_IMAGE_W = 1 << 4,
535 IR3_BARRIER_BUFFER_R = 1 << 5,
536 IR3_BARRIER_BUFFER_W = 1 << 6,
537 IR3_BARRIER_ARRAY_R = 1 << 7,
538 IR3_BARRIER_ARRAY_W = 1 << 8,
539 IR3_BARRIER_PRIVATE_R = 1 << 9,
540 IR3_BARRIER_PRIVATE_W = 1 << 10,
541 IR3_BARRIER_CONST_W = 1 << 11,
542 IR3_BARRIER_ACTIVE_FIBERS_R = 1 << 12,
543 IR3_BARRIER_ACTIVE_FIBERS_W = 1 << 13,
544 } barrier_class,
545 barrier_conflict;
546
547 /* Entry in ir3_block's instruction list: */
548 struct list_head node;
549
550 /* List of this instruction's repeat group. Vectorized NIR instructions are
551 * emitted as multiple scalar instructions that are linked together using
552 * this field. After RA, the ir3_combine_rpt pass iterates these groups and,
553 * if the register assignment allows it, merges them into a (rptN)
554 * instruction.
555 *
556 * NOTE: this is not a typical list as there is no empty list head. The list
557 * head is stored in the first instruction of the repeat group so also refers
558 * to a list entry. In order to distinguish the list's first entry, we use
559 * serialno: instructions in a repeat group are always emitted consecutively
560 * so the first will have the lowest serialno.
561 *
562 * As this is not a typical list, we have to be careful with using the
563 * existing list helper. For example, using list_length on the first
564 * instruction will yield one less than the number of instructions in its
565 * group.
566 */
567 struct list_head rpt_node;
568
569 uint32_t serialno;
570
571 // TODO only computerator/assembler:
572 int line;
573 };
574
575 /* Represents repeat groups in return values and arguments of the rpt builder
576 * API functions.
577 */
578 struct ir3_instruction_rpt {
579 struct ir3_instruction *rpts[4];
580 };
581
582 struct ir3 {
583 struct ir3_compiler *compiler;
584 gl_shader_stage type;
585
586 DECLARE_ARRAY(struct ir3_instruction *, inputs);
587
588 /* Track bary.f (and ldlv) instructions.. this is needed in
589 * scheduling to ensure that all varying fetches happen before
590 * any potential kill instructions. The hw gets grumpy if all
591 * threads in a group are killed before the last bary.f gets
592 * a chance to signal end of input (ei).
593 */
594 DECLARE_ARRAY(struct ir3_instruction *, baryfs);
595
596 /* Track all indirect instructions (read and write). To avoid
597 * deadlock scenario where an address register gets scheduled,
598 * but other dependent src instructions cannot be scheduled due
599 * to dependency on a *different* address register value, the
600 * scheduler needs to ensure that all dependencies other than
601 * the instruction other than the address register are scheduled
602 * before the one that writes the address register. Having a
603 * convenient list of instructions that reference some address
604 * register simplifies this.
605 */
606 DECLARE_ARRAY(struct ir3_instruction *, a0_users);
607
608 /* same for a1.x: */
609 DECLARE_ARRAY(struct ir3_instruction *, a1_users);
610
611 /* Track texture sample instructions which need texture state
612 * patched in (for astc-srgb workaround):
613 */
614 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
615
616 /* Track tg4 instructions which need texture state patched in (for tg4
617 * swizzling workaround):
618 */
619 DECLARE_ARRAY(struct ir3_instruction *, tg4);
620
621 /* List of blocks: */
622 struct list_head block_list;
623
624 /* List of ir3_array's: */
625 struct list_head array_list;
626
627 #if MESA_DEBUG
628 unsigned block_count;
629 #endif
630 unsigned instr_count;
631 };
632
633 struct ir3_array {
634 struct list_head node;
635 unsigned length;
636 unsigned id;
637
638 struct nir_def *r;
639
640 /* To avoid array write's from getting DCE'd, keep track of the
641 * most recent write. Any array access depends on the most
642 * recent write. This way, nothing depends on writes after the
643 * last read. But all the writes that happen before that have
644 * something depending on them
645 */
646 struct ir3_register *last_write;
647
648 /* extra stuff used in RA pass: */
649 unsigned base; /* base vreg name */
650 unsigned reg; /* base physical reg */
651 uint16_t start_ip, end_ip;
652
653 /* Indicates if half-precision */
654 bool half;
655
656 bool unused;
657 };
658
659 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
660
661 struct ir3_block {
662 struct list_head node;
663 struct ir3 *shader;
664
665 const struct nir_block *nblock;
666
667 struct list_head instr_list; /* list of ir3_instruction */
668
669 /* each block has either one or two successors.. in case of two
670 * successors, 'condition' decides which one to follow. A block preceding
671 * an if/else has two successors.
672 *
673 * In some cases the path that the machine actually takes through the
674 * program may not match the per-thread view of the CFG. In particular
675 * this is the case for if/else, where the machine jumps from the end of
676 * the if to the beginning of the else and switches active lanes. While
677 * most things only care about the per-thread view, we need to use the
678 * "physical" view when allocating shared registers. "successors" contains
679 * the per-thread successors, and "physical_successors" contains the
680 * physical successors which includes the fallthrough edge from the if to
681 * the else.
682 */
683 struct ir3_block *successors[2];
684
685 bool divergent_condition;
686
687 DECLARE_ARRAY(struct ir3_block *, predecessors);
688 DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
689 DECLARE_ARRAY(struct ir3_block *, physical_successors);
690
691 uint16_t start_ip, end_ip;
692
693 bool reconvergence_point;
694
695 bool in_early_preamble;
696
697 /* Track instructions which do not write a register but other-
698 * wise must not be discarded (such as kill, stg, etc)
699 */
700 DECLARE_ARRAY(struct ir3_instruction *, keeps);
701
702 /* used for per-pass extra block data. Mainly used right
703 * now in RA step to track livein/liveout.
704 */
705 void *data;
706
707 uint32_t index;
708
709 struct ir3_block *imm_dom;
710 DECLARE_ARRAY(struct ir3_block *, dom_children);
711
712 uint32_t dom_pre_index;
713 uint32_t dom_post_index;
714
715 uint32_t loop_depth;
716
717 #if MESA_DEBUG
718 uint32_t serialno;
719 #endif
720 };
721
722 enum ir3_cursor_option {
723 IR3_CURSOR_BEFORE_BLOCK,
724 IR3_CURSOR_AFTER_BLOCK,
725 IR3_CURSOR_BEFORE_INSTR,
726 IR3_CURSOR_AFTER_INSTR,
727 };
728
729 struct ir3_cursor {
730 enum ir3_cursor_option option;
731 union {
732 struct ir3_block *block;
733 struct ir3_instruction *instr;
734 };
735 };
736
737 struct ir3_builder {
738 struct ir3_cursor cursor;
739 };
740
741 static inline uint32_t
block_id(struct ir3_block * block)742 block_id(struct ir3_block *block)
743 {
744 #if MESA_DEBUG
745 return block->serialno;
746 #else
747 return (uint32_t)(unsigned long)block;
748 #endif
749 }
750
751 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)752 ir3_start_block(struct ir3 *ir)
753 {
754 return list_first_entry(&ir->block_list, struct ir3_block, node);
755 }
756
757 static inline struct ir3_block *
ir3_end_block(struct ir3 * ir)758 ir3_end_block(struct ir3 *ir)
759 {
760 return list_last_entry(&ir->block_list, struct ir3_block, node);
761 }
762
763 struct ir3_instruction *ir3_block_get_terminator(struct ir3_block *block);
764
765 struct ir3_instruction *ir3_block_take_terminator(struct ir3_block *block);
766
767 struct ir3_instruction *
768 ir3_block_get_last_non_terminator(struct ir3_block *block);
769
770 struct ir3_instruction *ir3_block_get_last_phi(struct ir3_block *block);
771
772 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)773 ir3_after_preamble(struct ir3 *ir)
774 {
775 struct ir3_block *block = ir3_start_block(ir);
776 /* The preamble will have a usually-empty else branch, and we want to skip
777 * that to get to the block after the preamble.
778 */
779 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
780 if (terminator && (terminator->opc == OPC_SHPS))
781 return block->successors[1]->successors[0];
782 else
783 return block;
784 }
785
786 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
787 void ir3_block_link_physical(struct ir3_block *pred, struct ir3_block *succ);
788 void ir3_block_remove_predecessor(struct ir3_block *block,
789 struct ir3_block *pred);
790 unsigned ir3_block_get_pred_index(struct ir3_block *block,
791 struct ir3_block *pred);
792
793 void ir3_calc_dominance(struct ir3 *ir);
794 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
795
796 struct ir3_shader_variant;
797
798 struct ir3 *ir3_create(struct ir3_compiler *compiler,
799 struct ir3_shader_variant *v);
800 void ir3_destroy(struct ir3 *shader);
801
802 void ir3_collect_info(struct ir3_shader_variant *v);
803 void *ir3_alloc(struct ir3 *shader, int sz);
804
805 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
806 unsigned reg_count,
807 bool double_threadsize);
808
809 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
810 bool double_threadsize);
811
812 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
813 unsigned regs_count);
814
815 struct ir3_block *ir3_block_create(struct ir3 *shader);
816
817 struct ir3_instruction *ir3_build_instr(struct ir3_builder *builder, opc_t opc,
818 int ndst, int nsrc);
819 struct ir3_instruction *ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc,
820 int ndst, int nsrc);
821 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
822 int ndst, int nsrc);
823 struct ir3_instruction *ir3_instr_create_at_end(struct ir3_block *block,
824 opc_t opc, int ndst, int nsrc);
825 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
826 void ir3_instr_add_dep(struct ir3_instruction *instr,
827 struct ir3_instruction *dep);
828 const char *ir3_instr_name(struct ir3_instruction *instr);
829 void ir3_instr_remove(struct ir3_instruction *instr);
830
831 void ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n);
832 bool ir3_instr_is_rpt(const struct ir3_instruction *instr);
833 bool ir3_instr_is_first_rpt(const struct ir3_instruction *instr);
834 struct ir3_instruction *ir3_instr_prev_rpt(const struct ir3_instruction *instr);
835 struct ir3_instruction *ir3_instr_first_rpt(struct ir3_instruction *instr);
836 unsigned ir3_instr_rpt_length(const struct ir3_instruction *instr);
837
838 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
839 int flags);
840 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
841 int flags);
842 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
843 struct ir3_register *reg);
844
845 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)846 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
847 {
848 assert(!dst->tied && !src->tied);
849 dst->tied = src;
850 src->tied = dst;
851 }
852
853 void ir3_reg_set_last_array(struct ir3_instruction *instr,
854 struct ir3_register *reg,
855 struct ir3_register *last_write);
856
857 void ir3_instr_set_address(struct ir3_instruction *instr,
858 struct ir3_instruction *addr);
859
860 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)861 ir3_instr_check_mark(struct ir3_instruction *instr)
862 {
863 if (instr->flags & IR3_INSTR_MARK)
864 return true; /* already visited */
865 instr->flags |= IR3_INSTR_MARK;
866 return false;
867 }
868
869 void ir3_block_clear_mark(struct ir3_block *block);
870 void ir3_clear_mark(struct ir3 *shader);
871
872 unsigned ir3_count_instructions(struct ir3 *ir);
873 unsigned ir3_count_instructions_sched(struct ir3 *ir);
874 unsigned ir3_count_instructions_ra(struct ir3 *ir);
875
876 /**
877 * Move 'instr' to just before 'after'
878 */
879 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)880 ir3_instr_move_before(struct ir3_instruction *instr,
881 struct ir3_instruction *after)
882 {
883 list_delinit(&instr->node);
884 list_addtail(&instr->node, &after->node);
885 }
886
887 /**
888 * Move 'instr' to just after 'before':
889 */
890 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)891 ir3_instr_move_after(struct ir3_instruction *instr,
892 struct ir3_instruction *before)
893 {
894 list_delinit(&instr->node);
895 list_add(&instr->node, &before->node);
896 }
897
898 /**
899 * Move 'instr' to the beginning of the block:
900 */
901 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)902 ir3_instr_move_before_block(struct ir3_instruction *instr,
903 struct ir3_block *block)
904 {
905 list_delinit(&instr->node);
906 list_add(&instr->node, &block->instr_list);
907 }
908
909 typedef bool (*use_filter_cb)(struct ir3_instruction *use, unsigned src_n);
910
911 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
912 void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter);
913
914 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
915 void ir3_fixup_src_type(struct ir3_instruction *instr);
916
917 int ir3_flut(struct ir3_register *src_reg);
918
919 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
920
921 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
922
923 /**
924 * Given an instruction whose result we want to test for nonzero, return a
925 * potentially different instruction for which the result would be the same.
926 * This might be one of its sources if instr doesn't change the nonzero-ness.
927 */
928 struct ir3_instruction *
929 ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr);
930
931 bool ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc);
932
933 #include "util/set.h"
934 #define foreach_ssa_use(__use, __instr) \
935 for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses; \
936 __use = NULL) \
937 set_foreach ((__instr)->uses, __entry) \
938 if ((__use = (void *)__entry->key))
939
940 static inline uint32_t
reg_num(const struct ir3_register * reg)941 reg_num(const struct ir3_register *reg)
942 {
943 return reg->num >> 2;
944 }
945
946 static inline uint32_t
reg_comp(const struct ir3_register * reg)947 reg_comp(const struct ir3_register *reg)
948 {
949 return reg->num & 0x3;
950 }
951
952 static inline bool
is_flow(struct ir3_instruction * instr)953 is_flow(struct ir3_instruction *instr)
954 {
955 return (opc_cat(instr->opc) == 0);
956 }
957
958 static inline bool
is_terminator(struct ir3_instruction * instr)959 is_terminator(struct ir3_instruction *instr)
960 {
961 switch (instr->opc) {
962 case OPC_BR:
963 case OPC_JUMP:
964 case OPC_BANY:
965 case OPC_BALL:
966 case OPC_BRAA:
967 case OPC_BRAO:
968 case OPC_SHPS:
969 case OPC_GETONE:
970 case OPC_GETLAST:
971 case OPC_PREDT:
972 case OPC_PREDF:
973 return true;
974 default:
975 return false;
976 }
977 }
978
979 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)980 is_kill_or_demote(struct ir3_instruction *instr)
981 {
982 return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
983 }
984
985 static inline bool
is_nop(struct ir3_instruction * instr)986 is_nop(struct ir3_instruction *instr)
987 {
988 return instr->opc == OPC_NOP;
989 }
990
991 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)992 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
993 {
994 unsigned dst_type = (dst->flags & IR3_REG_HALF);
995 unsigned src_type = (src->flags & IR3_REG_HALF);
996
997 /* Treat shared->normal copies and normal->shared copies as same-type. */
998 return dst_type == src_type;
999 }
1000
1001 /* Is it a non-transformative (ie. not type changing) mov? This can
1002 * also include absneg.s/absneg.f, which for the most part can be
1003 * treated as a mov (single src argument).
1004 */
1005 static inline bool
is_same_type_mov(struct ir3_instruction * instr)1006 is_same_type_mov(struct ir3_instruction *instr)
1007 {
1008 struct ir3_register *dst;
1009
1010 switch (instr->opc) {
1011 case OPC_MOV:
1012 if (instr->cat1.src_type != instr->cat1.dst_type)
1013 return false;
1014 /* If the type of dest reg and src reg are different,
1015 * it shouldn't be considered as same type mov
1016 */
1017 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1018 return false;
1019 break;
1020 case OPC_ABSNEG_F:
1021 case OPC_ABSNEG_S:
1022 if (instr->flags & IR3_INSTR_SAT)
1023 return false;
1024 /* If the type of dest reg and src reg are different,
1025 * it shouldn't be considered as same type mov
1026 */
1027 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1028 return false;
1029 break;
1030 default:
1031 return false;
1032 }
1033
1034 dst = instr->dsts[0];
1035
1036 /* mov's that write to a0 or p0.x are special: */
1037 if (dst->flags & IR3_REG_PREDICATE)
1038 return false;
1039 if (reg_num(dst) == REG_A0)
1040 return false;
1041
1042 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
1043 return false;
1044
1045 return true;
1046 }
1047
1048 /* A move from const, which changes size but not type, can also be
1049 * folded into dest instruction in some cases.
1050 */
1051 static inline bool
is_const_mov(struct ir3_instruction * instr)1052 is_const_mov(struct ir3_instruction *instr)
1053 {
1054 if (instr->opc != OPC_MOV)
1055 return false;
1056
1057 if (!(instr->srcs[0]->flags & IR3_REG_CONST))
1058 return false;
1059
1060 type_t src_type = instr->cat1.src_type;
1061 type_t dst_type = instr->cat1.dst_type;
1062
1063 /* Allow a narrowing move, but not a widening one. A narrowing
1064 * move from full c1.x can be folded into a hc1.x use in an ALU
1065 * instruction because it is doing the same thing as constant-
1066 * demotion. If CONSTANT_DEMOTION_ENABLE wasn't set, we'd need
1067 * return false in all cases.
1068 */
1069 if ((type_size(dst_type) > type_size(src_type)) ||
1070 (type_size(dst_type) == 8))
1071 return false;
1072
1073 return (type_float(src_type) && type_float(dst_type)) ||
1074 (type_uint(src_type) && type_uint(dst_type)) ||
1075 (type_sint(src_type) && type_sint(dst_type));
1076 }
1077
1078 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)1079 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
1080 {
1081 switch (instr->opc) {
1082 case OPC_BALLOT_MACRO:
1083 case OPC_ANY_MACRO:
1084 case OPC_ALL_MACRO:
1085 case OPC_ELECT_MACRO:
1086 case OPC_READ_COND_MACRO:
1087 case OPC_READ_GETLAST_MACRO:
1088 case OPC_READ_FIRST_MACRO:
1089 case OPC_SCAN_MACRO:
1090 case OPC_SCAN_CLUSTERS_MACRO:
1091 return true;
1092 default:
1093 return false;
1094 }
1095 }
1096
1097 static inline bool
is_alu(struct ir3_instruction * instr)1098 is_alu(struct ir3_instruction *instr)
1099 {
1100 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
1101 }
1102
1103 static inline bool
is_sfu(struct ir3_instruction * instr)1104 is_sfu(struct ir3_instruction *instr)
1105 {
1106 return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
1107 }
1108
1109 static inline bool
is_tex(struct ir3_instruction * instr)1110 is_tex(struct ir3_instruction *instr)
1111 {
1112 return (opc_cat(instr->opc) == 5) && instr->opc != OPC_TCINV;
1113 }
1114
1115 static inline bool
is_tex_shuffle(struct ir3_instruction * instr)1116 is_tex_shuffle(struct ir3_instruction *instr)
1117 {
1118 switch (instr->opc) {
1119 case OPC_BRCST_ACTIVE:
1120 case OPC_QUAD_SHUFFLE_BRCST:
1121 case OPC_QUAD_SHUFFLE_HORIZ:
1122 case OPC_QUAD_SHUFFLE_VERT:
1123 case OPC_QUAD_SHUFFLE_DIAG:
1124 return true;
1125 default:
1126 return false;
1127 }
1128 }
1129
1130 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)1131 is_tex_or_prefetch(struct ir3_instruction *instr)
1132 {
1133 return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
1134 }
1135
1136 static inline bool
is_mem(struct ir3_instruction * instr)1137 is_mem(struct ir3_instruction *instr)
1138 {
1139 return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
1140 }
1141
1142 static inline bool
is_barrier(struct ir3_instruction * instr)1143 is_barrier(struct ir3_instruction *instr)
1144 {
1145 return (opc_cat(instr->opc) == 7);
1146 }
1147
1148 static inline bool
is_half(struct ir3_instruction * instr)1149 is_half(struct ir3_instruction *instr)
1150 {
1151 return !!(instr->dsts[0]->flags & IR3_REG_HALF);
1152 }
1153
1154 static inline bool
is_shared(struct ir3_instruction * instr)1155 is_shared(struct ir3_instruction *instr)
1156 {
1157 return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
1158 }
1159
1160 static inline bool
is_store(struct ir3_instruction * instr)1161 is_store(struct ir3_instruction *instr)
1162 {
1163 /* these instructions, the "destination" register is
1164 * actually a source, the address to store to.
1165 */
1166 switch (instr->opc) {
1167 case OPC_STG:
1168 case OPC_STG_A:
1169 case OPC_STGB:
1170 case OPC_STIB:
1171 case OPC_STP:
1172 case OPC_STL:
1173 case OPC_STLW:
1174 case OPC_L2G:
1175 case OPC_G2L:
1176 return true;
1177 default:
1178 return false;
1179 }
1180 }
1181
1182 static inline bool
is_load(struct ir3_instruction * instr)1183 is_load(struct ir3_instruction *instr)
1184 {
1185 switch (instr->opc) {
1186 case OPC_LDG:
1187 case OPC_LDG_A:
1188 case OPC_LDGB:
1189 case OPC_LDIB:
1190 case OPC_LDL:
1191 case OPC_LDP:
1192 case OPC_L2G:
1193 case OPC_LDLW:
1194 case OPC_LDLV:
1195 /* probably some others too.. */
1196 return true;
1197 case OPC_LDC:
1198 return instr->dsts_count > 0;
1199 default:
1200 return false;
1201 }
1202 }
1203
1204 static inline bool
is_input(struct ir3_instruction * instr)1205 is_input(struct ir3_instruction *instr)
1206 {
1207 /* in some cases, ldlv is used to fetch varying without
1208 * interpolation.. fortunately inloc is the first src
1209 * register in either case
1210 */
1211 switch (instr->opc) {
1212 case OPC_LDLV:
1213 case OPC_BARY_F:
1214 case OPC_FLAT_B:
1215 return true;
1216 default:
1217 return false;
1218 }
1219 }
1220
1221 /* Whether non-helper invocations can read the value of helper invocations. We
1222 * cannot insert (eq) before these instructions.
1223 */
1224 static inline bool
uses_helpers(struct ir3_instruction * instr)1225 uses_helpers(struct ir3_instruction *instr)
1226 {
1227 switch (instr->opc) {
1228 /* These require helper invocations to be present */
1229 case OPC_SAMB:
1230 case OPC_GETLOD:
1231 case OPC_DSX:
1232 case OPC_DSY:
1233 case OPC_DSXPP_1:
1234 case OPC_DSYPP_1:
1235 case OPC_DSXPP_MACRO:
1236 case OPC_DSYPP_MACRO:
1237 case OPC_QUAD_SHUFFLE_BRCST:
1238 case OPC_QUAD_SHUFFLE_HORIZ:
1239 case OPC_QUAD_SHUFFLE_VERT:
1240 case OPC_QUAD_SHUFFLE_DIAG:
1241 case OPC_META_TEX_PREFETCH:
1242 return true;
1243
1244 /* sam requires helper invocations except for dummy prefetch instructions */
1245 case OPC_SAM:
1246 return instr->dsts_count != 0;
1247
1248 /* Subgroup operations don't require helper invocations to be present, but
1249 * will use helper invocations if they are present.
1250 */
1251 case OPC_BALLOT_MACRO:
1252 case OPC_ANY_MACRO:
1253 case OPC_ALL_MACRO:
1254 case OPC_READ_FIRST_MACRO:
1255 case OPC_READ_COND_MACRO:
1256 case OPC_MOVMSK:
1257 case OPC_BRCST_ACTIVE:
1258 return true;
1259
1260 /* Catch lowered READ_FIRST/READ_COND. For elect, don't include the getone
1261 * in the preamble because it doesn't actually matter which fiber is
1262 * selected.
1263 */
1264 case OPC_MOV:
1265 case OPC_ELECT_MACRO:
1266 return instr->flags & IR3_INSTR_NEEDS_HELPERS;
1267
1268 default:
1269 return false;
1270 }
1271 }
1272
1273 static inline bool
is_bool(struct ir3_instruction * instr)1274 is_bool(struct ir3_instruction *instr)
1275 {
1276 switch (instr->opc) {
1277 case OPC_CMPS_F:
1278 case OPC_CMPS_S:
1279 case OPC_CMPS_U:
1280 return true;
1281 default:
1282 return false;
1283 }
1284 }
1285
1286 static inline opc_t
cat3_half_opc(opc_t opc)1287 cat3_half_opc(opc_t opc)
1288 {
1289 switch (opc) {
1290 case OPC_MAD_F32:
1291 return OPC_MAD_F16;
1292 case OPC_SEL_B32:
1293 return OPC_SEL_B16;
1294 case OPC_SEL_S32:
1295 return OPC_SEL_S16;
1296 case OPC_SEL_F32:
1297 return OPC_SEL_F16;
1298 case OPC_SAD_S32:
1299 return OPC_SAD_S16;
1300 default:
1301 return opc;
1302 }
1303 }
1304
1305 static inline opc_t
cat3_full_opc(opc_t opc)1306 cat3_full_opc(opc_t opc)
1307 {
1308 switch (opc) {
1309 case OPC_MAD_F16:
1310 return OPC_MAD_F32;
1311 case OPC_SEL_B16:
1312 return OPC_SEL_B32;
1313 case OPC_SEL_S16:
1314 return OPC_SEL_S32;
1315 case OPC_SEL_F16:
1316 return OPC_SEL_F32;
1317 case OPC_SAD_S16:
1318 return OPC_SAD_S32;
1319 default:
1320 return opc;
1321 }
1322 }
1323
1324 static inline opc_t
cat4_half_opc(opc_t opc)1325 cat4_half_opc(opc_t opc)
1326 {
1327 switch (opc) {
1328 case OPC_RSQ:
1329 return OPC_HRSQ;
1330 case OPC_LOG2:
1331 return OPC_HLOG2;
1332 case OPC_EXP2:
1333 return OPC_HEXP2;
1334 default:
1335 return opc;
1336 }
1337 }
1338
1339 static inline opc_t
cat4_full_opc(opc_t opc)1340 cat4_full_opc(opc_t opc)
1341 {
1342 switch (opc) {
1343 case OPC_HRSQ:
1344 return OPC_RSQ;
1345 case OPC_HLOG2:
1346 return OPC_LOG2;
1347 case OPC_HEXP2:
1348 return OPC_EXP2;
1349 default:
1350 return opc;
1351 }
1352 }
1353
1354 static inline bool
is_meta(struct ir3_instruction * instr)1355 is_meta(struct ir3_instruction *instr)
1356 {
1357 return (opc_cat(instr->opc) == OPC_META);
1358 }
1359
1360 static inline unsigned
reg_elems(const struct ir3_register * reg)1361 reg_elems(const struct ir3_register *reg)
1362 {
1363 if (reg->flags & IR3_REG_ARRAY)
1364 return reg->size;
1365 else
1366 return util_last_bit(reg->wrmask);
1367 }
1368
1369 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1370 reg_elem_size(const struct ir3_register *reg)
1371 {
1372 return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1373 }
1374
1375 static inline unsigned
reg_size(const struct ir3_register * reg)1376 reg_size(const struct ir3_register *reg)
1377 {
1378 return reg_elems(reg) * reg_elem_size(reg);
1379 }
1380
1381 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
1382 * and have to handle relative accesses specially.
1383 */
1384
1385 static inline unsigned
post_ra_reg_elems(struct ir3_register * reg)1386 post_ra_reg_elems(struct ir3_register *reg)
1387 {
1388 if (reg->flags & IR3_REG_RELATIV)
1389 return reg->size;
1390 return reg_elems(reg);
1391 }
1392
1393 static inline unsigned
post_ra_reg_num(struct ir3_register * reg)1394 post_ra_reg_num(struct ir3_register *reg)
1395 {
1396 if (reg->flags & IR3_REG_RELATIV)
1397 return reg->array.base;
1398 return reg->num;
1399 }
1400
1401 static inline unsigned
dest_regs(struct ir3_instruction * instr)1402 dest_regs(struct ir3_instruction *instr)
1403 {
1404 if (instr->dsts_count == 0)
1405 return 0;
1406
1407 assert(instr->dsts_count == 1);
1408 return util_last_bit(instr->dsts[0]->wrmask);
1409 }
1410
1411 static inline bool
is_reg_gpr(const struct ir3_register * reg)1412 is_reg_gpr(const struct ir3_register *reg)
1413 {
1414 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_PREDICATE))
1415 return false;
1416 if (reg_num(reg) == REG_A0)
1417 return false;
1418 if (!(reg->flags & (IR3_REG_SSA | IR3_REG_RELATIV)) &&
1419 reg->num == INVALID_REG)
1420 return false;
1421 return true;
1422 }
1423
1424 static inline bool
is_reg_a0(const struct ir3_register * reg)1425 is_reg_a0(const struct ir3_register *reg)
1426 {
1427 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1428 return false;
1429 return reg->num == regid(REG_A0, 0);
1430 }
1431
1432 /* is dst a normal temp register: */
1433 static inline bool
is_dest_gpr(const struct ir3_register * dst)1434 is_dest_gpr(const struct ir3_register *dst)
1435 {
1436 if (dst->wrmask == 0)
1437 return false;
1438 return is_reg_gpr(dst);
1439 }
1440
1441 static inline bool
writes_gpr(struct ir3_instruction * instr)1442 writes_gpr(struct ir3_instruction *instr)
1443 {
1444 if (dest_regs(instr) == 0)
1445 return false;
1446 return is_dest_gpr(instr->dsts[0]);
1447 }
1448
1449 static inline bool
writes_addr0(struct ir3_instruction * instr)1450 writes_addr0(struct ir3_instruction *instr)
1451 {
1452 /* Note: only the first dest can write to a0.x */
1453 if (instr->dsts_count > 0) {
1454 struct ir3_register *dst = instr->dsts[0];
1455 return dst->num == regid(REG_A0, 0);
1456 }
1457 return false;
1458 }
1459
1460 static inline bool
writes_addr1(struct ir3_instruction * instr)1461 writes_addr1(struct ir3_instruction *instr)
1462 {
1463 /* Note: only the first dest can write to a1.x */
1464 if (instr->dsts_count > 0) {
1465 struct ir3_register *dst = instr->dsts[0];
1466 return dst->num == regid(REG_A0, 1);
1467 }
1468 return false;
1469 }
1470
1471 static inline bool
writes_pred(struct ir3_instruction * instr)1472 writes_pred(struct ir3_instruction *instr)
1473 {
1474 /* Note: only the first dest can write to p0 */
1475 if (instr->dsts_count > 0) {
1476 struct ir3_register *dst = instr->dsts[0];
1477 return !!(dst->flags & IR3_REG_PREDICATE);
1478 }
1479 return false;
1480 }
1481
1482 /* r0.x - r47.w are "normal" registers. r48.x - r55.w are shared registers.
1483 * Everything above those are non-GPR registers like a0.x and p0.x that aren't
1484 * assigned by RA.
1485 */
1486 #define GPR_REG_SIZE (4 * 48)
1487 #define SHARED_REG_START GPR_REG_SIZE
1488 #define SHARED_REG_SIZE (4 * 8)
1489 #define NONGPR_REG_START (SHARED_REG_START + SHARED_REG_SIZE)
1490 #define NONGPR_REG_SIZE (4 * 8)
1491
1492 enum ir3_reg_file {
1493 IR3_FILE_FULL,
1494 IR3_FILE_HALF,
1495 IR3_FILE_SHARED,
1496 IR3_FILE_NONGPR,
1497 };
1498
1499 /* Return a file + offset that can be used for determining if two registers
1500 * alias. The register is only really used for its flags, the num is taken from
1501 * the parameter. Registers overlap if they are in the same file and have an
1502 * overlapping offset. The offset is multiplied by 2 for full registers to
1503 * handle aliasing half and full registers, that is it's in units of half-regs.
1504 */
1505 static inline unsigned
ir3_reg_file_offset(const struct ir3_register * reg,unsigned num,bool mergedregs,enum ir3_reg_file * file)1506 ir3_reg_file_offset(const struct ir3_register *reg, unsigned num,
1507 bool mergedregs, enum ir3_reg_file *file)
1508 {
1509 assert(!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
1510 unsigned size = reg_elem_size(reg);
1511 if (!is_reg_gpr(reg)) {
1512 *file = IR3_FILE_NONGPR;
1513 return (num - NONGPR_REG_START) * size;
1514 } else if (reg->flags & IR3_REG_SHARED) {
1515 *file = IR3_FILE_SHARED;
1516 return (num - SHARED_REG_START) * size;
1517 } else if (mergedregs || !(reg->flags & IR3_REG_HALF)) {
1518 *file = IR3_FILE_FULL;
1519 return num * size;
1520 } else {
1521 *file = IR3_FILE_HALF;
1522 return num;
1523 }
1524 }
1525
1526 /* returns defining instruction for reg */
1527 /* TODO better name */
1528 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1529 ssa(struct ir3_register *reg)
1530 {
1531 if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1532 return reg->def->instr;
1533 return NULL;
1534 }
1535
1536 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1537 conflicts(struct ir3_register *a, struct ir3_register *b)
1538 {
1539 return (a && b) && (a->def != b->def);
1540 }
1541
1542 static inline bool
reg_is_addr1(struct ir3_register * r)1543 reg_is_addr1(struct ir3_register *r)
1544 {
1545 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1546 return false;
1547 return r->num == regid(REG_A0, 1);
1548 }
1549
1550 static inline type_t
half_type(type_t type)1551 half_type(type_t type)
1552 {
1553 switch (type) {
1554 case TYPE_F32:
1555 return TYPE_F16;
1556 case TYPE_U32:
1557 case TYPE_U8_32:
1558 return TYPE_U16;
1559 case TYPE_S32:
1560 return TYPE_S16;
1561 case TYPE_F16:
1562 case TYPE_U16:
1563 case TYPE_S16:
1564 return type;
1565 case TYPE_U8:
1566 return type;
1567 default:
1568 assert(0);
1569 return (type_t)~0;
1570 }
1571 }
1572
1573 static inline type_t
full_type(type_t type)1574 full_type(type_t type)
1575 {
1576 switch (type) {
1577 case TYPE_F16:
1578 return TYPE_F32;
1579 case TYPE_U8:
1580 case TYPE_U8_32:
1581 case TYPE_U16:
1582 return TYPE_U32;
1583 case TYPE_S16:
1584 return TYPE_S32;
1585 case TYPE_F32:
1586 case TYPE_U32:
1587 case TYPE_S32:
1588 return type;
1589 default:
1590 assert(0);
1591 return (type_t)~0;
1592 }
1593 }
1594
1595 /* some cat2 instructions (ie. those which are not float) can embed an
1596 * immediate:
1597 */
1598 static inline bool
ir3_cat2_int(opc_t opc)1599 ir3_cat2_int(opc_t opc)
1600 {
1601 switch (opc) {
1602 case OPC_ADD_U:
1603 case OPC_ADD_S:
1604 case OPC_SUB_U:
1605 case OPC_SUB_S:
1606 case OPC_CMPS_U:
1607 case OPC_CMPS_S:
1608 case OPC_MIN_U:
1609 case OPC_MIN_S:
1610 case OPC_MAX_U:
1611 case OPC_MAX_S:
1612 case OPC_CMPV_U:
1613 case OPC_CMPV_S:
1614 case OPC_MUL_U24:
1615 case OPC_MUL_S24:
1616 case OPC_MULL_U:
1617 case OPC_CLZ_S:
1618 case OPC_ABSNEG_S:
1619 case OPC_AND_B:
1620 case OPC_OR_B:
1621 case OPC_NOT_B:
1622 case OPC_XOR_B:
1623 case OPC_BFREV_B:
1624 case OPC_CLZ_B:
1625 case OPC_SHL_B:
1626 case OPC_SHR_B:
1627 case OPC_ASHR_B:
1628 case OPC_MGEN_B:
1629 case OPC_GETBIT_B:
1630 case OPC_CBITS_B:
1631 case OPC_BARY_F:
1632 case OPC_FLAT_B:
1633 return true;
1634
1635 default:
1636 return false;
1637 }
1638 }
1639
1640 /* map cat2 instruction to valid abs/neg flags: */
1641 static inline unsigned
ir3_cat2_absneg(opc_t opc)1642 ir3_cat2_absneg(opc_t opc)
1643 {
1644 switch (opc) {
1645 case OPC_ADD_F:
1646 case OPC_MIN_F:
1647 case OPC_MAX_F:
1648 case OPC_MUL_F:
1649 case OPC_SIGN_F:
1650 case OPC_CMPS_F:
1651 case OPC_ABSNEG_F:
1652 case OPC_CMPV_F:
1653 case OPC_FLOOR_F:
1654 case OPC_CEIL_F:
1655 case OPC_RNDNE_F:
1656 case OPC_RNDAZ_F:
1657 case OPC_TRUNC_F:
1658 case OPC_BARY_F:
1659 return IR3_REG_FABS | IR3_REG_FNEG;
1660
1661 case OPC_ADD_U:
1662 case OPC_ADD_S:
1663 case OPC_SUB_U:
1664 case OPC_SUB_S:
1665 case OPC_CMPS_U:
1666 case OPC_CMPS_S:
1667 case OPC_MIN_U:
1668 case OPC_MIN_S:
1669 case OPC_MAX_U:
1670 case OPC_MAX_S:
1671 case OPC_CMPV_U:
1672 case OPC_CMPV_S:
1673 case OPC_MUL_U24:
1674 case OPC_MUL_S24:
1675 case OPC_MULL_U:
1676 case OPC_CLZ_S:
1677 return 0;
1678
1679 case OPC_ABSNEG_S:
1680 return IR3_REG_SABS | IR3_REG_SNEG;
1681
1682 case OPC_AND_B:
1683 case OPC_OR_B:
1684 case OPC_NOT_B:
1685 case OPC_XOR_B:
1686 case OPC_BFREV_B:
1687 case OPC_CLZ_B:
1688 case OPC_SHL_B:
1689 case OPC_SHR_B:
1690 case OPC_ASHR_B:
1691 case OPC_MGEN_B:
1692 case OPC_GETBIT_B:
1693 case OPC_CBITS_B:
1694 return IR3_REG_BNOT;
1695
1696 default:
1697 return 0;
1698 }
1699 }
1700
1701 /* map cat3 instructions to valid abs/neg flags: */
1702 static inline unsigned
ir3_cat3_absneg(opc_t opc,unsigned src_n)1703 ir3_cat3_absneg(opc_t opc, unsigned src_n)
1704 {
1705 switch (opc) {
1706 case OPC_MAD_F16:
1707 case OPC_MAD_F32:
1708 case OPC_SEL_F16:
1709 case OPC_SEL_F32:
1710 return IR3_REG_FNEG;
1711
1712 case OPC_SAD_S16:
1713 case OPC_SAD_S32:
1714 return src_n == 1 ? IR3_REG_SNEG : 0;
1715
1716 case OPC_MAD_U16:
1717 case OPC_MADSH_U16:
1718 case OPC_MAD_S16:
1719 case OPC_MADSH_M16:
1720 case OPC_MAD_U24:
1721 case OPC_MAD_S24:
1722 case OPC_SEL_S16:
1723 case OPC_SEL_S32:
1724 /* neg *may* work on 3rd src.. */
1725
1726 case OPC_SEL_B16:
1727 case OPC_SEL_B32:
1728
1729 case OPC_SHRM:
1730 case OPC_SHLM:
1731 case OPC_SHRG:
1732 case OPC_SHLG:
1733 case OPC_ANDG:
1734 case OPC_WMM:
1735 case OPC_WMM_ACCU:
1736
1737 default:
1738 return 0;
1739 }
1740 }
1741
1742 /* Return the type (float, int, or uint) the op uses when converting from the
1743 * internal result of the op (which is assumed to be the same size as the
1744 * sources) to the destination when they are not the same size. If F32 it does
1745 * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1746 * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1747 * doesn't do anything sensible or is unknown.
1748 */
1749 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1750 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1751 {
1752 *can_fold = true;
1753 switch (instr->opc) {
1754 case OPC_ADD_F:
1755 case OPC_MUL_F:
1756 case OPC_BARY_F:
1757 case OPC_MAD_F32:
1758 case OPC_MAD_F16:
1759 case OPC_WMM:
1760 case OPC_WMM_ACCU:
1761 return TYPE_F32;
1762
1763 case OPC_ADD_U:
1764 case OPC_SUB_U:
1765 case OPC_MIN_U:
1766 case OPC_MAX_U:
1767 case OPC_AND_B:
1768 case OPC_OR_B:
1769 case OPC_NOT_B:
1770 case OPC_XOR_B:
1771 case OPC_MUL_U24:
1772 case OPC_MULL_U:
1773 case OPC_SHL_B:
1774 case OPC_SHR_B:
1775 case OPC_ASHR_B:
1776 case OPC_MAD_U24:
1777 case OPC_SHRM:
1778 case OPC_SHLM:
1779 case OPC_SHRG:
1780 case OPC_SHLG:
1781 case OPC_ANDG:
1782 /* Comparison ops zero-extend/truncate their results, so consider them as
1783 * unsigned here.
1784 */
1785 case OPC_CMPS_F:
1786 case OPC_CMPV_F:
1787 case OPC_CMPS_U:
1788 case OPC_CMPS_S:
1789 return TYPE_U32;
1790
1791 case OPC_ADD_S:
1792 case OPC_SUB_S:
1793 case OPC_MIN_S:
1794 case OPC_MAX_S:
1795 case OPC_ABSNEG_S:
1796 case OPC_MUL_S24:
1797 case OPC_MAD_S24:
1798 return TYPE_S32;
1799
1800 /* We assume that any move->move folding that could be done was done by
1801 * NIR.
1802 */
1803 case OPC_MOV:
1804 default:
1805 *can_fold = false;
1806 return TYPE_U32;
1807 }
1808 }
1809
1810 /* Return the src and dst types for the conversion which is already folded
1811 * into the op. We can assume that instr has folded in a conversion from
1812 * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1813 * to call if ir3_output_conv_type() returns can_fold = true.
1814 */
1815 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1816 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1817 {
1818 switch (instr->opc) {
1819 case OPC_CMPS_F:
1820 case OPC_CMPV_F:
1821 case OPC_CMPS_U:
1822 case OPC_CMPS_S:
1823 /* Comparisons only return 0/1 and the size of the comparison sources
1824 * is irrelevant, never consider them as having an output conversion
1825 * by returning a type with the dest size here:
1826 */
1827 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1828 : full_type(base_type);
1829
1830 case OPC_BARY_F:
1831 /* bary.f doesn't have an explicit source, but we can assume here that
1832 * the varying data it reads is in fp32.
1833 *
1834 * This may be fp16 on older gen's depending on some register
1835 * settings, but it's probably not worth plumbing that through for a
1836 * small improvement that NIR would hopefully handle for us anyway.
1837 */
1838 return TYPE_F32;
1839
1840 case OPC_FLAT_B:
1841 /* Treat the input data as u32 if not interpolating. */
1842 return TYPE_U32;
1843
1844 default:
1845 return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1846 : full_type(base_type);
1847 }
1848 }
1849
1850 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1851 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1852 {
1853 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1854 : full_type(base_type);
1855 }
1856
1857 /* Some instructions have signed/unsigned variants which are identical except
1858 * for whether the folded conversion sign-extends or zero-extends, and we can
1859 * fold in a mismatching move by rewriting the opcode. Return the opcode to
1860 * switch signedness, and whether one exists.
1861 */
1862 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1863 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1864 {
1865 switch (opc) {
1866 #define PAIR(u, s) \
1867 case OPC_##u: \
1868 return OPC_##s; \
1869 case OPC_##s: \
1870 return OPC_##u;
1871 PAIR(ADD_U, ADD_S)
1872 PAIR(SUB_U, SUB_S)
1873 /* Note: these are only identical when the sources are half, but that's
1874 * the only case we call this function for anyway.
1875 */
1876 PAIR(MUL_U24, MUL_S24)
1877
1878 default:
1879 *can_swap = false;
1880 return opc;
1881 }
1882 }
1883
1884 #define MASK(n) ((1 << (n)) - 1)
1885
1886 /* iterator for an instructions's sources (reg), also returns src #: */
1887 #define foreach_src_n(__srcreg, __n, __instr) \
1888 if ((__instr)->srcs_count) \
1889 for (struct ir3_register *__srcreg = (struct ir3_register *)~0; __srcreg;\
1890 __srcreg = NULL) \
1891 for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt; \
1892 __n++) \
1893 if ((__srcreg = (__instr)->srcs[__n]))
1894
1895 /* iterator for an instructions's sources (reg): */
1896 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1897
1898 #define foreach_src_if(__srcreg, __instr, __filter) \
1899 foreach_src (__srcreg, __instr) \
1900 if (__filter(__srcreg))
1901
1902 /* iterator for an instructions's destinations (reg), also returns dst #: */
1903 #define foreach_dst_n(__dstreg, __n, __instr) \
1904 if ((__instr)->dsts_count) \
1905 for (struct ir3_register *__dstreg = (struct ir3_register *)~0; __dstreg;\
1906 __dstreg = NULL) \
1907 for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt; \
1908 __n++) \
1909 if ((__dstreg = (__instr)->dsts[__n]))
1910
1911 /* iterator for an instructions's destinations (reg): */
1912 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1913
1914 #define foreach_dst_if(__dstreg, __instr, __filter) \
1915 foreach_dst (__dstreg, __instr) \
1916 if (__filter(__dstreg))
1917
1918 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1919 __ssa_src_cnt(struct ir3_instruction *instr)
1920 {
1921 return instr->srcs_count + instr->deps_count;
1922 }
1923
1924 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1925 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1926 {
1927 if (n >= instr->srcs_count)
1928 return true;
1929 return false;
1930 }
1931
1932 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1933 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1934 {
1935 if (__is_false_dep(instr, n))
1936 return &instr->deps[n - instr->srcs_count];
1937 if (ssa(instr->srcs[n]))
1938 return &instr->srcs[n]->def->instr;
1939 return NULL;
1940 }
1941
1942 #define foreach_ssa_srcp_n(__srcp, __n, __instr) \
1943 for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL) \
1944 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; \
1945 __n++) \
1946 if ((__srcp = __ssa_srcp_n(__instr, __n)))
1947
1948 #define foreach_ssa_srcp(__srcp, __instr) \
1949 foreach_ssa_srcp_n (__srcp, __i, __instr)
1950
1951 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1952 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
1953 for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst; \
1954 __srcinst = NULL) \
1955 foreach_ssa_srcp_n (__srcp, __n, __instr) \
1956 if ((__srcinst = *__srcp))
1957
1958 /* iterator for an instruction's SSA sources (instr): */
1959 #define foreach_ssa_src(__srcinst, __instr) \
1960 foreach_ssa_src_n (__srcinst, __i, __instr)
1961
1962 /* iterators for shader inputs: */
1963 #define foreach_input_n(__ininstr, __cnt, __ir) \
1964 for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr; \
1965 __ininstr = NULL) \
1966 for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
1967 if ((__ininstr = (__ir)->inputs[__cnt]))
1968 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1969
1970 /* iterators for instructions: */
1971 #define foreach_instr(__instr, __list) \
1972 list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1973 #define foreach_instr_from(__instr, __start, __list) \
1974 list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \
1975 __list, node)
1976 #define foreach_instr_rev(__instr, __list) \
1977 list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1978 #define foreach_instr_safe(__instr, __list) \
1979 list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1980 #define foreach_instr_from_safe(__instr, __start, __list) \
1981 list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, \
1982 __list, node)
1983
1984 /* Iterate over all instructions in a repeat group. */
1985 #define foreach_instr_rpt(__rpt, __instr) \
1986 if (assert(ir3_instr_is_first_rpt(__instr)), true) \
1987 for (struct ir3_instruction *__rpt = __instr, *__first = __instr; \
1988 __first || __rpt != __instr; \
1989 __first = NULL, __rpt = \
1990 list_entry(__rpt->rpt_node.next, \
1991 struct ir3_instruction, rpt_node))
1992
1993 /* Iterate over all instructions except the first one in a repeat group. */
1994 #define foreach_instr_rpt_excl(__rpt, __instr) \
1995 if (assert(ir3_instr_is_first_rpt(__instr)), true) \
1996 list_for_each_entry (struct ir3_instruction, __rpt, &__instr->rpt_node, \
1997 rpt_node)
1998
1999 #define foreach_instr_rpt_excl_safe(__rpt, __instr) \
2000 if (assert(ir3_instr_is_first_rpt(__instr)), true) \
2001 list_for_each_entry_safe (struct ir3_instruction, __rpt, \
2002 &__instr->rpt_node, rpt_node)
2003
2004 /* iterators for blocks: */
2005 #define foreach_block(__block, __list) \
2006 list_for_each_entry (struct ir3_block, __block, __list, node)
2007 #define foreach_block_safe(__block, __list) \
2008 list_for_each_entry_safe (struct ir3_block, __block, __list, node)
2009 #define foreach_block_rev(__block, __list) \
2010 list_for_each_entry_rev (struct ir3_block, __block, __list, node)
2011
2012 /* iterators for arrays: */
2013 #define foreach_array(__array, __list) \
2014 list_for_each_entry (struct ir3_array, __array, __list, node)
2015 #define foreach_array_safe(__array, __list) \
2016 list_for_each_entry_safe (struct ir3_array, __array, __list, node)
2017
2018 #define IR3_PASS(ir, pass, ...) \
2019 ({ \
2020 bool progress = pass(ir, ##__VA_ARGS__); \
2021 if (progress) { \
2022 ir3_debug_print(ir, "AFTER: " #pass); \
2023 ir3_validate(ir); \
2024 } \
2025 progress; \
2026 })
2027
2028 /* validate: */
2029 void ir3_validate(struct ir3 *ir);
2030
2031 /* dump: */
2032 void ir3_print(struct ir3 *ir);
2033 void ir3_print_instr(struct ir3_instruction *instr);
2034
2035 struct log_stream;
2036 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
2037
2038 /* delay calculation: */
2039 int ir3_delayslots(struct ir3_compiler *compiler,
2040 struct ir3_instruction *assigner,
2041 struct ir3_instruction *consumer, unsigned n, bool soft);
2042 unsigned ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
2043 struct ir3_instruction *assigner,
2044 struct ir3_instruction *consumer,
2045 unsigned assigner_n, unsigned consumer_n);
2046
2047 /* estimated (ss)/(sy) delay calculation */
2048
2049 static inline bool
is_local_mem_load(struct ir3_instruction * instr)2050 is_local_mem_load(struct ir3_instruction *instr)
2051 {
2052 return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
2053 instr->opc == OPC_LDLW;
2054 }
2055
2056 bool is_scalar_alu(struct ir3_instruction *instr,
2057 const struct ir3_compiler *compiler);
2058
2059 /* Does this instruction sometimes need (ss) to wait for its result? */
2060 static inline bool
is_ss_producer(struct ir3_instruction * instr)2061 is_ss_producer(struct ir3_instruction *instr)
2062 {
2063 foreach_dst (dst, instr) {
2064 if (dst->flags & IR3_REG_SHARED)
2065 return true;
2066 }
2067
2068 if (instr->block->in_early_preamble && writes_addr1(instr))
2069 return true;
2070
2071 return is_sfu(instr) || is_local_mem_load(instr) || instr->opc == OPC_SHFL;
2072 }
2073
2074 static inline bool
needs_ss(const struct ir3_compiler * compiler,struct ir3_instruction * producer,struct ir3_instruction * consumer)2075 needs_ss(const struct ir3_compiler *compiler, struct ir3_instruction *producer,
2076 struct ir3_instruction *consumer)
2077 {
2078 if (is_scalar_alu(producer, compiler) &&
2079 is_scalar_alu(consumer, compiler) &&
2080 (producer->dsts[0]->flags & IR3_REG_HALF) ==
2081 (consumer->srcs[0]->flags & IR3_REG_HALF))
2082 return false;
2083
2084 return is_ss_producer(producer);
2085 }
2086
2087 /* The soft delay for approximating the cost of (ss). */
2088 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)2089 soft_ss_delay(struct ir3_instruction *instr)
2090 {
2091 /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
2092 * using nop's instead of (ss) is:
2093 *
2094 * 8 - single warp
2095 * 9 - two warps
2096 * 10 - four warps
2097 *
2098 * and so on. Not quite sure where it tapers out (ie. how many warps share an
2099 * SFU unit). But 10 seems like a reasonable # to choose:
2100 */
2101 if (is_sfu(instr) || is_local_mem_load(instr))
2102 return 10;
2103
2104 /* The blob adds 6 nops between shared producers and consumers, and before we
2105 * used (ss) this was sufficient in most cases.
2106 */
2107 return 6;
2108 }
2109
2110 static inline bool
is_sy_producer(struct ir3_instruction * instr)2111 is_sy_producer(struct ir3_instruction *instr)
2112 {
2113 return is_tex_or_prefetch(instr) ||
2114 (is_load(instr) && !is_local_mem_load(instr)) ||
2115 is_atomic(instr->opc);
2116 }
2117
2118 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)2119 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
2120 {
2121 /* TODO: this is just an optimistic guess, we can do better post-RA.
2122 */
2123 bool double_wavesize =
2124 shader->type == MESA_SHADER_FRAGMENT ||
2125 shader->type == MESA_SHADER_COMPUTE;
2126
2127 unsigned components = reg_elems(instr->dsts[0]);
2128
2129 /* These numbers come from counting the number of delay slots to get
2130 * cat5/cat6 results back using nops instead of (sy). Note that these numbers
2131 * are with the result preloaded to cache by loading it before in the same
2132 * shader - uncached results are much larger.
2133 *
2134 * Note: most ALU instructions can't complete at the full doubled rate, so
2135 * they take 2 cycles. The only exception is fp16 instructions with no
2136 * built-in conversions. Therefore divide the latency by 2.
2137 *
2138 * TODO: Handle this properly in the scheduler and remove this.
2139 */
2140 if (instr->opc == OPC_LDC) {
2141 if (double_wavesize)
2142 return (21 + 8 * components) / 2;
2143 else
2144 return 18 + 4 * components;
2145 } else if (is_tex_or_prefetch(instr)) {
2146 if (double_wavesize) {
2147 switch (components) {
2148 case 1: return 58 / 2;
2149 case 2: return 60 / 2;
2150 case 3: return 77 / 2;
2151 case 4: return 79 / 2;
2152 default: unreachable("bad number of components");
2153 }
2154 } else {
2155 switch (components) {
2156 case 1: return 51;
2157 case 2: return 53;
2158 case 3: return 62;
2159 case 4: return 64;
2160 default: unreachable("bad number of components");
2161 }
2162 }
2163 } else {
2164 /* TODO: measure other cat6 opcodes like ldg */
2165 if (double_wavesize)
2166 return (172 + components) / 2;
2167 else
2168 return 109 + components;
2169 }
2170 }
2171
2172 /* Some instructions don't immediately consume their sources so may introduce a
2173 * WAR hazard.
2174 */
2175 static inline bool
is_war_hazard_producer(struct ir3_instruction * instr)2176 is_war_hazard_producer(struct ir3_instruction *instr)
2177 {
2178 return is_tex(instr) || is_mem(instr) || is_ss_producer(instr) ||
2179 instr->opc == OPC_STC;
2180 }
2181
2182 bool ir3_cleanup_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2183 bool ir3_merge_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2184 bool ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v);
2185
2186 /* unreachable block elimination: */
2187 bool ir3_remove_unreachable(struct ir3 *ir);
2188
2189 /* calculate reconvergence information: */
2190 void ir3_calc_reconvergence(struct ir3_shader_variant *so);
2191
2192 /* lower invalid shared phis after calculating reconvergence information: */
2193 bool ir3_lower_shared_phis(struct ir3 *ir);
2194
2195 /* dead code elimination: */
2196 struct ir3_shader_variant;
2197 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
2198
2199 /* fp16 conversion folding */
2200 bool ir3_cf(struct ir3 *ir);
2201
2202 /* shared mov folding */
2203 bool ir3_shared_fold(struct ir3 *ir);
2204
2205 /* copy-propagate: */
2206 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
2207
2208 /* common subexpression elimination: */
2209 bool ir3_cse(struct ir3 *ir);
2210
2211 /* Make arrays SSA */
2212 bool ir3_array_to_ssa(struct ir3 *ir);
2213
2214 /* scheduling: */
2215 bool ir3_sched_add_deps(struct ir3 *ir);
2216 int ir3_sched(struct ir3 *ir);
2217
2218 struct ir3_context;
2219 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
2220
2221 /* register assignment: */
2222 int ir3_ra(struct ir3_shader_variant *v);
2223 void ir3_ra_predicates(struct ir3_shader_variant *v);
2224
2225 /* lower subgroup ops: */
2226 bool ir3_lower_subgroups(struct ir3 *ir);
2227
2228 /* legalize: */
2229 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
2230 bool ir3_legalize_relative(struct ir3 *ir);
2231
2232 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)2233 ir3_has_latency_to_hide(struct ir3 *ir)
2234 {
2235 /* VS/GS/TCS/TESS co-exist with frag shader invocations, but we don't
2236 * know the nature of the fragment shader. Just assume it will have
2237 * latency to hide:
2238 */
2239 if (ir->type != MESA_SHADER_FRAGMENT)
2240 return true;
2241
2242 foreach_block (block, &ir->block_list) {
2243 foreach_instr (instr, &block->instr_list) {
2244 if (is_tex_or_prefetch(instr))
2245 return true;
2246
2247 if (is_load(instr)) {
2248 switch (instr->opc) {
2249 case OPC_LDLV:
2250 case OPC_LDL:
2251 case OPC_LDLW:
2252 break;
2253 default:
2254 return true;
2255 }
2256 }
2257 }
2258 }
2259
2260 return false;
2261 }
2262
2263 /**
2264 * Move 'instr' to after the last phi node at the beginning of the block:
2265 */
2266 static inline void
ir3_instr_move_after_phis(struct ir3_instruction * instr,struct ir3_block * block)2267 ir3_instr_move_after_phis(struct ir3_instruction *instr,
2268 struct ir3_block *block)
2269 {
2270 struct ir3_instruction *last_phi = ir3_block_get_last_phi(block);
2271 if (last_phi)
2272 ir3_instr_move_after(instr, last_phi);
2273 else
2274 ir3_instr_move_before_block(instr, block);
2275 }
2276
2277 static inline struct ir3_cursor
ir3_before_block(struct ir3_block * block)2278 ir3_before_block(struct ir3_block *block)
2279 {
2280 assert(block);
2281 struct ir3_cursor cursor;
2282 cursor.option = IR3_CURSOR_BEFORE_BLOCK;
2283 cursor.block = block;
2284 return cursor;
2285 }
2286
2287 static inline struct ir3_cursor
ir3_after_block(struct ir3_block * block)2288 ir3_after_block(struct ir3_block *block)
2289 {
2290 assert(block);
2291 struct ir3_cursor cursor;
2292 cursor.option = IR3_CURSOR_AFTER_BLOCK;
2293 cursor.block = block;
2294 return cursor;
2295 }
2296
2297 static inline struct ir3_cursor
ir3_before_instr(struct ir3_instruction * instr)2298 ir3_before_instr(struct ir3_instruction *instr)
2299 {
2300 assert(instr);
2301 struct ir3_cursor cursor;
2302 cursor.option = IR3_CURSOR_BEFORE_INSTR;
2303 cursor.instr = instr;
2304 return cursor;
2305 }
2306
2307 static inline struct ir3_cursor
ir3_after_instr(struct ir3_instruction * instr)2308 ir3_after_instr(struct ir3_instruction *instr)
2309 {
2310 assert(instr);
2311 struct ir3_cursor cursor;
2312 cursor.option = IR3_CURSOR_AFTER_INSTR;
2313 cursor.instr = instr;
2314 return cursor;
2315 }
2316
2317 static inline struct ir3_cursor
ir3_before_terminator(struct ir3_block * block)2318 ir3_before_terminator(struct ir3_block *block)
2319 {
2320 assert(block);
2321 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
2322
2323 if (terminator)
2324 return ir3_before_instr(terminator);
2325 return ir3_after_block(block);
2326 }
2327
2328 static inline struct ir3_cursor
ir3_after_phis(struct ir3_block * block)2329 ir3_after_phis(struct ir3_block *block)
2330 {
2331 assert(block);
2332
2333 foreach_instr (instr, &block->instr_list) {
2334 if (instr->opc != OPC_META_PHI)
2335 return ir3_before_instr(instr);
2336 }
2337
2338 return ir3_after_block(block);
2339 }
2340
2341 static inline struct ir3_cursor
ir3_after_instr_and_phis(struct ir3_instruction * instr)2342 ir3_after_instr_and_phis(struct ir3_instruction *instr)
2343 {
2344 if (instr->opc == OPC_META_PHI) {
2345 return ir3_after_phis(instr->block);
2346 } else {
2347 return ir3_after_instr(instr);
2348 }
2349 }
2350
2351 static inline struct ir3_builder
ir3_builder_at(struct ir3_cursor cursor)2352 ir3_builder_at(struct ir3_cursor cursor)
2353 {
2354 struct ir3_builder builder;
2355 builder.cursor = cursor;
2356 return builder;
2357 }
2358
2359
2360 /* ************************************************************************* */
2361 /* instruction helpers */
2362
2363 /* creates SSA src of correct type (ie. half vs full precision) */
2364 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)2365 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
2366 unsigned flags)
2367 {
2368 struct ir3_register *reg;
2369 flags |= src->dsts[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED);
2370 reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
2371 reg->def = src->dsts[0];
2372 reg->wrmask = src->dsts[0]->wrmask;
2373 return reg;
2374 }
2375
2376 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)2377 __ssa_dst(struct ir3_instruction *instr)
2378 {
2379 struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
2380 reg->instr = instr;
2381 return reg;
2382 }
2383
2384 static BITMASK_ENUM(ir3_register_flags)
type_flags(type_t type)2385 type_flags(type_t type)
2386 {
2387 if (type_size(type) < 32)
2388 return IR3_REG_HALF;
2389 return (ir3_register_flags)0;
2390 }
2391
2392 static inline struct ir3_instruction *
create_immed_typed_shared(struct ir3_builder * build,uint32_t val,type_t type,bool shared)2393 create_immed_typed_shared(struct ir3_builder *build, uint32_t val, type_t type,
2394 bool shared)
2395 {
2396 struct ir3_instruction *mov;
2397 ir3_register_flags flags = type_flags(type);
2398
2399 mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2400 mov->cat1.src_type = type;
2401 mov->cat1.dst_type = type;
2402 __ssa_dst(mov)->flags |= flags | (shared ? IR3_REG_SHARED : 0);
2403 ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
2404
2405 return mov;
2406 }
2407
2408 static inline struct ir3_instruction *
create_immed_typed(struct ir3_builder * build,uint32_t val,type_t type)2409 create_immed_typed(struct ir3_builder *build, uint32_t val, type_t type)
2410 {
2411 return create_immed_typed_shared(build, val, type, false);
2412 }
2413
2414 static inline struct ir3_instruction *
create_immed_shared(struct ir3_builder * build,uint32_t val,bool shared)2415 create_immed_shared(struct ir3_builder *build, uint32_t val, bool shared)
2416 {
2417 return create_immed_typed_shared(build, val, TYPE_U32, shared);
2418 }
2419
2420 static inline struct ir3_instruction *
create_immed(struct ir3_builder * build,uint32_t val)2421 create_immed(struct ir3_builder *build, uint32_t val)
2422 {
2423 return create_immed_shared(build, val, false);
2424 }
2425
2426 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_builder * build,unsigned n,type_t type)2427 create_uniform_typed(struct ir3_builder *build, unsigned n, type_t type)
2428 {
2429 struct ir3_instruction *mov;
2430 ir3_register_flags flags = type_flags(type);
2431
2432 mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2433 mov->cat1.src_type = type;
2434 mov->cat1.dst_type = type;
2435 __ssa_dst(mov)->flags |= flags;
2436 ir3_src_create(mov, n, IR3_REG_CONST | flags);
2437
2438 return mov;
2439 }
2440
2441 static inline struct ir3_instruction *
create_uniform(struct ir3_builder * build,unsigned n)2442 create_uniform(struct ir3_builder *build, unsigned n)
2443 {
2444 return create_uniform_typed(build, n, TYPE_F32);
2445 }
2446
2447 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_builder * build,int n,type_t type,struct ir3_instruction * address)2448 create_uniform_indirect(struct ir3_builder *build, int n, type_t type,
2449 struct ir3_instruction *address)
2450 {
2451 struct ir3_instruction *mov;
2452
2453 mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2454 mov->cat1.src_type = type;
2455 mov->cat1.dst_type = type;
2456 __ssa_dst(mov);
2457 ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
2458
2459 ir3_instr_set_address(mov, address);
2460
2461 return mov;
2462 }
2463
2464 static inline struct ir3_instruction *
ir3_MOV(struct ir3_builder * build,struct ir3_instruction * src,type_t type)2465 ir3_MOV(struct ir3_builder *build, struct ir3_instruction *src, type_t type)
2466 {
2467 struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOV, 1, 1);
2468 ir3_register_flags flags = type_flags(type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2469
2470 __ssa_dst(instr)->flags |= flags;
2471 if (src->dsts[0]->flags & IR3_REG_ARRAY) {
2472 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
2473 src_reg->array = src->dsts[0]->array;
2474 } else {
2475 __ssa_src(instr, src, 0);
2476 }
2477 assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
2478 instr->cat1.src_type = type;
2479 instr->cat1.dst_type = type;
2480 return instr;
2481 }
2482
2483 static inline struct ir3_instruction_rpt
ir3_MOV_rpt(struct ir3_builder * build,unsigned nrpt,struct ir3_instruction_rpt src,type_t type)2484 ir3_MOV_rpt(struct ir3_builder *build, unsigned nrpt,
2485 struct ir3_instruction_rpt src, type_t type)
2486 {
2487 struct ir3_instruction_rpt dst;
2488 assert(nrpt <= ARRAY_SIZE(dst.rpts));
2489
2490 for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2491 dst.rpts[rpt] = ir3_MOV(build, src.rpts[rpt], type);
2492
2493 ir3_instr_create_rpt(dst.rpts, nrpt);
2494 return dst;
2495 }
2496
2497 static inline struct ir3_instruction *
ir3_COV(struct ir3_builder * build,struct ir3_instruction * src,type_t src_type,type_t dst_type)2498 ir3_COV(struct ir3_builder *build, struct ir3_instruction *src, type_t src_type,
2499 type_t dst_type)
2500 {
2501 struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOV, 1, 1);
2502 ir3_register_flags dst_flags = type_flags(dst_type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2503 ASSERTED ir3_register_flags src_flags = type_flags(src_type);
2504
2505 assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2506
2507 __ssa_dst(instr)->flags |= dst_flags;
2508 __ssa_src(instr, src, 0);
2509 instr->cat1.src_type = src_type;
2510 instr->cat1.dst_type = dst_type;
2511 assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2512 return instr;
2513 }
2514
2515 static inline struct ir3_instruction_rpt
ir3_COV_rpt(struct ir3_builder * build,unsigned nrpt,struct ir3_instruction_rpt src,type_t src_type,type_t dst_type)2516 ir3_COV_rpt(struct ir3_builder *build, unsigned nrpt,
2517 struct ir3_instruction_rpt src, type_t src_type, type_t dst_type)
2518 {
2519 struct ir3_instruction_rpt dst;
2520
2521 for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2522 dst.rpts[rpt] = ir3_COV(build, src.rpts[rpt], src_type, dst_type);
2523
2524 ir3_instr_create_rpt(dst.rpts, nrpt);
2525 return dst;
2526 }
2527
2528 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_builder * build,unsigned components)2529 ir3_MOVMSK(struct ir3_builder *build, unsigned components)
2530 {
2531 struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOVMSK, 1, 0);
2532
2533 struct ir3_register *dst = __ssa_dst(instr);
2534 dst->flags |= IR3_REG_SHARED;
2535 dst->wrmask = (1 << components) - 1;
2536 instr->repeat = components - 1;
2537 return instr;
2538 }
2539
2540 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_builder * build,struct ir3_instruction * src,unsigned components)2541 ir3_BALLOT_MACRO(struct ir3_builder *build, struct ir3_instruction *src,
2542 unsigned components)
2543 {
2544 struct ir3_instruction *instr =
2545 ir3_build_instr(build, OPC_BALLOT_MACRO, 1, 1);
2546
2547 struct ir3_register *dst = __ssa_dst(instr);
2548 dst->flags |= IR3_REG_SHARED;
2549 dst->wrmask = (1 << components) - 1;
2550
2551 __ssa_src(instr, src, 0);
2552
2553 return instr;
2554 }
2555
2556 /* clang-format off */
2557 #define __INSTR0(flag, name, opc) \
2558 static inline struct ir3_instruction *ir3_##name(struct ir3_builder *build) \
2559 { \
2560 struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 0); \
2561 instr->flags |= flag; \
2562 return instr; \
2563 }
2564 /* clang-format on */
2565 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2566 #define INSTR0(name) __INSTR0((ir3_instruction_flags)0, name, OPC_##name)
2567
2568 /* clang-format off */
2569 #define __INSTR1(flag, dst_count, name, opc, scalar_alu) \
2570 static inline struct ir3_instruction *ir3_##name( \
2571 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags) \
2572 { \
2573 struct ir3_instruction *instr = \
2574 ir3_build_instr(build, opc, dst_count, 1); \
2575 unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & IR3_REG_SHARED) : 0; \
2576 for (unsigned i = 0; i < dst_count; i++) \
2577 __ssa_dst(instr)->flags |= dst_flag; \
2578 __ssa_src(instr, a, aflags); \
2579 instr->flags |= flag; \
2580 return instr; \
2581 } \
2582 static inline struct ir3_instruction_rpt ir3_##name##_rpt( \
2583 struct ir3_builder *build, unsigned nrpt, \
2584 struct ir3_instruction_rpt a, unsigned aflags) \
2585 { \
2586 struct ir3_instruction_rpt dst; \
2587 assert(nrpt <= ARRAY_SIZE(dst.rpts)); \
2588 for (unsigned rpt = 0; rpt < nrpt; rpt++) \
2589 dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags); \
2590 ir3_instr_create_rpt(dst.rpts, nrpt); \
2591 return dst; \
2592 }
2593
2594 /* clang-format on */
2595 #define INSTR1F(f, name) __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name, \
2596 false)
2597 #define INSTR1(name) __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2598 #define INSTR1S(name) __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2599 #define INSTR1NODST(name) __INSTR1((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2600
2601 /* clang-format off */
2602 #define __INSTR2(flag, dst_count, name, opc, scalar_alu) \
2603 static inline struct ir3_instruction *ir3_##name( \
2604 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags, \
2605 struct ir3_instruction *b, unsigned bflags) \
2606 { \
2607 struct ir3_instruction *instr = ir3_build_instr(build, opc, dst_count, 2); \
2608 unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags & \
2609 IR3_REG_SHARED) : 0; \
2610 for (unsigned i = 0; i < dst_count; i++) \
2611 __ssa_dst(instr)->flags |= dst_flag; \
2612 __ssa_src(instr, a, aflags); \
2613 __ssa_src(instr, b, bflags); \
2614 instr->flags |= flag; \
2615 return instr; \
2616 } \
2617 static inline struct ir3_instruction_rpt ir3_##name##_rpt( \
2618 struct ir3_builder *build, unsigned nrpt, \
2619 struct ir3_instruction_rpt a, unsigned aflags, \
2620 struct ir3_instruction_rpt b, unsigned bflags) \
2621 { \
2622 struct ir3_instruction_rpt dst; \
2623 assert(nrpt <= ARRAY_SIZE(dst.rpts)); \
2624 for (unsigned rpt = 0; rpt < nrpt; rpt++) { \
2625 dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags, \
2626 b.rpts[rpt], bflags); \
2627 } \
2628 ir3_instr_create_rpt(dst.rpts, nrpt); \
2629 return dst; \
2630 }
2631 /* clang-format on */
2632 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name, \
2633 false)
2634 #define INSTR2(name) __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2635 #define INSTR2S(name) __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2636 #define INSTR2NODST(name) __INSTR2((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2637
2638 /* clang-format off */
2639 #define __INSTR3(flag, dst_count, name, opc, scalar_alu) \
2640 static inline struct ir3_instruction *ir3_##name( \
2641 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags, \
2642 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2643 unsigned cflags) \
2644 { \
2645 struct ir3_instruction *instr = \
2646 ir3_build_instr(build, opc, dst_count, 3); \
2647 unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags & \
2648 c->dsts[0]->flags & IR3_REG_SHARED) : 0; \
2649 for (unsigned i = 0; i < dst_count; i++) \
2650 __ssa_dst(instr)->flags |= dst_flag; \
2651 __ssa_src(instr, a, aflags); \
2652 __ssa_src(instr, b, bflags); \
2653 __ssa_src(instr, c, cflags); \
2654 instr->flags |= flag; \
2655 return instr; \
2656 } \
2657 static inline struct ir3_instruction_rpt ir3_##name##_rpt( \
2658 struct ir3_builder *build, unsigned nrpt, \
2659 struct ir3_instruction_rpt a, unsigned aflags, \
2660 struct ir3_instruction_rpt b, unsigned bflags, \
2661 struct ir3_instruction_rpt c, unsigned cflags) \
2662 { \
2663 struct ir3_instruction_rpt dst; \
2664 assert(nrpt <= ARRAY_SIZE(dst.rpts)); \
2665 for (unsigned rpt = 0; rpt < nrpt; rpt++) { \
2666 dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags, \
2667 b.rpts[rpt], bflags, \
2668 c.rpts[rpt], cflags); \
2669 } \
2670 ir3_instr_create_rpt(dst.rpts, nrpt); \
2671 return dst; \
2672 }
2673 /* clang-format on */
2674 #define INSTR3F(f, name) __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name, \
2675 false)
2676 #define INSTR3(name) __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2677 #define INSTR3S(name) __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2678 #define INSTR3NODST(name) __INSTR3((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2679
2680 /* clang-format off */
2681 #define __INSTR4(flag, dst_count, name, opc) \
2682 static inline struct ir3_instruction *ir3_##name( \
2683 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags, \
2684 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2685 unsigned cflags, struct ir3_instruction *d, unsigned dflags) \
2686 { \
2687 struct ir3_instruction *instr = \
2688 ir3_build_instr(build, opc, dst_count, 4); \
2689 for (unsigned i = 0; i < dst_count; i++) \
2690 __ssa_dst(instr); \
2691 __ssa_src(instr, a, aflags); \
2692 __ssa_src(instr, b, bflags); \
2693 __ssa_src(instr, c, cflags); \
2694 __ssa_src(instr, d, dflags); \
2695 instr->flags |= flag; \
2696 return instr; \
2697 }
2698 /* clang-format on */
2699 #define INSTR4F(f, name) __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2700 #define INSTR4(name) __INSTR4((ir3_instruction_flags)0, 1, name, OPC_##name)
2701 #define INSTR4NODST(name) __INSTR4((ir3_instruction_flags)0, 0, name, OPC_##name)
2702
2703 /* clang-format off */
2704 #define __INSTR5(flag, name, opc) \
2705 static inline struct ir3_instruction *ir3_##name( \
2706 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags, \
2707 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2708 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2709 struct ir3_instruction *e, unsigned eflags) \
2710 { \
2711 struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 5); \
2712 __ssa_dst(instr); \
2713 __ssa_src(instr, a, aflags); \
2714 __ssa_src(instr, b, bflags); \
2715 __ssa_src(instr, c, cflags); \
2716 __ssa_src(instr, d, dflags); \
2717 __ssa_src(instr, e, eflags); \
2718 instr->flags |= flag; \
2719 return instr; \
2720 }
2721 /* clang-format on */
2722 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2723 #define INSTR5(name) __INSTR5((ir3_instruction_flags)0, name, OPC_##name)
2724
2725 /* clang-format off */
2726 #define __INSTR6(flag, dst_count, name, opc) \
2727 static inline struct ir3_instruction *ir3_##name( \
2728 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags, \
2729 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2730 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2731 struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f, \
2732 unsigned fflags) \
2733 { \
2734 struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 6); \
2735 for (unsigned i = 0; i < dst_count; i++) \
2736 __ssa_dst(instr); \
2737 __ssa_src(instr, a, aflags); \
2738 __ssa_src(instr, b, bflags); \
2739 __ssa_src(instr, c, cflags); \
2740 __ssa_src(instr, d, dflags); \
2741 __ssa_src(instr, e, eflags); \
2742 __ssa_src(instr, f, fflags); \
2743 instr->flags |= flag; \
2744 return instr; \
2745 }
2746 /* clang-format on */
2747 #define INSTR6F(f, name) __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2748 #define INSTR6(name) __INSTR6((ir3_instruction_flags)0, 1, name, OPC_##name)
2749 #define INSTR6NODST(name) __INSTR6((ir3_instruction_flags)0, 0, name, OPC_##name)
2750
2751 /* cat0 instructions: */
2752 INSTR0(NOP)
INSTR1NODST(BR)2753 INSTR1NODST(BR)
2754 INSTR1NODST(BALL)
2755 INSTR1NODST(BANY)
2756 INSTR2NODST(BRAA)
2757 INSTR2NODST(BRAO)
2758 INSTR0(JUMP)
2759 INSTR1NODST(KILL)
2760 INSTR1NODST(DEMOTE)
2761 INSTR0(END)
2762 INSTR0(CHSH)
2763 INSTR0(CHMASK)
2764 INSTR1NODST(PREDT)
2765 INSTR1NODST(PREDF)
2766 INSTR0(PREDE)
2767 INSTR0(GETONE)
2768 INSTR0(GETLAST)
2769 INSTR0(SHPS)
2770 INSTR0(SHPE)
2771
2772 /* cat1 macros */
2773 INSTR1(ANY_MACRO)
2774 INSTR1(ALL_MACRO)
2775 INSTR1(READ_FIRST_MACRO)
2776 INSTR2(READ_COND_MACRO)
2777 INSTR1(READ_GETLAST_MACRO)
2778
2779 static inline struct ir3_instruction *
2780 ir3_ELECT_MACRO(struct ir3_builder *build)
2781 {
2782 struct ir3_instruction *instr =
2783 ir3_build_instr(build, OPC_ELECT_MACRO, 1, 0);
2784 __ssa_dst(instr);
2785 return instr;
2786 }
2787
2788 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_builder * build)2789 ir3_SHPS_MACRO(struct ir3_builder *build)
2790 {
2791 struct ir3_instruction *instr = ir3_build_instr(build, OPC_SHPS_MACRO, 1, 0);
2792 __ssa_dst(instr);
2793 return instr;
2794 }
2795
2796 /* cat2 instructions, most 2 src but some 1 src: */
2797 INSTR2S(ADD_F)
INSTR2S(MIN_F)2798 INSTR2S(MIN_F)
2799 INSTR2S(MAX_F)
2800 INSTR2S(MUL_F)
2801 INSTR1S(SIGN_F)
2802 INSTR2S(CMPS_F)
2803 INSTR1S(ABSNEG_F)
2804 INSTR2S(CMPV_F)
2805 INSTR1S(FLOOR_F)
2806 INSTR1S(CEIL_F)
2807 INSTR1S(RNDNE_F)
2808 INSTR1S(RNDAZ_F)
2809 INSTR1S(TRUNC_F)
2810 INSTR2S(ADD_U)
2811 INSTR2S(ADD_S)
2812 INSTR2S(SUB_U)
2813 INSTR2S(SUB_S)
2814 INSTR2S(CMPS_U)
2815 INSTR2S(CMPS_S)
2816 INSTR2S(MIN_U)
2817 INSTR2S(MIN_S)
2818 INSTR2S(MAX_U)
2819 INSTR2S(MAX_S)
2820 INSTR1S(ABSNEG_S)
2821 INSTR2S(AND_B)
2822 INSTR2S(OR_B)
2823 INSTR1S(NOT_B)
2824 INSTR2S(XOR_B)
2825 INSTR2S(CMPV_U)
2826 INSTR2S(CMPV_S)
2827 INSTR2S(MUL_U24)
2828 INSTR2S(MUL_S24)
2829 INSTR2S(MULL_U)
2830 INSTR1S(BFREV_B)
2831 INSTR1S(CLZ_S)
2832 INSTR1S(CLZ_B)
2833 INSTR2S(SHL_B)
2834 INSTR2S(SHR_B)
2835 INSTR2S(ASHR_B)
2836 INSTR2(BARY_F)
2837 INSTR2(FLAT_B)
2838 INSTR2S(MGEN_B)
2839 INSTR2S(GETBIT_B)
2840 INSTR1(SETRM)
2841 INSTR1S(CBITS_B)
2842 INSTR2S(SHB)
2843 INSTR2S(MSAD)
2844
2845 /* cat3 instructions: */
2846 INSTR3(MAD_U16)
2847 INSTR3(MADSH_U16)
2848 INSTR3(MAD_S16)
2849 INSTR3(MADSH_M16)
2850 INSTR3(MAD_U24)
2851 INSTR3(MAD_S24)
2852 INSTR3(MAD_F16)
2853 INSTR3(MAD_F32)
2854 INSTR3(DP2ACC)
2855 INSTR3(DP4ACC)
2856 /* NOTE: SEL_B32 checks for zero vs nonzero */
2857 INSTR3S(SEL_B16)
2858 INSTR3S(SEL_B32)
2859 INSTR3S(SEL_S16)
2860 INSTR3S(SEL_S32)
2861 INSTR3S(SEL_F16)
2862 INSTR3S(SEL_F32)
2863 INSTR3(SAD_S16)
2864 INSTR3(SAD_S32)
2865 INSTR3S(SHRM)
2866 INSTR3S(SHLM)
2867 INSTR3S(SHRG)
2868 INSTR3S(SHLG)
2869 INSTR3S(ANDG)
2870
2871 /* cat4 instructions: */
2872 INSTR1S(RCP)
2873 INSTR1S(RSQ)
2874 INSTR1S(HRSQ)
2875 INSTR1S(LOG2)
2876 INSTR1S(HLOG2)
2877 INSTR1S(EXP2)
2878 INSTR1S(HEXP2)
2879 INSTR1S(SIN)
2880 INSTR1S(COS)
2881 INSTR1S(SQRT)
2882
2883 /* cat5 instructions: */
2884 INSTR1(DSX)
2885 INSTR1(DSXPP_MACRO)
2886 INSTR1(DSY)
2887 INSTR1(DSYPP_MACRO)
2888 INSTR1F(3D, DSX)
2889 INSTR1F(3D, DSY)
2890 INSTR1(RGETPOS)
2891
2892 static inline struct ir3_instruction *
2893 ir3_SAM(struct ir3_builder *build, opc_t opc, type_t type, unsigned wrmask,
2894 ir3_instruction_flags flags, struct ir3_instruction *samp_tex,
2895 struct ir3_instruction *src0, struct ir3_instruction *src1)
2896 {
2897 struct ir3_instruction *sam;
2898 unsigned nreg = 0;
2899
2900 if (flags & IR3_INSTR_S2EN) {
2901 nreg++;
2902 }
2903 if (src0 || opc == OPC_SAM) {
2904 nreg++;
2905 }
2906 if (src1) {
2907 nreg++;
2908 }
2909
2910 sam = ir3_build_instr(build, opc, 1, nreg);
2911 sam->flags |= flags;
2912 __ssa_dst(sam)->wrmask = wrmask;
2913 if (flags & IR3_INSTR_S2EN) {
2914 __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2915 }
2916 if (src0) {
2917 __ssa_src(sam, src0, 0);
2918 } else if (opc == OPC_SAM) {
2919 /* Create a dummy shared source for the coordinate, for the prefetch
2920 * case. It needs to be shared so that we don't accidentally disable early
2921 * preamble, and this is what the blob does.
2922 */
2923 ir3_src_create(sam, regid(48, 0), IR3_REG_SHARED);
2924 }
2925 if (src1) {
2926 __ssa_src(sam, src1, 0);
2927 }
2928 sam->cat5.type = type;
2929
2930 return sam;
2931 }
2932
2933 /* brcst.active rx, ry behaves like a conditional move: rx either keeps its
2934 * value or is set to ry. In order to model this in SSA form, we add an extra
2935 * argument (the initial value of rx) and tie it to the destination.
2936 */
2937 static inline struct ir3_instruction *
ir3_BRCST_ACTIVE(struct ir3_builder * build,unsigned cluster_size,struct ir3_instruction * src,struct ir3_instruction * dst_default)2938 ir3_BRCST_ACTIVE(struct ir3_builder *build, unsigned cluster_size,
2939 struct ir3_instruction *src,
2940 struct ir3_instruction *dst_default)
2941 {
2942 struct ir3_instruction *brcst =
2943 ir3_build_instr(build, OPC_BRCST_ACTIVE, 1, 2);
2944 brcst->cat5.cluster_size = cluster_size;
2945 brcst->cat5.type = TYPE_U32;
2946 struct ir3_register *brcst_dst = __ssa_dst(brcst);
2947 __ssa_src(brcst, src, 0);
2948 struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0);
2949 ir3_reg_tie(brcst_dst, default_src);
2950 return brcst;
2951 }
2952
2953 /* cat6 instructions: */
2954 INSTR0(GETFIBERID)
2955 INSTR2(LDLV)
2956 INSTR3(LDG)
2957 INSTR3(LDL)
2958 INSTR3(LDLW)
2959 INSTR3(LDP)
2960 INSTR4NODST(STG)
2961 INSTR3NODST(STL)
2962 INSTR3NODST(STLW)
2963 INSTR3NODST(STP)
2964 INSTR1(RESINFO)
2965 INSTR1(RESFMT)
2966 INSTR2(ATOMIC_ADD)
2967 INSTR2(ATOMIC_SUB)
2968 INSTR2(ATOMIC_XCHG)
2969 INSTR2(ATOMIC_INC)
2970 INSTR2(ATOMIC_DEC)
2971 INSTR2(ATOMIC_CMPXCHG)
2972 INSTR2(ATOMIC_MIN)
2973 INSTR2(ATOMIC_MAX)
2974 INSTR2(ATOMIC_AND)
2975 INSTR2(ATOMIC_OR)
2976 INSTR2(ATOMIC_XOR)
2977 INSTR2(LDC)
2978 INSTR2(QUAD_SHUFFLE_BRCST)
2979 INSTR1(QUAD_SHUFFLE_HORIZ)
2980 INSTR1(QUAD_SHUFFLE_VERT)
2981 INSTR1(QUAD_SHUFFLE_DIAG)
2982 INSTR2NODST(LDC_K)
2983 INSTR2NODST(STC)
2984 INSTR2NODST(STSC)
2985 INSTR2(SHFL)
2986 #ifndef GPU
2987 #elif GPU >= 600
2988 INSTR4NODST(STIB);
2989 INSTR3(LDIB);
2990 INSTR5(LDG_A);
2991 INSTR6NODST(STG_A);
2992 INSTR2(ATOMIC_G_ADD)
2993 INSTR2(ATOMIC_G_SUB)
2994 INSTR2(ATOMIC_G_XCHG)
2995 INSTR2(ATOMIC_G_INC)
2996 INSTR2(ATOMIC_G_DEC)
2997 INSTR2(ATOMIC_G_CMPXCHG)
2998 INSTR2(ATOMIC_G_MIN)
2999 INSTR2(ATOMIC_G_MAX)
3000 INSTR2(ATOMIC_G_AND)
3001 INSTR2(ATOMIC_G_OR)
3002 INSTR2(ATOMIC_G_XOR)
3003 INSTR3(ATOMIC_B_ADD)
3004 INSTR3(ATOMIC_B_SUB)
3005 INSTR3(ATOMIC_B_XCHG)
3006 INSTR3(ATOMIC_B_INC)
3007 INSTR3(ATOMIC_B_DEC)
3008 INSTR3(ATOMIC_B_CMPXCHG)
3009 INSTR3(ATOMIC_B_MIN)
3010 INSTR3(ATOMIC_B_MAX)
3011 INSTR3(ATOMIC_B_AND)
3012 INSTR3(ATOMIC_B_OR)
3013 INSTR3(ATOMIC_B_XOR)
3014 #elif GPU >= 400
3015 INSTR3(LDGB)
3016 #if GPU >= 500
3017 INSTR3(LDIB)
3018 #endif
3019 INSTR4NODST(STGB)
3020 INSTR4NODST(STIB)
3021 INSTR4(ATOMIC_S_ADD)
3022 INSTR4(ATOMIC_S_SUB)
3023 INSTR4(ATOMIC_S_XCHG)
3024 INSTR4(ATOMIC_S_INC)
3025 INSTR4(ATOMIC_S_DEC)
3026 INSTR4(ATOMIC_S_CMPXCHG)
3027 INSTR4(ATOMIC_S_MIN)
3028 INSTR4(ATOMIC_S_MAX)
3029 INSTR4(ATOMIC_S_AND)
3030 INSTR4(ATOMIC_S_OR)
3031 INSTR4(ATOMIC_S_XOR)
3032 #endif
3033 INSTR4NODST(LDG_K)
3034
3035 /* cat7 instructions: */
3036 INSTR0(BAR)
3037 INSTR0(FENCE)
3038 INSTR0(CCINV)
3039
3040 /* ************************************************************************* */
3041 #include "util/bitset.h"
3042
3043 #define MAX_REG 256
3044
3045 typedef BITSET_DECLARE(fullstate_t, 2 * GPR_REG_SIZE);
3046 typedef BITSET_DECLARE(halfstate_t, GPR_REG_SIZE);
3047 typedef BITSET_DECLARE(sharedstate_t, 2 * SHARED_REG_SIZE);
3048 typedef BITSET_DECLARE(nongprstate_t, 2 * NONGPR_REG_SIZE);
3049
3050 typedef struct {
3051 bool mergedregs;
3052 fullstate_t full;
3053 halfstate_t half;
3054 sharedstate_t shared;
3055 nongprstate_t nongpr;
3056 } regmask_t;
3057
3058 static inline BITSET_WORD *
__regmask_file(regmask_t * regmask,enum ir3_reg_file file)3059 __regmask_file(regmask_t *regmask, enum ir3_reg_file file)
3060 {
3061 switch (file) {
3062 case IR3_FILE_FULL:
3063 return regmask->full;
3064 case IR3_FILE_HALF:
3065 return regmask->half;
3066 case IR3_FILE_SHARED:
3067 return regmask->shared;
3068 case IR3_FILE_NONGPR:
3069 return regmask->nongpr;
3070 }
3071 unreachable("bad file");
3072 }
3073
3074 static inline bool
__regmask_get(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3075 __regmask_get(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3076 {
3077 BITSET_WORD *regs = __regmask_file(regmask, file);
3078 for (unsigned i = 0; i < size; i++) {
3079 if (BITSET_TEST(regs, n + i))
3080 return true;
3081 }
3082 return false;
3083 }
3084
3085 static inline void
__regmask_set(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3086 __regmask_set(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3087 {
3088 BITSET_WORD *regs = __regmask_file(regmask, file);
3089 for (unsigned i = 0; i < size; i++)
3090 BITSET_SET(regs, n + i);
3091 }
3092
3093 static inline void
__regmask_clear(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3094 __regmask_clear(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3095 {
3096 BITSET_WORD *regs = __regmask_file(regmask, file);
3097 for (unsigned i = 0; i < size; i++)
3098 BITSET_CLEAR(regs, n + i);
3099 }
3100
3101 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)3102 regmask_init(regmask_t *regmask, bool mergedregs)
3103 {
3104 memset(regmask, 0, sizeof(*regmask));
3105 regmask->mergedregs = mergedregs;
3106 }
3107
3108 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)3109 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
3110 {
3111 assert(dst->mergedregs == a->mergedregs);
3112 assert(dst->mergedregs == b->mergedregs);
3113
3114 for (unsigned i = 0; i < ARRAY_SIZE(dst->full); i++)
3115 dst->full[i] = a->full[i] | b->full[i];
3116 for (unsigned i = 0; i < ARRAY_SIZE(dst->half); i++)
3117 dst->half[i] = a->half[i] | b->half[i];
3118 for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3119 dst->shared[i] = a->shared[i] | b->shared[i];
3120 for (unsigned i = 0; i < ARRAY_SIZE(dst->nongpr); i++)
3121 dst->nongpr[i] = a->nongpr[i] | b->nongpr[i];
3122 }
3123
3124 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)3125 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
3126 {
3127 for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3128 dst->shared[i] = a->shared[i] | b->shared[i];
3129 }
3130
3131 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)3132 regmask_set(regmask_t *regmask, struct ir3_register *reg)
3133 {
3134 unsigned size = reg_elem_size(reg);
3135 enum ir3_reg_file file;
3136 unsigned num = post_ra_reg_num(reg);
3137 unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3138 if (reg->flags & IR3_REG_RELATIV) {
3139 __regmask_set(regmask, file, n, size * reg->size);
3140 } else {
3141 for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3142 if (mask & 1)
3143 __regmask_set(regmask, file, n, size);
3144 }
3145 }
3146
3147 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)3148 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
3149 {
3150 unsigned size = reg_elem_size(reg);
3151 enum ir3_reg_file file;
3152 unsigned num = post_ra_reg_num(reg);
3153 unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3154 if (reg->flags & IR3_REG_RELATIV) {
3155 __regmask_clear(regmask, file, n, size * reg->size);
3156 } else {
3157 for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3158 if (mask & 1)
3159 __regmask_clear(regmask, file, n, size);
3160 }
3161 }
3162
3163 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)3164 regmask_get(regmask_t *regmask, struct ir3_register *reg)
3165 {
3166 unsigned size = reg_elem_size(reg);
3167 enum ir3_reg_file file;
3168 unsigned num = post_ra_reg_num(reg);
3169 unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3170 if (reg->flags & IR3_REG_RELATIV) {
3171 return __regmask_get(regmask, file, n, size * reg->size);
3172 } else {
3173 for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3174 if (mask & 1)
3175 if (__regmask_get(regmask, file, n, size))
3176 return true;
3177 }
3178 return false;
3179 }
3180 /* ************************************************************************* */
3181
3182 #endif /* IR3_H_ */
3183