1 /*
2 * Copyright © 2013 Rob Clark <robdclark@gmail.com>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #ifndef IR3_H_
7 #define IR3_H_
8
9 #include <stdbool.h>
10 #include <stdint.h>
11
12 #include "compiler/shader_enums.h"
13
14 #include "util/bitscan.h"
15 #include "util/list.h"
16 #include "util/set.h"
17 #include "util/u_debug.h"
18
19 #include "freedreno_common.h"
20
21 #include "instr-a3xx.h"
22
23 /* low level intermediate representation of an adreno shader program */
24
25 struct ir3_compiler;
26 struct ir3;
27 struct ir3_instruction;
28 struct ir3_block;
29
30 struct ir3_info {
31 void *data; /* used internally in ir3 assembler */
32 /* Size in bytes of the shader binary, including NIR constants and
33 * padding
34 */
35 uint32_t size;
36 /* byte offset from start of the shader to the NIR constant data. */
37 uint32_t constant_data_offset;
38 /* Size in dwords of the instructions. */
39 uint16_t sizedwords;
40 uint16_t instrs_count; /* expanded to account for rpt's */
41 uint16_t preamble_instrs_count;
42 uint16_t nops_count; /* # of nop instructions, including nopN */
43 uint16_t mov_count;
44 uint16_t cov_count;
45 uint16_t stp_count;
46 uint16_t ldp_count;
47 /* NOTE: max_reg, etc, does not include registers not touched
48 * by the shader (ie. vertex fetched via VFD_DECODE but not
49 * touched by shader)
50 */
51 int8_t max_reg; /* highest GPR # used by shader */
52 int8_t max_half_reg;
53 int16_t max_const;
54 /* This is the maximum # of waves that can executed at once in one core,
55 * assuming that they are all executing this shader.
56 */
57 int8_t max_waves;
58 uint8_t subgroup_size;
59 bool double_threadsize;
60 bool multi_dword_ldp_stp;
61 bool early_preamble;
62 bool uses_ray_intersection;
63
64 /* number of sync bits: */
65 uint16_t ss, sy;
66
67 /* estimate of number of cycles stalled on (ss) */
68 uint16_t sstall;
69 /* estimate of number of cycles stalled on (sy) */
70 uint16_t systall;
71
72 uint16_t last_baryf; /* instruction # of last varying fetch */
73
74 uint16_t last_helper; /* last instruction to use helper invocations */
75
76 /* Number of instructions of a given category: */
77 uint16_t instrs_per_cat[8];
78 };
79
80 struct ir3_merge_set {
81 uint16_t preferred_reg;
82 uint16_t size;
83 uint16_t alignment;
84
85 unsigned interval_start;
86 unsigned spill_slot;
87
88 unsigned regs_count;
89 struct ir3_register **regs;
90 };
91
92 typedef enum ir3_register_flags {
93 IR3_REG_CONST = BIT(0),
94 IR3_REG_IMMED = BIT(1),
95 IR3_REG_HALF = BIT(2),
96 /* Shared registers have the same value for all threads when read.
97 * They can only be written when one thread is active (that is, inside
98 * a "getone" block).
99 */
100 IR3_REG_SHARED = BIT(3),
101 IR3_REG_RELATIV = BIT(4),
102 IR3_REG_R = BIT(5),
103 /* Most instructions, it seems, can do float abs/neg but not
104 * integer. The CP pass needs to know what is intended (int or
105 * float) in order to do the right thing. For this reason the
106 * abs/neg flags are split out into float and int variants. In
107 * addition, .b (bitwise) operations, the negate is actually a
108 * bitwise not, so split that out into a new flag to make it
109 * more clear.
110 */
111 IR3_REG_FNEG = BIT(6),
112 IR3_REG_FABS = BIT(7),
113 IR3_REG_SNEG = BIT(8),
114 IR3_REG_SABS = BIT(9),
115 IR3_REG_BNOT = BIT(10),
116 /* (ei) flag, end-input? Set on last bary, presumably to signal
117 * that the shader needs no more input:
118 *
119 * Note: Has different meaning on other instructions like add.s/u
120 */
121 IR3_REG_EI = BIT(11),
122 /* meta-flags, for intermediate stages of IR, ie.
123 * before register assignment is done:
124 */
125 IR3_REG_SSA = BIT(12), /* 'def' is ptr to assigning destination */
126 IR3_REG_ARRAY = BIT(13),
127
128 /* Set on a use whenever the SSA value becomes dead after the current
129 * instruction.
130 */
131 IR3_REG_KILL = BIT(14),
132
133 /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
134 * same SSA value in a single instruction, this is only set on the first
135 * use.
136 */
137 IR3_REG_FIRST_KILL = BIT(15),
138
139 /* Set when a destination doesn't have any uses and is dead immediately
140 * after the instruction. This can happen even after optimizations for
141 * corner cases such as destinations of atomic instructions.
142 */
143 IR3_REG_UNUSED = BIT(16),
144
145 /* "Early-clobber" on a destination means that the destination is
146 * (potentially) written before any sources are read and therefore
147 * interferes with the sources of the instruction.
148 */
149 IR3_REG_EARLY_CLOBBER = BIT(17),
150
151 /* If this is the last usage of a specific value in the register, the
152 * register cannot be read without being written to first after this.
153 * Note: This effectively has the same semantics as IR3_REG_KILL.
154 */
155 IR3_REG_LAST_USE = BIT(18),
156
157 /* Predicate register (p0.c). Cannot be combined with half or shared. */
158 IR3_REG_PREDICATE = BIT(19),
159
160 /* Render target dst. Only used by alias.rt. */
161 IR3_REG_RT = BIT(20),
162
163 /* Register that is initialized using alias.tex (or will be once the
164 * alias.tex instructions are inserted). Before alias.tex is inserted, alias
165 * registers may contain things that are normally not allowed by the owning
166 * instruction (e.g., consts or immediates) because they will be replaced by
167 * GPRs later.
168 * Note that if wrmask > 1, this will be set if any of the registers is an
169 * alias, even though not all of them may be. We currently have no way to
170 * tell which ones are actual aliases.
171 */
172 IR3_REG_ALIAS = BIT(21),
173
174 /* Alias registers allow us to allocate non-consecutive registers and remap
175 * them to consecutive ones using alias.tex. We implement this by adding the
176 * sources of collects directly to the sources of their users. This way, RA
177 * treats them as scalar registers and we can remap them to consecutive
178 * registers afterwards. This flag is used to keep track of the scalar
179 * sources that should be remapped together. Every source of such an "alias
180 * group" will have the IR3_REG_ALIAS set, while the first one will also have
181 * IR3_REG_FIRST_ALIAS set.
182 */
183 IR3_REG_FIRST_ALIAS = BIT(22),
184 } ir3_register_flags;
185
186 struct ir3_register {
187 BITMASK_ENUM(ir3_register_flags) flags;
188
189 unsigned name;
190
191 /* used for cat5 instructions, but also for internal/IR level
192 * tracking of what registers are read/written by an instruction.
193 * wrmask may be a bad name since it is used to represent both
194 * src and dst that touch multiple adjacent registers.
195 */
196 unsigned wrmask : 16; /* up to vec16 */
197
198 /* for relative addressing, 32bits for array size is too small,
199 * but otoh we don't need to deal with disjoint sets, so instead
200 * use a simple size field (number of scalar components).
201 *
202 * Note the size field isn't important for relative const (since
203 * we don't have to do register allocation for constants).
204 */
205 unsigned size : 16;
206
207 /* normal registers:
208 * the component is in the low two bits of the reg #, so
209 * rN.x becomes: (N << 2) | x
210 */
211 uint16_t num;
212 union {
213 /* immediate: */
214 int32_t iim_val;
215 uint32_t uim_val;
216 float fim_val;
217 /* relative: */
218 struct {
219 uint16_t id;
220 int16_t offset;
221 uint16_t base;
222 } array;
223 };
224
225 /* For IR3_REG_SSA, dst registers contain pointer back to the instruction
226 * containing this register.
227 */
228 struct ir3_instruction *instr;
229
230 /* For IR3_REG_SSA, src registers contain ptr back to assigning
231 * instruction.
232 *
233 * For IR3_REG_ARRAY, the pointer is back to the last dependent
234 * array access (although the net effect is the same, it points
235 * back to a previous instruction that we depend on).
236 */
237 struct ir3_register *def;
238
239 /* Pointer to another register in the instruction that must share the same
240 * physical register. Each destination can be tied with one source, and
241 * they must have "tied" pointing to each other.
242 */
243 struct ir3_register *tied;
244
245 unsigned spill_slot, next_use;
246
247 unsigned merge_set_offset;
248 struct ir3_merge_set *merge_set;
249 unsigned interval_start, interval_end;
250 };
251
252 /*
253 * Stupid/simple growable array implementation:
254 */
255 #define DECLARE_ARRAY(type, name) \
256 unsigned name##_count, name##_sz; \
257 type *name;
258
259 #define array_insert(ctx, arr, ...) \
260 do { \
261 if (arr##_count == arr##_sz) { \
262 arr##_sz = MAX2(2 * arr##_sz, 16); \
263 arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0])); \
264 } \
265 arr[arr##_count++] = __VA_ARGS__; \
266 } while (0)
267
268 typedef enum {
269 REDUCE_OP_ADD_U,
270 REDUCE_OP_ADD_F,
271 REDUCE_OP_MUL_U,
272 REDUCE_OP_MUL_F,
273 REDUCE_OP_MIN_U,
274 REDUCE_OP_MIN_S,
275 REDUCE_OP_MIN_F,
276 REDUCE_OP_MAX_U,
277 REDUCE_OP_MAX_S,
278 REDUCE_OP_MAX_F,
279 REDUCE_OP_AND_B,
280 REDUCE_OP_OR_B,
281 REDUCE_OP_XOR_B,
282 } reduce_op_t;
283
284 typedef enum {
285 ALIAS_TEX = 0,
286 ALIAS_RT = 1,
287 ALIAS_MEM = 2,
288 } ir3_alias_scope;
289
290 typedef enum {
291 SHFL_XOR = 1,
292 SHFL_UP = 2,
293 SHFL_DOWN = 3,
294 SHFL_RUP = 6,
295 SHFL_RDOWN = 7,
296 } ir3_shfl_mode;
297
298 typedef enum ir3_instruction_flags {
299 /* (sy) flag is set on first instruction, and after sample
300 * instructions (probably just on RAW hazard).
301 */
302 IR3_INSTR_SY = BIT(0),
303 /* (ss) flag is set on first instruction, and first instruction
304 * to depend on the result of "long" instructions (RAW hazard):
305 *
306 * rcp, rsq, log2, exp2, sin, cos, sqrt
307 *
308 * It seems to synchronize until all in-flight instructions are
309 * completed, for example:
310 *
311 * rsq hr1.w, hr1.w
312 * add.f hr2.z, (neg)hr2.z, hc0.y
313 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
314 * rsq hr2.x, hr2.x
315 * (rpt1)nop
316 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
317 * nop
318 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
319 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
320 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
321 *
322 * The last mul.f does not have (ss) set, presumably because the
323 * (ss) on the previous instruction does the job.
324 *
325 * The blob driver also seems to set it on WAR hazards, although
326 * not really clear if this is needed or just blob compiler being
327 * sloppy. So far I haven't found a case where removing the (ss)
328 * causes problems for WAR hazard, but I could just be getting
329 * lucky:
330 *
331 * rcp r1.y, r3.y
332 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
333 *
334 */
335 IR3_INSTR_SS = BIT(1),
336 /* (jp) flag is set on jump targets:
337 */
338 IR3_INSTR_JP = BIT(2),
339 /* (eq) flag kills helper invocations when they are no longer needed */
340 IR3_INSTR_EQ = BIT(3),
341 IR3_INSTR_UL = BIT(4),
342 IR3_INSTR_3D = BIT(5),
343 IR3_INSTR_A = BIT(6),
344 IR3_INSTR_O = BIT(7),
345 IR3_INSTR_P = BIT(8),
346 IR3_INSTR_S = BIT(9),
347 IR3_INSTR_S2EN = BIT(10),
348 IR3_INSTR_SAT = BIT(11),
349 /* (cat5/cat6) Bindless */
350 IR3_INSTR_B = BIT(12),
351 /* (cat5/cat6) nonuniform */
352 IR3_INSTR_NONUNIF = BIT(13),
353 /* (cat5-only) Get some parts of the encoding from a1.x */
354 IR3_INSTR_A1EN = BIT(14),
355 /* uniform destination for ldc, which must be set if and only if it has a
356 * shared reg destination
357 */
358 IR3_INSTR_U = BIT(15),
359 /* meta-flags, for intermediate stages of IR, ie.
360 * before register assignment is done:
361 */
362 IR3_INSTR_MARK = BIT(16),
363
364 /* Used by shared register allocation when creating spill/reload instructions
365 * to inform validation that this is created by RA. This also may be set on
366 * an instruction where a spill has been folded into it.
367 */
368 IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
369
370 IR3_INSTR_UNUSED = BIT(17),
371
372 /* Used to indicate that a mov comes from a lowered READ_FIRST/READ_COND
373 * and may broadcast a helper invocation's value from a vector register to a
374 * shared register that may be read by other invocations. This factors into
375 * (eq) calculations.
376 */
377 IR3_INSTR_NEEDS_HELPERS = BIT(18),
378
379 /* isam.v */
380 IR3_INSTR_V = BIT(19),
381
382 /* isam.1d. Note that .1d is an active-low bit. */
383 IR3_INSTR_INV_1D = BIT(20),
384
385 /* isam.v/ldib.b/stib.b can optionally use an immediate offset with one of
386 * their sources.
387 */
388 IR3_INSTR_IMM_OFFSET = BIT(21),
389 } ir3_instruction_flags;
390
391 struct ir3_instruction {
392 struct ir3_block *block;
393 opc_t opc;
394 BITMASK_ENUM(ir3_instruction_flags) flags;
395 uint8_t repeat;
396 uint8_t nop;
397 #if MESA_DEBUG
398 unsigned srcs_max, dsts_max;
399 #endif
400 unsigned srcs_count, dsts_count;
401 struct ir3_register **dsts;
402 struct ir3_register **srcs;
403 union {
404 struct {
405 char inv1, inv2;
406 int immed;
407 struct ir3_block *target;
408 const char *target_label;
409 unsigned idx; /* for brac.N */
410 } cat0;
411 struct {
412 type_t src_type, dst_type;
413 round_t round;
414 reduce_op_t reduce_op;
415 } cat1;
416 struct {
417 enum {
418 IR3_COND_LT = 0,
419 IR3_COND_LE = 1,
420 IR3_COND_GT = 2,
421 IR3_COND_GE = 3,
422 IR3_COND_EQ = 4,
423 IR3_COND_NE = 5,
424 } condition;
425 } cat2;
426 struct {
427 enum {
428 IR3_SRC_UNSIGNED = 0,
429 IR3_SRC_MIXED = 1,
430 } signedness;
431 enum {
432 IR3_SRC_PACKED_LOW = 0,
433 IR3_SRC_PACKED_HIGH = 1,
434 } packed;
435 bool swapped;
436 } cat3;
437 struct {
438 unsigned samp, tex;
439 unsigned tex_base : 3;
440 unsigned cluster_size : 4;
441 type_t type;
442 } cat5;
443 struct {
444 type_t type;
445 /* TODO remove dst_offset and handle as a ir3_register
446 * which might be IMMED, similar to how src_offset is
447 * handled.
448 */
449 int dst_offset;
450 int iim_val; /* for ldgb/stgb, # of components */
451 unsigned d : 3; /* for ldc, component offset */
452 bool typed : 1;
453 unsigned base : 3;
454 ir3_shfl_mode shfl_mode : 3;
455 } cat6;
456 struct {
457 unsigned w : 1; /* write */
458 unsigned r : 1; /* read */
459 unsigned l : 1; /* local */
460 unsigned g : 1; /* global */
461
462 ir3_alias_scope alias_scope;
463 unsigned alias_table_size_minus_one;
464 bool alias_type_float;
465 } cat7;
466 /* for meta-instructions, just used to hold extra data
467 * before instruction scheduling, etc
468 */
469 struct {
470 int off; /* component/offset */
471 } split;
472 struct {
473 /* Per-source index back to the entry in the
474 * ir3_shader_variant::outputs table.
475 */
476 unsigned *outidxs;
477 } end;
478 struct {
479 /* used to temporarily hold reference to nir_phi_instr
480 * until we resolve the phi srcs
481 */
482 void *nphi;
483 unsigned comp;
484 } phi;
485 struct {
486 unsigned samp, tex;
487 unsigned input_offset;
488 unsigned samp_base : 3;
489 unsigned tex_base : 3;
490 } prefetch;
491 struct {
492 /* maps back to entry in ir3_shader_variant::inputs table: */
493 int inidx;
494 /* for sysvals, identifies the sysval type. Mostly so we can
495 * identify the special cases where a sysval should not be DCE'd
496 * (currently, just pre-fs texture fetch)
497 */
498 gl_system_value sysval;
499 } input;
500 struct {
501 unsigned src_base, src_size;
502 unsigned dst_base;
503 } push_consts;
504 struct {
505 uint64_t value;
506 } raw;
507 };
508
509 /* For assigning jump offsets, we need instruction's position: */
510 uint32_t ip;
511
512 /* used for per-pass extra instruction data.
513 *
514 * TODO we should remove the per-pass data like this and 'use_count'
515 * and do something similar to what RA does w/ ir3_ra_instr_data..
516 * ie. use the ir3_count_instructions pass, and then use instr->ip
517 * to index into a table of pass-private data.
518 */
519 void *data;
520
521 /**
522 * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
523 */
524 struct set *uses;
525
526 int use_count; /* currently just updated/used by cp */
527
528 /* an instruction can reference at most one address register amongst
529 * it's src/dst registers. Beyond that, you need to insert mov's.
530 *
531 * NOTE: do not write this directly, use ir3_instr_set_address()
532 */
533 struct ir3_register *address;
534
535 /* Tracking for additional dependent instructions. Used to handle
536 * barriers, WAR hazards for arrays/SSBOs/etc.
537 */
538 DECLARE_ARRAY(struct ir3_instruction *, deps);
539
540 /*
541 * From PoV of instruction scheduling, not execution (ie. ignores global/
542 * local distinction):
543 * shared image atomic SSBO everything
544 * barrier()/ - R/W R/W R/W R/W X
545 * groupMemoryBarrier()
546 * memoryBarrier()
547 * (but only images declared coherent?)
548 * memoryBarrierAtomic() - R/W
549 * memoryBarrierBuffer() - R/W
550 * memoryBarrierImage() - R/W
551 * memoryBarrierShared() - R/W
552 *
553 * TODO I think for SSBO/image/shared, in cases where we can determine
554 * which variable is accessed, we don't need to care about accesses to
555 * different variables (unless declared coherent??)
556 */
557 enum {
558 IR3_BARRIER_EVERYTHING = 1 << 0,
559 IR3_BARRIER_SHARED_R = 1 << 1,
560 IR3_BARRIER_SHARED_W = 1 << 2,
561 IR3_BARRIER_IMAGE_R = 1 << 3,
562 IR3_BARRIER_IMAGE_W = 1 << 4,
563 IR3_BARRIER_BUFFER_R = 1 << 5,
564 IR3_BARRIER_BUFFER_W = 1 << 6,
565 IR3_BARRIER_ARRAY_R = 1 << 7,
566 IR3_BARRIER_ARRAY_W = 1 << 8,
567 IR3_BARRIER_PRIVATE_R = 1 << 9,
568 IR3_BARRIER_PRIVATE_W = 1 << 10,
569 IR3_BARRIER_CONST_W = 1 << 11,
570 IR3_BARRIER_ACTIVE_FIBERS_R = 1 << 12,
571 IR3_BARRIER_ACTIVE_FIBERS_W = 1 << 13,
572 } barrier_class,
573 barrier_conflict;
574
575 /* Entry in ir3_block's instruction list: */
576 struct list_head node;
577
578 /* List of this instruction's repeat group. Vectorized NIR instructions are
579 * emitted as multiple scalar instructions that are linked together using
580 * this field. After RA, the ir3_combine_rpt pass iterates these groups and,
581 * if the register assignment allows it, merges them into a (rptN)
582 * instruction.
583 *
584 * NOTE: this is not a typical list as there is no empty list head. The list
585 * head is stored in the first instruction of the repeat group so also refers
586 * to a list entry. In order to distinguish the list's first entry, we use
587 * serialno: instructions in a repeat group are always emitted consecutively
588 * so the first will have the lowest serialno.
589 *
590 * As this is not a typical list, we have to be careful with using the
591 * existing list helper. For example, using list_length on the first
592 * instruction will yield one less than the number of instructions in its
593 * group.
594 */
595 struct list_head rpt_node;
596
597 uint32_t serialno;
598
599 // TODO only computerator/assembler:
600 int line;
601 };
602
603 /* Represents repeat groups in return values and arguments of the rpt builder
604 * API functions.
605 */
606 struct ir3_instruction_rpt {
607 struct ir3_instruction *rpts[4];
608 };
609
610 struct ir3 {
611 struct ir3_compiler *compiler;
612 gl_shader_stage type;
613
614 DECLARE_ARRAY(struct ir3_instruction *, inputs);
615
616 /* Track bary.f (and ldlv) instructions.. this is needed in
617 * scheduling to ensure that all varying fetches happen before
618 * any potential kill instructions. The hw gets grumpy if all
619 * threads in a group are killed before the last bary.f gets
620 * a chance to signal end of input (ei).
621 */
622 DECLARE_ARRAY(struct ir3_instruction *, baryfs);
623
624 /* Track all indirect instructions (read and write). To avoid
625 * deadlock scenario where an address register gets scheduled,
626 * but other dependent src instructions cannot be scheduled due
627 * to dependency on a *different* address register value, the
628 * scheduler needs to ensure that all dependencies other than
629 * the instruction other than the address register are scheduled
630 * before the one that writes the address register. Having a
631 * convenient list of instructions that reference some address
632 * register simplifies this.
633 */
634 DECLARE_ARRAY(struct ir3_instruction *, a0_users);
635
636 /* same for a1.x: */
637 DECLARE_ARRAY(struct ir3_instruction *, a1_users);
638
639 /* Track texture sample instructions which need texture state
640 * patched in (for astc-srgb workaround):
641 */
642 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
643
644 /* Track tg4 instructions which need texture state patched in (for tg4
645 * swizzling workaround):
646 */
647 DECLARE_ARRAY(struct ir3_instruction *, tg4);
648
649 /* List of blocks: */
650 struct list_head block_list;
651
652 /* List of ir3_array's: */
653 struct list_head array_list;
654
655 #if MESA_DEBUG
656 unsigned block_count;
657 #endif
658 unsigned instr_count;
659 };
660
661 struct ir3_array {
662 struct list_head node;
663 unsigned length;
664 unsigned id;
665
666 struct nir_def *r;
667
668 /* To avoid array write's from getting DCE'd, keep track of the
669 * most recent write. Any array access depends on the most
670 * recent write. This way, nothing depends on writes after the
671 * last read. But all the writes that happen before that have
672 * something depending on them
673 */
674 struct ir3_register *last_write;
675
676 /* extra stuff used in RA pass: */
677 unsigned base; /* base vreg name */
678 unsigned reg; /* base physical reg */
679 uint16_t start_ip, end_ip;
680
681 /* Indicates if half-precision */
682 bool half;
683
684 bool unused;
685 };
686
687 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
688
689 struct ir3_block {
690 struct list_head node;
691 struct ir3 *shader;
692
693 const struct nir_block *nblock;
694
695 struct list_head instr_list; /* list of ir3_instruction */
696
697 /* each block has either one or two successors.. in case of two
698 * successors, 'condition' decides which one to follow. A block preceding
699 * an if/else has two successors.
700 *
701 * In some cases the path that the machine actually takes through the
702 * program may not match the per-thread view of the CFG. In particular
703 * this is the case for if/else, where the machine jumps from the end of
704 * the if to the beginning of the else and switches active lanes. While
705 * most things only care about the per-thread view, we need to use the
706 * "physical" view when allocating shared registers. "successors" contains
707 * the per-thread successors, and "physical_successors" contains the
708 * physical successors which includes the fallthrough edge from the if to
709 * the else.
710 */
711 struct ir3_block *successors[2];
712
713 bool divergent_condition;
714
715 DECLARE_ARRAY(struct ir3_block *, predecessors);
716 DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
717 DECLARE_ARRAY(struct ir3_block *, physical_successors);
718
719 uint16_t start_ip, end_ip;
720
721 bool reconvergence_point;
722
723 bool in_early_preamble;
724
725 /* Track instructions which do not write a register but other-
726 * wise must not be discarded (such as kill, stg, etc)
727 */
728 DECLARE_ARRAY(struct ir3_instruction *, keeps);
729
730 /* used for per-pass extra block data. Mainly used right
731 * now in RA step to track livein/liveout.
732 */
733 void *data;
734
735 uint32_t index;
736
737 struct ir3_block *imm_dom;
738 DECLARE_ARRAY(struct ir3_block *, dom_children);
739
740 uint32_t dom_pre_index;
741 uint32_t dom_post_index;
742
743 uint32_t loop_depth;
744
745 #if MESA_DEBUG
746 uint32_t serialno;
747 #endif
748 };
749
750 enum ir3_cursor_option {
751 IR3_CURSOR_BEFORE_BLOCK,
752 IR3_CURSOR_AFTER_BLOCK,
753 IR3_CURSOR_BEFORE_INSTR,
754 IR3_CURSOR_AFTER_INSTR,
755 };
756
757 struct ir3_cursor {
758 enum ir3_cursor_option option;
759 union {
760 struct ir3_block *block;
761 struct ir3_instruction *instr;
762 };
763 };
764
765 struct ir3_builder {
766 struct ir3_cursor cursor;
767 };
768
769 static inline uint32_t
block_id(struct ir3_block * block)770 block_id(struct ir3_block *block)
771 {
772 #if MESA_DEBUG
773 return block->serialno;
774 #else
775 return (uint32_t)(unsigned long)block;
776 #endif
777 }
778
779 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)780 ir3_start_block(struct ir3 *ir)
781 {
782 return list_first_entry(&ir->block_list, struct ir3_block, node);
783 }
784
785 static inline struct ir3_block *
ir3_end_block(struct ir3 * ir)786 ir3_end_block(struct ir3 *ir)
787 {
788 return list_last_entry(&ir->block_list, struct ir3_block, node);
789 }
790
791 struct ir3_instruction *ir3_find_end(struct ir3 *ir);
792
793 struct ir3_instruction *ir3_block_get_terminator(struct ir3_block *block);
794
795 struct ir3_instruction *ir3_block_take_terminator(struct ir3_block *block);
796
797 struct ir3_instruction *
798 ir3_block_get_last_non_terminator(struct ir3_block *block);
799
800 struct ir3_instruction *ir3_block_get_last_phi(struct ir3_block *block);
801
802 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)803 ir3_after_preamble(struct ir3 *ir)
804 {
805 struct ir3_block *block = ir3_start_block(ir);
806 /* The preamble will have a usually-empty else branch, and we want to skip
807 * that to get to the block after the preamble.
808 */
809 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
810 if (terminator && (terminator->opc == OPC_SHPS))
811 return block->successors[1]->successors[0];
812 else
813 return block;
814 }
815
816 static inline bool
ir3_has_preamble(struct ir3 * ir)817 ir3_has_preamble(struct ir3 *ir)
818 {
819 return ir3_start_block(ir) != ir3_after_preamble(ir);
820 }
821
822 struct ir3_instruction *ir3_find_shpe(struct ir3 *ir);
823
824 /* Create an empty preamble and return shpe. */
825 struct ir3_instruction *ir3_create_empty_preamble(struct ir3 *ir);
826
827 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
828 void ir3_block_link_physical(struct ir3_block *pred, struct ir3_block *succ);
829 void ir3_block_remove_predecessor(struct ir3_block *block,
830 struct ir3_block *pred);
831 unsigned ir3_block_get_pred_index(struct ir3_block *block,
832 struct ir3_block *pred);
833
834 void ir3_calc_dominance(struct ir3 *ir);
835 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
836
837 struct ir3_shader_variant;
838
839 struct ir3 *ir3_create(struct ir3_compiler *compiler,
840 struct ir3_shader_variant *v);
841 void ir3_destroy(struct ir3 *shader);
842
843 void ir3_collect_info(struct ir3_shader_variant *v);
844 void *ir3_alloc(struct ir3 *shader, int sz);
845
846 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
847 unsigned reg_count,
848 bool double_threadsize);
849
850 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
851 bool double_threadsize);
852
853 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
854 unsigned regs_count);
855
856 struct ir3_block *ir3_block_create(struct ir3 *shader);
857
858 struct ir3_instruction *ir3_build_instr(struct ir3_builder *builder, opc_t opc,
859 int ndst, int nsrc);
860 struct ir3_instruction *ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc,
861 int ndst, int nsrc);
862 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
863 int ndst, int nsrc);
864 struct ir3_instruction *ir3_instr_create_at_end(struct ir3_block *block,
865 opc_t opc, int ndst, int nsrc);
866 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
867 void ir3_instr_add_dep(struct ir3_instruction *instr,
868 struct ir3_instruction *dep);
869 const char *ir3_instr_name(struct ir3_instruction *instr);
870 void ir3_instr_remove(struct ir3_instruction *instr);
871
872 void ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n);
873 bool ir3_instr_is_rpt(const struct ir3_instruction *instr);
874 bool ir3_instr_is_first_rpt(const struct ir3_instruction *instr);
875 struct ir3_instruction *ir3_instr_prev_rpt(const struct ir3_instruction *instr);
876 struct ir3_instruction *ir3_instr_first_rpt(struct ir3_instruction *instr);
877 unsigned ir3_instr_rpt_length(const struct ir3_instruction *instr);
878
879 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
880 int flags);
881 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
882 int flags);
883 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
884 struct ir3_register *reg);
885
886 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)887 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
888 {
889 assert(!dst->tied && !src->tied);
890 dst->tied = src;
891 src->tied = dst;
892 }
893
894 void ir3_reg_set_last_array(struct ir3_instruction *instr,
895 struct ir3_register *reg,
896 struct ir3_register *last_write);
897
898 void ir3_instr_set_address(struct ir3_instruction *instr,
899 struct ir3_instruction *addr);
900
901 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)902 ir3_instr_check_mark(struct ir3_instruction *instr)
903 {
904 if (instr->flags & IR3_INSTR_MARK)
905 return true; /* already visited */
906 instr->flags |= IR3_INSTR_MARK;
907 return false;
908 }
909
910 void ir3_block_clear_mark(struct ir3_block *block);
911 void ir3_clear_mark(struct ir3 *shader);
912
913 unsigned ir3_count_instructions(struct ir3 *ir);
914 unsigned ir3_count_instructions_sched(struct ir3 *ir);
915 unsigned ir3_count_instructions_ra(struct ir3 *ir);
916
917 /**
918 * Move 'instr' to just before 'after'
919 */
920 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)921 ir3_instr_move_before(struct ir3_instruction *instr,
922 struct ir3_instruction *after)
923 {
924 list_delinit(&instr->node);
925 list_addtail(&instr->node, &after->node);
926 }
927
928 /**
929 * Move 'instr' to just after 'before':
930 */
931 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)932 ir3_instr_move_after(struct ir3_instruction *instr,
933 struct ir3_instruction *before)
934 {
935 list_delinit(&instr->node);
936 list_add(&instr->node, &before->node);
937 }
938
939 /**
940 * Move 'instr' to the beginning of the block:
941 */
942 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)943 ir3_instr_move_before_block(struct ir3_instruction *instr,
944 struct ir3_block *block)
945 {
946 list_delinit(&instr->node);
947 list_add(&instr->node, &block->instr_list);
948 }
949
950 typedef bool (*use_filter_cb)(struct ir3_instruction *use, unsigned src_n);
951
952 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
953 void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter);
954
955 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
956 void ir3_fixup_src_type(struct ir3_instruction *instr);
957
958 int ir3_flut(struct ir3_register *src_reg);
959
960 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
961
962 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
963
964 /**
965 * Given an instruction whose result we want to test for nonzero, return a
966 * potentially different instruction for which the result would be the same.
967 * This might be one of its sources if instr doesn't change the nonzero-ness.
968 */
969 struct ir3_instruction *
970 ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr);
971
972 bool ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc);
973
974 #include "util/set.h"
975 #define foreach_ssa_use(__use, __instr) \
976 for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses; \
977 __use = NULL) \
978 set_foreach ((__instr)->uses, __entry) \
979 if ((__use = (void *)__entry->key))
980
981 static inline uint32_t
reg_num(const struct ir3_register * reg)982 reg_num(const struct ir3_register *reg)
983 {
984 return reg->num >> 2;
985 }
986
987 static inline uint32_t
reg_comp(const struct ir3_register * reg)988 reg_comp(const struct ir3_register *reg)
989 {
990 return reg->num & 0x3;
991 }
992
993 static inline bool
is_flow(struct ir3_instruction * instr)994 is_flow(struct ir3_instruction *instr)
995 {
996 return (opc_cat(instr->opc) == 0);
997 }
998
999 static inline bool
is_terminator(struct ir3_instruction * instr)1000 is_terminator(struct ir3_instruction *instr)
1001 {
1002 switch (instr->opc) {
1003 case OPC_BR:
1004 case OPC_JUMP:
1005 case OPC_BANY:
1006 case OPC_BALL:
1007 case OPC_BRAA:
1008 case OPC_BRAO:
1009 case OPC_SHPS:
1010 case OPC_GETONE:
1011 case OPC_GETLAST:
1012 case OPC_PREDT:
1013 case OPC_PREDF:
1014 return true;
1015 default:
1016 return false;
1017 }
1018 }
1019
1020 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)1021 is_kill_or_demote(struct ir3_instruction *instr)
1022 {
1023 return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
1024 }
1025
1026 static inline bool
is_nop(struct ir3_instruction * instr)1027 is_nop(struct ir3_instruction *instr)
1028 {
1029 return instr->opc == OPC_NOP;
1030 }
1031
1032 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)1033 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
1034 {
1035 unsigned dst_type = (dst->flags & IR3_REG_HALF);
1036 unsigned src_type = (src->flags & IR3_REG_HALF);
1037
1038 /* Treat shared->normal copies and normal->shared copies as same-type. */
1039 return dst_type == src_type;
1040 }
1041
1042 /* Is it a non-transformative (ie. not type changing) mov? This can
1043 * also include absneg.s/absneg.f, which for the most part can be
1044 * treated as a mov (single src argument).
1045 */
1046 static inline bool
is_same_type_mov(struct ir3_instruction * instr)1047 is_same_type_mov(struct ir3_instruction *instr)
1048 {
1049 struct ir3_register *dst;
1050
1051 switch (instr->opc) {
1052 case OPC_MOV:
1053 if (instr->cat1.src_type != instr->cat1.dst_type)
1054 return false;
1055 /* If the type of dest reg and src reg are different,
1056 * it shouldn't be considered as same type mov
1057 */
1058 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1059 return false;
1060 break;
1061 case OPC_ABSNEG_F:
1062 case OPC_ABSNEG_S:
1063 if (instr->flags & IR3_INSTR_SAT)
1064 return false;
1065 /* If the type of dest reg and src reg are different,
1066 * it shouldn't be considered as same type mov
1067 */
1068 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1069 return false;
1070 break;
1071 default:
1072 return false;
1073 }
1074
1075 dst = instr->dsts[0];
1076
1077 /* mov's that write to a0 or p0.x are special: */
1078 if (dst->flags & IR3_REG_PREDICATE)
1079 return false;
1080 if (reg_num(dst) == REG_A0)
1081 return false;
1082
1083 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
1084 return false;
1085
1086 return true;
1087 }
1088
1089 /* A move from const, which changes size but not type, can also be
1090 * folded into dest instruction in some cases.
1091 */
1092 static inline bool
is_const_mov(struct ir3_instruction * instr)1093 is_const_mov(struct ir3_instruction *instr)
1094 {
1095 if (instr->opc != OPC_MOV)
1096 return false;
1097
1098 if (!(instr->srcs[0]->flags & IR3_REG_CONST))
1099 return false;
1100
1101 type_t src_type = instr->cat1.src_type;
1102 type_t dst_type = instr->cat1.dst_type;
1103
1104 /* Allow a narrowing move, but not a widening one. A narrowing
1105 * move from full c1.x can be folded into a hc1.x use in an ALU
1106 * instruction because it is doing the same thing as constant-
1107 * demotion. If CONSTANT_DEMOTION_ENABLE wasn't set, we'd need
1108 * return false in all cases.
1109 */
1110 if ((type_size(dst_type) > type_size(src_type)) ||
1111 (type_size(dst_type) == 8))
1112 return false;
1113
1114 return (type_float(src_type) && type_float(dst_type)) ||
1115 (type_uint(src_type) && type_uint(dst_type)) ||
1116 (type_sint(src_type) && type_sint(dst_type));
1117 }
1118
1119 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)1120 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
1121 {
1122 switch (instr->opc) {
1123 case OPC_BALLOT_MACRO:
1124 case OPC_ANY_MACRO:
1125 case OPC_ALL_MACRO:
1126 case OPC_ELECT_MACRO:
1127 case OPC_READ_COND_MACRO:
1128 case OPC_READ_GETLAST_MACRO:
1129 case OPC_READ_FIRST_MACRO:
1130 case OPC_SCAN_MACRO:
1131 case OPC_SCAN_CLUSTERS_MACRO:
1132 return true;
1133 default:
1134 return false;
1135 }
1136 }
1137
1138 static inline bool
is_alu(struct ir3_instruction * instr)1139 is_alu(struct ir3_instruction *instr)
1140 {
1141 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
1142 }
1143
1144 static inline bool
is_sfu(struct ir3_instruction * instr)1145 is_sfu(struct ir3_instruction *instr)
1146 {
1147 return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
1148 }
1149
1150 static inline bool
is_tex(struct ir3_instruction * instr)1151 is_tex(struct ir3_instruction *instr)
1152 {
1153 return (opc_cat(instr->opc) == 5) && instr->opc != OPC_TCINV;
1154 }
1155
1156 static inline bool
is_tex_shuffle(struct ir3_instruction * instr)1157 is_tex_shuffle(struct ir3_instruction *instr)
1158 {
1159 switch (instr->opc) {
1160 case OPC_BRCST_ACTIVE:
1161 case OPC_QUAD_SHUFFLE_BRCST:
1162 case OPC_QUAD_SHUFFLE_HORIZ:
1163 case OPC_QUAD_SHUFFLE_VERT:
1164 case OPC_QUAD_SHUFFLE_DIAG:
1165 return true;
1166 default:
1167 return false;
1168 }
1169 }
1170
1171 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)1172 is_tex_or_prefetch(struct ir3_instruction *instr)
1173 {
1174 return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
1175 }
1176
1177 static inline bool
is_mem(struct ir3_instruction * instr)1178 is_mem(struct ir3_instruction *instr)
1179 {
1180 return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
1181 }
1182
1183 static inline bool
is_barrier(struct ir3_instruction * instr)1184 is_barrier(struct ir3_instruction *instr)
1185 {
1186 return (opc_cat(instr->opc) == 7) && instr->opc != OPC_ALIAS;
1187 }
1188
1189 static inline bool
is_half(struct ir3_instruction * instr)1190 is_half(struct ir3_instruction *instr)
1191 {
1192 return !!(instr->dsts[0]->flags & IR3_REG_HALF);
1193 }
1194
1195 static inline bool
is_shared(struct ir3_instruction * instr)1196 is_shared(struct ir3_instruction *instr)
1197 {
1198 return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
1199 }
1200
1201 static inline bool
is_store(struct ir3_instruction * instr)1202 is_store(struct ir3_instruction *instr)
1203 {
1204 /* these instructions, the "destination" register is
1205 * actually a source, the address to store to.
1206 */
1207 switch (instr->opc) {
1208 case OPC_STG:
1209 case OPC_STG_A:
1210 case OPC_STGB:
1211 case OPC_STIB:
1212 case OPC_STP:
1213 case OPC_STL:
1214 case OPC_STLW:
1215 case OPC_L2G:
1216 case OPC_G2L:
1217 return true;
1218 default:
1219 return false;
1220 }
1221 }
1222
1223 static inline bool
is_load(struct ir3_instruction * instr)1224 is_load(struct ir3_instruction *instr)
1225 {
1226 switch (instr->opc) {
1227 case OPC_LDG:
1228 case OPC_LDG_A:
1229 case OPC_LDGB:
1230 case OPC_LDIB:
1231 case OPC_LDL:
1232 case OPC_LDP:
1233 case OPC_L2G:
1234 case OPC_LDLW:
1235 case OPC_LDLV:
1236 case OPC_RAY_INTERSECTION:
1237 /* probably some others too.. */
1238 return true;
1239 case OPC_LDC:
1240 return instr->dsts_count > 0;
1241 default:
1242 return false;
1243 }
1244 }
1245
1246 static inline bool
is_input(struct ir3_instruction * instr)1247 is_input(struct ir3_instruction *instr)
1248 {
1249 /* in some cases, ldlv is used to fetch varying without
1250 * interpolation.. fortunately inloc is the first src
1251 * register in either case
1252 */
1253 switch (instr->opc) {
1254 case OPC_LDLV:
1255 case OPC_BARY_F:
1256 case OPC_FLAT_B:
1257 return true;
1258 default:
1259 return false;
1260 }
1261 }
1262
1263 /* Whether non-helper invocations can read the value of helper invocations. We
1264 * cannot insert (eq) before these instructions.
1265 */
1266 static inline bool
uses_helpers(struct ir3_instruction * instr)1267 uses_helpers(struct ir3_instruction *instr)
1268 {
1269 switch (instr->opc) {
1270 /* These require helper invocations to be present */
1271 case OPC_SAMB:
1272 case OPC_GETLOD:
1273 case OPC_DSX:
1274 case OPC_DSY:
1275 case OPC_DSXPP_1:
1276 case OPC_DSYPP_1:
1277 case OPC_DSXPP_MACRO:
1278 case OPC_DSYPP_MACRO:
1279 case OPC_QUAD_SHUFFLE_BRCST:
1280 case OPC_QUAD_SHUFFLE_HORIZ:
1281 case OPC_QUAD_SHUFFLE_VERT:
1282 case OPC_QUAD_SHUFFLE_DIAG:
1283 case OPC_META_TEX_PREFETCH:
1284 return true;
1285
1286 /* sam requires helper invocations except for dummy prefetch instructions */
1287 case OPC_SAM:
1288 return instr->dsts_count != 0;
1289
1290 /* Subgroup operations don't require helper invocations to be present, but
1291 * will use helper invocations if they are present.
1292 */
1293 case OPC_BALLOT_MACRO:
1294 case OPC_ANY_MACRO:
1295 case OPC_ALL_MACRO:
1296 case OPC_READ_FIRST_MACRO:
1297 case OPC_READ_COND_MACRO:
1298 case OPC_MOVMSK:
1299 case OPC_BRCST_ACTIVE:
1300 return true;
1301
1302 /* Catch lowered READ_FIRST/READ_COND. For elect, don't include the getone
1303 * in the preamble because it doesn't actually matter which fiber is
1304 * selected.
1305 */
1306 case OPC_MOV:
1307 case OPC_ELECT_MACRO:
1308 return instr->flags & IR3_INSTR_NEEDS_HELPERS;
1309
1310 default:
1311 return false;
1312 }
1313 }
1314
1315 static inline bool
is_bool(struct ir3_instruction * instr)1316 is_bool(struct ir3_instruction *instr)
1317 {
1318 switch (instr->opc) {
1319 case OPC_CMPS_F:
1320 case OPC_CMPS_S:
1321 case OPC_CMPS_U:
1322 return true;
1323 default:
1324 return false;
1325 }
1326 }
1327
1328 static inline opc_t
cat3_half_opc(opc_t opc)1329 cat3_half_opc(opc_t opc)
1330 {
1331 switch (opc) {
1332 case OPC_MAD_F32:
1333 return OPC_MAD_F16;
1334 case OPC_SEL_B32:
1335 return OPC_SEL_B16;
1336 case OPC_SEL_S32:
1337 return OPC_SEL_S16;
1338 case OPC_SEL_F32:
1339 return OPC_SEL_F16;
1340 case OPC_SAD_S32:
1341 return OPC_SAD_S16;
1342 default:
1343 return opc;
1344 }
1345 }
1346
1347 static inline opc_t
cat3_full_opc(opc_t opc)1348 cat3_full_opc(opc_t opc)
1349 {
1350 switch (opc) {
1351 case OPC_MAD_F16:
1352 return OPC_MAD_F32;
1353 case OPC_SEL_B16:
1354 return OPC_SEL_B32;
1355 case OPC_SEL_S16:
1356 return OPC_SEL_S32;
1357 case OPC_SEL_F16:
1358 return OPC_SEL_F32;
1359 case OPC_SAD_S16:
1360 return OPC_SAD_S32;
1361 default:
1362 return opc;
1363 }
1364 }
1365
1366 static inline opc_t
cat4_half_opc(opc_t opc)1367 cat4_half_opc(opc_t opc)
1368 {
1369 switch (opc) {
1370 case OPC_RSQ:
1371 return OPC_HRSQ;
1372 case OPC_LOG2:
1373 return OPC_HLOG2;
1374 case OPC_EXP2:
1375 return OPC_HEXP2;
1376 default:
1377 return opc;
1378 }
1379 }
1380
1381 static inline opc_t
cat4_full_opc(opc_t opc)1382 cat4_full_opc(opc_t opc)
1383 {
1384 switch (opc) {
1385 case OPC_HRSQ:
1386 return OPC_RSQ;
1387 case OPC_HLOG2:
1388 return OPC_LOG2;
1389 case OPC_HEXP2:
1390 return OPC_EXP2;
1391 default:
1392 return opc;
1393 }
1394 }
1395
1396 static inline bool
is_meta(struct ir3_instruction * instr)1397 is_meta(struct ir3_instruction *instr)
1398 {
1399 return (opc_cat(instr->opc) == OPC_META);
1400 }
1401
1402 static inline unsigned
reg_elems(const struct ir3_register * reg)1403 reg_elems(const struct ir3_register *reg)
1404 {
1405 if (reg->flags & IR3_REG_ARRAY)
1406 return reg->size;
1407 else
1408 return util_last_bit(reg->wrmask);
1409 }
1410
1411 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1412 reg_elem_size(const struct ir3_register *reg)
1413 {
1414 return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1415 }
1416
1417 static inline unsigned
reg_size(const struct ir3_register * reg)1418 reg_size(const struct ir3_register *reg)
1419 {
1420 return reg_elems(reg) * reg_elem_size(reg);
1421 }
1422
1423 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
1424 * and have to handle relative accesses specially.
1425 */
1426
1427 static inline unsigned
post_ra_reg_elems(struct ir3_register * reg)1428 post_ra_reg_elems(struct ir3_register *reg)
1429 {
1430 if (reg->flags & IR3_REG_RELATIV)
1431 return reg->size;
1432 return reg_elems(reg);
1433 }
1434
1435 static inline unsigned
post_ra_reg_num(struct ir3_register * reg)1436 post_ra_reg_num(struct ir3_register *reg)
1437 {
1438 if (reg->flags & IR3_REG_RELATIV)
1439 return reg->array.base;
1440 return reg->num;
1441 }
1442
1443 static inline unsigned
dest_regs(struct ir3_instruction * instr)1444 dest_regs(struct ir3_instruction *instr)
1445 {
1446 if (instr->dsts_count == 0)
1447 return 0;
1448
1449 assert(instr->dsts_count == 1);
1450 return util_last_bit(instr->dsts[0]->wrmask);
1451 }
1452
1453 static inline bool
is_reg_gpr(const struct ir3_register * reg)1454 is_reg_gpr(const struct ir3_register *reg)
1455 {
1456 if (reg->flags &
1457 (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_PREDICATE | IR3_REG_RT)) {
1458 return false;
1459 }
1460 if (reg_num(reg) == REG_A0)
1461 return false;
1462 if (!(reg->flags & (IR3_REG_SSA | IR3_REG_RELATIV)) &&
1463 reg->num == INVALID_REG)
1464 return false;
1465 return true;
1466 }
1467
1468 static inline bool
is_reg_a0(const struct ir3_register * reg)1469 is_reg_a0(const struct ir3_register *reg)
1470 {
1471 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1472 return false;
1473 return reg->num == regid(REG_A0, 0);
1474 }
1475
1476 /* is dst a normal temp register: */
1477 static inline bool
is_dest_gpr(const struct ir3_register * dst)1478 is_dest_gpr(const struct ir3_register *dst)
1479 {
1480 if (dst->wrmask == 0)
1481 return false;
1482 return is_reg_gpr(dst);
1483 }
1484
1485 static inline bool
writes_gpr(struct ir3_instruction * instr)1486 writes_gpr(struct ir3_instruction *instr)
1487 {
1488 if (dest_regs(instr) == 0)
1489 return false;
1490 return is_dest_gpr(instr->dsts[0]);
1491 }
1492
1493 static inline bool
writes_addr0(struct ir3_instruction * instr)1494 writes_addr0(struct ir3_instruction *instr)
1495 {
1496 /* Note: only the first dest can write to a0.x */
1497 if (instr->dsts_count > 0) {
1498 struct ir3_register *dst = instr->dsts[0];
1499 return dst->num == regid(REG_A0, 0);
1500 }
1501 return false;
1502 }
1503
1504 static inline bool
writes_addr1(struct ir3_instruction * instr)1505 writes_addr1(struct ir3_instruction *instr)
1506 {
1507 /* Note: only the first dest can write to a1.x */
1508 if (instr->dsts_count > 0) {
1509 struct ir3_register *dst = instr->dsts[0];
1510 return dst->num == regid(REG_A0, 1);
1511 }
1512 return false;
1513 }
1514
1515 static inline bool
writes_pred(struct ir3_instruction * instr)1516 writes_pred(struct ir3_instruction *instr)
1517 {
1518 /* Note: only the first dest can write to p0 */
1519 if (instr->dsts_count > 0) {
1520 struct ir3_register *dst = instr->dsts[0];
1521 return !!(dst->flags & IR3_REG_PREDICATE);
1522 }
1523 return false;
1524 }
1525
1526 /* r0.x - r47.w are "normal" registers. r48.x - r55.w are shared registers.
1527 * Everything above those are non-GPR registers like a0.x and p0.x that aren't
1528 * assigned by RA.
1529 */
1530 #define GPR_REG_SIZE (4 * 48)
1531 #define SHARED_REG_START GPR_REG_SIZE
1532 #define SHARED_REG_SIZE (4 * 8)
1533 #define NONGPR_REG_START (SHARED_REG_START + SHARED_REG_SIZE)
1534 #define NONGPR_REG_SIZE (4 * 8)
1535
1536 enum ir3_reg_file {
1537 IR3_FILE_FULL,
1538 IR3_FILE_HALF,
1539 IR3_FILE_SHARED,
1540 IR3_FILE_NONGPR,
1541 };
1542
1543 /* Return a file + offset that can be used for determining if two registers
1544 * alias. The register is only really used for its flags, the num is taken from
1545 * the parameter. Registers overlap if they are in the same file and have an
1546 * overlapping offset. The offset is multiplied by 2 for full registers to
1547 * handle aliasing half and full registers, that is it's in units of half-regs.
1548 */
1549 static inline unsigned
ir3_reg_file_offset(const struct ir3_register * reg,unsigned num,bool mergedregs,enum ir3_reg_file * file)1550 ir3_reg_file_offset(const struct ir3_register *reg, unsigned num,
1551 bool mergedregs, enum ir3_reg_file *file)
1552 {
1553 assert(!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
1554 unsigned size = reg_elem_size(reg);
1555 if (!is_reg_gpr(reg)) {
1556 *file = IR3_FILE_NONGPR;
1557 return (num - NONGPR_REG_START) * size;
1558 } else if (reg->flags & IR3_REG_SHARED) {
1559 *file = IR3_FILE_SHARED;
1560 return (num - SHARED_REG_START) * size;
1561 } else if (mergedregs || !(reg->flags & IR3_REG_HALF)) {
1562 *file = IR3_FILE_FULL;
1563 return num * size;
1564 } else {
1565 *file = IR3_FILE_HALF;
1566 return num;
1567 }
1568 }
1569
1570 /* returns defining instruction for reg */
1571 /* TODO better name */
1572 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1573 ssa(struct ir3_register *reg)
1574 {
1575 if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1576 return reg->def->instr;
1577 return NULL;
1578 }
1579
1580 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1581 conflicts(struct ir3_register *a, struct ir3_register *b)
1582 {
1583 return (a && b) && (a->def != b->def);
1584 }
1585
1586 static inline bool
reg_is_addr1(struct ir3_register * r)1587 reg_is_addr1(struct ir3_register *r)
1588 {
1589 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1590 return false;
1591 return r->num == regid(REG_A0, 1);
1592 }
1593
1594 static inline type_t
half_type(type_t type)1595 half_type(type_t type)
1596 {
1597 switch (type) {
1598 case TYPE_F32:
1599 return TYPE_F16;
1600 case TYPE_U32:
1601 case TYPE_U8_32:
1602 return TYPE_U16;
1603 case TYPE_S32:
1604 return TYPE_S16;
1605 case TYPE_F16:
1606 case TYPE_U16:
1607 case TYPE_S16:
1608 return type;
1609 case TYPE_U8:
1610 return type;
1611 default:
1612 assert(0);
1613 return (type_t)~0;
1614 }
1615 }
1616
1617 static inline type_t
full_type(type_t type)1618 full_type(type_t type)
1619 {
1620 switch (type) {
1621 case TYPE_F16:
1622 return TYPE_F32;
1623 case TYPE_U8:
1624 case TYPE_U8_32:
1625 case TYPE_U16:
1626 return TYPE_U32;
1627 case TYPE_S16:
1628 return TYPE_S32;
1629 case TYPE_F32:
1630 case TYPE_U32:
1631 case TYPE_S32:
1632 return type;
1633 default:
1634 assert(0);
1635 return (type_t)~0;
1636 }
1637 }
1638
1639 /* some cat2 instructions (ie. those which are not float) can embed an
1640 * immediate:
1641 */
1642 static inline bool
ir3_cat2_int(opc_t opc)1643 ir3_cat2_int(opc_t opc)
1644 {
1645 switch (opc) {
1646 case OPC_ADD_U:
1647 case OPC_ADD_S:
1648 case OPC_SUB_U:
1649 case OPC_SUB_S:
1650 case OPC_CMPS_U:
1651 case OPC_CMPS_S:
1652 case OPC_MIN_U:
1653 case OPC_MIN_S:
1654 case OPC_MAX_U:
1655 case OPC_MAX_S:
1656 case OPC_CMPV_U:
1657 case OPC_CMPV_S:
1658 case OPC_MUL_U24:
1659 case OPC_MUL_S24:
1660 case OPC_MULL_U:
1661 case OPC_CLZ_S:
1662 case OPC_ABSNEG_S:
1663 case OPC_AND_B:
1664 case OPC_OR_B:
1665 case OPC_NOT_B:
1666 case OPC_XOR_B:
1667 case OPC_BFREV_B:
1668 case OPC_CLZ_B:
1669 case OPC_SHL_B:
1670 case OPC_SHR_B:
1671 case OPC_ASHR_B:
1672 case OPC_MGEN_B:
1673 case OPC_GETBIT_B:
1674 case OPC_CBITS_B:
1675 case OPC_BARY_F:
1676 case OPC_FLAT_B:
1677 return true;
1678
1679 default:
1680 return false;
1681 }
1682 }
1683
1684 /* map cat2 instruction to valid abs/neg flags: */
1685 static inline unsigned
ir3_cat2_absneg(opc_t opc)1686 ir3_cat2_absneg(opc_t opc)
1687 {
1688 switch (opc) {
1689 case OPC_ADD_F:
1690 case OPC_MIN_F:
1691 case OPC_MAX_F:
1692 case OPC_MUL_F:
1693 case OPC_SIGN_F:
1694 case OPC_CMPS_F:
1695 case OPC_ABSNEG_F:
1696 case OPC_CMPV_F:
1697 case OPC_FLOOR_F:
1698 case OPC_CEIL_F:
1699 case OPC_RNDNE_F:
1700 case OPC_RNDAZ_F:
1701 case OPC_TRUNC_F:
1702 case OPC_BARY_F:
1703 return IR3_REG_FABS | IR3_REG_FNEG;
1704
1705 case OPC_ADD_U:
1706 case OPC_ADD_S:
1707 case OPC_SUB_U:
1708 case OPC_SUB_S:
1709 case OPC_CMPS_U:
1710 case OPC_CMPS_S:
1711 case OPC_MIN_U:
1712 case OPC_MIN_S:
1713 case OPC_MAX_U:
1714 case OPC_MAX_S:
1715 case OPC_CMPV_U:
1716 case OPC_CMPV_S:
1717 case OPC_MUL_U24:
1718 case OPC_MUL_S24:
1719 case OPC_MULL_U:
1720 case OPC_CLZ_S:
1721 return 0;
1722
1723 case OPC_ABSNEG_S:
1724 return IR3_REG_SABS | IR3_REG_SNEG;
1725
1726 case OPC_AND_B:
1727 case OPC_OR_B:
1728 case OPC_NOT_B:
1729 case OPC_XOR_B:
1730 case OPC_BFREV_B:
1731 case OPC_CLZ_B:
1732 case OPC_SHL_B:
1733 case OPC_SHR_B:
1734 case OPC_ASHR_B:
1735 case OPC_MGEN_B:
1736 case OPC_GETBIT_B:
1737 case OPC_CBITS_B:
1738 return IR3_REG_BNOT;
1739
1740 default:
1741 return 0;
1742 }
1743 }
1744
1745 /* map cat3 instructions to valid abs/neg flags: */
1746 static inline unsigned
ir3_cat3_absneg(opc_t opc,unsigned src_n)1747 ir3_cat3_absneg(opc_t opc, unsigned src_n)
1748 {
1749 switch (opc) {
1750 case OPC_MAD_F16:
1751 case OPC_MAD_F32:
1752 case OPC_SEL_F16:
1753 case OPC_SEL_F32:
1754 return IR3_REG_FNEG;
1755
1756 case OPC_SAD_S16:
1757 case OPC_SAD_S32:
1758 return src_n == 1 ? IR3_REG_SNEG : 0;
1759
1760 case OPC_MAD_U16:
1761 case OPC_MADSH_U16:
1762 case OPC_MAD_S16:
1763 case OPC_MADSH_M16:
1764 case OPC_MAD_U24:
1765 case OPC_MAD_S24:
1766 case OPC_SEL_S16:
1767 case OPC_SEL_S32:
1768 /* neg *may* work on 3rd src.. */
1769
1770 case OPC_SEL_B16:
1771 case OPC_SEL_B32:
1772
1773 case OPC_SHRM:
1774 case OPC_SHLM:
1775 case OPC_SHRG:
1776 case OPC_SHLG:
1777 case OPC_ANDG:
1778 case OPC_WMM:
1779 case OPC_WMM_ACCU:
1780
1781 default:
1782 return 0;
1783 }
1784 }
1785
1786 /* Return the type (float, int, or uint) the op uses when converting from the
1787 * internal result of the op (which is assumed to be the same size as the
1788 * sources) to the destination when they are not the same size. If F32 it does
1789 * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1790 * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1791 * doesn't do anything sensible or is unknown.
1792 */
1793 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1794 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1795 {
1796 *can_fold = true;
1797 switch (instr->opc) {
1798 case OPC_ADD_F:
1799 case OPC_MUL_F:
1800 case OPC_BARY_F:
1801 case OPC_MAD_F32:
1802 case OPC_MAD_F16:
1803 case OPC_WMM:
1804 case OPC_WMM_ACCU:
1805 return TYPE_F32;
1806
1807 case OPC_ADD_U:
1808 case OPC_SUB_U:
1809 case OPC_MIN_U:
1810 case OPC_MAX_U:
1811 case OPC_AND_B:
1812 case OPC_OR_B:
1813 case OPC_NOT_B:
1814 case OPC_XOR_B:
1815 case OPC_MUL_U24:
1816 case OPC_MULL_U:
1817 case OPC_SHL_B:
1818 case OPC_SHR_B:
1819 case OPC_ASHR_B:
1820 case OPC_MAD_U24:
1821 case OPC_SHRM:
1822 case OPC_SHLM:
1823 case OPC_SHRG:
1824 case OPC_SHLG:
1825 case OPC_ANDG:
1826 /* Comparison ops zero-extend/truncate their results, so consider them as
1827 * unsigned here.
1828 */
1829 case OPC_CMPS_F:
1830 case OPC_CMPV_F:
1831 case OPC_CMPS_U:
1832 case OPC_CMPS_S:
1833 return TYPE_U32;
1834
1835 case OPC_ADD_S:
1836 case OPC_SUB_S:
1837 case OPC_MIN_S:
1838 case OPC_MAX_S:
1839 case OPC_ABSNEG_S:
1840 case OPC_MUL_S24:
1841 case OPC_MAD_S24:
1842 return TYPE_S32;
1843
1844 /* We assume that any move->move folding that could be done was done by
1845 * NIR.
1846 */
1847 case OPC_MOV:
1848 default:
1849 *can_fold = false;
1850 return TYPE_U32;
1851 }
1852 }
1853
1854 /* Return the src and dst types for the conversion which is already folded
1855 * into the op. We can assume that instr has folded in a conversion from
1856 * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1857 * to call if ir3_output_conv_type() returns can_fold = true.
1858 */
1859 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1860 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1861 {
1862 switch (instr->opc) {
1863 case OPC_CMPS_F:
1864 case OPC_CMPV_F:
1865 case OPC_CMPS_U:
1866 case OPC_CMPS_S:
1867 /* Comparisons only return 0/1 and the size of the comparison sources
1868 * is irrelevant, never consider them as having an output conversion
1869 * by returning a type with the dest size here:
1870 */
1871 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1872 : full_type(base_type);
1873
1874 case OPC_BARY_F:
1875 /* bary.f doesn't have an explicit source, but we can assume here that
1876 * the varying data it reads is in fp32.
1877 *
1878 * This may be fp16 on older gen's depending on some register
1879 * settings, but it's probably not worth plumbing that through for a
1880 * small improvement that NIR would hopefully handle for us anyway.
1881 */
1882 return TYPE_F32;
1883
1884 case OPC_FLAT_B:
1885 /* Treat the input data as u32 if not interpolating. */
1886 return TYPE_U32;
1887
1888 default:
1889 return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1890 : full_type(base_type);
1891 }
1892 }
1893
1894 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1895 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1896 {
1897 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1898 : full_type(base_type);
1899 }
1900
1901 /* Some instructions have signed/unsigned variants which are identical except
1902 * for whether the folded conversion sign-extends or zero-extends, and we can
1903 * fold in a mismatching move by rewriting the opcode. Return the opcode to
1904 * switch signedness, and whether one exists.
1905 */
1906 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1907 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1908 {
1909 switch (opc) {
1910 #define PAIR(u, s) \
1911 case OPC_##u: \
1912 return OPC_##s; \
1913 case OPC_##s: \
1914 return OPC_##u;
1915 PAIR(ADD_U, ADD_S)
1916 PAIR(SUB_U, SUB_S)
1917 /* Note: these are only identical when the sources are half, but that's
1918 * the only case we call this function for anyway.
1919 */
1920 PAIR(MUL_U24, MUL_S24)
1921
1922 default:
1923 *can_swap = false;
1924 return opc;
1925 }
1926 }
1927
1928 #define MASK(n) ((1 << (n)) - 1)
1929
1930 /* iterator for an instructions's sources (reg), also returns src #: */
1931 #define foreach_src_n(__srcreg, __n, __instr) \
1932 if ((__instr)->srcs_count) \
1933 for (struct ir3_register *__srcreg = (struct ir3_register *)~0; __srcreg;\
1934 __srcreg = NULL) \
1935 for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt; \
1936 __n++) \
1937 if ((__srcreg = (__instr)->srcs[__n]))
1938
1939 /* iterator for an instructions's sources (reg): */
1940 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1941
1942 #define foreach_src_if(__srcreg, __instr, __filter) \
1943 foreach_src (__srcreg, __instr) \
1944 if (__filter(__srcreg))
1945
1946 /* Is this either the first src in an alias group (see IR3_REG_FIRST_ALIAS) or a
1947 * normal src.
1948 */
1949 static inline bool
ir3_src_is_first_in_group(struct ir3_register * src)1950 ir3_src_is_first_in_group(struct ir3_register *src)
1951 {
1952 return (src->flags & IR3_REG_FIRST_ALIAS) || !(src->flags & IR3_REG_ALIAS);
1953 }
1954
1955 /* Iterator for an instruction's sources taking alias groups into account.
1956 * __src_n will hold the original source index (i.e., the index before expanding
1957 * collects to alias groups) while __alias_n the index within the current
1958 * group. Thus, the actual source index is __src_n + __alias_n.
1959 */
1960 #define foreach_src_with_alias_n(__srcreg, __src_n, __alias_n, __instr) \
1961 for (unsigned __src_n = -1, __alias_n = -1, __e = 0; !__e; __e = 1) \
1962 foreach_src (__srcreg, __instr) \
1963 if (__src_n += ir3_src_is_first_in_group(__srcreg) ? 1 : 0, \
1964 __alias_n = \
1965 ir3_src_is_first_in_group(__srcreg) ? 0 : __alias_n + 1, \
1966 true)
1967
1968 /* Iterator for all the sources in the alias group (see IR3_REG_FIRST_ALIAS)
1969 * starting at source index __start. __alias_n is the offset of the source
1970 * from the start of the alias group.
1971 */
1972 #define foreach_src_in_alias_group_n(__alias, __alias_n, __instr, __start) \
1973 for (struct ir3_register *__alias = __instr->srcs[__start]; \
1974 __alias && (__alias->flags & IR3_REG_FIRST_ALIAS); __alias = NULL) \
1975 for (unsigned __i = __start, __alias_n = 0; \
1976 __i < __instr->srcs_count && \
1977 (__i == __start || !ir3_src_is_first_in_group(__instr->srcs[__i])); \
1978 __i++, __alias_n++) \
1979 if ((__alias = __instr->srcs[__i]))
1980
1981 #define foreach_src_in_alias_group(__alias, __instr, __start) \
1982 foreach_src_in_alias_group_n (__alias, __alias_n, __instr, __start)
1983
1984 /* iterator for an instructions's destinations (reg), also returns dst #: */
1985 #define foreach_dst_n(__dstreg, __n, __instr) \
1986 if ((__instr)->dsts_count) \
1987 for (struct ir3_register *__dstreg = (struct ir3_register *)~0; __dstreg;\
1988 __dstreg = NULL) \
1989 for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt; \
1990 __n++) \
1991 if ((__dstreg = (__instr)->dsts[__n]))
1992
1993 /* iterator for an instructions's destinations (reg): */
1994 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1995
1996 #define foreach_dst_if(__dstreg, __instr, __filter) \
1997 foreach_dst (__dstreg, __instr) \
1998 if (__filter(__dstreg))
1999
2000 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)2001 __ssa_src_cnt(struct ir3_instruction *instr)
2002 {
2003 return instr->srcs_count + instr->deps_count;
2004 }
2005
2006 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)2007 __is_false_dep(struct ir3_instruction *instr, unsigned n)
2008 {
2009 if (n >= instr->srcs_count)
2010 return true;
2011 return false;
2012 }
2013
2014 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)2015 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
2016 {
2017 if (__is_false_dep(instr, n))
2018 return &instr->deps[n - instr->srcs_count];
2019 if (ssa(instr->srcs[n]))
2020 return &instr->srcs[n]->def->instr;
2021 return NULL;
2022 }
2023
2024 #define foreach_ssa_srcp_n(__srcp, __n, __instr) \
2025 for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL) \
2026 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; \
2027 __n++) \
2028 if ((__srcp = __ssa_srcp_n(__instr, __n)))
2029
2030 #define foreach_ssa_srcp(__srcp, __instr) \
2031 foreach_ssa_srcp_n (__srcp, __i, __instr)
2032
2033 /* iterator for an instruction's SSA sources (instr), also returns src #: */
2034 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
2035 for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst; \
2036 __srcinst = NULL) \
2037 foreach_ssa_srcp_n (__srcp, __n, __instr) \
2038 if ((__srcinst = *__srcp))
2039
2040 /* iterator for an instruction's SSA sources (instr): */
2041 #define foreach_ssa_src(__srcinst, __instr) \
2042 foreach_ssa_src_n (__srcinst, __i, __instr)
2043
2044 /* iterators for shader inputs: */
2045 #define foreach_input_n(__ininstr, __cnt, __ir) \
2046 for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr; \
2047 __ininstr = NULL) \
2048 for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
2049 if ((__ininstr = (__ir)->inputs[__cnt]))
2050 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
2051
2052 /* iterators for instructions: */
2053 #define foreach_instr(__instr, __list) \
2054 list_for_each_entry (struct ir3_instruction, __instr, __list, node)
2055 #define foreach_instr_from(__instr, __start, __list) \
2056 list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \
2057 __list, node)
2058 #define foreach_instr_rev(__instr, __list) \
2059 list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
2060 #define foreach_instr_safe(__instr, __list) \
2061 list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
2062 #define foreach_instr_from_safe(__instr, __start, __list) \
2063 list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, \
2064 __list, node)
2065
2066 /* Iterate over all instructions in a repeat group. */
2067 #define foreach_instr_rpt(__rpt, __instr) \
2068 if (assert(ir3_instr_is_first_rpt(__instr)), true) \
2069 for (struct ir3_instruction *__rpt = __instr, *__first = __instr; \
2070 __first || __rpt != __instr; \
2071 __first = NULL, __rpt = \
2072 list_entry(__rpt->rpt_node.next, \
2073 struct ir3_instruction, rpt_node))
2074
2075 /* Iterate over all instructions except the first one in a repeat group. */
2076 #define foreach_instr_rpt_excl(__rpt, __instr) \
2077 if (assert(ir3_instr_is_first_rpt(__instr)), true) \
2078 list_for_each_entry (struct ir3_instruction, __rpt, &__instr->rpt_node, \
2079 rpt_node)
2080
2081 #define foreach_instr_rpt_excl_safe(__rpt, __instr) \
2082 if (assert(ir3_instr_is_first_rpt(__instr)), true) \
2083 list_for_each_entry_safe (struct ir3_instruction, __rpt, \
2084 &__instr->rpt_node, rpt_node)
2085
2086 /* iterators for blocks: */
2087 #define foreach_block(__block, __list) \
2088 list_for_each_entry (struct ir3_block, __block, __list, node)
2089 #define foreach_block_safe(__block, __list) \
2090 list_for_each_entry_safe (struct ir3_block, __block, __list, node)
2091 #define foreach_block_rev(__block, __list) \
2092 list_for_each_entry_rev (struct ir3_block, __block, __list, node)
2093
2094 /* iterators for arrays: */
2095 #define foreach_array(__array, __list) \
2096 list_for_each_entry (struct ir3_array, __array, __list, node)
2097 #define foreach_array_safe(__array, __list) \
2098 list_for_each_entry_safe (struct ir3_array, __array, __list, node)
2099
2100 #define IR3_PASS(ir, pass, ...) \
2101 ({ \
2102 bool progress = pass(ir, ##__VA_ARGS__); \
2103 if (progress) { \
2104 ir3_debug_print(ir, "AFTER: " #pass); \
2105 ir3_validate(ir); \
2106 } \
2107 progress; \
2108 })
2109
2110 /* validate: */
2111 void ir3_validate(struct ir3 *ir);
2112
2113 /* dump: */
2114 void ir3_print(struct ir3 *ir);
2115 void ir3_print_instr(struct ir3_instruction *instr);
2116
2117 struct log_stream;
2118 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
2119
2120 /* delay calculation: */
2121 unsigned ir3_src_read_delay(struct ir3_compiler *compiler,
2122 struct ir3_instruction *instr, unsigned src_n);
2123 int ir3_delayslots(struct ir3_compiler *compiler,
2124 struct ir3_instruction *assigner,
2125 struct ir3_instruction *consumer, unsigned n, bool soft);
2126 unsigned ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
2127 struct ir3_instruction *assigner,
2128 struct ir3_instruction *consumer,
2129 unsigned assigner_n, unsigned consumer_n);
2130
2131 /* estimated (ss)/(sy) delay calculation */
2132
2133 static inline bool
is_local_mem_load(struct ir3_instruction * instr)2134 is_local_mem_load(struct ir3_instruction *instr)
2135 {
2136 return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
2137 instr->opc == OPC_LDLW;
2138 }
2139
2140 bool is_scalar_alu(struct ir3_instruction *instr,
2141 const struct ir3_compiler *compiler);
2142
2143 /* Does this instruction sometimes need (ss) to wait for its result? */
2144 static inline bool
is_ss_producer(struct ir3_instruction * instr)2145 is_ss_producer(struct ir3_instruction *instr)
2146 {
2147 foreach_dst (dst, instr) {
2148 if (dst->flags & IR3_REG_SHARED)
2149 return true;
2150 }
2151
2152 if (instr->block->in_early_preamble && writes_addr1(instr))
2153 return true;
2154
2155 return is_sfu(instr) || is_local_mem_load(instr) || instr->opc == OPC_SHFL;
2156 }
2157
2158 static inline bool
needs_ss(const struct ir3_compiler * compiler,struct ir3_instruction * producer,struct ir3_instruction * consumer)2159 needs_ss(const struct ir3_compiler *compiler, struct ir3_instruction *producer,
2160 struct ir3_instruction *consumer)
2161 {
2162 if (is_scalar_alu(producer, compiler) &&
2163 is_scalar_alu(consumer, compiler) &&
2164 (producer->dsts[0]->flags & IR3_REG_HALF) ==
2165 (consumer->srcs[0]->flags & IR3_REG_HALF))
2166 return false;
2167
2168 return is_ss_producer(producer);
2169 }
2170
2171 static inline bool
supports_ss(struct ir3_instruction * instr)2172 supports_ss(struct ir3_instruction *instr)
2173 {
2174 return opc_cat(instr->opc) < 5 || instr->opc == OPC_ALIAS;
2175 }
2176
2177 /* The soft delay for approximating the cost of (ss). */
2178 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)2179 soft_ss_delay(struct ir3_instruction *instr)
2180 {
2181 /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
2182 * using nop's instead of (ss) is:
2183 *
2184 * 8 - single warp
2185 * 9 - two warps
2186 * 10 - four warps
2187 *
2188 * and so on. Not quite sure where it tapers out (ie. how many warps share an
2189 * SFU unit). But 10 seems like a reasonable # to choose:
2190 */
2191 if (is_sfu(instr) || is_local_mem_load(instr))
2192 return 10;
2193
2194 /* The blob adds 6 nops between shared producers and consumers, and before we
2195 * used (ss) this was sufficient in most cases.
2196 */
2197 return 6;
2198 }
2199
2200 static inline bool
is_sy_producer(struct ir3_instruction * instr)2201 is_sy_producer(struct ir3_instruction *instr)
2202 {
2203 return is_tex_or_prefetch(instr) ||
2204 (is_load(instr) && !is_local_mem_load(instr)) ||
2205 is_atomic(instr->opc);
2206 }
2207
2208 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)2209 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
2210 {
2211 /* TODO: this is just an optimistic guess, we can do better post-RA.
2212 */
2213 bool double_wavesize =
2214 shader->type == MESA_SHADER_FRAGMENT ||
2215 shader->type == MESA_SHADER_COMPUTE;
2216
2217 unsigned components = reg_elems(instr->dsts[0]);
2218
2219 /* These numbers come from counting the number of delay slots to get
2220 * cat5/cat6 results back using nops instead of (sy). Note that these numbers
2221 * are with the result preloaded to cache by loading it before in the same
2222 * shader - uncached results are much larger.
2223 *
2224 * Note: most ALU instructions can't complete at the full doubled rate, so
2225 * they take 2 cycles. The only exception is fp16 instructions with no
2226 * built-in conversions. Therefore divide the latency by 2.
2227 *
2228 * TODO: Handle this properly in the scheduler and remove this.
2229 */
2230 if (instr->opc == OPC_LDC) {
2231 if (double_wavesize)
2232 return (21 + 8 * components) / 2;
2233 else
2234 return 18 + 4 * components;
2235 } else if (is_tex_or_prefetch(instr)) {
2236 if (double_wavesize) {
2237 switch (components) {
2238 case 1: return 58 / 2;
2239 case 2: return 60 / 2;
2240 case 3: return 77 / 2;
2241 case 4: return 79 / 2;
2242 default: unreachable("bad number of components");
2243 }
2244 } else {
2245 switch (components) {
2246 case 1: return 51;
2247 case 2: return 53;
2248 case 3: return 62;
2249 case 4: return 64;
2250 default: unreachable("bad number of components");
2251 }
2252 }
2253 } else {
2254 /* TODO: measure other cat6 opcodes like ldg */
2255 if (double_wavesize)
2256 return (172 + components) / 2;
2257 else
2258 return 109 + components;
2259 }
2260 }
2261
2262 /* Some instructions don't immediately consume their sources so may introduce a
2263 * WAR hazard.
2264 */
2265 static inline bool
is_war_hazard_producer(struct ir3_instruction * instr)2266 is_war_hazard_producer(struct ir3_instruction *instr)
2267 {
2268 return is_tex(instr) || is_mem(instr) || is_ss_producer(instr) ||
2269 instr->opc == OPC_STC;
2270 }
2271
2272 bool ir3_cleanup_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2273 bool ir3_merge_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2274 bool ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v);
2275 bool ir3_create_alias_tex_regs(struct ir3 *ir);
2276 bool ir3_insert_alias_tex(struct ir3 *ir);
2277 bool ir3_create_alias_rt(struct ir3 *ir, struct ir3_shader_variant *v);
2278
2279 /* unreachable block elimination: */
2280 bool ir3_remove_unreachable(struct ir3 *ir);
2281
2282 /* calculate reconvergence information: */
2283 void ir3_calc_reconvergence(struct ir3_shader_variant *so);
2284
2285 /* lower invalid shared phis after calculating reconvergence information: */
2286 bool ir3_lower_shared_phis(struct ir3 *ir);
2287
2288 /* dead code elimination: */
2289 struct ir3_shader_variant;
2290 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
2291
2292 /* fp16 conversion folding */
2293 bool ir3_cf(struct ir3 *ir);
2294
2295 /* shared mov folding */
2296 bool ir3_shared_fold(struct ir3 *ir);
2297
2298 /* copy-propagate: */
2299 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
2300
2301 /* common subexpression elimination: */
2302 bool ir3_cse(struct ir3 *ir);
2303
2304 /* Make arrays SSA */
2305 bool ir3_array_to_ssa(struct ir3 *ir);
2306
2307 /* scheduling: */
2308 bool ir3_sched_add_deps(struct ir3 *ir);
2309 int ir3_sched(struct ir3 *ir);
2310
2311 struct ir3_context;
2312 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
2313
2314 /* register assignment: */
2315 int ir3_ra(struct ir3_shader_variant *v);
2316 void ir3_ra_predicates(struct ir3_shader_variant *v);
2317
2318 /* lower subgroup ops: */
2319 bool ir3_lower_subgroups(struct ir3 *ir);
2320
2321 /* legalize: */
2322 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
2323 bool ir3_legalize_relative(struct ir3 *ir);
2324
2325 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)2326 ir3_has_latency_to_hide(struct ir3 *ir)
2327 {
2328 /* VS/GS/TCS/TESS co-exist with frag shader invocations, but we don't
2329 * know the nature of the fragment shader. Just assume it will have
2330 * latency to hide:
2331 */
2332 if (ir->type != MESA_SHADER_FRAGMENT)
2333 return true;
2334
2335 foreach_block (block, &ir->block_list) {
2336 foreach_instr (instr, &block->instr_list) {
2337 if (is_tex_or_prefetch(instr))
2338 return true;
2339
2340 if (is_load(instr)) {
2341 switch (instr->opc) {
2342 case OPC_LDLV:
2343 case OPC_LDL:
2344 case OPC_LDLW:
2345 break;
2346 default:
2347 return true;
2348 }
2349 }
2350 }
2351 }
2352
2353 return false;
2354 }
2355
2356 /**
2357 * Move 'instr' to after the last phi node at the beginning of the block:
2358 */
2359 static inline void
ir3_instr_move_after_phis(struct ir3_instruction * instr,struct ir3_block * block)2360 ir3_instr_move_after_phis(struct ir3_instruction *instr,
2361 struct ir3_block *block)
2362 {
2363 struct ir3_instruction *last_phi = ir3_block_get_last_phi(block);
2364 if (last_phi)
2365 ir3_instr_move_after(instr, last_phi);
2366 else
2367 ir3_instr_move_before_block(instr, block);
2368 }
2369
2370 static inline struct ir3_cursor
ir3_before_block(struct ir3_block * block)2371 ir3_before_block(struct ir3_block *block)
2372 {
2373 assert(block);
2374 struct ir3_cursor cursor;
2375 cursor.option = IR3_CURSOR_BEFORE_BLOCK;
2376 cursor.block = block;
2377 return cursor;
2378 }
2379
2380 static inline struct ir3_cursor
ir3_after_block(struct ir3_block * block)2381 ir3_after_block(struct ir3_block *block)
2382 {
2383 assert(block);
2384 struct ir3_cursor cursor;
2385 cursor.option = IR3_CURSOR_AFTER_BLOCK;
2386 cursor.block = block;
2387 return cursor;
2388 }
2389
2390 static inline struct ir3_cursor
ir3_before_instr(struct ir3_instruction * instr)2391 ir3_before_instr(struct ir3_instruction *instr)
2392 {
2393 assert(instr);
2394 struct ir3_cursor cursor;
2395 cursor.option = IR3_CURSOR_BEFORE_INSTR;
2396 cursor.instr = instr;
2397 return cursor;
2398 }
2399
2400 static inline struct ir3_cursor
ir3_after_instr(struct ir3_instruction * instr)2401 ir3_after_instr(struct ir3_instruction *instr)
2402 {
2403 assert(instr);
2404 struct ir3_cursor cursor;
2405 cursor.option = IR3_CURSOR_AFTER_INSTR;
2406 cursor.instr = instr;
2407 return cursor;
2408 }
2409
2410 static inline struct ir3_cursor
ir3_before_terminator(struct ir3_block * block)2411 ir3_before_terminator(struct ir3_block *block)
2412 {
2413 assert(block);
2414 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
2415
2416 if (terminator)
2417 return ir3_before_instr(terminator);
2418 return ir3_after_block(block);
2419 }
2420
2421 static inline struct ir3_cursor
ir3_after_phis(struct ir3_block * block)2422 ir3_after_phis(struct ir3_block *block)
2423 {
2424 assert(block);
2425
2426 foreach_instr (instr, &block->instr_list) {
2427 if (instr->opc != OPC_META_PHI)
2428 return ir3_before_instr(instr);
2429 }
2430
2431 return ir3_after_block(block);
2432 }
2433
2434 static inline struct ir3_cursor
ir3_after_instr_and_phis(struct ir3_instruction * instr)2435 ir3_after_instr_and_phis(struct ir3_instruction *instr)
2436 {
2437 if (instr->opc == OPC_META_PHI) {
2438 return ir3_after_phis(instr->block);
2439 } else {
2440 return ir3_after_instr(instr);
2441 }
2442 }
2443
2444 static inline struct ir3_builder
ir3_builder_at(struct ir3_cursor cursor)2445 ir3_builder_at(struct ir3_cursor cursor)
2446 {
2447 struct ir3_builder builder;
2448 builder.cursor = cursor;
2449 return builder;
2450 }
2451
2452
2453 /* ************************************************************************* */
2454 /* instruction helpers */
2455
2456 /* creates SSA src of correct type (ie. half vs full precision) */
2457 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)2458 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
2459 unsigned flags)
2460 {
2461 struct ir3_register *reg;
2462 flags |= src->dsts[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED);
2463 reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
2464 reg->def = src->dsts[0];
2465 reg->wrmask = src->dsts[0]->wrmask;
2466 return reg;
2467 }
2468
2469 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)2470 __ssa_dst(struct ir3_instruction *instr)
2471 {
2472 struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
2473 reg->instr = instr;
2474 return reg;
2475 }
2476
2477 static BITMASK_ENUM(ir3_register_flags)
type_flags(type_t type)2478 type_flags(type_t type)
2479 {
2480 if (type_size(type) < 32)
2481 return IR3_REG_HALF;
2482 return (ir3_register_flags)0;
2483 }
2484
2485 static inline struct ir3_instruction *
create_immed_typed_shared(struct ir3_builder * build,uint32_t val,type_t type,bool shared)2486 create_immed_typed_shared(struct ir3_builder *build, uint32_t val, type_t type,
2487 bool shared)
2488 {
2489 struct ir3_instruction *mov;
2490 ir3_register_flags flags = type_flags(type);
2491
2492 mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2493 mov->cat1.src_type = type;
2494 mov->cat1.dst_type = type;
2495 __ssa_dst(mov)->flags |= flags | (shared ? IR3_REG_SHARED : 0);
2496 ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
2497
2498 return mov;
2499 }
2500
2501 static inline struct ir3_instruction *
create_immed_typed(struct ir3_builder * build,uint32_t val,type_t type)2502 create_immed_typed(struct ir3_builder *build, uint32_t val, type_t type)
2503 {
2504 return create_immed_typed_shared(build, val, type, false);
2505 }
2506
2507 static inline struct ir3_instruction *
create_immed_shared(struct ir3_builder * build,uint32_t val,bool shared)2508 create_immed_shared(struct ir3_builder *build, uint32_t val, bool shared)
2509 {
2510 return create_immed_typed_shared(build, val, TYPE_U32, shared);
2511 }
2512
2513 static inline struct ir3_instruction *
create_immed(struct ir3_builder * build,uint32_t val)2514 create_immed(struct ir3_builder *build, uint32_t val)
2515 {
2516 return create_immed_shared(build, val, false);
2517 }
2518
2519 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_builder * build,unsigned n,type_t type)2520 create_uniform_typed(struct ir3_builder *build, unsigned n, type_t type)
2521 {
2522 struct ir3_instruction *mov;
2523 ir3_register_flags flags = type_flags(type);
2524
2525 mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2526 mov->cat1.src_type = type;
2527 mov->cat1.dst_type = type;
2528 __ssa_dst(mov)->flags |= flags;
2529 ir3_src_create(mov, n, IR3_REG_CONST | flags);
2530
2531 return mov;
2532 }
2533
2534 static inline struct ir3_instruction *
create_uniform(struct ir3_builder * build,unsigned n)2535 create_uniform(struct ir3_builder *build, unsigned n)
2536 {
2537 return create_uniform_typed(build, n, TYPE_F32);
2538 }
2539
2540 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_builder * build,int n,type_t type,struct ir3_instruction * address)2541 create_uniform_indirect(struct ir3_builder *build, int n, type_t type,
2542 struct ir3_instruction *address)
2543 {
2544 struct ir3_instruction *mov;
2545
2546 mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2547 mov->cat1.src_type = type;
2548 mov->cat1.dst_type = type;
2549 __ssa_dst(mov);
2550 ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
2551
2552 ir3_instr_set_address(mov, address);
2553
2554 return mov;
2555 }
2556
2557 static inline struct ir3_instruction *
ir3_MOV(struct ir3_builder * build,struct ir3_instruction * src,type_t type)2558 ir3_MOV(struct ir3_builder *build, struct ir3_instruction *src, type_t type)
2559 {
2560 struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOV, 1, 1);
2561 ir3_register_flags flags = type_flags(type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2562
2563 __ssa_dst(instr)->flags |= flags;
2564 if (src->dsts[0]->flags & IR3_REG_ARRAY) {
2565 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
2566 src_reg->array = src->dsts[0]->array;
2567 } else {
2568 __ssa_src(instr, src, 0);
2569 }
2570 assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
2571 instr->cat1.src_type = type;
2572 instr->cat1.dst_type = type;
2573 return instr;
2574 }
2575
2576 static inline struct ir3_instruction_rpt
ir3_MOV_rpt(struct ir3_builder * build,unsigned nrpt,struct ir3_instruction_rpt src,type_t type)2577 ir3_MOV_rpt(struct ir3_builder *build, unsigned nrpt,
2578 struct ir3_instruction_rpt src, type_t type)
2579 {
2580 struct ir3_instruction_rpt dst;
2581 assert(nrpt <= ARRAY_SIZE(dst.rpts));
2582
2583 for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2584 dst.rpts[rpt] = ir3_MOV(build, src.rpts[rpt], type);
2585
2586 ir3_instr_create_rpt(dst.rpts, nrpt);
2587 return dst;
2588 }
2589
2590 static inline struct ir3_instruction *
ir3_COV(struct ir3_builder * build,struct ir3_instruction * src,type_t src_type,type_t dst_type)2591 ir3_COV(struct ir3_builder *build, struct ir3_instruction *src, type_t src_type,
2592 type_t dst_type)
2593 {
2594 struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOV, 1, 1);
2595 ir3_register_flags dst_flags = type_flags(dst_type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2596 ASSERTED ir3_register_flags src_flags = type_flags(src_type);
2597
2598 assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2599
2600 __ssa_dst(instr)->flags |= dst_flags;
2601 __ssa_src(instr, src, 0);
2602 instr->cat1.src_type = src_type;
2603 instr->cat1.dst_type = dst_type;
2604 assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2605 return instr;
2606 }
2607
2608 static inline struct ir3_instruction_rpt
ir3_COV_rpt(struct ir3_builder * build,unsigned nrpt,struct ir3_instruction_rpt src,type_t src_type,type_t dst_type)2609 ir3_COV_rpt(struct ir3_builder *build, unsigned nrpt,
2610 struct ir3_instruction_rpt src, type_t src_type, type_t dst_type)
2611 {
2612 struct ir3_instruction_rpt dst;
2613
2614 for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2615 dst.rpts[rpt] = ir3_COV(build, src.rpts[rpt], src_type, dst_type);
2616
2617 ir3_instr_create_rpt(dst.rpts, nrpt);
2618 return dst;
2619 }
2620
2621 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_builder * build,unsigned components)2622 ir3_MOVMSK(struct ir3_builder *build, unsigned components)
2623 {
2624 struct ir3_instruction *instr = ir3_build_instr(build, OPC_MOVMSK, 1, 0);
2625
2626 struct ir3_register *dst = __ssa_dst(instr);
2627 dst->flags |= IR3_REG_SHARED;
2628 dst->wrmask = (1 << components) - 1;
2629 instr->repeat = components - 1;
2630 return instr;
2631 }
2632
2633 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_builder * build,struct ir3_instruction * src,unsigned components)2634 ir3_BALLOT_MACRO(struct ir3_builder *build, struct ir3_instruction *src,
2635 unsigned components)
2636 {
2637 struct ir3_instruction *instr =
2638 ir3_build_instr(build, OPC_BALLOT_MACRO, 1, 1);
2639
2640 struct ir3_register *dst = __ssa_dst(instr);
2641 dst->flags |= IR3_REG_SHARED;
2642 dst->wrmask = (1 << components) - 1;
2643
2644 __ssa_src(instr, src, 0);
2645
2646 return instr;
2647 }
2648
2649 /* clang-format off */
2650 #define __INSTR0(flag, name, opc) \
2651 static inline struct ir3_instruction *ir3_##name(struct ir3_builder *build) \
2652 { \
2653 struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 0); \
2654 instr->flags |= flag; \
2655 return instr; \
2656 }
2657 /* clang-format on */
2658 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2659 #define INSTR0(name) __INSTR0((ir3_instruction_flags)0, name, OPC_##name)
2660
2661 /* clang-format off */
2662 #define __INSTR1(flag, dst_count, name, opc, scalar_alu) \
2663 static inline struct ir3_instruction *ir3_##name( \
2664 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags) \
2665 { \
2666 struct ir3_instruction *instr = \
2667 ir3_build_instr(build, opc, dst_count, 1); \
2668 unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & IR3_REG_SHARED) : 0; \
2669 for (unsigned i = 0; i < dst_count; i++) \
2670 __ssa_dst(instr)->flags |= dst_flag; \
2671 __ssa_src(instr, a, aflags); \
2672 instr->flags |= flag; \
2673 return instr; \
2674 } \
2675 static inline struct ir3_instruction_rpt ir3_##name##_rpt( \
2676 struct ir3_builder *build, unsigned nrpt, \
2677 struct ir3_instruction_rpt a, unsigned aflags) \
2678 { \
2679 struct ir3_instruction_rpt dst; \
2680 assert(nrpt <= ARRAY_SIZE(dst.rpts)); \
2681 for (unsigned rpt = 0; rpt < nrpt; rpt++) \
2682 dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags); \
2683 ir3_instr_create_rpt(dst.rpts, nrpt); \
2684 return dst; \
2685 }
2686
2687 /* clang-format on */
2688 #define INSTR1F(f, name) __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name, \
2689 false)
2690 #define INSTR1(name) __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2691 #define INSTR1S(name) __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2692 #define INSTR1NODST(name) __INSTR1((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2693
2694 /* clang-format off */
2695 #define __INSTR2(flag, dst_count, name, opc, scalar_alu) \
2696 static inline struct ir3_instruction *ir3_##name( \
2697 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags, \
2698 struct ir3_instruction *b, unsigned bflags) \
2699 { \
2700 struct ir3_instruction *instr = ir3_build_instr(build, opc, dst_count, 2); \
2701 unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags & \
2702 IR3_REG_SHARED) : 0; \
2703 for (unsigned i = 0; i < dst_count; i++) \
2704 __ssa_dst(instr)->flags |= dst_flag; \
2705 __ssa_src(instr, a, aflags); \
2706 __ssa_src(instr, b, bflags); \
2707 instr->flags |= flag; \
2708 return instr; \
2709 } \
2710 static inline struct ir3_instruction_rpt ir3_##name##_rpt( \
2711 struct ir3_builder *build, unsigned nrpt, \
2712 struct ir3_instruction_rpt a, unsigned aflags, \
2713 struct ir3_instruction_rpt b, unsigned bflags) \
2714 { \
2715 struct ir3_instruction_rpt dst; \
2716 assert(nrpt <= ARRAY_SIZE(dst.rpts)); \
2717 for (unsigned rpt = 0; rpt < nrpt; rpt++) { \
2718 dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags, \
2719 b.rpts[rpt], bflags); \
2720 } \
2721 ir3_instr_create_rpt(dst.rpts, nrpt); \
2722 return dst; \
2723 }
2724 /* clang-format on */
2725 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name, \
2726 false)
2727 #define INSTR2(name) __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2728 #define INSTR2S(name) __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2729 #define INSTR2NODST(name) __INSTR2((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2730
2731 /* clang-format off */
2732 #define __INSTR3(flag, dst_count, name, opc, scalar_alu) \
2733 static inline struct ir3_instruction *ir3_##name( \
2734 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags, \
2735 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2736 unsigned cflags) \
2737 { \
2738 struct ir3_instruction *instr = \
2739 ir3_build_instr(build, opc, dst_count, 3); \
2740 unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags & \
2741 c->dsts[0]->flags & IR3_REG_SHARED) : 0; \
2742 for (unsigned i = 0; i < dst_count; i++) \
2743 __ssa_dst(instr)->flags |= dst_flag; \
2744 __ssa_src(instr, a, aflags); \
2745 __ssa_src(instr, b, bflags); \
2746 __ssa_src(instr, c, cflags); \
2747 instr->flags |= flag; \
2748 return instr; \
2749 } \
2750 static inline struct ir3_instruction_rpt ir3_##name##_rpt( \
2751 struct ir3_builder *build, unsigned nrpt, \
2752 struct ir3_instruction_rpt a, unsigned aflags, \
2753 struct ir3_instruction_rpt b, unsigned bflags, \
2754 struct ir3_instruction_rpt c, unsigned cflags) \
2755 { \
2756 struct ir3_instruction_rpt dst; \
2757 assert(nrpt <= ARRAY_SIZE(dst.rpts)); \
2758 for (unsigned rpt = 0; rpt < nrpt; rpt++) { \
2759 dst.rpts[rpt] = ir3_##name(build, a.rpts[rpt], aflags, \
2760 b.rpts[rpt], bflags, \
2761 c.rpts[rpt], cflags); \
2762 } \
2763 ir3_instr_create_rpt(dst.rpts, nrpt); \
2764 return dst; \
2765 }
2766 /* clang-format on */
2767 #define INSTR3F(f, name) __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name, \
2768 false)
2769 #define INSTR3(name) __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2770 #define INSTR3S(name) __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2771 #define INSTR3NODST(name) __INSTR3((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2772
2773 /* clang-format off */
2774 #define __INSTR4(flag, dst_count, name, opc) \
2775 static inline struct ir3_instruction *ir3_##name( \
2776 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags, \
2777 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2778 unsigned cflags, struct ir3_instruction *d, unsigned dflags) \
2779 { \
2780 struct ir3_instruction *instr = \
2781 ir3_build_instr(build, opc, dst_count, 4); \
2782 for (unsigned i = 0; i < dst_count; i++) \
2783 __ssa_dst(instr); \
2784 __ssa_src(instr, a, aflags); \
2785 __ssa_src(instr, b, bflags); \
2786 __ssa_src(instr, c, cflags); \
2787 __ssa_src(instr, d, dflags); \
2788 instr->flags |= flag; \
2789 return instr; \
2790 }
2791 /* clang-format on */
2792 #define INSTR4F(f, name) __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2793 #define INSTR4(name) __INSTR4((ir3_instruction_flags)0, 1, name, OPC_##name)
2794 #define INSTR4NODST(name) __INSTR4((ir3_instruction_flags)0, 0, name, OPC_##name)
2795
2796 /* clang-format off */
2797 #define __INSTR5(flag, name, opc) \
2798 static inline struct ir3_instruction *ir3_##name( \
2799 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags, \
2800 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2801 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2802 struct ir3_instruction *e, unsigned eflags) \
2803 { \
2804 struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 5); \
2805 __ssa_dst(instr); \
2806 __ssa_src(instr, a, aflags); \
2807 __ssa_src(instr, b, bflags); \
2808 __ssa_src(instr, c, cflags); \
2809 __ssa_src(instr, d, dflags); \
2810 __ssa_src(instr, e, eflags); \
2811 instr->flags |= flag; \
2812 return instr; \
2813 }
2814 /* clang-format on */
2815 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2816 #define INSTR5(name) __INSTR5((ir3_instruction_flags)0, name, OPC_##name)
2817
2818 /* clang-format off */
2819 #define __INSTR6(flag, dst_count, name, opc) \
2820 static inline struct ir3_instruction *ir3_##name( \
2821 struct ir3_builder *build, struct ir3_instruction *a, unsigned aflags, \
2822 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2823 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2824 struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f, \
2825 unsigned fflags) \
2826 { \
2827 struct ir3_instruction *instr = ir3_build_instr(build, opc, 1, 6); \
2828 for (unsigned i = 0; i < dst_count; i++) \
2829 __ssa_dst(instr); \
2830 __ssa_src(instr, a, aflags); \
2831 __ssa_src(instr, b, bflags); \
2832 __ssa_src(instr, c, cflags); \
2833 __ssa_src(instr, d, dflags); \
2834 __ssa_src(instr, e, eflags); \
2835 __ssa_src(instr, f, fflags); \
2836 instr->flags |= flag; \
2837 return instr; \
2838 }
2839 /* clang-format on */
2840 #define INSTR6F(f, name) __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2841 #define INSTR6(name) __INSTR6((ir3_instruction_flags)0, 1, name, OPC_##name)
2842 #define INSTR6NODST(name) __INSTR6((ir3_instruction_flags)0, 0, name, OPC_##name)
2843
2844 /* cat0 instructions: */
2845 INSTR0(NOP)
INSTR1NODST(BR)2846 INSTR1NODST(BR)
2847 INSTR1NODST(BALL)
2848 INSTR1NODST(BANY)
2849 INSTR2NODST(BRAA)
2850 INSTR2NODST(BRAO)
2851 INSTR0(JUMP)
2852 INSTR1NODST(KILL)
2853 INSTR1NODST(DEMOTE)
2854 INSTR0(END)
2855 INSTR0(CHSH)
2856 INSTR0(CHMASK)
2857 INSTR1NODST(PREDT)
2858 INSTR1NODST(PREDF)
2859 INSTR0(PREDE)
2860 INSTR0(GETONE)
2861 INSTR0(GETLAST)
2862 INSTR0(SHPS)
2863 INSTR0(SHPE)
2864
2865 /* cat1 macros */
2866 INSTR1(ANY_MACRO)
2867 INSTR1(ALL_MACRO)
2868 INSTR1(READ_FIRST_MACRO)
2869 INSTR2(READ_COND_MACRO)
2870 INSTR1(READ_GETLAST_MACRO)
2871
2872 static inline struct ir3_instruction *
2873 ir3_ELECT_MACRO(struct ir3_builder *build)
2874 {
2875 struct ir3_instruction *instr =
2876 ir3_build_instr(build, OPC_ELECT_MACRO, 1, 0);
2877 __ssa_dst(instr);
2878 return instr;
2879 }
2880
2881 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_builder * build)2882 ir3_SHPS_MACRO(struct ir3_builder *build)
2883 {
2884 struct ir3_instruction *instr = ir3_build_instr(build, OPC_SHPS_MACRO, 1, 0);
2885 __ssa_dst(instr);
2886 return instr;
2887 }
2888
2889 /* cat2 instructions, most 2 src but some 1 src: */
2890 INSTR2S(ADD_F)
INSTR2S(MIN_F)2891 INSTR2S(MIN_F)
2892 INSTR2S(MAX_F)
2893 INSTR2S(MUL_F)
2894 INSTR1S(SIGN_F)
2895 INSTR2S(CMPS_F)
2896 INSTR1S(ABSNEG_F)
2897 INSTR2S(CMPV_F)
2898 INSTR1S(FLOOR_F)
2899 INSTR1S(CEIL_F)
2900 INSTR1S(RNDNE_F)
2901 INSTR1S(RNDAZ_F)
2902 INSTR1S(TRUNC_F)
2903 INSTR2S(ADD_U)
2904 INSTR2S(ADD_S)
2905 INSTR2S(SUB_U)
2906 INSTR2S(SUB_S)
2907 INSTR2S(CMPS_U)
2908 INSTR2S(CMPS_S)
2909 INSTR2S(MIN_U)
2910 INSTR2S(MIN_S)
2911 INSTR2S(MAX_U)
2912 INSTR2S(MAX_S)
2913 INSTR1S(ABSNEG_S)
2914 INSTR2S(AND_B)
2915 INSTR2S(OR_B)
2916 INSTR1S(NOT_B)
2917 INSTR2S(XOR_B)
2918 INSTR2S(CMPV_U)
2919 INSTR2S(CMPV_S)
2920 INSTR2S(MUL_U24)
2921 INSTR2S(MUL_S24)
2922 INSTR2S(MULL_U)
2923 INSTR1S(BFREV_B)
2924 INSTR1S(CLZ_S)
2925 INSTR1S(CLZ_B)
2926 INSTR2S(SHL_B)
2927 INSTR2S(SHR_B)
2928 INSTR2S(ASHR_B)
2929 INSTR2(BARY_F)
2930 INSTR2(FLAT_B)
2931 INSTR2S(MGEN_B)
2932 INSTR2S(GETBIT_B)
2933 INSTR1(SETRM)
2934 INSTR1S(CBITS_B)
2935 INSTR2S(SHB)
2936 INSTR2S(MSAD)
2937
2938 /* cat3 instructions: */
2939 INSTR3(MAD_U16)
2940 INSTR3(MADSH_U16)
2941 INSTR3(MAD_S16)
2942 INSTR3(MADSH_M16)
2943 INSTR3(MAD_U24)
2944 INSTR3(MAD_S24)
2945 INSTR3(MAD_F16)
2946 INSTR3(MAD_F32)
2947 INSTR3(DP2ACC)
2948 INSTR3(DP4ACC)
2949 /* NOTE: SEL_B32 checks for zero vs nonzero */
2950 INSTR3S(SEL_B16)
2951 INSTR3S(SEL_B32)
2952 INSTR3S(SEL_S16)
2953 INSTR3S(SEL_S32)
2954 INSTR3S(SEL_F16)
2955 INSTR3S(SEL_F32)
2956 INSTR3(SAD_S16)
2957 INSTR3(SAD_S32)
2958 INSTR3S(SHRM)
2959 INSTR3S(SHLM)
2960 INSTR3S(SHRG)
2961 INSTR3S(SHLG)
2962 INSTR3S(ANDG)
2963
2964 /* cat4 instructions: */
2965 INSTR1S(RCP)
2966 INSTR1S(RSQ)
2967 INSTR1S(HRSQ)
2968 INSTR1S(LOG2)
2969 INSTR1S(HLOG2)
2970 INSTR1S(EXP2)
2971 INSTR1S(HEXP2)
2972 INSTR1S(SIN)
2973 INSTR1S(COS)
2974 INSTR1S(SQRT)
2975
2976 /* cat5 instructions: */
2977 INSTR1(DSX)
2978 INSTR1(DSXPP_MACRO)
2979 INSTR1(DSY)
2980 INSTR1(DSYPP_MACRO)
2981 INSTR1F(3D, DSX)
2982 INSTR1F(3D, DSY)
2983 INSTR1(RGETPOS)
2984
2985 static inline struct ir3_instruction *
2986 ir3_SAM(struct ir3_builder *build, opc_t opc, type_t type, unsigned wrmask,
2987 ir3_instruction_flags flags, struct ir3_instruction *samp_tex,
2988 struct ir3_instruction *src0, struct ir3_instruction *src1)
2989 {
2990 struct ir3_instruction *sam;
2991 unsigned nreg = 0;
2992
2993 if (flags & IR3_INSTR_S2EN) {
2994 nreg++;
2995 }
2996 if (src0 || opc == OPC_SAM) {
2997 nreg++;
2998 }
2999 if (src1) {
3000 nreg++;
3001 }
3002
3003 sam = ir3_build_instr(build, opc, 1, nreg);
3004 sam->flags |= flags;
3005 __ssa_dst(sam)->wrmask = wrmask;
3006 if (flags & IR3_INSTR_S2EN) {
3007 __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
3008 }
3009 if (src0) {
3010 __ssa_src(sam, src0, 0);
3011 } else if (opc == OPC_SAM) {
3012 /* Create a dummy shared source for the coordinate, for the prefetch
3013 * case. It needs to be shared so that we don't accidentally disable early
3014 * preamble, and this is what the blob does.
3015 */
3016 ir3_src_create(sam, regid(48, 0), IR3_REG_SHARED);
3017 }
3018 if (src1) {
3019 __ssa_src(sam, src1, 0);
3020 }
3021 sam->cat5.type = type;
3022
3023 return sam;
3024 }
3025
3026 /* brcst.active rx, ry behaves like a conditional move: rx either keeps its
3027 * value or is set to ry. In order to model this in SSA form, we add an extra
3028 * argument (the initial value of rx) and tie it to the destination.
3029 */
3030 static inline struct ir3_instruction *
ir3_BRCST_ACTIVE(struct ir3_builder * build,unsigned cluster_size,struct ir3_instruction * src,struct ir3_instruction * dst_default)3031 ir3_BRCST_ACTIVE(struct ir3_builder *build, unsigned cluster_size,
3032 struct ir3_instruction *src,
3033 struct ir3_instruction *dst_default)
3034 {
3035 struct ir3_instruction *brcst =
3036 ir3_build_instr(build, OPC_BRCST_ACTIVE, 1, 2);
3037 brcst->cat5.cluster_size = cluster_size;
3038 brcst->cat5.type = TYPE_U32;
3039 struct ir3_register *brcst_dst = __ssa_dst(brcst);
3040 __ssa_src(brcst, src, 0);
3041 struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0);
3042 ir3_reg_tie(brcst_dst, default_src);
3043 return brcst;
3044 }
3045
3046 /* cat6 instructions: */
3047 INSTR0(GETFIBERID)
3048 INSTR2(LDLV)
3049 INSTR3(LDG)
3050 INSTR3(LDL)
3051 INSTR3(LDLW)
3052 INSTR3(LDP)
3053 INSTR4NODST(STG)
3054 INSTR3NODST(STL)
3055 INSTR3NODST(STLW)
3056 INSTR3NODST(STP)
3057 INSTR1(RESINFO)
3058 INSTR1(RESFMT)
3059 INSTR2(ATOMIC_ADD)
3060 INSTR2(ATOMIC_SUB)
3061 INSTR2(ATOMIC_XCHG)
3062 INSTR2(ATOMIC_INC)
3063 INSTR2(ATOMIC_DEC)
3064 INSTR2(ATOMIC_CMPXCHG)
3065 INSTR2(ATOMIC_MIN)
3066 INSTR2(ATOMIC_MAX)
3067 INSTR2(ATOMIC_AND)
3068 INSTR2(ATOMIC_OR)
3069 INSTR2(ATOMIC_XOR)
3070 INSTR2(LDC)
3071 INSTR2(QUAD_SHUFFLE_BRCST)
3072 INSTR1(QUAD_SHUFFLE_HORIZ)
3073 INSTR1(QUAD_SHUFFLE_VERT)
3074 INSTR1(QUAD_SHUFFLE_DIAG)
3075 INSTR2NODST(LDC_K)
3076 INSTR2NODST(STC)
3077 INSTR2NODST(STSC)
3078 INSTR2(SHFL)
3079 #ifndef GPU
3080 #elif GPU >= 600
3081 INSTR4NODST(STIB);
3082 INSTR3(LDIB);
3083 INSTR5(LDG_A);
3084 INSTR6NODST(STG_A);
3085 INSTR2(ATOMIC_G_ADD)
3086 INSTR2(ATOMIC_G_SUB)
3087 INSTR2(ATOMIC_G_XCHG)
3088 INSTR2(ATOMIC_G_INC)
3089 INSTR2(ATOMIC_G_DEC)
3090 INSTR2(ATOMIC_G_CMPXCHG)
3091 INSTR2(ATOMIC_G_MIN)
3092 INSTR2(ATOMIC_G_MAX)
3093 INSTR2(ATOMIC_G_AND)
3094 INSTR2(ATOMIC_G_OR)
3095 INSTR2(ATOMIC_G_XOR)
3096 INSTR3(ATOMIC_B_ADD)
3097 INSTR3(ATOMIC_B_SUB)
3098 INSTR3(ATOMIC_B_XCHG)
3099 INSTR3(ATOMIC_B_INC)
3100 INSTR3(ATOMIC_B_DEC)
3101 INSTR3(ATOMIC_B_CMPXCHG)
3102 INSTR3(ATOMIC_B_MIN)
3103 INSTR3(ATOMIC_B_MAX)
3104 INSTR3(ATOMIC_B_AND)
3105 INSTR3(ATOMIC_B_OR)
3106 INSTR3(ATOMIC_B_XOR)
3107 #elif GPU >= 400
3108 INSTR3(LDGB)
3109 #if GPU >= 500
3110 INSTR3(LDIB)
3111 #endif
3112 INSTR4NODST(STGB)
3113 INSTR4NODST(STIB)
3114 INSTR4(ATOMIC_S_ADD)
3115 INSTR4(ATOMIC_S_SUB)
3116 INSTR4(ATOMIC_S_XCHG)
3117 INSTR4(ATOMIC_S_INC)
3118 INSTR4(ATOMIC_S_DEC)
3119 INSTR4(ATOMIC_S_CMPXCHG)
3120 INSTR4(ATOMIC_S_MIN)
3121 INSTR4(ATOMIC_S_MAX)
3122 INSTR4(ATOMIC_S_AND)
3123 INSTR4(ATOMIC_S_OR)
3124 INSTR4(ATOMIC_S_XOR)
3125 #endif
3126 INSTR4NODST(LDG_K)
3127 INSTR5(RAY_INTERSECTION)
3128
3129 /* cat7 instructions: */
3130 INSTR0(BAR)
3131 INSTR0(FENCE)
3132 INSTR0(CCINV)
3133
3134 /* ************************************************************************* */
3135 #include "util/bitset.h"
3136
3137 #define MAX_REG 256
3138
3139 typedef BITSET_DECLARE(fullstate_t, 2 * GPR_REG_SIZE);
3140 typedef BITSET_DECLARE(halfstate_t, GPR_REG_SIZE);
3141 typedef BITSET_DECLARE(sharedstate_t, 2 * SHARED_REG_SIZE);
3142 typedef BITSET_DECLARE(nongprstate_t, 2 * NONGPR_REG_SIZE);
3143
3144 typedef struct {
3145 bool mergedregs;
3146 fullstate_t full;
3147 halfstate_t half;
3148 sharedstate_t shared;
3149 nongprstate_t nongpr;
3150 } regmask_t;
3151
3152 static inline BITSET_WORD *
__regmask_file(regmask_t * regmask,enum ir3_reg_file file)3153 __regmask_file(regmask_t *regmask, enum ir3_reg_file file)
3154 {
3155 switch (file) {
3156 case IR3_FILE_FULL:
3157 return regmask->full;
3158 case IR3_FILE_HALF:
3159 return regmask->half;
3160 case IR3_FILE_SHARED:
3161 return regmask->shared;
3162 case IR3_FILE_NONGPR:
3163 return regmask->nongpr;
3164 }
3165 unreachable("bad file");
3166 }
3167
3168 static inline bool
__regmask_get(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3169 __regmask_get(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3170 {
3171 BITSET_WORD *regs = __regmask_file(regmask, file);
3172 for (unsigned i = 0; i < size; i++) {
3173 if (BITSET_TEST(regs, n + i))
3174 return true;
3175 }
3176 return false;
3177 }
3178
3179 static inline void
__regmask_set(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3180 __regmask_set(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3181 {
3182 BITSET_WORD *regs = __regmask_file(regmask, file);
3183 for (unsigned i = 0; i < size; i++)
3184 BITSET_SET(regs, n + i);
3185 }
3186
3187 static inline void
__regmask_clear(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3188 __regmask_clear(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3189 {
3190 BITSET_WORD *regs = __regmask_file(regmask, file);
3191 for (unsigned i = 0; i < size; i++)
3192 BITSET_CLEAR(regs, n + i);
3193 }
3194
3195 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)3196 regmask_init(regmask_t *regmask, bool mergedregs)
3197 {
3198 memset(regmask, 0, sizeof(*regmask));
3199 regmask->mergedregs = mergedregs;
3200 }
3201
3202 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)3203 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
3204 {
3205 assert(dst->mergedregs == a->mergedregs);
3206 assert(dst->mergedregs == b->mergedregs);
3207
3208 for (unsigned i = 0; i < ARRAY_SIZE(dst->full); i++)
3209 dst->full[i] = a->full[i] | b->full[i];
3210 for (unsigned i = 0; i < ARRAY_SIZE(dst->half); i++)
3211 dst->half[i] = a->half[i] | b->half[i];
3212 for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3213 dst->shared[i] = a->shared[i] | b->shared[i];
3214 for (unsigned i = 0; i < ARRAY_SIZE(dst->nongpr); i++)
3215 dst->nongpr[i] = a->nongpr[i] | b->nongpr[i];
3216 }
3217
3218 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)3219 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
3220 {
3221 for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3222 dst->shared[i] = a->shared[i] | b->shared[i];
3223 }
3224
3225 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)3226 regmask_set(regmask_t *regmask, struct ir3_register *reg)
3227 {
3228 unsigned size = reg_elem_size(reg);
3229 enum ir3_reg_file file;
3230 unsigned num = post_ra_reg_num(reg);
3231 unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3232 if (reg->flags & IR3_REG_RELATIV) {
3233 __regmask_set(regmask, file, n, size * reg->size);
3234 } else {
3235 for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3236 if (mask & 1)
3237 __regmask_set(regmask, file, n, size);
3238 }
3239 }
3240
3241 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)3242 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
3243 {
3244 unsigned size = reg_elem_size(reg);
3245 enum ir3_reg_file file;
3246 unsigned num = post_ra_reg_num(reg);
3247 unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3248 if (reg->flags & IR3_REG_RELATIV) {
3249 __regmask_clear(regmask, file, n, size * reg->size);
3250 } else {
3251 for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3252 if (mask & 1)
3253 __regmask_clear(regmask, file, n, size);
3254 }
3255 }
3256
3257 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)3258 regmask_get(regmask_t *regmask, struct ir3_register *reg)
3259 {
3260 unsigned size = reg_elem_size(reg);
3261 enum ir3_reg_file file;
3262 unsigned num = post_ra_reg_num(reg);
3263 unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3264 if (reg->flags & IR3_REG_RELATIV) {
3265 return __regmask_get(regmask, file, n, size * reg->size);
3266 } else {
3267 for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3268 if (mask & 1)
3269 if (__regmask_get(regmask, file, n, size))
3270 return true;
3271 }
3272 return false;
3273 }
3274 /* ************************************************************************* */
3275
3276 #endif /* IR3_H_ */
3277