1 /*
2 * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef IR3_H_
25 #define IR3_H_
26
27 #include <stdbool.h>
28 #include <stdint.h>
29
30 #include "compiler/shader_enums.h"
31
32 #include "util/bitscan.h"
33 #include "util/list.h"
34 #include "util/set.h"
35 #include "util/u_debug.h"
36
37 #include "freedreno_common.h"
38
39 #include "instr-a3xx.h"
40
41 /* low level intermediate representation of an adreno shader program */
42
43 struct ir3_compiler;
44 struct ir3;
45 struct ir3_instruction;
46 struct ir3_block;
47
48 struct ir3_info {
49 void *data; /* used internally in ir3 assembler */
50 /* Size in bytes of the shader binary, including NIR constants and
51 * padding
52 */
53 uint32_t size;
54 /* byte offset from start of the shader to the NIR constant data. */
55 uint32_t constant_data_offset;
56 /* Size in dwords of the instructions. */
57 uint16_t sizedwords;
58 uint16_t instrs_count; /* expanded to account for rpt's */
59 uint16_t nops_count; /* # of nop instructions, including nopN */
60 uint16_t mov_count;
61 uint16_t cov_count;
62 uint16_t stp_count;
63 uint16_t ldp_count;
64 /* NOTE: max_reg, etc, does not include registers not touched
65 * by the shader (ie. vertex fetched via VFD_DECODE but not
66 * touched by shader)
67 */
68 int8_t max_reg; /* highest GPR # used by shader */
69 int8_t max_half_reg;
70 int16_t max_const;
71 /* This is the maximum # of waves that can executed at once in one core,
72 * assuming that they are all executing this shader.
73 */
74 int8_t max_waves;
75 uint8_t subgroup_size;
76 bool double_threadsize;
77 bool multi_dword_ldp_stp;
78
79 /* number of sync bits: */
80 uint16_t ss, sy;
81
82 /* estimate of number of cycles stalled on (ss) */
83 uint16_t sstall;
84 /* estimate of number of cycles stalled on (sy) */
85 uint16_t systall;
86
87 uint16_t last_baryf; /* instruction # of last varying fetch */
88
89 uint16_t last_helper; /* last instruction to use helper invocations */
90
91 /* Number of instructions of a given category: */
92 uint16_t instrs_per_cat[8];
93 };
94
95 struct ir3_merge_set {
96 uint16_t preferred_reg;
97 uint16_t size;
98 uint16_t alignment;
99
100 unsigned interval_start;
101 unsigned spill_slot;
102
103 unsigned regs_count;
104 struct ir3_register **regs;
105 };
106
107 typedef enum ir3_register_flags {
108 IR3_REG_CONST = BIT(0),
109 IR3_REG_IMMED = BIT(1),
110 IR3_REG_HALF = BIT(2),
111 /* Shared registers have the same value for all threads when read.
112 * They can only be written when one thread is active (that is, inside
113 * a "getone" block).
114 */
115 IR3_REG_SHARED = BIT(3),
116 IR3_REG_RELATIV = BIT(4),
117 IR3_REG_R = BIT(5),
118 /* Most instructions, it seems, can do float abs/neg but not
119 * integer. The CP pass needs to know what is intended (int or
120 * float) in order to do the right thing. For this reason the
121 * abs/neg flags are split out into float and int variants. In
122 * addition, .b (bitwise) operations, the negate is actually a
123 * bitwise not, so split that out into a new flag to make it
124 * more clear.
125 */
126 IR3_REG_FNEG = BIT(6),
127 IR3_REG_FABS = BIT(7),
128 IR3_REG_SNEG = BIT(8),
129 IR3_REG_SABS = BIT(9),
130 IR3_REG_BNOT = BIT(10),
131 /* (ei) flag, end-input? Set on last bary, presumably to signal
132 * that the shader needs no more input:
133 *
134 * Note: Has different meaning on other instructions like add.s/u
135 */
136 IR3_REG_EI = BIT(11),
137 /* meta-flags, for intermediate stages of IR, ie.
138 * before register assignment is done:
139 */
140 IR3_REG_SSA = BIT(12), /* 'def' is ptr to assigning destination */
141 IR3_REG_ARRAY = BIT(13),
142
143 /* Set on a use whenever the SSA value becomes dead after the current
144 * instruction.
145 */
146 IR3_REG_KILL = BIT(14),
147
148 /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
149 * same SSA value in a single instruction, this is only set on the first
150 * use.
151 */
152 IR3_REG_FIRST_KILL = BIT(15),
153
154 /* Set when a destination doesn't have any uses and is dead immediately
155 * after the instruction. This can happen even after optimizations for
156 * corner cases such as destinations of atomic instructions.
157 */
158 IR3_REG_UNUSED = BIT(16),
159
160 /* "Early-clobber" on a destination means that the destination is
161 * (potentially) written before any sources are read and therefore
162 * interferes with the sources of the instruction.
163 */
164 IR3_REG_EARLY_CLOBBER = BIT(17),
165
166 /* If this is the last usage of a specific value in the register, the
167 * register cannot be read without being written to first after this.
168 * Note: This effectively has the same semantics as IR3_REG_KILL.
169 */
170 IR3_REG_LAST_USE = BIT(18),
171 } ir3_register_flags;
172
173 struct ir3_register {
174 BITMASK_ENUM(ir3_register_flags) flags;
175
176 unsigned name;
177
178 /* used for cat5 instructions, but also for internal/IR level
179 * tracking of what registers are read/written by an instruction.
180 * wrmask may be a bad name since it is used to represent both
181 * src and dst that touch multiple adjacent registers.
182 */
183 unsigned wrmask : 16; /* up to vec16 */
184
185 /* for relative addressing, 32bits for array size is too small,
186 * but otoh we don't need to deal with disjoint sets, so instead
187 * use a simple size field (number of scalar components).
188 *
189 * Note the size field isn't important for relative const (since
190 * we don't have to do register allocation for constants).
191 */
192 unsigned size : 16;
193
194 /* normal registers:
195 * the component is in the low two bits of the reg #, so
196 * rN.x becomes: (N << 2) | x
197 */
198 uint16_t num;
199 union {
200 /* immediate: */
201 int32_t iim_val;
202 uint32_t uim_val;
203 float fim_val;
204 /* relative: */
205 struct {
206 uint16_t id;
207 int16_t offset;
208 uint16_t base;
209 } array;
210 };
211
212 /* For IR3_REG_SSA, dst registers contain pointer back to the instruction
213 * containing this register.
214 */
215 struct ir3_instruction *instr;
216
217 /* For IR3_REG_SSA, src registers contain ptr back to assigning
218 * instruction.
219 *
220 * For IR3_REG_ARRAY, the pointer is back to the last dependent
221 * array access (although the net effect is the same, it points
222 * back to a previous instruction that we depend on).
223 */
224 struct ir3_register *def;
225
226 /* Pointer to another register in the instruction that must share the same
227 * physical register. Each destination can be tied with one source, and
228 * they must have "tied" pointing to each other.
229 */
230 struct ir3_register *tied;
231
232 unsigned spill_slot, next_use;
233
234 unsigned merge_set_offset;
235 struct ir3_merge_set *merge_set;
236 unsigned interval_start, interval_end;
237 };
238
239 /*
240 * Stupid/simple growable array implementation:
241 */
242 #define DECLARE_ARRAY(type, name) \
243 unsigned name##_count, name##_sz; \
244 type *name;
245
246 #define array_insert(ctx, arr, ...) \
247 do { \
248 if (arr##_count == arr##_sz) { \
249 arr##_sz = MAX2(2 * arr##_sz, 16); \
250 arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0])); \
251 } \
252 arr[arr##_count++] = __VA_ARGS__; \
253 } while (0)
254
255 typedef enum {
256 REDUCE_OP_ADD_U,
257 REDUCE_OP_ADD_F,
258 REDUCE_OP_MUL_U,
259 REDUCE_OP_MUL_F,
260 REDUCE_OP_MIN_U,
261 REDUCE_OP_MIN_S,
262 REDUCE_OP_MIN_F,
263 REDUCE_OP_MAX_U,
264 REDUCE_OP_MAX_S,
265 REDUCE_OP_MAX_F,
266 REDUCE_OP_AND_B,
267 REDUCE_OP_OR_B,
268 REDUCE_OP_XOR_B,
269 } reduce_op_t;
270
271 typedef enum {
272 ALIAS_TEX = 0,
273 ALIAS_RT = 3,
274 ALIAS_MEM = 4,
275 } ir3_alias_scope;
276
277 typedef enum ir3_instruction_flags {
278 /* (sy) flag is set on first instruction, and after sample
279 * instructions (probably just on RAW hazard).
280 */
281 IR3_INSTR_SY = BIT(0),
282 /* (ss) flag is set on first instruction, and first instruction
283 * to depend on the result of "long" instructions (RAW hazard):
284 *
285 * rcp, rsq, log2, exp2, sin, cos, sqrt
286 *
287 * It seems to synchronize until all in-flight instructions are
288 * completed, for example:
289 *
290 * rsq hr1.w, hr1.w
291 * add.f hr2.z, (neg)hr2.z, hc0.y
292 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
293 * rsq hr2.x, hr2.x
294 * (rpt1)nop
295 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
296 * nop
297 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
298 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
299 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
300 *
301 * The last mul.f does not have (ss) set, presumably because the
302 * (ss) on the previous instruction does the job.
303 *
304 * The blob driver also seems to set it on WAR hazards, although
305 * not really clear if this is needed or just blob compiler being
306 * sloppy. So far I haven't found a case where removing the (ss)
307 * causes problems for WAR hazard, but I could just be getting
308 * lucky:
309 *
310 * rcp r1.y, r3.y
311 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
312 *
313 */
314 IR3_INSTR_SS = BIT(1),
315 /* (jp) flag is set on jump targets:
316 */
317 IR3_INSTR_JP = BIT(2),
318 /* (eq) flag kills helper invocations when they are no longer needed */
319 IR3_INSTR_EQ = BIT(3),
320 IR3_INSTR_UL = BIT(4),
321 IR3_INSTR_3D = BIT(5),
322 IR3_INSTR_A = BIT(6),
323 IR3_INSTR_O = BIT(7),
324 IR3_INSTR_P = BIT(8),
325 IR3_INSTR_S = BIT(9),
326 IR3_INSTR_S2EN = BIT(10),
327 IR3_INSTR_SAT = BIT(11),
328 /* (cat5/cat6) Bindless */
329 IR3_INSTR_B = BIT(12),
330 /* (cat5/cat6) nonuniform */
331 IR3_INSTR_NONUNIF = BIT(13),
332 /* (cat5-only) Get some parts of the encoding from a1.x */
333 IR3_INSTR_A1EN = BIT(14),
334 /* meta-flags, for intermediate stages of IR, ie.
335 * before register assignment is done:
336 */
337 IR3_INSTR_MARK = BIT(15),
338
339 /* Used by shared register allocation when creating spill/reload instructions
340 * to inform validation that this is created by RA. This also may be set on
341 * an instruction where a spill has been folded into it.
342 */
343 IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
344
345 IR3_INSTR_UNUSED = BIT(17),
346 } ir3_instruction_flags;
347
348 struct ir3_instruction {
349 struct ir3_block *block;
350 opc_t opc;
351 BITMASK_ENUM(ir3_instruction_flags) flags;
352 uint8_t repeat;
353 uint8_t nop;
354 #ifdef DEBUG
355 unsigned srcs_max, dsts_max;
356 #endif
357 unsigned srcs_count, dsts_count;
358 struct ir3_register **dsts;
359 struct ir3_register **srcs;
360 union {
361 struct {
362 char inv1, inv2;
363 char comp1, comp2;
364 int immed;
365 struct ir3_block *target;
366 const char *target_label;
367 brtype_t brtype;
368 unsigned idx; /* for brac.N */
369 } cat0;
370 struct {
371 type_t src_type, dst_type;
372 round_t round;
373 reduce_op_t reduce_op;
374 } cat1;
375 struct {
376 enum {
377 IR3_COND_LT = 0,
378 IR3_COND_LE = 1,
379 IR3_COND_GT = 2,
380 IR3_COND_GE = 3,
381 IR3_COND_EQ = 4,
382 IR3_COND_NE = 5,
383 } condition;
384 } cat2;
385 struct {
386 enum {
387 IR3_SRC_UNSIGNED = 0,
388 IR3_SRC_MIXED = 1,
389 } signedness;
390 enum {
391 IR3_SRC_PACKED_LOW = 0,
392 IR3_SRC_PACKED_HIGH = 1,
393 } packed;
394 bool swapped;
395 } cat3;
396 struct {
397 unsigned samp, tex;
398 unsigned tex_base : 3;
399 unsigned cluster_size : 4;
400 type_t type;
401 } cat5;
402 struct {
403 type_t type;
404 /* TODO remove dst_offset and handle as a ir3_register
405 * which might be IMMED, similar to how src_offset is
406 * handled.
407 */
408 int dst_offset;
409 int iim_val; /* for ldgb/stgb, # of components */
410 unsigned d : 3; /* for ldc, component offset */
411 bool typed : 1;
412 unsigned base : 3;
413 } cat6;
414 struct {
415 unsigned w : 1; /* write */
416 unsigned r : 1; /* read */
417 unsigned l : 1; /* local */
418 unsigned g : 1; /* global */
419
420 ir3_alias_scope alias_scope;
421 } cat7;
422 /* for meta-instructions, just used to hold extra data
423 * before instruction scheduling, etc
424 */
425 struct {
426 int off; /* component/offset */
427 } split;
428 struct {
429 /* Per-source index back to the entry in the
430 * ir3_shader_variant::outputs table.
431 */
432 unsigned *outidxs;
433 } end;
434 struct {
435 /* used to temporarily hold reference to nir_phi_instr
436 * until we resolve the phi srcs
437 */
438 void *nphi;
439 } phi;
440 struct {
441 unsigned samp, tex;
442 unsigned input_offset;
443 unsigned samp_base : 3;
444 unsigned tex_base : 3;
445 } prefetch;
446 struct {
447 /* maps back to entry in ir3_shader_variant::inputs table: */
448 int inidx;
449 /* for sysvals, identifies the sysval type. Mostly so we can
450 * identify the special cases where a sysval should not be DCE'd
451 * (currently, just pre-fs texture fetch)
452 */
453 gl_system_value sysval;
454 } input;
455 struct {
456 unsigned src_base, src_size;
457 unsigned dst_base;
458 } push_consts;
459 struct {
460 uint64_t value;
461 } raw;
462 };
463
464 /* For assigning jump offsets, we need instruction's position: */
465 uint32_t ip;
466
467 /* used for per-pass extra instruction data.
468 *
469 * TODO we should remove the per-pass data like this and 'use_count'
470 * and do something similar to what RA does w/ ir3_ra_instr_data..
471 * ie. use the ir3_count_instructions pass, and then use instr->ip
472 * to index into a table of pass-private data.
473 */
474 void *data;
475
476 /**
477 * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
478 */
479 struct set *uses;
480
481 int use_count; /* currently just updated/used by cp */
482
483 /* an instruction can reference at most one address register amongst
484 * it's src/dst registers. Beyond that, you need to insert mov's.
485 *
486 * NOTE: do not write this directly, use ir3_instr_set_address()
487 */
488 struct ir3_register *address;
489
490 /* Tracking for additional dependent instructions. Used to handle
491 * barriers, WAR hazards for arrays/SSBOs/etc.
492 */
493 DECLARE_ARRAY(struct ir3_instruction *, deps);
494
495 /*
496 * From PoV of instruction scheduling, not execution (ie. ignores global/
497 * local distinction):
498 * shared image atomic SSBO everything
499 * barrier()/ - R/W R/W R/W R/W X
500 * groupMemoryBarrier()
501 * memoryBarrier()
502 * (but only images declared coherent?)
503 * memoryBarrierAtomic() - R/W
504 * memoryBarrierBuffer() - R/W
505 * memoryBarrierImage() - R/W
506 * memoryBarrierShared() - R/W
507 *
508 * TODO I think for SSBO/image/shared, in cases where we can determine
509 * which variable is accessed, we don't need to care about accesses to
510 * different variables (unless declared coherent??)
511 */
512 enum {
513 IR3_BARRIER_EVERYTHING = 1 << 0,
514 IR3_BARRIER_SHARED_R = 1 << 1,
515 IR3_BARRIER_SHARED_W = 1 << 2,
516 IR3_BARRIER_IMAGE_R = 1 << 3,
517 IR3_BARRIER_IMAGE_W = 1 << 4,
518 IR3_BARRIER_BUFFER_R = 1 << 5,
519 IR3_BARRIER_BUFFER_W = 1 << 6,
520 IR3_BARRIER_ARRAY_R = 1 << 7,
521 IR3_BARRIER_ARRAY_W = 1 << 8,
522 IR3_BARRIER_PRIVATE_R = 1 << 9,
523 IR3_BARRIER_PRIVATE_W = 1 << 10,
524 IR3_BARRIER_CONST_W = 1 << 11,
525 IR3_BARRIER_ACTIVE_FIBERS_R = 1 << 12,
526 IR3_BARRIER_ACTIVE_FIBERS_W = 1 << 13,
527 } barrier_class,
528 barrier_conflict;
529
530 /* Entry in ir3_block's instruction list: */
531 struct list_head node;
532
533 uint32_t serialno;
534
535 // TODO only computerator/assembler:
536 int line;
537 };
538
539 struct ir3 {
540 struct ir3_compiler *compiler;
541 gl_shader_stage type;
542
543 DECLARE_ARRAY(struct ir3_instruction *, inputs);
544
545 /* Track bary.f (and ldlv) instructions.. this is needed in
546 * scheduling to ensure that all varying fetches happen before
547 * any potential kill instructions. The hw gets grumpy if all
548 * threads in a group are killed before the last bary.f gets
549 * a chance to signal end of input (ei).
550 */
551 DECLARE_ARRAY(struct ir3_instruction *, baryfs);
552
553 /* Track all indirect instructions (read and write). To avoid
554 * deadlock scenario where an address register gets scheduled,
555 * but other dependent src instructions cannot be scheduled due
556 * to dependency on a *different* address register value, the
557 * scheduler needs to ensure that all dependencies other than
558 * the instruction other than the address register are scheduled
559 * before the one that writes the address register. Having a
560 * convenient list of instructions that reference some address
561 * register simplifies this.
562 */
563 DECLARE_ARRAY(struct ir3_instruction *, a0_users);
564
565 /* same for a1.x: */
566 DECLARE_ARRAY(struct ir3_instruction *, a1_users);
567
568 /* and same for instructions that consume predicate register: */
569 DECLARE_ARRAY(struct ir3_instruction *, predicates);
570
571 /* Track texture sample instructions which need texture state
572 * patched in (for astc-srgb workaround):
573 */
574 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
575
576 /* Track tg4 instructions which need texture state patched in (for tg4
577 * swizzling workaround):
578 */
579 DECLARE_ARRAY(struct ir3_instruction *, tg4);
580
581 /* List of blocks: */
582 struct list_head block_list;
583
584 /* List of ir3_array's: */
585 struct list_head array_list;
586
587 #ifdef DEBUG
588 unsigned block_count;
589 #endif
590 unsigned instr_count;
591 };
592
593 struct ir3_array {
594 struct list_head node;
595 unsigned length;
596 unsigned id;
597
598 struct nir_def *r;
599
600 /* To avoid array write's from getting DCE'd, keep track of the
601 * most recent write. Any array access depends on the most
602 * recent write. This way, nothing depends on writes after the
603 * last read. But all the writes that happen before that have
604 * something depending on them
605 */
606 struct ir3_register *last_write;
607
608 /* extra stuff used in RA pass: */
609 unsigned base; /* base vreg name */
610 unsigned reg; /* base physical reg */
611 uint16_t start_ip, end_ip;
612
613 /* Indicates if half-precision */
614 bool half;
615
616 bool unused;
617 };
618
619 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
620
621 enum ir3_branch_type {
622 IR3_BRANCH_COND, /* condition */
623 IR3_BRANCH_ANY, /* subgroupAny(condition) */
624 IR3_BRANCH_ALL, /* subgroupAll(condition) */
625 IR3_BRANCH_GETONE, /* subgroupElect() */
626 IR3_BRANCH_GETLAST, /* getlast.w8 */
627 IR3_BRANCH_SHPS, /* preamble start */
628 };
629
630 struct ir3_block {
631 struct list_head node;
632 struct ir3 *shader;
633
634 const struct nir_block *nblock;
635
636 struct list_head instr_list; /* list of ir3_instruction */
637
638 /* The actual branch condition, if there are two successors */
639 enum ir3_branch_type brtype;
640
641 /* each block has either one or two successors.. in case of two
642 * successors, 'condition' decides which one to follow. A block preceding
643 * an if/else has two successors.
644 *
645 * In some cases the path that the machine actually takes through the
646 * program may not match the per-thread view of the CFG. In particular
647 * this is the case for if/else, where the machine jumps from the end of
648 * the if to the beginning of the else and switches active lanes. While
649 * most things only care about the per-thread view, we need to use the
650 * "physical" view when allocating shared registers. "successors" contains
651 * the per-thread successors, and "physical_successors" contains the
652 * physical successors which includes the fallthrough edge from the if to
653 * the else.
654 */
655 struct ir3_instruction *condition;
656 struct ir3_block *successors[2];
657
658 DECLARE_ARRAY(struct ir3_block *, predecessors);
659 DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
660 DECLARE_ARRAY(struct ir3_block *, physical_successors);
661
662 uint16_t start_ip, end_ip;
663
664 bool reconvergence_point;
665
666 /* Track instructions which do not write a register but other-
667 * wise must not be discarded (such as kill, stg, etc)
668 */
669 DECLARE_ARRAY(struct ir3_instruction *, keeps);
670
671 /* used for per-pass extra block data. Mainly used right
672 * now in RA step to track livein/liveout.
673 */
674 void *data;
675
676 uint32_t index;
677
678 struct ir3_block *imm_dom;
679 DECLARE_ARRAY(struct ir3_block *, dom_children);
680
681 uint32_t dom_pre_index;
682 uint32_t dom_post_index;
683
684 uint32_t loop_id;
685 uint32_t loop_depth;
686
687 #ifdef DEBUG
688 uint32_t serialno;
689 #endif
690 };
691
692 static inline uint32_t
block_id(struct ir3_block * block)693 block_id(struct ir3_block *block)
694 {
695 #ifdef DEBUG
696 return block->serialno;
697 #else
698 return (uint32_t)(unsigned long)block;
699 #endif
700 }
701
702 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)703 ir3_start_block(struct ir3 *ir)
704 {
705 return list_first_entry(&ir->block_list, struct ir3_block, node);
706 }
707
708 static inline struct ir3_block *
ir3_end_block(struct ir3 * ir)709 ir3_end_block(struct ir3 *ir)
710 {
711 return list_last_entry(&ir->block_list, struct ir3_block, node);
712 }
713
714 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)715 ir3_after_preamble(struct ir3 *ir)
716 {
717 struct ir3_block *block = ir3_start_block(ir);
718 /* The preamble will have a usually-empty else branch, and we want to skip
719 * that to get to the block after the preamble.
720 */
721 if (block->brtype == IR3_BRANCH_SHPS)
722 return block->successors[1]->successors[0];
723 else
724 return block;
725 }
726
727 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
728 void ir3_block_link_physical(struct ir3_block *pred, struct ir3_block *succ);
729 void ir3_block_remove_predecessor(struct ir3_block *block,
730 struct ir3_block *pred);
731 unsigned ir3_block_get_pred_index(struct ir3_block *block,
732 struct ir3_block *pred);
733
734 void ir3_calc_dominance(struct ir3 *ir);
735 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
736
737 struct ir3_shader_variant;
738
739 struct ir3 *ir3_create(struct ir3_compiler *compiler,
740 struct ir3_shader_variant *v);
741 void ir3_destroy(struct ir3 *shader);
742
743 void ir3_collect_info(struct ir3_shader_variant *v);
744 void *ir3_alloc(struct ir3 *shader, int sz);
745
746 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
747 unsigned reg_count,
748 bool double_threadsize);
749
750 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
751 bool double_threadsize);
752
753 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
754 unsigned regs_count);
755
756 struct ir3_block *ir3_block_create(struct ir3 *shader);
757
758 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
759 int ndst, int nsrc);
760 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
761 void ir3_instr_add_dep(struct ir3_instruction *instr,
762 struct ir3_instruction *dep);
763 const char *ir3_instr_name(struct ir3_instruction *instr);
764
765 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
766 int flags);
767 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
768 int flags);
769 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
770 struct ir3_register *reg);
771
772 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)773 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
774 {
775 assert(!dst->tied && !src->tied);
776 dst->tied = src;
777 src->tied = dst;
778 }
779
780 void ir3_reg_set_last_array(struct ir3_instruction *instr,
781 struct ir3_register *reg,
782 struct ir3_register *last_write);
783
784 void ir3_instr_set_address(struct ir3_instruction *instr,
785 struct ir3_instruction *addr);
786
787 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)788 ir3_instr_check_mark(struct ir3_instruction *instr)
789 {
790 if (instr->flags & IR3_INSTR_MARK)
791 return true; /* already visited */
792 instr->flags |= IR3_INSTR_MARK;
793 return false;
794 }
795
796 void ir3_block_clear_mark(struct ir3_block *block);
797 void ir3_clear_mark(struct ir3 *shader);
798
799 unsigned ir3_count_instructions(struct ir3 *ir);
800 unsigned ir3_count_instructions_ra(struct ir3 *ir);
801
802 /**
803 * Move 'instr' to just before 'after'
804 */
805 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)806 ir3_instr_move_before(struct ir3_instruction *instr,
807 struct ir3_instruction *after)
808 {
809 list_delinit(&instr->node);
810 list_addtail(&instr->node, &after->node);
811 }
812
813 /**
814 * Move 'instr' to just after 'before':
815 */
816 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)817 ir3_instr_move_after(struct ir3_instruction *instr,
818 struct ir3_instruction *before)
819 {
820 list_delinit(&instr->node);
821 list_add(&instr->node, &before->node);
822 }
823
824 /**
825 * Move 'instr' to the beginning of the block:
826 */
827 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)828 ir3_instr_move_before_block(struct ir3_instruction *instr,
829 struct ir3_block *block)
830 {
831 list_delinit(&instr->node);
832 list_add(&instr->node, &block->instr_list);
833 }
834
835 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
836
837 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
838 void ir3_fixup_src_type(struct ir3_instruction *instr);
839
840 int ir3_flut(struct ir3_register *src_reg);
841
842 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
843
844 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
845
846 #include "util/set.h"
847 #define foreach_ssa_use(__use, __instr) \
848 for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses; \
849 __use = NULL) \
850 set_foreach ((__instr)->uses, __entry) \
851 if ((__use = (void *)__entry->key))
852
853 static inline uint32_t
reg_num(const struct ir3_register * reg)854 reg_num(const struct ir3_register *reg)
855 {
856 return reg->num >> 2;
857 }
858
859 static inline uint32_t
reg_comp(const struct ir3_register * reg)860 reg_comp(const struct ir3_register *reg)
861 {
862 return reg->num & 0x3;
863 }
864
865 static inline bool
is_flow(struct ir3_instruction * instr)866 is_flow(struct ir3_instruction *instr)
867 {
868 return (opc_cat(instr->opc) == 0);
869 }
870
871 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)872 is_kill_or_demote(struct ir3_instruction *instr)
873 {
874 return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
875 }
876
877 static inline bool
is_nop(struct ir3_instruction * instr)878 is_nop(struct ir3_instruction *instr)
879 {
880 return instr->opc == OPC_NOP;
881 }
882
883 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)884 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
885 {
886 unsigned dst_type = (dst->flags & IR3_REG_HALF);
887 unsigned src_type = (src->flags & IR3_REG_HALF);
888
889 /* Treat shared->normal copies as same-type, because they can generally be
890 * folded, but not normal->shared copies.
891 */
892 if (dst_type != src_type ||
893 ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
894 return false;
895 else
896 return true;
897 }
898
899 /* Is it a non-transformative (ie. not type changing) mov? This can
900 * also include absneg.s/absneg.f, which for the most part can be
901 * treated as a mov (single src argument).
902 */
903 static inline bool
is_same_type_mov(struct ir3_instruction * instr)904 is_same_type_mov(struct ir3_instruction *instr)
905 {
906 struct ir3_register *dst;
907
908 switch (instr->opc) {
909 case OPC_MOV:
910 if (instr->cat1.src_type != instr->cat1.dst_type)
911 return false;
912 /* If the type of dest reg and src reg are different,
913 * it shouldn't be considered as same type mov
914 */
915 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
916 return false;
917 break;
918 case OPC_ABSNEG_F:
919 case OPC_ABSNEG_S:
920 if (instr->flags & IR3_INSTR_SAT)
921 return false;
922 /* If the type of dest reg and src reg are different,
923 * it shouldn't be considered as same type mov
924 */
925 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
926 return false;
927 break;
928 case OPC_META_PHI:
929 return instr->srcs_count == 1;
930 default:
931 return false;
932 }
933
934 dst = instr->dsts[0];
935
936 /* mov's that write to a0 or p0.x are special: */
937 if (dst->num == regid(REG_P0, 0))
938 return false;
939 if (reg_num(dst) == REG_A0)
940 return false;
941
942 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
943 return false;
944
945 return true;
946 }
947
948 /* A move from const, which changes size but not type, can also be
949 * folded into dest instruction in some cases.
950 */
951 static inline bool
is_const_mov(struct ir3_instruction * instr)952 is_const_mov(struct ir3_instruction *instr)
953 {
954 if (instr->opc != OPC_MOV)
955 return false;
956
957 if (!(instr->srcs[0]->flags & IR3_REG_CONST))
958 return false;
959
960 type_t src_type = instr->cat1.src_type;
961 type_t dst_type = instr->cat1.dst_type;
962
963 return (type_float(src_type) && type_float(dst_type)) ||
964 (type_uint(src_type) && type_uint(dst_type)) ||
965 (type_sint(src_type) && type_sint(dst_type));
966 }
967
968 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)969 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
970 {
971 switch (instr->opc) {
972 case OPC_BALLOT_MACRO:
973 case OPC_ANY_MACRO:
974 case OPC_ALL_MACRO:
975 case OPC_ELECT_MACRO:
976 case OPC_READ_COND_MACRO:
977 case OPC_READ_FIRST_MACRO:
978 case OPC_SWZ_SHARED_MACRO:
979 case OPC_SCAN_MACRO:
980 return true;
981 default:
982 return false;
983 }
984 }
985
986 static inline bool
is_alu(struct ir3_instruction * instr)987 is_alu(struct ir3_instruction *instr)
988 {
989 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
990 }
991
992 static inline bool
is_sfu(struct ir3_instruction * instr)993 is_sfu(struct ir3_instruction *instr)
994 {
995 return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
996 }
997
998 static inline bool
is_tex(struct ir3_instruction * instr)999 is_tex(struct ir3_instruction *instr)
1000 {
1001 return (opc_cat(instr->opc) == 5) && instr->opc != OPC_TCINV;
1002 }
1003
1004 static inline bool
is_tex_shuffle(struct ir3_instruction * instr)1005 is_tex_shuffle(struct ir3_instruction *instr)
1006 {
1007 switch (instr->opc) {
1008 case OPC_BRCST_ACTIVE:
1009 case OPC_QUAD_SHUFFLE_BRCST:
1010 case OPC_QUAD_SHUFFLE_HORIZ:
1011 case OPC_QUAD_SHUFFLE_VERT:
1012 case OPC_QUAD_SHUFFLE_DIAG:
1013 return true;
1014 default:
1015 return false;
1016 }
1017 }
1018
1019 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)1020 is_tex_or_prefetch(struct ir3_instruction *instr)
1021 {
1022 return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
1023 }
1024
1025 static inline bool
is_mem(struct ir3_instruction * instr)1026 is_mem(struct ir3_instruction *instr)
1027 {
1028 return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
1029 }
1030
1031 static inline bool
is_barrier(struct ir3_instruction * instr)1032 is_barrier(struct ir3_instruction *instr)
1033 {
1034 return (opc_cat(instr->opc) == 7);
1035 }
1036
1037 static inline bool
is_half(struct ir3_instruction * instr)1038 is_half(struct ir3_instruction *instr)
1039 {
1040 return !!(instr->dsts[0]->flags & IR3_REG_HALF);
1041 }
1042
1043 static inline bool
is_shared(struct ir3_instruction * instr)1044 is_shared(struct ir3_instruction *instr)
1045 {
1046 return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
1047 }
1048
1049 static inline bool
is_store(struct ir3_instruction * instr)1050 is_store(struct ir3_instruction *instr)
1051 {
1052 /* these instructions, the "destination" register is
1053 * actually a source, the address to store to.
1054 */
1055 switch (instr->opc) {
1056 case OPC_STG:
1057 case OPC_STG_A:
1058 case OPC_STGB:
1059 case OPC_STIB:
1060 case OPC_STP:
1061 case OPC_STL:
1062 case OPC_STLW:
1063 case OPC_L2G:
1064 case OPC_G2L:
1065 return true;
1066 default:
1067 return false;
1068 }
1069 }
1070
1071 static inline bool
is_load(struct ir3_instruction * instr)1072 is_load(struct ir3_instruction *instr)
1073 {
1074 switch (instr->opc) {
1075 case OPC_LDG:
1076 case OPC_LDG_A:
1077 case OPC_LDGB:
1078 case OPC_LDIB:
1079 case OPC_LDL:
1080 case OPC_LDP:
1081 case OPC_L2G:
1082 case OPC_LDLW:
1083 case OPC_LDC:
1084 case OPC_LDLV:
1085 /* probably some others too.. */
1086 return true;
1087 default:
1088 return false;
1089 }
1090 }
1091
1092 static inline bool
is_input(struct ir3_instruction * instr)1093 is_input(struct ir3_instruction *instr)
1094 {
1095 /* in some cases, ldlv is used to fetch varying without
1096 * interpolation.. fortunately inloc is the first src
1097 * register in either case
1098 */
1099 switch (instr->opc) {
1100 case OPC_LDLV:
1101 case OPC_BARY_F:
1102 case OPC_FLAT_B:
1103 return true;
1104 default:
1105 return false;
1106 }
1107 }
1108
1109 /* Whether non-helper invocations can read the value of helper invocations. We
1110 * cannot insert (eq) before these instructions.
1111 */
1112 static inline bool
uses_helpers(struct ir3_instruction * instr)1113 uses_helpers(struct ir3_instruction *instr)
1114 {
1115 switch (instr->opc) {
1116 /* These require helper invocations to be present */
1117 case OPC_SAM:
1118 case OPC_SAMB:
1119 case OPC_GETLOD:
1120 case OPC_DSX:
1121 case OPC_DSY:
1122 case OPC_DSXPP_1:
1123 case OPC_DSYPP_1:
1124 case OPC_DSXPP_MACRO:
1125 case OPC_DSYPP_MACRO:
1126 case OPC_QUAD_SHUFFLE_BRCST:
1127 case OPC_QUAD_SHUFFLE_HORIZ:
1128 case OPC_QUAD_SHUFFLE_VERT:
1129 case OPC_QUAD_SHUFFLE_DIAG:
1130 case OPC_META_TEX_PREFETCH:
1131 return true;
1132
1133 /* Subgroup operations don't require helper invocations to be present, but
1134 * will use helper invocations if they are present.
1135 */
1136 case OPC_BALLOT_MACRO:
1137 case OPC_ANY_MACRO:
1138 case OPC_ALL_MACRO:
1139 case OPC_ELECT_MACRO:
1140 case OPC_READ_FIRST_MACRO:
1141 case OPC_READ_COND_MACRO:
1142 case OPC_MOVMSK:
1143 case OPC_BRCST_ACTIVE:
1144 return true;
1145
1146 /* Catch lowered READ_FIRST/READ_COND. */
1147 case OPC_MOV:
1148 return (instr->dsts[0]->flags & IR3_REG_SHARED) &&
1149 !(instr->srcs[0]->flags & IR3_REG_SHARED);
1150
1151 default:
1152 return false;
1153 }
1154 }
1155
1156 static inline bool
is_bool(struct ir3_instruction * instr)1157 is_bool(struct ir3_instruction *instr)
1158 {
1159 switch (instr->opc) {
1160 case OPC_CMPS_F:
1161 case OPC_CMPS_S:
1162 case OPC_CMPS_U:
1163 return true;
1164 default:
1165 return false;
1166 }
1167 }
1168
1169 static inline opc_t
cat3_half_opc(opc_t opc)1170 cat3_half_opc(opc_t opc)
1171 {
1172 switch (opc) {
1173 case OPC_MAD_F32:
1174 return OPC_MAD_F16;
1175 case OPC_SEL_B32:
1176 return OPC_SEL_B16;
1177 case OPC_SEL_S32:
1178 return OPC_SEL_S16;
1179 case OPC_SEL_F32:
1180 return OPC_SEL_F16;
1181 case OPC_SAD_S32:
1182 return OPC_SAD_S16;
1183 default:
1184 return opc;
1185 }
1186 }
1187
1188 static inline opc_t
cat3_full_opc(opc_t opc)1189 cat3_full_opc(opc_t opc)
1190 {
1191 switch (opc) {
1192 case OPC_MAD_F16:
1193 return OPC_MAD_F32;
1194 case OPC_SEL_B16:
1195 return OPC_SEL_B32;
1196 case OPC_SEL_S16:
1197 return OPC_SEL_S32;
1198 case OPC_SEL_F16:
1199 return OPC_SEL_F32;
1200 case OPC_SAD_S16:
1201 return OPC_SAD_S32;
1202 default:
1203 return opc;
1204 }
1205 }
1206
1207 static inline opc_t
cat4_half_opc(opc_t opc)1208 cat4_half_opc(opc_t opc)
1209 {
1210 switch (opc) {
1211 case OPC_RSQ:
1212 return OPC_HRSQ;
1213 case OPC_LOG2:
1214 return OPC_HLOG2;
1215 case OPC_EXP2:
1216 return OPC_HEXP2;
1217 default:
1218 return opc;
1219 }
1220 }
1221
1222 static inline opc_t
cat4_full_opc(opc_t opc)1223 cat4_full_opc(opc_t opc)
1224 {
1225 switch (opc) {
1226 case OPC_HRSQ:
1227 return OPC_RSQ;
1228 case OPC_HLOG2:
1229 return OPC_LOG2;
1230 case OPC_HEXP2:
1231 return OPC_EXP2;
1232 default:
1233 return opc;
1234 }
1235 }
1236
1237 static inline bool
is_meta(struct ir3_instruction * instr)1238 is_meta(struct ir3_instruction *instr)
1239 {
1240 return (opc_cat(instr->opc) == OPC_META);
1241 }
1242
1243 static inline unsigned
reg_elems(const struct ir3_register * reg)1244 reg_elems(const struct ir3_register *reg)
1245 {
1246 if (reg->flags & IR3_REG_ARRAY)
1247 return reg->size;
1248 else
1249 return util_last_bit(reg->wrmask);
1250 }
1251
1252 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1253 reg_elem_size(const struct ir3_register *reg)
1254 {
1255 return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1256 }
1257
1258 static inline unsigned
reg_size(const struct ir3_register * reg)1259 reg_size(const struct ir3_register *reg)
1260 {
1261 return reg_elems(reg) * reg_elem_size(reg);
1262 }
1263
1264 static inline unsigned
dest_regs(struct ir3_instruction * instr)1265 dest_regs(struct ir3_instruction *instr)
1266 {
1267 if (instr->dsts_count == 0)
1268 return 0;
1269
1270 assert(instr->dsts_count == 1);
1271 return util_last_bit(instr->dsts[0]->wrmask);
1272 }
1273
1274 /* is dst a normal temp register: */
1275 static inline bool
is_dest_gpr(struct ir3_register * dst)1276 is_dest_gpr(struct ir3_register *dst)
1277 {
1278 if (dst->wrmask == 0)
1279 return false;
1280 if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
1281 return false;
1282 return true;
1283 }
1284
1285 static inline bool
writes_gpr(struct ir3_instruction * instr)1286 writes_gpr(struct ir3_instruction *instr)
1287 {
1288 if (dest_regs(instr) == 0)
1289 return false;
1290 return is_dest_gpr(instr->dsts[0]);
1291 }
1292
1293 static inline bool
writes_addr0(struct ir3_instruction * instr)1294 writes_addr0(struct ir3_instruction *instr)
1295 {
1296 /* Note: only the first dest can write to a0.x */
1297 if (instr->dsts_count > 0) {
1298 struct ir3_register *dst = instr->dsts[0];
1299 return dst->num == regid(REG_A0, 0);
1300 }
1301 return false;
1302 }
1303
1304 static inline bool
writes_addr1(struct ir3_instruction * instr)1305 writes_addr1(struct ir3_instruction *instr)
1306 {
1307 /* Note: only the first dest can write to a1.x */
1308 if (instr->dsts_count > 0) {
1309 struct ir3_register *dst = instr->dsts[0];
1310 return dst->num == regid(REG_A0, 1);
1311 }
1312 return false;
1313 }
1314
1315 static inline bool
writes_pred(struct ir3_instruction * instr)1316 writes_pred(struct ir3_instruction *instr)
1317 {
1318 /* Note: only the first dest can write to p0.x */
1319 if (instr->dsts_count > 0) {
1320 struct ir3_register *dst = instr->dsts[0];
1321 return reg_num(dst) == REG_P0;
1322 }
1323 return false;
1324 }
1325
1326 /* Is it something other than a normal register. Shared regs, p0, and a0/a1
1327 * are considered special here. Special registers are always accessed with one
1328 * size and never alias normal registers, even though a naive calculation
1329 * would sometimes make it seem like e.g. r30.z aliases a0.x.
1330 */
1331 static inline bool
is_reg_special(const struct ir3_register * reg)1332 is_reg_special(const struct ir3_register *reg)
1333 {
1334 return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
1335 (reg_num(reg) == REG_P0);
1336 }
1337
1338 /* Same as above but in cases where we don't have a register. r48.x and above
1339 * are shared/special.
1340 */
1341 static inline bool
is_reg_num_special(unsigned num)1342 is_reg_num_special(unsigned num)
1343 {
1344 return num >= 48 * 4;
1345 }
1346
1347 /* returns defining instruction for reg */
1348 /* TODO better name */
1349 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1350 ssa(struct ir3_register *reg)
1351 {
1352 if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1353 return reg->def->instr;
1354 return NULL;
1355 }
1356
1357 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1358 conflicts(struct ir3_register *a, struct ir3_register *b)
1359 {
1360 return (a && b) && (a->def != b->def);
1361 }
1362
1363 static inline bool
reg_gpr(struct ir3_register * r)1364 reg_gpr(struct ir3_register *r)
1365 {
1366 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1367 return false;
1368 if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
1369 return false;
1370 return true;
1371 }
1372
1373 static inline type_t
half_type(type_t type)1374 half_type(type_t type)
1375 {
1376 switch (type) {
1377 case TYPE_F32:
1378 return TYPE_F16;
1379 case TYPE_U32:
1380 return TYPE_U16;
1381 case TYPE_S32:
1382 return TYPE_S16;
1383 case TYPE_F16:
1384 case TYPE_U16:
1385 case TYPE_S16:
1386 return type;
1387 case TYPE_U8:
1388 case TYPE_S8:
1389 return type;
1390 default:
1391 assert(0);
1392 return (type_t)~0;
1393 }
1394 }
1395
1396 static inline type_t
full_type(type_t type)1397 full_type(type_t type)
1398 {
1399 switch (type) {
1400 case TYPE_F16:
1401 return TYPE_F32;
1402 case TYPE_U8:
1403 case TYPE_U16:
1404 return TYPE_U32;
1405 case TYPE_S8:
1406 case TYPE_S16:
1407 return TYPE_S32;
1408 case TYPE_F32:
1409 case TYPE_U32:
1410 case TYPE_S32:
1411 return type;
1412 default:
1413 assert(0);
1414 return (type_t)~0;
1415 }
1416 }
1417
1418 /* some cat2 instructions (ie. those which are not float) can embed an
1419 * immediate:
1420 */
1421 static inline bool
ir3_cat2_int(opc_t opc)1422 ir3_cat2_int(opc_t opc)
1423 {
1424 switch (opc) {
1425 case OPC_ADD_U:
1426 case OPC_ADD_S:
1427 case OPC_SUB_U:
1428 case OPC_SUB_S:
1429 case OPC_CMPS_U:
1430 case OPC_CMPS_S:
1431 case OPC_MIN_U:
1432 case OPC_MIN_S:
1433 case OPC_MAX_U:
1434 case OPC_MAX_S:
1435 case OPC_CMPV_U:
1436 case OPC_CMPV_S:
1437 case OPC_MUL_U24:
1438 case OPC_MUL_S24:
1439 case OPC_MULL_U:
1440 case OPC_CLZ_S:
1441 case OPC_ABSNEG_S:
1442 case OPC_AND_B:
1443 case OPC_OR_B:
1444 case OPC_NOT_B:
1445 case OPC_XOR_B:
1446 case OPC_BFREV_B:
1447 case OPC_CLZ_B:
1448 case OPC_SHL_B:
1449 case OPC_SHR_B:
1450 case OPC_ASHR_B:
1451 case OPC_MGEN_B:
1452 case OPC_GETBIT_B:
1453 case OPC_CBITS_B:
1454 case OPC_BARY_F:
1455 case OPC_FLAT_B:
1456 return true;
1457
1458 default:
1459 return false;
1460 }
1461 }
1462
1463 /* map cat2 instruction to valid abs/neg flags: */
1464 static inline unsigned
ir3_cat2_absneg(opc_t opc)1465 ir3_cat2_absneg(opc_t opc)
1466 {
1467 switch (opc) {
1468 case OPC_ADD_F:
1469 case OPC_MIN_F:
1470 case OPC_MAX_F:
1471 case OPC_MUL_F:
1472 case OPC_SIGN_F:
1473 case OPC_CMPS_F:
1474 case OPC_ABSNEG_F:
1475 case OPC_CMPV_F:
1476 case OPC_FLOOR_F:
1477 case OPC_CEIL_F:
1478 case OPC_RNDNE_F:
1479 case OPC_RNDAZ_F:
1480 case OPC_TRUNC_F:
1481 case OPC_BARY_F:
1482 return IR3_REG_FABS | IR3_REG_FNEG;
1483
1484 case OPC_ADD_U:
1485 case OPC_ADD_S:
1486 case OPC_SUB_U:
1487 case OPC_SUB_S:
1488 case OPC_CMPS_U:
1489 case OPC_CMPS_S:
1490 case OPC_MIN_U:
1491 case OPC_MIN_S:
1492 case OPC_MAX_U:
1493 case OPC_MAX_S:
1494 case OPC_CMPV_U:
1495 case OPC_CMPV_S:
1496 case OPC_MUL_U24:
1497 case OPC_MUL_S24:
1498 case OPC_MULL_U:
1499 case OPC_CLZ_S:
1500 return 0;
1501
1502 case OPC_ABSNEG_S:
1503 return IR3_REG_SABS | IR3_REG_SNEG;
1504
1505 case OPC_AND_B:
1506 case OPC_OR_B:
1507 case OPC_NOT_B:
1508 case OPC_XOR_B:
1509 case OPC_BFREV_B:
1510 case OPC_CLZ_B:
1511 case OPC_SHL_B:
1512 case OPC_SHR_B:
1513 case OPC_ASHR_B:
1514 case OPC_MGEN_B:
1515 case OPC_GETBIT_B:
1516 case OPC_CBITS_B:
1517 return IR3_REG_BNOT;
1518
1519 default:
1520 return 0;
1521 }
1522 }
1523
1524 /* map cat3 instructions to valid abs/neg flags: */
1525 static inline unsigned
ir3_cat3_absneg(opc_t opc)1526 ir3_cat3_absneg(opc_t opc)
1527 {
1528 switch (opc) {
1529 case OPC_MAD_F16:
1530 case OPC_MAD_F32:
1531 case OPC_SEL_F16:
1532 case OPC_SEL_F32:
1533 return IR3_REG_FNEG;
1534
1535 case OPC_MAD_U16:
1536 case OPC_MADSH_U16:
1537 case OPC_MAD_S16:
1538 case OPC_MADSH_M16:
1539 case OPC_MAD_U24:
1540 case OPC_MAD_S24:
1541 case OPC_SEL_S16:
1542 case OPC_SEL_S32:
1543 case OPC_SAD_S16:
1544 case OPC_SAD_S32:
1545 /* neg *may* work on 3rd src.. */
1546
1547 case OPC_SEL_B16:
1548 case OPC_SEL_B32:
1549
1550 case OPC_SHRM:
1551 case OPC_SHLM:
1552 case OPC_SHRG:
1553 case OPC_SHLG:
1554 case OPC_ANDG:
1555 case OPC_WMM:
1556 case OPC_WMM_ACCU:
1557
1558 default:
1559 return 0;
1560 }
1561 }
1562
1563 /* Return the type (float, int, or uint) the op uses when converting from the
1564 * internal result of the op (which is assumed to be the same size as the
1565 * sources) to the destination when they are not the same size. If F32 it does
1566 * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1567 * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1568 * doesn't do anything sensible or is unknown.
1569 */
1570 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1571 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1572 {
1573 *can_fold = true;
1574 switch (instr->opc) {
1575 case OPC_ADD_F:
1576 case OPC_MUL_F:
1577 case OPC_BARY_F:
1578 case OPC_MAD_F32:
1579 case OPC_MAD_F16:
1580 case OPC_WMM:
1581 case OPC_WMM_ACCU:
1582 return TYPE_F32;
1583
1584 case OPC_ADD_U:
1585 case OPC_SUB_U:
1586 case OPC_MIN_U:
1587 case OPC_MAX_U:
1588 case OPC_AND_B:
1589 case OPC_OR_B:
1590 case OPC_NOT_B:
1591 case OPC_XOR_B:
1592 case OPC_MUL_U24:
1593 case OPC_MULL_U:
1594 case OPC_SHL_B:
1595 case OPC_SHR_B:
1596 case OPC_ASHR_B:
1597 case OPC_MAD_U24:
1598 case OPC_SHRM:
1599 case OPC_SHLM:
1600 case OPC_SHRG:
1601 case OPC_SHLG:
1602 case OPC_ANDG:
1603 /* Comparison ops zero-extend/truncate their results, so consider them as
1604 * unsigned here.
1605 */
1606 case OPC_CMPS_F:
1607 case OPC_CMPV_F:
1608 case OPC_CMPS_U:
1609 case OPC_CMPS_S:
1610 return TYPE_U32;
1611
1612 case OPC_ADD_S:
1613 case OPC_SUB_S:
1614 case OPC_MIN_S:
1615 case OPC_MAX_S:
1616 case OPC_ABSNEG_S:
1617 case OPC_MUL_S24:
1618 case OPC_MAD_S24:
1619 return TYPE_S32;
1620
1621 /* We assume that any move->move folding that could be done was done by
1622 * NIR.
1623 */
1624 case OPC_MOV:
1625 default:
1626 *can_fold = false;
1627 return TYPE_U32;
1628 }
1629 }
1630
1631 /* Return the src and dst types for the conversion which is already folded
1632 * into the op. We can assume that instr has folded in a conversion from
1633 * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1634 * to call if ir3_output_conv_type() returns can_fold = true.
1635 */
1636 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1637 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1638 {
1639 switch (instr->opc) {
1640 case OPC_CMPS_F:
1641 case OPC_CMPV_F:
1642 case OPC_CMPS_U:
1643 case OPC_CMPS_S:
1644 /* Comparisons only return 0/1 and the size of the comparison sources
1645 * is irrelevant, never consider them as having an output conversion
1646 * by returning a type with the dest size here:
1647 */
1648 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1649 : full_type(base_type);
1650
1651 case OPC_BARY_F:
1652 /* bary.f doesn't have an explicit source, but we can assume here that
1653 * the varying data it reads is in fp32.
1654 *
1655 * This may be fp16 on older gen's depending on some register
1656 * settings, but it's probably not worth plumbing that through for a
1657 * small improvement that NIR would hopefully handle for us anyway.
1658 */
1659 return TYPE_F32;
1660
1661 case OPC_FLAT_B:
1662 /* Treat the input data as u32 if not interpolating. */
1663 return TYPE_U32;
1664
1665 default:
1666 return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1667 : full_type(base_type);
1668 }
1669 }
1670
1671 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1672 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1673 {
1674 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1675 : full_type(base_type);
1676 }
1677
1678 /* Some instructions have signed/unsigned variants which are identical except
1679 * for whether the folded conversion sign-extends or zero-extends, and we can
1680 * fold in a mismatching move by rewriting the opcode. Return the opcode to
1681 * switch signedness, and whether one exists.
1682 */
1683 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1684 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1685 {
1686 switch (opc) {
1687 #define PAIR(u, s) \
1688 case OPC_##u: \
1689 return OPC_##s; \
1690 case OPC_##s: \
1691 return OPC_##u;
1692 PAIR(ADD_U, ADD_S)
1693 PAIR(SUB_U, SUB_S)
1694 /* Note: these are only identical when the sources are half, but that's
1695 * the only case we call this function for anyway.
1696 */
1697 PAIR(MUL_U24, MUL_S24)
1698
1699 default:
1700 *can_swap = false;
1701 return opc;
1702 }
1703 }
1704
1705 #define MASK(n) ((1 << (n)) - 1)
1706
1707 /* iterator for an instructions's sources (reg), also returns src #: */
1708 #define foreach_src_n(__srcreg, __n, __instr) \
1709 if ((__instr)->srcs_count) \
1710 for (struct ir3_register *__srcreg = (struct ir3_register *)~0; __srcreg;\
1711 __srcreg = NULL) \
1712 for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt; \
1713 __n++) \
1714 if ((__srcreg = (__instr)->srcs[__n]))
1715
1716 /* iterator for an instructions's sources (reg): */
1717 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1718
1719 /* iterator for an instructions's destinations (reg), also returns dst #: */
1720 #define foreach_dst_n(__dstreg, __n, __instr) \
1721 if ((__instr)->dsts_count) \
1722 for (struct ir3_register *__dstreg = (struct ir3_register *)~0; __dstreg;\
1723 __dstreg = NULL) \
1724 for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt; \
1725 __n++) \
1726 if ((__dstreg = (__instr)->dsts[__n]))
1727
1728 /* iterator for an instructions's destinations (reg): */
1729 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1730
1731 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1732 __ssa_src_cnt(struct ir3_instruction *instr)
1733 {
1734 return instr->srcs_count + instr->deps_count;
1735 }
1736
1737 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1738 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1739 {
1740 if (n >= instr->srcs_count)
1741 return true;
1742 return false;
1743 }
1744
1745 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1746 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1747 {
1748 if (__is_false_dep(instr, n))
1749 return &instr->deps[n - instr->srcs_count];
1750 if (ssa(instr->srcs[n]))
1751 return &instr->srcs[n]->def->instr;
1752 return NULL;
1753 }
1754
1755 #define foreach_ssa_srcp_n(__srcp, __n, __instr) \
1756 for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL) \
1757 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; \
1758 __n++) \
1759 if ((__srcp = __ssa_srcp_n(__instr, __n)))
1760
1761 #define foreach_ssa_srcp(__srcp, __instr) \
1762 foreach_ssa_srcp_n (__srcp, __i, __instr)
1763
1764 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1765 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
1766 for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst; \
1767 __srcinst = NULL) \
1768 foreach_ssa_srcp_n (__srcp, __n, __instr) \
1769 if ((__srcinst = *__srcp))
1770
1771 /* iterator for an instruction's SSA sources (instr): */
1772 #define foreach_ssa_src(__srcinst, __instr) \
1773 foreach_ssa_src_n (__srcinst, __i, __instr)
1774
1775 /* iterators for shader inputs: */
1776 #define foreach_input_n(__ininstr, __cnt, __ir) \
1777 for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr; \
1778 __ininstr = NULL) \
1779 for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
1780 if ((__ininstr = (__ir)->inputs[__cnt]))
1781 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1782
1783 /* iterators for instructions: */
1784 #define foreach_instr(__instr, __list) \
1785 list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1786 #define foreach_instr_from(__instr, __start, __list) \
1787 list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \
1788 __list, node)
1789 #define foreach_instr_rev(__instr, __list) \
1790 list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1791 #define foreach_instr_safe(__instr, __list) \
1792 list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1793 #define foreach_instr_from_safe(__instr, __start, __list) \
1794 list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, \
1795 __list, node)
1796
1797 /* iterators for blocks: */
1798 #define foreach_block(__block, __list) \
1799 list_for_each_entry (struct ir3_block, __block, __list, node)
1800 #define foreach_block_safe(__block, __list) \
1801 list_for_each_entry_safe (struct ir3_block, __block, __list, node)
1802 #define foreach_block_rev(__block, __list) \
1803 list_for_each_entry_rev (struct ir3_block, __block, __list, node)
1804
1805 /* iterators for arrays: */
1806 #define foreach_array(__array, __list) \
1807 list_for_each_entry (struct ir3_array, __array, __list, node)
1808 #define foreach_array_safe(__array, __list) \
1809 list_for_each_entry_safe (struct ir3_array, __array, __list, node)
1810
1811 #define IR3_PASS(ir, pass, ...) \
1812 ({ \
1813 bool progress = pass(ir, ##__VA_ARGS__); \
1814 if (progress) { \
1815 ir3_debug_print(ir, "AFTER: " #pass); \
1816 ir3_validate(ir); \
1817 } \
1818 progress; \
1819 })
1820
1821 /* validate: */
1822 void ir3_validate(struct ir3 *ir);
1823
1824 /* dump: */
1825 void ir3_print(struct ir3 *ir);
1826 void ir3_print_instr(struct ir3_instruction *instr);
1827
1828 struct log_stream;
1829 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
1830
1831 /* delay calculation: */
1832 int ir3_delayslots(struct ir3_instruction *assigner,
1833 struct ir3_instruction *consumer, unsigned n, bool soft);
1834 unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
1835 struct ir3_instruction *consumer,
1836 unsigned assigner_n, unsigned consumer_n);
1837 unsigned ir3_delay_calc(struct ir3_block *block,
1838 struct ir3_instruction *instr, bool mergedregs);
1839
1840 /* estimated (ss)/(sy) delay calculation */
1841
1842 static inline bool
is_local_mem_load(struct ir3_instruction * instr)1843 is_local_mem_load(struct ir3_instruction *instr)
1844 {
1845 return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
1846 instr->opc == OPC_LDLW;
1847 }
1848
1849 /* Does this instruction need (ss) to wait for its result? */
1850 static inline bool
is_ss_producer(struct ir3_instruction * instr)1851 is_ss_producer(struct ir3_instruction *instr)
1852 {
1853 foreach_dst (dst, instr) {
1854 if (dst->flags & IR3_REG_SHARED)
1855 return true;
1856 }
1857 return is_sfu(instr) || is_local_mem_load(instr);
1858 }
1859
1860 /* The soft delay for approximating the cost of (ss). */
1861 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)1862 soft_ss_delay(struct ir3_instruction *instr)
1863 {
1864 /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
1865 * using nop's instead of (ss) is:
1866 *
1867 * 8 - single warp
1868 * 9 - two warps
1869 * 10 - four warps
1870 *
1871 * and so on. Not quite sure where it tapers out (ie. how many warps share an
1872 * SFU unit). But 10 seems like a reasonable # to choose:
1873 */
1874 if (is_sfu(instr) || is_local_mem_load(instr))
1875 return 10;
1876
1877 /* The blob adds 6 nops between shared producers and consumers, and before we
1878 * used (ss) this was sufficient in most cases.
1879 */
1880 return 6;
1881 }
1882
1883 static inline bool
is_sy_producer(struct ir3_instruction * instr)1884 is_sy_producer(struct ir3_instruction *instr)
1885 {
1886 return is_tex_or_prefetch(instr) ||
1887 (is_load(instr) && !is_local_mem_load(instr)) ||
1888 is_atomic(instr->opc);
1889 }
1890
1891 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)1892 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
1893 {
1894 /* TODO: this is just an optimistic guess, we can do better post-RA.
1895 */
1896 bool double_wavesize =
1897 shader->type == MESA_SHADER_FRAGMENT ||
1898 shader->type == MESA_SHADER_COMPUTE;
1899
1900 unsigned components = reg_elems(instr->dsts[0]);
1901
1902 /* These numbers come from counting the number of delay slots to get
1903 * cat5/cat6 results back using nops instead of (sy). Note that these numbers
1904 * are with the result preloaded to cache by loading it before in the same
1905 * shader - uncached results are much larger.
1906 *
1907 * Note: most ALU instructions can't complete at the full doubled rate, so
1908 * they take 2 cycles. The only exception is fp16 instructions with no
1909 * built-in conversions. Therefore divide the latency by 2.
1910 *
1911 * TODO: Handle this properly in the scheduler and remove this.
1912 */
1913 if (instr->opc == OPC_LDC) {
1914 if (double_wavesize)
1915 return (21 + 8 * components) / 2;
1916 else
1917 return 18 + 4 * components;
1918 } else if (is_tex_or_prefetch(instr)) {
1919 if (double_wavesize) {
1920 switch (components) {
1921 case 1: return 58 / 2;
1922 case 2: return 60 / 2;
1923 case 3: return 77 / 2;
1924 case 4: return 79 / 2;
1925 default: unreachable("bad number of components");
1926 }
1927 } else {
1928 switch (components) {
1929 case 1: return 51;
1930 case 2: return 53;
1931 case 3: return 62;
1932 case 4: return 64;
1933 default: unreachable("bad number of components");
1934 }
1935 }
1936 } else {
1937 /* TODO: measure other cat6 opcodes like ldg */
1938 if (double_wavesize)
1939 return (172 + components) / 2;
1940 else
1941 return 109 + components;
1942 }
1943 }
1944
1945 /* unreachable block elimination: */
1946 bool ir3_remove_unreachable(struct ir3 *ir);
1947
1948 /* calculate reconvergence information: */
1949 void ir3_calc_reconvergence(struct ir3_shader_variant *so);
1950
1951 /* dead code elimination: */
1952 struct ir3_shader_variant;
1953 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
1954
1955 /* fp16 conversion folding */
1956 bool ir3_cf(struct ir3 *ir);
1957
1958 /* copy-propagate: */
1959 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1960
1961 /* common subexpression elimination: */
1962 bool ir3_cse(struct ir3 *ir);
1963
1964 /* Make arrays SSA */
1965 bool ir3_array_to_ssa(struct ir3 *ir);
1966
1967 /* scheduling: */
1968 bool ir3_sched_add_deps(struct ir3 *ir);
1969 int ir3_sched(struct ir3 *ir);
1970
1971 struct ir3_context;
1972 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
1973
1974 /* register assignment: */
1975 int ir3_ra(struct ir3_shader_variant *v);
1976
1977 /* lower subgroup ops: */
1978 bool ir3_lower_subgroups(struct ir3 *ir);
1979
1980 /* legalize: */
1981 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
1982 bool ir3_legalize_relative(struct ir3 *ir);
1983
1984 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)1985 ir3_has_latency_to_hide(struct ir3 *ir)
1986 {
1987 /* VS/GS/TCS/TESS co-exist with frag shader invocations, but we don't
1988 * know the nature of the fragment shader. Just assume it will have
1989 * latency to hide:
1990 */
1991 if (ir->type != MESA_SHADER_FRAGMENT)
1992 return true;
1993
1994 foreach_block (block, &ir->block_list) {
1995 foreach_instr (instr, &block->instr_list) {
1996 if (is_tex_or_prefetch(instr))
1997 return true;
1998
1999 if (is_load(instr)) {
2000 switch (instr->opc) {
2001 case OPC_LDLV:
2002 case OPC_LDL:
2003 case OPC_LDLW:
2004 break;
2005 default:
2006 return true;
2007 }
2008 }
2009 }
2010 }
2011
2012 return false;
2013 }
2014
2015 /* ************************************************************************* */
2016 /* instruction helpers */
2017
2018 /* creates SSA src of correct type (ie. half vs full precision) */
2019 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)2020 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
2021 unsigned flags)
2022 {
2023 struct ir3_register *reg;
2024 if (src->dsts[0]->flags & IR3_REG_HALF)
2025 flags |= IR3_REG_HALF;
2026 reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
2027 reg->def = src->dsts[0];
2028 reg->wrmask = src->dsts[0]->wrmask;
2029 return reg;
2030 }
2031
2032 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)2033 __ssa_dst(struct ir3_instruction *instr)
2034 {
2035 struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
2036 reg->instr = instr;
2037 return reg;
2038 }
2039
2040 static ir3_register_flags
type_flags(type_t type)2041 type_flags(type_t type)
2042 {
2043 if (type_size(type) < 32)
2044 return IR3_REG_HALF;
2045 return (ir3_register_flags)0;
2046 }
2047
2048 static inline struct ir3_instruction *
create_immed_typed(struct ir3_block * block,uint32_t val,type_t type)2049 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
2050 {
2051 struct ir3_instruction *mov;
2052 ir3_register_flags flags = type_flags(type);
2053
2054 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2055 mov->cat1.src_type = type;
2056 mov->cat1.dst_type = type;
2057 __ssa_dst(mov)->flags |= flags;
2058 ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
2059
2060 return mov;
2061 }
2062
2063 static inline struct ir3_instruction *
create_immed(struct ir3_block * block,uint32_t val)2064 create_immed(struct ir3_block *block, uint32_t val)
2065 {
2066 return create_immed_typed(block, val, TYPE_U32);
2067 }
2068
2069 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_block * block,unsigned n,type_t type)2070 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
2071 {
2072 struct ir3_instruction *mov;
2073 ir3_register_flags flags = type_flags(type);
2074
2075 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2076 mov->cat1.src_type = type;
2077 mov->cat1.dst_type = type;
2078 __ssa_dst(mov)->flags |= flags;
2079 ir3_src_create(mov, n, IR3_REG_CONST | flags);
2080
2081 return mov;
2082 }
2083
2084 static inline struct ir3_instruction *
create_uniform(struct ir3_block * block,unsigned n)2085 create_uniform(struct ir3_block *block, unsigned n)
2086 {
2087 return create_uniform_typed(block, n, TYPE_F32);
2088 }
2089
2090 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_block * block,int n,type_t type,struct ir3_instruction * address)2091 create_uniform_indirect(struct ir3_block *block, int n, type_t type,
2092 struct ir3_instruction *address)
2093 {
2094 struct ir3_instruction *mov;
2095
2096 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2097 mov->cat1.src_type = type;
2098 mov->cat1.dst_type = type;
2099 __ssa_dst(mov);
2100 ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
2101
2102 ir3_instr_set_address(mov, address);
2103
2104 return mov;
2105 }
2106
2107 static inline struct ir3_instruction *
ir3_MOV(struct ir3_block * block,struct ir3_instruction * src,type_t type)2108 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
2109 {
2110 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2111 ir3_register_flags flags = type_flags(type);
2112
2113 __ssa_dst(instr)->flags |= flags;
2114 if (src->dsts[0]->flags & IR3_REG_ARRAY) {
2115 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
2116 src_reg->array = src->dsts[0]->array;
2117 } else {
2118 __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
2119 }
2120 assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
2121 instr->cat1.src_type = type;
2122 instr->cat1.dst_type = type;
2123 return instr;
2124 }
2125
2126 static inline struct ir3_instruction *
ir3_COV(struct ir3_block * block,struct ir3_instruction * src,type_t src_type,type_t dst_type)2127 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
2128 type_t dst_type)
2129 {
2130 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2131 ir3_register_flags dst_flags = type_flags(dst_type);
2132 ASSERTED ir3_register_flags src_flags = type_flags(src_type);
2133
2134 assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2135
2136 __ssa_dst(instr)->flags |= dst_flags;
2137 __ssa_src(instr, src, 0);
2138 instr->cat1.src_type = src_type;
2139 instr->cat1.dst_type = dst_type;
2140 assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2141 return instr;
2142 }
2143
2144 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_block * block,unsigned components)2145 ir3_MOVMSK(struct ir3_block *block, unsigned components)
2146 {
2147 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
2148
2149 struct ir3_register *dst = __ssa_dst(instr);
2150 dst->flags |= IR3_REG_SHARED;
2151 dst->wrmask = (1 << components) - 1;
2152 instr->repeat = components - 1;
2153 return instr;
2154 }
2155
2156 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_block * block,struct ir3_instruction * src,unsigned components)2157 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
2158 unsigned components)
2159 {
2160 struct ir3_instruction *instr =
2161 ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
2162
2163 struct ir3_register *dst = __ssa_dst(instr);
2164 dst->flags |= IR3_REG_SHARED;
2165 dst->wrmask = (1 << components) - 1;
2166
2167 __ssa_src(instr, src, 0);
2168
2169 return instr;
2170 }
2171
2172 static inline struct ir3_instruction *
ir3_NOP(struct ir3_block * block)2173 ir3_NOP(struct ir3_block *block)
2174 {
2175 return ir3_instr_create(block, OPC_NOP, 0, 0);
2176 }
2177
2178 /* clang-format off */
2179 #define __INSTR0(flag, name, opc) \
2180 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block) \
2181 { \
2182 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0); \
2183 instr->flags |= flag; \
2184 return instr; \
2185 }
2186 /* clang-format on */
2187 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2188 #define INSTR0(name) __INSTR0((ir3_instruction_flags)0, name, OPC_##name)
2189
2190 /* clang-format off */
2191 #define __INSTR1(flag, dst_count, name, opc) \
2192 static inline struct ir3_instruction *ir3_##name( \
2193 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags) \
2194 { \
2195 struct ir3_instruction *instr = \
2196 ir3_instr_create(block, opc, dst_count, 1); \
2197 for (unsigned i = 0; i < dst_count; i++) \
2198 __ssa_dst(instr); \
2199 __ssa_src(instr, a, aflags); \
2200 instr->flags |= flag; \
2201 return instr; \
2202 }
2203 /* clang-format on */
2204 #define INSTR1F(f, name) __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2205 #define INSTR1(name) __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name)
2206 #define INSTR1NODST(name) __INSTR1((ir3_instruction_flags)0, 0, name, OPC_##name)
2207
2208 /* clang-format off */
2209 #define __INSTR2(flag, dst_count, name, opc) \
2210 static inline struct ir3_instruction *ir3_##name( \
2211 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2212 struct ir3_instruction *b, unsigned bflags) \
2213 { \
2214 struct ir3_instruction *instr = ir3_instr_create(block, opc, dst_count, 2); \
2215 for (unsigned i = 0; i < dst_count; i++) \
2216 __ssa_dst(instr); \
2217 __ssa_src(instr, a, aflags); \
2218 __ssa_src(instr, b, bflags); \
2219 instr->flags |= flag; \
2220 return instr; \
2221 }
2222 /* clang-format on */
2223 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2224 #define INSTR2(name) __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name)
2225 #define INSTR2NODST(name) __INSTR2((ir3_instruction_flags)0, 0, name, OPC_##name)
2226
2227 /* clang-format off */
2228 #define __INSTR3(flag, dst_count, name, opc) \
2229 static inline struct ir3_instruction *ir3_##name( \
2230 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2231 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2232 unsigned cflags) \
2233 { \
2234 struct ir3_instruction *instr = \
2235 ir3_instr_create(block, opc, dst_count, 3); \
2236 for (unsigned i = 0; i < dst_count; i++) \
2237 __ssa_dst(instr); \
2238 __ssa_src(instr, a, aflags); \
2239 __ssa_src(instr, b, bflags); \
2240 __ssa_src(instr, c, cflags); \
2241 instr->flags |= flag; \
2242 return instr; \
2243 }
2244 /* clang-format on */
2245 #define INSTR3F(f, name) __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2246 #define INSTR3(name) __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name)
2247 #define INSTR3NODST(name) __INSTR3((ir3_instruction_flags)0, 0, name, OPC_##name)
2248
2249 /* clang-format off */
2250 #define __INSTR4(flag, dst_count, name, opc) \
2251 static inline struct ir3_instruction *ir3_##name( \
2252 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2253 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2254 unsigned cflags, struct ir3_instruction *d, unsigned dflags) \
2255 { \
2256 struct ir3_instruction *instr = \
2257 ir3_instr_create(block, opc, dst_count, 4); \
2258 for (unsigned i = 0; i < dst_count; i++) \
2259 __ssa_dst(instr); \
2260 __ssa_src(instr, a, aflags); \
2261 __ssa_src(instr, b, bflags); \
2262 __ssa_src(instr, c, cflags); \
2263 __ssa_src(instr, d, dflags); \
2264 instr->flags |= flag; \
2265 return instr; \
2266 }
2267 /* clang-format on */
2268 #define INSTR4F(f, name) __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2269 #define INSTR4(name) __INSTR4((ir3_instruction_flags)0, 1, name, OPC_##name)
2270 #define INSTR4NODST(name) __INSTR4((ir3_instruction_flags)0, 0, name, OPC_##name)
2271
2272 /* clang-format off */
2273 #define __INSTR5(flag, name, opc) \
2274 static inline struct ir3_instruction *ir3_##name( \
2275 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2276 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2277 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2278 struct ir3_instruction *e, unsigned eflags) \
2279 { \
2280 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5); \
2281 __ssa_dst(instr); \
2282 __ssa_src(instr, a, aflags); \
2283 __ssa_src(instr, b, bflags); \
2284 __ssa_src(instr, c, cflags); \
2285 __ssa_src(instr, d, dflags); \
2286 __ssa_src(instr, e, eflags); \
2287 instr->flags |= flag; \
2288 return instr; \
2289 }
2290 /* clang-format on */
2291 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2292 #define INSTR5(name) __INSTR5((ir3_instruction_flags)0, name, OPC_##name)
2293
2294 /* clang-format off */
2295 #define __INSTR6(flag, dst_count, name, opc) \
2296 static inline struct ir3_instruction *ir3_##name( \
2297 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2298 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2299 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2300 struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f, \
2301 unsigned fflags) \
2302 { \
2303 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6); \
2304 for (unsigned i = 0; i < dst_count; i++) \
2305 __ssa_dst(instr); \
2306 __ssa_src(instr, a, aflags); \
2307 __ssa_src(instr, b, bflags); \
2308 __ssa_src(instr, c, cflags); \
2309 __ssa_src(instr, d, dflags); \
2310 __ssa_src(instr, e, eflags); \
2311 __ssa_src(instr, f, fflags); \
2312 instr->flags |= flag; \
2313 return instr; \
2314 }
2315 /* clang-format on */
2316 #define INSTR6F(f, name) __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2317 #define INSTR6(name) __INSTR6((ir3_instruction_flags)0, 1, name, OPC_##name)
2318 #define INSTR6NODST(name) __INSTR6((ir3_instruction_flags)0, 0, name, OPC_##name)
2319
2320 /* cat0 instructions: */
2321 INSTR1NODST(B)
INSTR0(JUMP)2322 INSTR0(JUMP)
2323 INSTR1NODST(KILL)
2324 INSTR1NODST(DEMOTE)
2325 INSTR0(END)
2326 INSTR0(CHSH)
2327 INSTR0(CHMASK)
2328 INSTR1NODST(PREDT)
2329 INSTR0(PREDF)
2330 INSTR0(PREDE)
2331 INSTR0(GETONE)
2332 INSTR0(GETLAST)
2333 INSTR0(SHPS)
2334 INSTR0(SHPE)
2335
2336 /* cat1 macros */
2337 INSTR1(ANY_MACRO)
2338 INSTR1(ALL_MACRO)
2339 INSTR1(READ_FIRST_MACRO)
2340 INSTR2(READ_COND_MACRO)
2341
2342 static inline struct ir3_instruction *
2343 ir3_ELECT_MACRO(struct ir3_block *block)
2344 {
2345 struct ir3_instruction *instr =
2346 ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
2347 __ssa_dst(instr);
2348 return instr;
2349 }
2350
2351 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_block * block)2352 ir3_SHPS_MACRO(struct ir3_block *block)
2353 {
2354 struct ir3_instruction *instr =
2355 ir3_instr_create(block, OPC_SHPS_MACRO, 1, 0);
2356 __ssa_dst(instr);
2357 return instr;
2358 }
2359
2360 /* cat2 instructions, most 2 src but some 1 src: */
2361 INSTR2(ADD_F)
INSTR2(MIN_F)2362 INSTR2(MIN_F)
2363 INSTR2(MAX_F)
2364 INSTR2(MUL_F)
2365 INSTR1(SIGN_F)
2366 INSTR2(CMPS_F)
2367 INSTR1(ABSNEG_F)
2368 INSTR2(CMPV_F)
2369 INSTR1(FLOOR_F)
2370 INSTR1(CEIL_F)
2371 INSTR1(RNDNE_F)
2372 INSTR1(RNDAZ_F)
2373 INSTR1(TRUNC_F)
2374 INSTR2(ADD_U)
2375 INSTR2(ADD_S)
2376 INSTR2(SUB_U)
2377 INSTR2(SUB_S)
2378 INSTR2(CMPS_U)
2379 INSTR2(CMPS_S)
2380 INSTR2(MIN_U)
2381 INSTR2(MIN_S)
2382 INSTR2(MAX_U)
2383 INSTR2(MAX_S)
2384 INSTR1(ABSNEG_S)
2385 INSTR2(AND_B)
2386 INSTR2(OR_B)
2387 INSTR1(NOT_B)
2388 INSTR2(XOR_B)
2389 INSTR2(CMPV_U)
2390 INSTR2(CMPV_S)
2391 INSTR2(MUL_U24)
2392 INSTR2(MUL_S24)
2393 INSTR2(MULL_U)
2394 INSTR1(BFREV_B)
2395 INSTR1(CLZ_S)
2396 INSTR1(CLZ_B)
2397 INSTR2(SHL_B)
2398 INSTR2(SHR_B)
2399 INSTR2(ASHR_B)
2400 INSTR2(BARY_F)
2401 INSTR2(FLAT_B)
2402 INSTR2(MGEN_B)
2403 INSTR2(GETBIT_B)
2404 INSTR1(SETRM)
2405 INSTR1(CBITS_B)
2406 INSTR2(SHB)
2407 INSTR2(MSAD)
2408
2409 /* cat3 instructions: */
2410 INSTR3(MAD_U16)
2411 INSTR3(MADSH_U16)
2412 INSTR3(MAD_S16)
2413 INSTR3(MADSH_M16)
2414 INSTR3(MAD_U24)
2415 INSTR3(MAD_S24)
2416 INSTR3(MAD_F16)
2417 INSTR3(MAD_F32)
2418 INSTR3(DP2ACC)
2419 INSTR3(DP4ACC)
2420 /* NOTE: SEL_B32 checks for zero vs nonzero */
2421 INSTR3(SEL_B16)
2422 INSTR3(SEL_B32)
2423 INSTR3(SEL_S16)
2424 INSTR3(SEL_S32)
2425 INSTR3(SEL_F16)
2426 INSTR3(SEL_F32)
2427 INSTR3(SAD_S16)
2428 INSTR3(SAD_S32)
2429
2430 /* cat4 instructions: */
2431 INSTR1(RCP)
2432 INSTR1(RSQ)
2433 INSTR1(HRSQ)
2434 INSTR1(LOG2)
2435 INSTR1(HLOG2)
2436 INSTR1(EXP2)
2437 INSTR1(HEXP2)
2438 INSTR1(SIN)
2439 INSTR1(COS)
2440 INSTR1(SQRT)
2441
2442 /* cat5 instructions: */
2443 INSTR1(DSX)
2444 INSTR1(DSXPP_MACRO)
2445 INSTR1(DSY)
2446 INSTR1(DSYPP_MACRO)
2447 INSTR1F(3D, DSX)
2448 INSTR1F(3D, DSY)
2449 INSTR1(RGETPOS)
2450
2451 static inline struct ir3_instruction *
2452 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
2453 ir3_instruction_flags flags, struct ir3_instruction *samp_tex,
2454 struct ir3_instruction *src0, struct ir3_instruction *src1)
2455 {
2456 struct ir3_instruction *sam;
2457 unsigned nreg = 0;
2458
2459 if (flags & IR3_INSTR_S2EN) {
2460 nreg++;
2461 }
2462 if (src0) {
2463 nreg++;
2464 }
2465 if (src1) {
2466 nreg++;
2467 }
2468
2469 sam = ir3_instr_create(block, opc, 1, nreg);
2470 sam->flags |= flags;
2471 __ssa_dst(sam)->wrmask = wrmask;
2472 if (flags & IR3_INSTR_S2EN) {
2473 __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2474 }
2475 if (src0) {
2476 __ssa_src(sam, src0, 0);
2477 }
2478 if (src1) {
2479 __ssa_src(sam, src1, 0);
2480 }
2481 sam->cat5.type = type;
2482
2483 return sam;
2484 }
2485
2486 /* brcst.active rx, ry behaves like a conditional move: rx either keeps its
2487 * value or is set to ry. In order to model this in SSA form, we add an extra
2488 * argument (the initial value of rx) and tie it to the destination.
2489 */
2490 static inline struct ir3_instruction *
ir3_BRCST_ACTIVE(struct ir3_block * block,unsigned cluster_size,struct ir3_instruction * src,struct ir3_instruction * dst_default)2491 ir3_BRCST_ACTIVE(struct ir3_block *block, unsigned cluster_size,
2492 struct ir3_instruction *src,
2493 struct ir3_instruction *dst_default)
2494 {
2495 struct ir3_instruction *brcst =
2496 ir3_instr_create(block, OPC_BRCST_ACTIVE, 1, 2);
2497 brcst->cat5.cluster_size = cluster_size;
2498 brcst->cat5.type = TYPE_U32;
2499 struct ir3_register *brcst_dst = __ssa_dst(brcst);
2500 __ssa_src(brcst, src, 0);
2501 struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0);
2502 ir3_reg_tie(brcst_dst, default_src);
2503 return brcst;
2504 }
2505
2506 /* cat6 instructions: */
2507 INSTR0(GETFIBERID)
2508 INSTR2(LDLV)
2509 INSTR3(LDG)
2510 INSTR3(LDL)
2511 INSTR3(LDLW)
2512 INSTR3(LDP)
2513 INSTR4NODST(STG)
2514 INSTR3NODST(STL)
2515 INSTR3NODST(STLW)
2516 INSTR3NODST(STP)
2517 INSTR1(RESINFO)
2518 INSTR1(RESFMT)
2519 INSTR2(ATOMIC_ADD)
2520 INSTR2(ATOMIC_SUB)
2521 INSTR2(ATOMIC_XCHG)
2522 INSTR2(ATOMIC_INC)
2523 INSTR2(ATOMIC_DEC)
2524 INSTR2(ATOMIC_CMPXCHG)
2525 INSTR2(ATOMIC_MIN)
2526 INSTR2(ATOMIC_MAX)
2527 INSTR2(ATOMIC_AND)
2528 INSTR2(ATOMIC_OR)
2529 INSTR2(ATOMIC_XOR)
2530 INSTR2(LDC)
2531 INSTR2(QUAD_SHUFFLE_BRCST)
2532 INSTR1(QUAD_SHUFFLE_HORIZ)
2533 INSTR1(QUAD_SHUFFLE_VERT)
2534 INSTR1(QUAD_SHUFFLE_DIAG)
2535 INSTR2NODST(LDC_K)
2536 INSTR2NODST(STC)
2537 INSTR2NODST(STSC)
2538 #ifndef GPU
2539 #elif GPU >= 600
2540 INSTR3NODST(STIB);
2541 INSTR2(LDIB);
2542 INSTR5(LDG_A);
2543 INSTR6NODST(STG_A);
2544 INSTR2(ATOMIC_G_ADD)
2545 INSTR2(ATOMIC_G_SUB)
2546 INSTR2(ATOMIC_G_XCHG)
2547 INSTR2(ATOMIC_G_INC)
2548 INSTR2(ATOMIC_G_DEC)
2549 INSTR2(ATOMIC_G_CMPXCHG)
2550 INSTR2(ATOMIC_G_MIN)
2551 INSTR2(ATOMIC_G_MAX)
2552 INSTR2(ATOMIC_G_AND)
2553 INSTR2(ATOMIC_G_OR)
2554 INSTR2(ATOMIC_G_XOR)
2555 INSTR3(ATOMIC_B_ADD)
2556 INSTR3(ATOMIC_B_SUB)
2557 INSTR3(ATOMIC_B_XCHG)
2558 INSTR3(ATOMIC_B_INC)
2559 INSTR3(ATOMIC_B_DEC)
2560 INSTR3(ATOMIC_B_CMPXCHG)
2561 INSTR3(ATOMIC_B_MIN)
2562 INSTR3(ATOMIC_B_MAX)
2563 INSTR3(ATOMIC_B_AND)
2564 INSTR3(ATOMIC_B_OR)
2565 INSTR3(ATOMIC_B_XOR)
2566 #elif GPU >= 400
2567 INSTR3(LDGB)
2568 #if GPU >= 500
2569 INSTR3(LDIB)
2570 #endif
2571 INSTR4NODST(STGB)
2572 INSTR4NODST(STIB)
2573 INSTR4(ATOMIC_S_ADD)
2574 INSTR4(ATOMIC_S_SUB)
2575 INSTR4(ATOMIC_S_XCHG)
2576 INSTR4(ATOMIC_S_INC)
2577 INSTR4(ATOMIC_S_DEC)
2578 INSTR4(ATOMIC_S_CMPXCHG)
2579 INSTR4(ATOMIC_S_MIN)
2580 INSTR4(ATOMIC_S_MAX)
2581 INSTR4(ATOMIC_S_AND)
2582 INSTR4(ATOMIC_S_OR)
2583 INSTR4(ATOMIC_S_XOR)
2584 #endif
2585 INSTR4NODST(LDG_K)
2586
2587 /* cat7 instructions: */
2588 INSTR0(BAR)
2589 INSTR0(FENCE)
2590 INSTR0(CCINV)
2591
2592 /* ************************************************************************* */
2593 #include "util/bitset.h"
2594
2595 #define MAX_REG 256
2596
2597 typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
2598
2599 typedef struct {
2600 bool mergedregs;
2601 regmaskstate_t mask;
2602 } regmask_t;
2603
2604 static inline bool
__regmask_get(regmask_t * regmask,bool half,unsigned n)2605 __regmask_get(regmask_t *regmask, bool half, unsigned n)
2606 {
2607 if (regmask->mergedregs) {
2608 /* a6xx+ case, with merged register file, we track things in terms
2609 * of half-precision registers, with a full precisions register
2610 * using two half-precision slots.
2611 *
2612 * Pretend that special regs (a0.x, a1.x, etc.) are full registers to
2613 * avoid having them alias normal full regs.
2614 */
2615 if (half && !is_reg_num_special(n)) {
2616 return BITSET_TEST(regmask->mask, n);
2617 } else {
2618 n *= 2;
2619 return BITSET_TEST(regmask->mask, n) ||
2620 BITSET_TEST(regmask->mask, n + 1);
2621 }
2622 } else {
2623 /* pre a6xx case, with separate register file for half and full
2624 * precision:
2625 */
2626 if (half)
2627 n += MAX_REG;
2628 return BITSET_TEST(regmask->mask, n);
2629 }
2630 }
2631
2632 static inline void
__regmask_set(regmask_t * regmask,bool half,unsigned n)2633 __regmask_set(regmask_t *regmask, bool half, unsigned n)
2634 {
2635 if (regmask->mergedregs) {
2636 /* a6xx+ case, with merged register file, we track things in terms
2637 * of half-precision registers, with a full precisions register
2638 * using two half-precision slots:
2639 */
2640 if (half && !is_reg_num_special(n)) {
2641 BITSET_SET(regmask->mask, n);
2642 } else {
2643 n *= 2;
2644 BITSET_SET(regmask->mask, n);
2645 BITSET_SET(regmask->mask, n + 1);
2646 }
2647 } else {
2648 /* pre a6xx case, with separate register file for half and full
2649 * precision:
2650 */
2651 if (half)
2652 n += MAX_REG;
2653 BITSET_SET(regmask->mask, n);
2654 }
2655 }
2656
2657 static inline void
__regmask_clear(regmask_t * regmask,bool half,unsigned n)2658 __regmask_clear(regmask_t *regmask, bool half, unsigned n)
2659 {
2660 if (regmask->mergedregs) {
2661 /* a6xx+ case, with merged register file, we track things in terms
2662 * of half-precision registers, with a full precisions register
2663 * using two half-precision slots:
2664 */
2665 if (half && !is_reg_num_special(n)) {
2666 BITSET_CLEAR(regmask->mask, n);
2667 } else {
2668 n *= 2;
2669 BITSET_CLEAR(regmask->mask, n);
2670 BITSET_CLEAR(regmask->mask, n + 1);
2671 }
2672 } else {
2673 /* pre a6xx case, with separate register file for half and full
2674 * precision:
2675 */
2676 if (half)
2677 n += MAX_REG;
2678 BITSET_CLEAR(regmask->mask, n);
2679 }
2680 }
2681
2682 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)2683 regmask_init(regmask_t *regmask, bool mergedregs)
2684 {
2685 memset(®mask->mask, 0, sizeof(regmask->mask));
2686 regmask->mergedregs = mergedregs;
2687 }
2688
2689 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)2690 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
2691 {
2692 assert(dst->mergedregs == a->mergedregs);
2693 assert(dst->mergedregs == b->mergedregs);
2694
2695 for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2696 dst->mask[i] = a->mask[i] | b->mask[i];
2697 }
2698
2699 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)2700 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
2701 {
2702 regmaskstate_t shared_mask;
2703 BITSET_ZERO(shared_mask);
2704
2705 if (b->mergedregs) {
2706 BITSET_SET_RANGE(shared_mask, 2 * 4 * 48, 2 * 4 * 56 - 1);
2707 } else {
2708 BITSET_SET_RANGE(shared_mask, 4 * 48, 4 * 56 - 1);
2709 }
2710
2711 for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2712 dst->mask[i] = a->mask[i] | (b->mask[i] & shared_mask[i]);
2713 }
2714
2715 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)2716 regmask_set(regmask_t *regmask, struct ir3_register *reg)
2717 {
2718 bool half = reg->flags & IR3_REG_HALF;
2719 if (reg->flags & IR3_REG_RELATIV) {
2720 for (unsigned i = 0; i < reg->size; i++)
2721 __regmask_set(regmask, half, reg->array.base + i);
2722 } else {
2723 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2724 if (mask & 1)
2725 __regmask_set(regmask, half, n);
2726 }
2727 }
2728
2729 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)2730 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
2731 {
2732 bool half = reg->flags & IR3_REG_HALF;
2733 if (reg->flags & IR3_REG_RELATIV) {
2734 for (unsigned i = 0; i < reg->size; i++)
2735 __regmask_clear(regmask, half, reg->array.base + i);
2736 } else {
2737 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2738 if (mask & 1)
2739 __regmask_clear(regmask, half, n);
2740 }
2741 }
2742
2743 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)2744 regmask_get(regmask_t *regmask, struct ir3_register *reg)
2745 {
2746 bool half = reg->flags & IR3_REG_HALF;
2747 if (reg->flags & IR3_REG_RELATIV) {
2748 for (unsigned i = 0; i < reg->size; i++)
2749 if (__regmask_get(regmask, half, reg->array.base + i))
2750 return true;
2751 } else {
2752 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2753 if (mask & 1)
2754 if (__regmask_get(regmask, half, n))
2755 return true;
2756 }
2757 return false;
2758 }
2759 /* ************************************************************************* */
2760
2761 #endif /* IR3_H_ */
2762