1 /*
2 * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef IR3_H_
25 #define IR3_H_
26
27 #include <stdbool.h>
28 #include <stdint.h>
29
30 #include "compiler/shader_enums.h"
31
32 #include "util/bitscan.h"
33 #include "util/list.h"
34 #include "util/set.h"
35 #include "util/u_debug.h"
36
37 #include "instr-a3xx.h"
38
39 /* low level intermediate representation of an adreno shader program */
40
41 struct ir3_compiler;
42 struct ir3;
43 struct ir3_instruction;
44 struct ir3_block;
45
46 struct ir3_info {
47 void *data; /* used internally in ir3 assembler */
48 /* Size in bytes of the shader binary, including NIR constants and
49 * padding
50 */
51 uint32_t size;
52 /* byte offset from start of the shader to the NIR constant data. */
53 uint32_t constant_data_offset;
54 /* Size in dwords of the instructions. */
55 uint16_t sizedwords;
56 uint16_t instrs_count; /* expanded to account for rpt's */
57 uint16_t nops_count; /* # of nop instructions, including nopN */
58 uint16_t mov_count;
59 uint16_t cov_count;
60 uint16_t stp_count;
61 uint16_t ldp_count;
62 /* NOTE: max_reg, etc, does not include registers not touched
63 * by the shader (ie. vertex fetched via VFD_DECODE but not
64 * touched by shader)
65 */
66 int8_t max_reg; /* highest GPR # used by shader */
67 int8_t max_half_reg;
68 int16_t max_const;
69 /* This is the maximum # of waves that can executed at once in one core,
70 * assuming that they are all executing this shader.
71 */
72 int8_t max_waves;
73 bool double_threadsize;
74 bool multi_dword_ldp_stp;
75
76 /* number of sync bits: */
77 uint16_t ss, sy;
78
79 /* estimate of number of cycles stalled on (ss) */
80 uint16_t sstall;
81 /* estimate of number of cycles stalled on (sy) */
82 uint16_t systall;
83
84 uint16_t last_baryf; /* instruction # of last varying fetch */
85
86 /* Number of instructions of a given category: */
87 uint16_t instrs_per_cat[8];
88 };
89
90 struct ir3_merge_set {
91 uint16_t preferred_reg;
92 uint16_t size;
93 uint16_t alignment;
94
95 unsigned interval_start;
96 unsigned spill_slot;
97
98 unsigned regs_count;
99 struct ir3_register **regs;
100 };
101
102 struct ir3_register {
103 enum {
104 IR3_REG_CONST = 0x001,
105 IR3_REG_IMMED = 0x002,
106 IR3_REG_HALF = 0x004,
107 /* Shared registers have the same value for all threads when read.
108 * They can only be written when one thread is active (that is, inside
109 * a "getone" block).
110 */
111 IR3_REG_SHARED = 0x008,
112 IR3_REG_RELATIV = 0x010,
113 IR3_REG_R = 0x020,
114 /* Most instructions, it seems, can do float abs/neg but not
115 * integer. The CP pass needs to know what is intended (int or
116 * float) in order to do the right thing. For this reason the
117 * abs/neg flags are split out into float and int variants. In
118 * addition, .b (bitwise) operations, the negate is actually a
119 * bitwise not, so split that out into a new flag to make it
120 * more clear.
121 */
122 IR3_REG_FNEG = 0x040,
123 IR3_REG_FABS = 0x080,
124 IR3_REG_SNEG = 0x100,
125 IR3_REG_SABS = 0x200,
126 IR3_REG_BNOT = 0x400,
127 /* (ei) flag, end-input? Set on last bary, presumably to signal
128 * that the shader needs no more input:
129 *
130 * Note: Has different meaning on other instructions like add.s/u
131 */
132 IR3_REG_EI = 0x2000,
133 /* meta-flags, for intermediate stages of IR, ie.
134 * before register assignment is done:
135 */
136 IR3_REG_SSA = 0x4000, /* 'def' is ptr to assigning destination */
137 IR3_REG_ARRAY = 0x8000,
138
139 /* Set on a use whenever the SSA value becomes dead after the current
140 * instruction.
141 */
142 IR3_REG_KILL = 0x10000,
143
144 /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
145 * same SSA value in a single instruction, this is only set on the first
146 * use.
147 */
148 IR3_REG_FIRST_KILL = 0x20000,
149
150 /* Set when a destination doesn't have any uses and is dead immediately
151 * after the instruction. This can happen even after optimizations for
152 * corner cases such as destinations of atomic instructions.
153 */
154 IR3_REG_UNUSED = 0x40000,
155
156 /* "Early-clobber" on a destination means that the destination is
157 * (potentially) written before any sources are read and therefore
158 * interferes with the sources of the instruction.
159 */
160 IR3_REG_EARLY_CLOBBER = 0x80000,
161 } flags;
162
163 unsigned name;
164
165 /* used for cat5 instructions, but also for internal/IR level
166 * tracking of what registers are read/written by an instruction.
167 * wrmask may be a bad name since it is used to represent both
168 * src and dst that touch multiple adjacent registers.
169 */
170 unsigned wrmask : 16; /* up to vec16 */
171
172 /* for relative addressing, 32bits for array size is too small,
173 * but otoh we don't need to deal with disjoint sets, so instead
174 * use a simple size field (number of scalar components).
175 *
176 * Note the size field isn't important for relative const (since
177 * we don't have to do register allocation for constants).
178 */
179 unsigned size : 16;
180
181 /* normal registers:
182 * the component is in the low two bits of the reg #, so
183 * rN.x becomes: (N << 2) | x
184 */
185 uint16_t num;
186 union {
187 /* immediate: */
188 int32_t iim_val;
189 uint32_t uim_val;
190 float fim_val;
191 /* relative: */
192 struct {
193 uint16_t id;
194 int16_t offset;
195 uint16_t base;
196 } array;
197 };
198
199 /* For IR3_REG_SSA, dst registers contain pointer back to the instruction
200 * containing this register.
201 */
202 struct ir3_instruction *instr;
203
204 /* For IR3_REG_SSA, src registers contain ptr back to assigning
205 * instruction.
206 *
207 * For IR3_REG_ARRAY, the pointer is back to the last dependent
208 * array access (although the net effect is the same, it points
209 * back to a previous instruction that we depend on).
210 */
211 struct ir3_register *def;
212
213 /* Pointer to another register in the instruction that must share the same
214 * physical register. Each destination can be tied with one source, and
215 * they must have "tied" pointing to each other.
216 */
217 struct ir3_register *tied;
218
219 unsigned spill_slot, next_use;
220
221 unsigned merge_set_offset;
222 struct ir3_merge_set *merge_set;
223 unsigned interval_start, interval_end;
224 };
225
226 /*
227 * Stupid/simple growable array implementation:
228 */
229 #define DECLARE_ARRAY(type, name) \
230 unsigned name##_count, name##_sz; \
231 type *name;
232
233 #define array_insert(ctx, arr, ...) \
234 do { \
235 if (arr##_count == arr##_sz) { \
236 arr##_sz = MAX2(2 * arr##_sz, 16); \
237 arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0])); \
238 } \
239 arr[arr##_count++] = __VA_ARGS__; \
240 } while (0)
241
242 typedef enum {
243 REDUCE_OP_ADD_U,
244 REDUCE_OP_ADD_F,
245 REDUCE_OP_MUL_U,
246 REDUCE_OP_MUL_F,
247 REDUCE_OP_MIN_U,
248 REDUCE_OP_MIN_S,
249 REDUCE_OP_MIN_F,
250 REDUCE_OP_MAX_U,
251 REDUCE_OP_MAX_S,
252 REDUCE_OP_MAX_F,
253 REDUCE_OP_AND_B,
254 REDUCE_OP_OR_B,
255 REDUCE_OP_XOR_B,
256 } reduce_op_t;
257
258 struct ir3_instruction {
259 struct ir3_block *block;
260 opc_t opc;
261 enum {
262 /* (sy) flag is set on first instruction, and after sample
263 * instructions (probably just on RAW hazard).
264 */
265 IR3_INSTR_SY = 0x001,
266 /* (ss) flag is set on first instruction, and first instruction
267 * to depend on the result of "long" instructions (RAW hazard):
268 *
269 * rcp, rsq, log2, exp2, sin, cos, sqrt
270 *
271 * It seems to synchronize until all in-flight instructions are
272 * completed, for example:
273 *
274 * rsq hr1.w, hr1.w
275 * add.f hr2.z, (neg)hr2.z, hc0.y
276 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
277 * rsq hr2.x, hr2.x
278 * (rpt1)nop
279 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
280 * nop
281 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
282 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
283 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
284 *
285 * The last mul.f does not have (ss) set, presumably because the
286 * (ss) on the previous instruction does the job.
287 *
288 * The blob driver also seems to set it on WAR hazards, although
289 * not really clear if this is needed or just blob compiler being
290 * sloppy. So far I haven't found a case where removing the (ss)
291 * causes problems for WAR hazard, but I could just be getting
292 * lucky:
293 *
294 * rcp r1.y, r3.y
295 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
296 *
297 */
298 IR3_INSTR_SS = 0x002,
299 /* (jp) flag is set on jump targets:
300 */
301 IR3_INSTR_JP = 0x004,
302 IR3_INSTR_UL = 0x008,
303 IR3_INSTR_3D = 0x010,
304 IR3_INSTR_A = 0x020,
305 IR3_INSTR_O = 0x040,
306 IR3_INSTR_P = 0x080,
307 IR3_INSTR_S = 0x100,
308 IR3_INSTR_S2EN = 0x200,
309 IR3_INSTR_SAT = 0x400,
310 /* (cat5/cat6) Bindless */
311 IR3_INSTR_B = 0x800,
312 /* (cat5/cat6) nonuniform */
313 IR3_INSTR_NONUNIF = 0x1000,
314 /* (cat5-only) Get some parts of the encoding from a1.x */
315 IR3_INSTR_A1EN = 0x02000,
316 /* meta-flags, for intermediate stages of IR, ie.
317 * before register assignment is done:
318 */
319 IR3_INSTR_MARK = 0x04000,
320 IR3_INSTR_UNUSED = 0x08000,
321 } flags;
322 uint8_t repeat;
323 uint8_t nop;
324 #ifdef DEBUG
325 unsigned srcs_max, dsts_max;
326 #endif
327 unsigned srcs_count, dsts_count;
328 struct ir3_register **dsts;
329 struct ir3_register **srcs;
330 union {
331 struct {
332 char inv1, inv2;
333 char comp1, comp2;
334 int immed;
335 struct ir3_block *target;
336 const char *target_label;
337 brtype_t brtype;
338 unsigned idx; /* for brac.N */
339 } cat0;
340 struct {
341 type_t src_type, dst_type;
342 round_t round;
343 reduce_op_t reduce_op;
344 } cat1;
345 struct {
346 enum {
347 IR3_COND_LT = 0,
348 IR3_COND_LE = 1,
349 IR3_COND_GT = 2,
350 IR3_COND_GE = 3,
351 IR3_COND_EQ = 4,
352 IR3_COND_NE = 5,
353 } condition;
354 } cat2;
355 struct {
356 enum {
357 IR3_SRC_UNSIGNED = 0,
358 IR3_SRC_MIXED = 1,
359 } signedness;
360 enum {
361 IR3_SRC_PACKED_LOW = 0,
362 IR3_SRC_PACKED_HIGH = 1,
363 } packed;
364 bool swapped;
365 } cat3;
366 struct {
367 unsigned samp, tex;
368 unsigned tex_base : 3;
369 unsigned cluster_size : 4;
370 type_t type;
371 } cat5;
372 struct {
373 type_t type;
374 /* TODO remove dst_offset and handle as a ir3_register
375 * which might be IMMED, similar to how src_offset is
376 * handled.
377 */
378 int dst_offset;
379 int iim_val; /* for ldgb/stgb, # of components */
380 unsigned d : 3; /* for ldc, component offset */
381 bool typed : 1;
382 unsigned base : 3;
383 } cat6;
384 struct {
385 unsigned w : 1; /* write */
386 unsigned r : 1; /* read */
387 unsigned l : 1; /* local */
388 unsigned g : 1; /* global */
389 } cat7;
390 /* for meta-instructions, just used to hold extra data
391 * before instruction scheduling, etc
392 */
393 struct {
394 int off; /* component/offset */
395 } split;
396 struct {
397 /* Per-source index back to the entry in the
398 * ir3_shader_variant::outputs table.
399 */
400 unsigned *outidxs;
401 } end;
402 struct {
403 /* used to temporarily hold reference to nir_phi_instr
404 * until we resolve the phi srcs
405 */
406 void *nphi;
407 } phi;
408 struct {
409 unsigned samp, tex;
410 unsigned input_offset;
411 unsigned samp_base : 3;
412 unsigned tex_base : 3;
413 } prefetch;
414 struct {
415 /* maps back to entry in ir3_shader_variant::inputs table: */
416 int inidx;
417 /* for sysvals, identifies the sysval type. Mostly so we can
418 * identify the special cases where a sysval should not be DCE'd
419 * (currently, just pre-fs texture fetch)
420 */
421 gl_system_value sysval;
422 } input;
423 };
424
425 /* For assigning jump offsets, we need instruction's position: */
426 uint32_t ip;
427
428 /* used for per-pass extra instruction data.
429 *
430 * TODO we should remove the per-pass data like this and 'use_count'
431 * and do something similar to what RA does w/ ir3_ra_instr_data..
432 * ie. use the ir3_count_instructions pass, and then use instr->ip
433 * to index into a table of pass-private data.
434 */
435 void *data;
436
437 /**
438 * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
439 */
440 struct set *uses;
441
442 int use_count; /* currently just updated/used by cp */
443
444 /* an instruction can reference at most one address register amongst
445 * it's src/dst registers. Beyond that, you need to insert mov's.
446 *
447 * NOTE: do not write this directly, use ir3_instr_set_address()
448 */
449 struct ir3_register *address;
450
451 /* Tracking for additional dependent instructions. Used to handle
452 * barriers, WAR hazards for arrays/SSBOs/etc.
453 */
454 DECLARE_ARRAY(struct ir3_instruction *, deps);
455
456 /*
457 * From PoV of instruction scheduling, not execution (ie. ignores global/
458 * local distinction):
459 * shared image atomic SSBO everything
460 * barrier()/ - R/W R/W R/W R/W X
461 * groupMemoryBarrier()
462 * memoryBarrier()
463 * (but only images declared coherent?)
464 * memoryBarrierAtomic() - R/W
465 * memoryBarrierBuffer() - R/W
466 * memoryBarrierImage() - R/W
467 * memoryBarrierShared() - R/W
468 *
469 * TODO I think for SSBO/image/shared, in cases where we can determine
470 * which variable is accessed, we don't need to care about accesses to
471 * different variables (unless declared coherent??)
472 */
473 enum {
474 IR3_BARRIER_EVERYTHING = 1 << 0,
475 IR3_BARRIER_SHARED_R = 1 << 1,
476 IR3_BARRIER_SHARED_W = 1 << 2,
477 IR3_BARRIER_IMAGE_R = 1 << 3,
478 IR3_BARRIER_IMAGE_W = 1 << 4,
479 IR3_BARRIER_BUFFER_R = 1 << 5,
480 IR3_BARRIER_BUFFER_W = 1 << 6,
481 IR3_BARRIER_ARRAY_R = 1 << 7,
482 IR3_BARRIER_ARRAY_W = 1 << 8,
483 IR3_BARRIER_PRIVATE_R = 1 << 9,
484 IR3_BARRIER_PRIVATE_W = 1 << 10,
485 IR3_BARRIER_CONST_W = 1 << 11,
486 IR3_BARRIER_ACTIVE_FIBERS_R = 1 << 12,
487 IR3_BARRIER_ACTIVE_FIBERS_W = 1 << 13,
488 } barrier_class,
489 barrier_conflict;
490
491 /* Entry in ir3_block's instruction list: */
492 struct list_head node;
493
494 uint32_t serialno;
495
496 // TODO only computerator/assembler:
497 int line;
498 };
499
500 struct ir3 {
501 struct ir3_compiler *compiler;
502 gl_shader_stage type;
503
504 DECLARE_ARRAY(struct ir3_instruction *, inputs);
505
506 /* Track bary.f (and ldlv) instructions.. this is needed in
507 * scheduling to ensure that all varying fetches happen before
508 * any potential kill instructions. The hw gets grumpy if all
509 * threads in a group are killed before the last bary.f gets
510 * a chance to signal end of input (ei).
511 */
512 DECLARE_ARRAY(struct ir3_instruction *, baryfs);
513
514 /* Track all indirect instructions (read and write). To avoid
515 * deadlock scenario where an address register gets scheduled,
516 * but other dependent src instructions cannot be scheduled due
517 * to dependency on a *different* address register value, the
518 * scheduler needs to ensure that all dependencies other than
519 * the instruction other than the address register are scheduled
520 * before the one that writes the address register. Having a
521 * convenient list of instructions that reference some address
522 * register simplifies this.
523 */
524 DECLARE_ARRAY(struct ir3_instruction *, a0_users);
525
526 /* same for a1.x: */
527 DECLARE_ARRAY(struct ir3_instruction *, a1_users);
528
529 /* and same for instructions that consume predicate register: */
530 DECLARE_ARRAY(struct ir3_instruction *, predicates);
531
532 /* Track texture sample instructions which need texture state
533 * patched in (for astc-srgb workaround):
534 */
535 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
536
537 /* Track tg4 instructions which need texture state patched in (for tg4
538 * swizzling workaround):
539 */
540 DECLARE_ARRAY(struct ir3_instruction *, tg4);
541
542 /* List of blocks: */
543 struct list_head block_list;
544
545 /* List of ir3_array's: */
546 struct list_head array_list;
547
548 #ifdef DEBUG
549 unsigned block_count;
550 #endif
551 unsigned instr_count;
552 };
553
554 struct ir3_array {
555 struct list_head node;
556 unsigned length;
557 unsigned id;
558
559 struct nir_register *r;
560
561 /* To avoid array write's from getting DCE'd, keep track of the
562 * most recent write. Any array access depends on the most
563 * recent write. This way, nothing depends on writes after the
564 * last read. But all the writes that happen before that have
565 * something depending on them
566 */
567 struct ir3_register *last_write;
568
569 /* extra stuff used in RA pass: */
570 unsigned base; /* base vreg name */
571 unsigned reg; /* base physical reg */
572 uint16_t start_ip, end_ip;
573
574 /* Indicates if half-precision */
575 bool half;
576
577 bool unused;
578 };
579
580 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
581
582 enum ir3_branch_type {
583 IR3_BRANCH_COND, /* condition */
584 IR3_BRANCH_ANY, /* subgroupAny(condition) */
585 IR3_BRANCH_ALL, /* subgroupAll(condition) */
586 IR3_BRANCH_GETONE, /* subgroupElect() */
587 IR3_BRANCH_SHPS, /* preamble start */
588 };
589
590 struct ir3_block {
591 struct list_head node;
592 struct ir3 *shader;
593
594 const struct nir_block *nblock;
595
596 struct list_head instr_list; /* list of ir3_instruction */
597
598 /* The actual branch condition, if there are two successors */
599 enum ir3_branch_type brtype;
600
601 /* each block has either one or two successors.. in case of two
602 * successors, 'condition' decides which one to follow. A block preceding
603 * an if/else has two successors.
604 *
605 * In some cases the path that the machine actually takes through the
606 * program may not match the per-thread view of the CFG. In particular
607 * this is the case for if/else, where the machine jumps from the end of
608 * the if to the beginning of the else and switches active lanes. While
609 * most things only care about the per-thread view, we need to use the
610 * "physical" view when allocating shared registers. "successors" contains
611 * the per-thread successors, and "physical_successors" contains the
612 * physical successors which includes the fallthrough edge from the if to
613 * the else.
614 */
615 struct ir3_instruction *condition;
616 struct ir3_block *successors[2];
617 struct ir3_block *physical_successors[2];
618
619 DECLARE_ARRAY(struct ir3_block *, predecessors);
620 DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
621
622 uint16_t start_ip, end_ip;
623
624 /* Track instructions which do not write a register but other-
625 * wise must not be discarded (such as kill, stg, etc)
626 */
627 DECLARE_ARRAY(struct ir3_instruction *, keeps);
628
629 /* used for per-pass extra block data. Mainly used right
630 * now in RA step to track livein/liveout.
631 */
632 void *data;
633
634 uint32_t index;
635
636 struct ir3_block *imm_dom;
637 DECLARE_ARRAY(struct ir3_block *, dom_children);
638
639 uint32_t dom_pre_index;
640 uint32_t dom_post_index;
641
642 uint32_t loop_id;
643 uint32_t loop_depth;
644
645 #ifdef DEBUG
646 uint32_t serialno;
647 #endif
648 };
649
650 static inline uint32_t
block_id(struct ir3_block * block)651 block_id(struct ir3_block *block)
652 {
653 #ifdef DEBUG
654 return block->serialno;
655 #else
656 return (uint32_t)(unsigned long)block;
657 #endif
658 }
659
660 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)661 ir3_start_block(struct ir3 *ir)
662 {
663 return list_first_entry(&ir->block_list, struct ir3_block, node);
664 }
665
666 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)667 ir3_after_preamble(struct ir3 *ir)
668 {
669 struct ir3_block *block = ir3_start_block(ir);
670 /* The preamble will have a usually-empty else branch, and we want to skip
671 * that to get to the block after the preamble.
672 */
673 if (block->brtype == IR3_BRANCH_SHPS)
674 return block->successors[1]->successors[0];
675 else
676 return block;
677 }
678
679 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
680 void ir3_block_add_physical_predecessor(struct ir3_block *block,
681 struct ir3_block *pred);
682 void ir3_block_remove_predecessor(struct ir3_block *block,
683 struct ir3_block *pred);
684 void ir3_block_remove_physical_predecessor(struct ir3_block *block,
685 struct ir3_block *pred);
686 unsigned ir3_block_get_pred_index(struct ir3_block *block,
687 struct ir3_block *pred);
688
689 void ir3_calc_dominance(struct ir3 *ir);
690 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
691
692 struct ir3_shader_variant;
693
694 struct ir3 *ir3_create(struct ir3_compiler *compiler,
695 struct ir3_shader_variant *v);
696 void ir3_destroy(struct ir3 *shader);
697
698 void ir3_collect_info(struct ir3_shader_variant *v);
699 void *ir3_alloc(struct ir3 *shader, int sz);
700
701 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
702 unsigned reg_count,
703 bool double_threadsize);
704
705 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
706 bool double_threadsize);
707
708 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
709 unsigned regs_count);
710
711 struct ir3_block *ir3_block_create(struct ir3 *shader);
712
713 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
714 int ndst, int nsrc);
715 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
716 void ir3_instr_add_dep(struct ir3_instruction *instr,
717 struct ir3_instruction *dep);
718 const char *ir3_instr_name(struct ir3_instruction *instr);
719
720 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
721 int flags);
722 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
723 int flags);
724 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
725 struct ir3_register *reg);
726
727 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)728 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
729 {
730 assert(!dst->tied && !src->tied);
731 dst->tied = src;
732 src->tied = dst;
733 }
734
735 void ir3_reg_set_last_array(struct ir3_instruction *instr,
736 struct ir3_register *reg,
737 struct ir3_register *last_write);
738
739 void ir3_instr_set_address(struct ir3_instruction *instr,
740 struct ir3_instruction *addr);
741
742 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)743 ir3_instr_check_mark(struct ir3_instruction *instr)
744 {
745 if (instr->flags & IR3_INSTR_MARK)
746 return true; /* already visited */
747 instr->flags |= IR3_INSTR_MARK;
748 return false;
749 }
750
751 void ir3_block_clear_mark(struct ir3_block *block);
752 void ir3_clear_mark(struct ir3 *shader);
753
754 unsigned ir3_count_instructions(struct ir3 *ir);
755 unsigned ir3_count_instructions_ra(struct ir3 *ir);
756
757 /**
758 * Move 'instr' to just before 'after'
759 */
760 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)761 ir3_instr_move_before(struct ir3_instruction *instr,
762 struct ir3_instruction *after)
763 {
764 list_delinit(&instr->node);
765 list_addtail(&instr->node, &after->node);
766 }
767
768 /**
769 * Move 'instr' to just after 'before':
770 */
771 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)772 ir3_instr_move_after(struct ir3_instruction *instr,
773 struct ir3_instruction *before)
774 {
775 list_delinit(&instr->node);
776 list_add(&instr->node, &before->node);
777 }
778
779 /**
780 * Move 'instr' to the beginning of the block:
781 */
782 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)783 ir3_instr_move_before_block(struct ir3_instruction *instr,
784 struct ir3_block *block)
785 {
786 list_delinit(&instr->node);
787 list_add(&instr->node, &block->instr_list);
788 }
789
790 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
791
792 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
793 void ir3_fixup_src_type(struct ir3_instruction *instr);
794
795 int ir3_flut(struct ir3_register *src_reg);
796
797 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
798
799 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
800
801 #include "util/set.h"
802 #define foreach_ssa_use(__use, __instr) \
803 for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses; \
804 __use = NULL) \
805 set_foreach ((__instr)->uses, __entry) \
806 if ((__use = (void *)__entry->key))
807
808 static inline uint32_t
reg_num(const struct ir3_register * reg)809 reg_num(const struct ir3_register *reg)
810 {
811 return reg->num >> 2;
812 }
813
814 static inline uint32_t
reg_comp(const struct ir3_register * reg)815 reg_comp(const struct ir3_register *reg)
816 {
817 return reg->num & 0x3;
818 }
819
820 static inline bool
is_flow(struct ir3_instruction * instr)821 is_flow(struct ir3_instruction *instr)
822 {
823 return (opc_cat(instr->opc) == 0);
824 }
825
826 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)827 is_kill_or_demote(struct ir3_instruction *instr)
828 {
829 return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
830 }
831
832 static inline bool
is_nop(struct ir3_instruction * instr)833 is_nop(struct ir3_instruction *instr)
834 {
835 return instr->opc == OPC_NOP;
836 }
837
838 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)839 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
840 {
841 unsigned dst_type = (dst->flags & IR3_REG_HALF);
842 unsigned src_type = (src->flags & IR3_REG_HALF);
843
844 /* Treat shared->normal copies as same-type, because they can generally be
845 * folded, but not normal->shared copies.
846 */
847 if (dst_type != src_type ||
848 ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
849 return false;
850 else
851 return true;
852 }
853
854 /* Is it a non-transformative (ie. not type changing) mov? This can
855 * also include absneg.s/absneg.f, which for the most part can be
856 * treated as a mov (single src argument).
857 */
858 static inline bool
is_same_type_mov(struct ir3_instruction * instr)859 is_same_type_mov(struct ir3_instruction *instr)
860 {
861 struct ir3_register *dst;
862
863 switch (instr->opc) {
864 case OPC_MOV:
865 if (instr->cat1.src_type != instr->cat1.dst_type)
866 return false;
867 /* If the type of dest reg and src reg are different,
868 * it shouldn't be considered as same type mov
869 */
870 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
871 return false;
872 break;
873 case OPC_ABSNEG_F:
874 case OPC_ABSNEG_S:
875 if (instr->flags & IR3_INSTR_SAT)
876 return false;
877 /* If the type of dest reg and src reg are different,
878 * it shouldn't be considered as same type mov
879 */
880 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
881 return false;
882 break;
883 case OPC_META_PHI:
884 return instr->srcs_count == 1;
885 default:
886 return false;
887 }
888
889 dst = instr->dsts[0];
890
891 /* mov's that write to a0 or p0.x are special: */
892 if (dst->num == regid(REG_P0, 0))
893 return false;
894 if (reg_num(dst) == REG_A0)
895 return false;
896
897 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
898 return false;
899
900 return true;
901 }
902
903 /* A move from const, which changes size but not type, can also be
904 * folded into dest instruction in some cases.
905 */
906 static inline bool
is_const_mov(struct ir3_instruction * instr)907 is_const_mov(struct ir3_instruction *instr)
908 {
909 if (instr->opc != OPC_MOV)
910 return false;
911
912 if (!(instr->srcs[0]->flags & IR3_REG_CONST))
913 return false;
914
915 type_t src_type = instr->cat1.src_type;
916 type_t dst_type = instr->cat1.dst_type;
917
918 return (type_float(src_type) && type_float(dst_type)) ||
919 (type_uint(src_type) && type_uint(dst_type)) ||
920 (type_sint(src_type) && type_sint(dst_type));
921 }
922
923 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)924 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
925 {
926 switch (instr->opc) {
927 case OPC_BALLOT_MACRO:
928 case OPC_ANY_MACRO:
929 case OPC_ALL_MACRO:
930 case OPC_ELECT_MACRO:
931 case OPC_READ_COND_MACRO:
932 case OPC_READ_FIRST_MACRO:
933 case OPC_SWZ_SHARED_MACRO:
934 case OPC_SCAN_MACRO:
935 return true;
936 default:
937 return false;
938 }
939 }
940
941 static inline bool
is_alu(struct ir3_instruction * instr)942 is_alu(struct ir3_instruction *instr)
943 {
944 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
945 }
946
947 static inline bool
is_sfu(struct ir3_instruction * instr)948 is_sfu(struct ir3_instruction *instr)
949 {
950 return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
951 }
952
953 static inline bool
is_tex(struct ir3_instruction * instr)954 is_tex(struct ir3_instruction *instr)
955 {
956 return (opc_cat(instr->opc) == 5);
957 }
958
959 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)960 is_tex_or_prefetch(struct ir3_instruction *instr)
961 {
962 return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
963 }
964
965 static inline bool
is_mem(struct ir3_instruction * instr)966 is_mem(struct ir3_instruction *instr)
967 {
968 return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
969 }
970
971 static inline bool
is_barrier(struct ir3_instruction * instr)972 is_barrier(struct ir3_instruction *instr)
973 {
974 return (opc_cat(instr->opc) == 7);
975 }
976
977 static inline bool
is_half(struct ir3_instruction * instr)978 is_half(struct ir3_instruction *instr)
979 {
980 return !!(instr->dsts[0]->flags & IR3_REG_HALF);
981 }
982
983 static inline bool
is_shared(struct ir3_instruction * instr)984 is_shared(struct ir3_instruction *instr)
985 {
986 return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
987 }
988
989 static inline bool
is_store(struct ir3_instruction * instr)990 is_store(struct ir3_instruction *instr)
991 {
992 /* these instructions, the "destination" register is
993 * actually a source, the address to store to.
994 */
995 switch (instr->opc) {
996 case OPC_STG:
997 case OPC_STG_A:
998 case OPC_STGB:
999 case OPC_STIB:
1000 case OPC_STP:
1001 case OPC_STL:
1002 case OPC_STLW:
1003 case OPC_L2G:
1004 case OPC_G2L:
1005 return true;
1006 default:
1007 return false;
1008 }
1009 }
1010
1011 static inline bool
is_load(struct ir3_instruction * instr)1012 is_load(struct ir3_instruction *instr)
1013 {
1014 switch (instr->opc) {
1015 case OPC_LDG:
1016 case OPC_LDG_A:
1017 case OPC_LDGB:
1018 case OPC_LDIB:
1019 case OPC_LDL:
1020 case OPC_LDP:
1021 case OPC_L2G:
1022 case OPC_LDLW:
1023 case OPC_LDC:
1024 case OPC_LDLV:
1025 /* probably some others too.. */
1026 return true;
1027 default:
1028 return false;
1029 }
1030 }
1031
1032 static inline bool
is_input(struct ir3_instruction * instr)1033 is_input(struct ir3_instruction *instr)
1034 {
1035 /* in some cases, ldlv is used to fetch varying without
1036 * interpolation.. fortunately inloc is the first src
1037 * register in either case
1038 */
1039 switch (instr->opc) {
1040 case OPC_LDLV:
1041 case OPC_BARY_F:
1042 case OPC_FLAT_B:
1043 return true;
1044 default:
1045 return false;
1046 }
1047 }
1048
1049 static inline bool
is_bool(struct ir3_instruction * instr)1050 is_bool(struct ir3_instruction *instr)
1051 {
1052 switch (instr->opc) {
1053 case OPC_CMPS_F:
1054 case OPC_CMPS_S:
1055 case OPC_CMPS_U:
1056 return true;
1057 default:
1058 return false;
1059 }
1060 }
1061
1062 static inline opc_t
cat3_half_opc(opc_t opc)1063 cat3_half_opc(opc_t opc)
1064 {
1065 switch (opc) {
1066 case OPC_MAD_F32:
1067 return OPC_MAD_F16;
1068 case OPC_SEL_B32:
1069 return OPC_SEL_B16;
1070 case OPC_SEL_S32:
1071 return OPC_SEL_S16;
1072 case OPC_SEL_F32:
1073 return OPC_SEL_F16;
1074 case OPC_SAD_S32:
1075 return OPC_SAD_S16;
1076 default:
1077 return opc;
1078 }
1079 }
1080
1081 static inline opc_t
cat3_full_opc(opc_t opc)1082 cat3_full_opc(opc_t opc)
1083 {
1084 switch (opc) {
1085 case OPC_MAD_F16:
1086 return OPC_MAD_F32;
1087 case OPC_SEL_B16:
1088 return OPC_SEL_B32;
1089 case OPC_SEL_S16:
1090 return OPC_SEL_S32;
1091 case OPC_SEL_F16:
1092 return OPC_SEL_F32;
1093 case OPC_SAD_S16:
1094 return OPC_SAD_S32;
1095 default:
1096 return opc;
1097 }
1098 }
1099
1100 static inline opc_t
cat4_half_opc(opc_t opc)1101 cat4_half_opc(opc_t opc)
1102 {
1103 switch (opc) {
1104 case OPC_RSQ:
1105 return OPC_HRSQ;
1106 case OPC_LOG2:
1107 return OPC_HLOG2;
1108 case OPC_EXP2:
1109 return OPC_HEXP2;
1110 default:
1111 return opc;
1112 }
1113 }
1114
1115 static inline opc_t
cat4_full_opc(opc_t opc)1116 cat4_full_opc(opc_t opc)
1117 {
1118 switch (opc) {
1119 case OPC_HRSQ:
1120 return OPC_RSQ;
1121 case OPC_HLOG2:
1122 return OPC_LOG2;
1123 case OPC_HEXP2:
1124 return OPC_EXP2;
1125 default:
1126 return opc;
1127 }
1128 }
1129
1130 static inline bool
is_meta(struct ir3_instruction * instr)1131 is_meta(struct ir3_instruction *instr)
1132 {
1133 return (opc_cat(instr->opc) == -1);
1134 }
1135
1136 static inline unsigned
reg_elems(const struct ir3_register * reg)1137 reg_elems(const struct ir3_register *reg)
1138 {
1139 if (reg->flags & IR3_REG_ARRAY)
1140 return reg->size;
1141 else
1142 return util_last_bit(reg->wrmask);
1143 }
1144
1145 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1146 reg_elem_size(const struct ir3_register *reg)
1147 {
1148 return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1149 }
1150
1151 static inline unsigned
reg_size(const struct ir3_register * reg)1152 reg_size(const struct ir3_register *reg)
1153 {
1154 return reg_elems(reg) * reg_elem_size(reg);
1155 }
1156
1157 static inline unsigned
dest_regs(struct ir3_instruction * instr)1158 dest_regs(struct ir3_instruction *instr)
1159 {
1160 if (instr->dsts_count == 0)
1161 return 0;
1162
1163 assert(instr->dsts_count == 1);
1164 return util_last_bit(instr->dsts[0]->wrmask);
1165 }
1166
1167 /* is dst a normal temp register: */
1168 static inline bool
is_dest_gpr(struct ir3_register * dst)1169 is_dest_gpr(struct ir3_register *dst)
1170 {
1171 if (dst->wrmask == 0)
1172 return false;
1173 if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
1174 return false;
1175 return true;
1176 }
1177
1178 static inline bool
writes_gpr(struct ir3_instruction * instr)1179 writes_gpr(struct ir3_instruction *instr)
1180 {
1181 if (dest_regs(instr) == 0)
1182 return false;
1183 return is_dest_gpr(instr->dsts[0]);
1184 }
1185
1186 static inline bool
writes_addr0(struct ir3_instruction * instr)1187 writes_addr0(struct ir3_instruction *instr)
1188 {
1189 /* Note: only the first dest can write to a0.x */
1190 if (instr->dsts_count > 0) {
1191 struct ir3_register *dst = instr->dsts[0];
1192 return dst->num == regid(REG_A0, 0);
1193 }
1194 return false;
1195 }
1196
1197 static inline bool
writes_addr1(struct ir3_instruction * instr)1198 writes_addr1(struct ir3_instruction *instr)
1199 {
1200 /* Note: only the first dest can write to a1.x */
1201 if (instr->dsts_count > 0) {
1202 struct ir3_register *dst = instr->dsts[0];
1203 return dst->num == regid(REG_A0, 1);
1204 }
1205 return false;
1206 }
1207
1208 static inline bool
writes_pred(struct ir3_instruction * instr)1209 writes_pred(struct ir3_instruction *instr)
1210 {
1211 /* Note: only the first dest can write to p0.x */
1212 if (instr->dsts_count > 0) {
1213 struct ir3_register *dst = instr->dsts[0];
1214 return reg_num(dst) == REG_P0;
1215 }
1216 return false;
1217 }
1218
1219 /* Is it something other than a normal register. Shared regs, p0, and a0/a1
1220 * are considered special here. Special registers are always accessed with one
1221 * size and never alias normal registers, even though a naive calculation
1222 * would sometimes make it seem like e.g. r30.z aliases a0.x.
1223 */
1224 static inline bool
is_reg_special(const struct ir3_register * reg)1225 is_reg_special(const struct ir3_register *reg)
1226 {
1227 return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
1228 (reg_num(reg) == REG_P0);
1229 }
1230
1231 /* Same as above but in cases where we don't have a register. r48.x and above
1232 * are shared/special.
1233 */
1234 static inline bool
is_reg_num_special(unsigned num)1235 is_reg_num_special(unsigned num)
1236 {
1237 return num >= 48 * 4;
1238 }
1239
1240 /* returns defining instruction for reg */
1241 /* TODO better name */
1242 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1243 ssa(struct ir3_register *reg)
1244 {
1245 if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1246 return reg->def->instr;
1247 return NULL;
1248 }
1249
1250 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1251 conflicts(struct ir3_register *a, struct ir3_register *b)
1252 {
1253 return (a && b) && (a->def != b->def);
1254 }
1255
1256 static inline bool
reg_gpr(struct ir3_register * r)1257 reg_gpr(struct ir3_register *r)
1258 {
1259 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1260 return false;
1261 if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
1262 return false;
1263 return true;
1264 }
1265
1266 static inline type_t
half_type(type_t type)1267 half_type(type_t type)
1268 {
1269 switch (type) {
1270 case TYPE_F32:
1271 return TYPE_F16;
1272 case TYPE_U32:
1273 return TYPE_U16;
1274 case TYPE_S32:
1275 return TYPE_S16;
1276 case TYPE_F16:
1277 case TYPE_U16:
1278 case TYPE_S16:
1279 return type;
1280 case TYPE_U8:
1281 case TYPE_S8:
1282 return type;
1283 default:
1284 assert(0);
1285 return ~0;
1286 }
1287 }
1288
1289 static inline type_t
full_type(type_t type)1290 full_type(type_t type)
1291 {
1292 switch (type) {
1293 case TYPE_F16:
1294 return TYPE_F32;
1295 case TYPE_U8:
1296 case TYPE_U16:
1297 return TYPE_U32;
1298 case TYPE_S8:
1299 case TYPE_S16:
1300 return TYPE_S32;
1301 case TYPE_F32:
1302 case TYPE_U32:
1303 case TYPE_S32:
1304 return type;
1305 default:
1306 assert(0);
1307 return ~0;
1308 }
1309 }
1310
1311 /* some cat2 instructions (ie. those which are not float) can embed an
1312 * immediate:
1313 */
1314 static inline bool
ir3_cat2_int(opc_t opc)1315 ir3_cat2_int(opc_t opc)
1316 {
1317 switch (opc) {
1318 case OPC_ADD_U:
1319 case OPC_ADD_S:
1320 case OPC_SUB_U:
1321 case OPC_SUB_S:
1322 case OPC_CMPS_U:
1323 case OPC_CMPS_S:
1324 case OPC_MIN_U:
1325 case OPC_MIN_S:
1326 case OPC_MAX_U:
1327 case OPC_MAX_S:
1328 case OPC_CMPV_U:
1329 case OPC_CMPV_S:
1330 case OPC_MUL_U24:
1331 case OPC_MUL_S24:
1332 case OPC_MULL_U:
1333 case OPC_CLZ_S:
1334 case OPC_ABSNEG_S:
1335 case OPC_AND_B:
1336 case OPC_OR_B:
1337 case OPC_NOT_B:
1338 case OPC_XOR_B:
1339 case OPC_BFREV_B:
1340 case OPC_CLZ_B:
1341 case OPC_SHL_B:
1342 case OPC_SHR_B:
1343 case OPC_ASHR_B:
1344 case OPC_MGEN_B:
1345 case OPC_GETBIT_B:
1346 case OPC_CBITS_B:
1347 case OPC_BARY_F:
1348 case OPC_FLAT_B:
1349 return true;
1350
1351 default:
1352 return false;
1353 }
1354 }
1355
1356 /* map cat2 instruction to valid abs/neg flags: */
1357 static inline unsigned
ir3_cat2_absneg(opc_t opc)1358 ir3_cat2_absneg(opc_t opc)
1359 {
1360 switch (opc) {
1361 case OPC_ADD_F:
1362 case OPC_MIN_F:
1363 case OPC_MAX_F:
1364 case OPC_MUL_F:
1365 case OPC_SIGN_F:
1366 case OPC_CMPS_F:
1367 case OPC_ABSNEG_F:
1368 case OPC_CMPV_F:
1369 case OPC_FLOOR_F:
1370 case OPC_CEIL_F:
1371 case OPC_RNDNE_F:
1372 case OPC_RNDAZ_F:
1373 case OPC_TRUNC_F:
1374 case OPC_BARY_F:
1375 return IR3_REG_FABS | IR3_REG_FNEG;
1376
1377 case OPC_ADD_U:
1378 case OPC_ADD_S:
1379 case OPC_SUB_U:
1380 case OPC_SUB_S:
1381 case OPC_CMPS_U:
1382 case OPC_CMPS_S:
1383 case OPC_MIN_U:
1384 case OPC_MIN_S:
1385 case OPC_MAX_U:
1386 case OPC_MAX_S:
1387 case OPC_CMPV_U:
1388 case OPC_CMPV_S:
1389 case OPC_MUL_U24:
1390 case OPC_MUL_S24:
1391 case OPC_MULL_U:
1392 case OPC_CLZ_S:
1393 return 0;
1394
1395 case OPC_ABSNEG_S:
1396 return IR3_REG_SABS | IR3_REG_SNEG;
1397
1398 case OPC_AND_B:
1399 case OPC_OR_B:
1400 case OPC_NOT_B:
1401 case OPC_XOR_B:
1402 case OPC_BFREV_B:
1403 case OPC_CLZ_B:
1404 case OPC_SHL_B:
1405 case OPC_SHR_B:
1406 case OPC_ASHR_B:
1407 case OPC_MGEN_B:
1408 case OPC_GETBIT_B:
1409 case OPC_CBITS_B:
1410 return IR3_REG_BNOT;
1411
1412 default:
1413 return 0;
1414 }
1415 }
1416
1417 /* map cat3 instructions to valid abs/neg flags: */
1418 static inline unsigned
ir3_cat3_absneg(opc_t opc)1419 ir3_cat3_absneg(opc_t opc)
1420 {
1421 switch (opc) {
1422 case OPC_MAD_F16:
1423 case OPC_MAD_F32:
1424 case OPC_SEL_F16:
1425 case OPC_SEL_F32:
1426 return IR3_REG_FNEG;
1427
1428 case OPC_MAD_U16:
1429 case OPC_MADSH_U16:
1430 case OPC_MAD_S16:
1431 case OPC_MADSH_M16:
1432 case OPC_MAD_U24:
1433 case OPC_MAD_S24:
1434 case OPC_SEL_S16:
1435 case OPC_SEL_S32:
1436 case OPC_SAD_S16:
1437 case OPC_SAD_S32:
1438 /* neg *may* work on 3rd src.. */
1439
1440 case OPC_SEL_B16:
1441 case OPC_SEL_B32:
1442
1443 case OPC_SHRM:
1444 case OPC_SHLM:
1445 case OPC_SHRG:
1446 case OPC_SHLG:
1447 case OPC_ANDG:
1448 case OPC_WMM:
1449 case OPC_WMM_ACCU:
1450
1451 default:
1452 return 0;
1453 }
1454 }
1455
1456 /* Return the type (float, int, or uint) the op uses when converting from the
1457 * internal result of the op (which is assumed to be the same size as the
1458 * sources) to the destination when they are not the same size. If F32 it does
1459 * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1460 * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1461 * doesn't do anything sensible or is unknown.
1462 */
1463 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1464 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1465 {
1466 *can_fold = true;
1467 switch (instr->opc) {
1468 case OPC_ADD_F:
1469 case OPC_MUL_F:
1470 case OPC_BARY_F:
1471 case OPC_MAD_F32:
1472 case OPC_MAD_F16:
1473 case OPC_WMM:
1474 case OPC_WMM_ACCU:
1475 return TYPE_F32;
1476
1477 case OPC_ADD_U:
1478 case OPC_SUB_U:
1479 case OPC_MIN_U:
1480 case OPC_MAX_U:
1481 case OPC_AND_B:
1482 case OPC_OR_B:
1483 case OPC_NOT_B:
1484 case OPC_XOR_B:
1485 case OPC_MUL_U24:
1486 case OPC_MULL_U:
1487 case OPC_SHL_B:
1488 case OPC_SHR_B:
1489 case OPC_ASHR_B:
1490 case OPC_MAD_U24:
1491 case OPC_SHRM:
1492 case OPC_SHLM:
1493 case OPC_SHRG:
1494 case OPC_SHLG:
1495 case OPC_ANDG:
1496 /* Comparison ops zero-extend/truncate their results, so consider them as
1497 * unsigned here.
1498 */
1499 case OPC_CMPS_F:
1500 case OPC_CMPV_F:
1501 case OPC_CMPS_U:
1502 case OPC_CMPS_S:
1503 return TYPE_U32;
1504
1505 case OPC_ADD_S:
1506 case OPC_SUB_S:
1507 case OPC_MIN_S:
1508 case OPC_MAX_S:
1509 case OPC_ABSNEG_S:
1510 case OPC_MUL_S24:
1511 case OPC_MAD_S24:
1512 return TYPE_S32;
1513
1514 /* We assume that any move->move folding that could be done was done by
1515 * NIR.
1516 */
1517 case OPC_MOV:
1518 default:
1519 *can_fold = false;
1520 return TYPE_U32;
1521 }
1522 }
1523
1524 /* Return the src and dst types for the conversion which is already folded
1525 * into the op. We can assume that instr has folded in a conversion from
1526 * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1527 * to call if ir3_output_conv_type() returns can_fold = true.
1528 */
1529 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1530 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1531 {
1532 switch (instr->opc) {
1533 case OPC_CMPS_F:
1534 case OPC_CMPV_F:
1535 case OPC_CMPS_U:
1536 case OPC_CMPS_S:
1537 /* Comparisons only return 0/1 and the size of the comparison sources
1538 * is irrelevant, never consider them as having an output conversion
1539 * by returning a type with the dest size here:
1540 */
1541 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1542 : full_type(base_type);
1543
1544 case OPC_BARY_F:
1545 /* bary.f doesn't have an explicit source, but we can assume here that
1546 * the varying data it reads is in fp32.
1547 *
1548 * This may be fp16 on older gen's depending on some register
1549 * settings, but it's probably not worth plumbing that through for a
1550 * small improvement that NIR would hopefully handle for us anyway.
1551 */
1552 return TYPE_F32;
1553
1554 case OPC_FLAT_B:
1555 /* Treat the input data as u32 if not interpolating. */
1556 return TYPE_U32;
1557
1558 default:
1559 return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1560 : full_type(base_type);
1561 }
1562 }
1563
1564 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1565 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1566 {
1567 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1568 : full_type(base_type);
1569 }
1570
1571 /* Some instructions have signed/unsigned variants which are identical except
1572 * for whether the folded conversion sign-extends or zero-extends, and we can
1573 * fold in a mismatching move by rewriting the opcode. Return the opcode to
1574 * switch signedness, and whether one exists.
1575 */
1576 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1577 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1578 {
1579 switch (opc) {
1580 #define PAIR(u, s) \
1581 case OPC_##u: \
1582 return OPC_##s; \
1583 case OPC_##s: \
1584 return OPC_##u;
1585 PAIR(ADD_U, ADD_S)
1586 PAIR(SUB_U, SUB_S)
1587 /* Note: these are only identical when the sources are half, but that's
1588 * the only case we call this function for anyway.
1589 */
1590 PAIR(MUL_U24, MUL_S24)
1591
1592 default:
1593 *can_swap = false;
1594 return opc;
1595 }
1596 }
1597
1598 #define MASK(n) ((1 << (n)) - 1)
1599
1600 /* iterator for an instructions's sources (reg), also returns src #: */
1601 #define foreach_src_n(__srcreg, __n, __instr) \
1602 if ((__instr)->srcs_count) \
1603 for (struct ir3_register *__srcreg = (void *)~0; __srcreg; \
1604 __srcreg = NULL) \
1605 for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt; \
1606 __n++) \
1607 if ((__srcreg = (__instr)->srcs[__n]))
1608
1609 /* iterator for an instructions's sources (reg): */
1610 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1611
1612 /* iterator for an instructions's destinations (reg), also returns dst #: */
1613 #define foreach_dst_n(__dstreg, __n, __instr) \
1614 if ((__instr)->dsts_count) \
1615 for (struct ir3_register *__dstreg = (void *)~0; __dstreg; \
1616 __dstreg = NULL) \
1617 for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt; \
1618 __n++) \
1619 if ((__dstreg = (__instr)->dsts[__n]))
1620
1621 /* iterator for an instructions's destinations (reg): */
1622 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1623
1624 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1625 __ssa_src_cnt(struct ir3_instruction *instr)
1626 {
1627 return instr->srcs_count + instr->deps_count;
1628 }
1629
1630 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1631 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1632 {
1633 if (n >= instr->srcs_count)
1634 return true;
1635 return false;
1636 }
1637
1638 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1639 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1640 {
1641 if (__is_false_dep(instr, n))
1642 return &instr->deps[n - instr->srcs_count];
1643 if (ssa(instr->srcs[n]))
1644 return &instr->srcs[n]->def->instr;
1645 return NULL;
1646 }
1647
1648 #define foreach_ssa_srcp_n(__srcp, __n, __instr) \
1649 for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL) \
1650 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; \
1651 __n++) \
1652 if ((__srcp = __ssa_srcp_n(__instr, __n)))
1653
1654 #define foreach_ssa_srcp(__srcp, __instr) \
1655 foreach_ssa_srcp_n (__srcp, __i, __instr)
1656
1657 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1658 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
1659 for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst; \
1660 __srcinst = NULL) \
1661 foreach_ssa_srcp_n (__srcp, __n, __instr) \
1662 if ((__srcinst = *__srcp))
1663
1664 /* iterator for an instruction's SSA sources (instr): */
1665 #define foreach_ssa_src(__srcinst, __instr) \
1666 foreach_ssa_src_n (__srcinst, __i, __instr)
1667
1668 /* iterators for shader inputs: */
1669 #define foreach_input_n(__ininstr, __cnt, __ir) \
1670 for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr; \
1671 __ininstr = NULL) \
1672 for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
1673 if ((__ininstr = (__ir)->inputs[__cnt]))
1674 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1675
1676 /* iterators for instructions: */
1677 #define foreach_instr(__instr, __list) \
1678 list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1679 #define foreach_instr_rev(__instr, __list) \
1680 list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1681 #define foreach_instr_safe(__instr, __list) \
1682 list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1683 #define foreach_instr_from_safe(__instr, __start, __list) \
1684 list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, \
1685 __list, node)
1686
1687 /* iterators for blocks: */
1688 #define foreach_block(__block, __list) \
1689 list_for_each_entry (struct ir3_block, __block, __list, node)
1690 #define foreach_block_safe(__block, __list) \
1691 list_for_each_entry_safe (struct ir3_block, __block, __list, node)
1692 #define foreach_block_rev(__block, __list) \
1693 list_for_each_entry_rev (struct ir3_block, __block, __list, node)
1694
1695 /* iterators for arrays: */
1696 #define foreach_array(__array, __list) \
1697 list_for_each_entry (struct ir3_array, __array, __list, node)
1698 #define foreach_array_safe(__array, __list) \
1699 list_for_each_entry_safe (struct ir3_array, __array, __list, node)
1700
1701 #define IR3_PASS(ir, pass, ...) \
1702 ({ \
1703 bool progress = pass(ir, ##__VA_ARGS__); \
1704 if (progress) { \
1705 ir3_debug_print(ir, "AFTER: " #pass); \
1706 ir3_validate(ir); \
1707 } \
1708 progress; \
1709 })
1710
1711 /* validate: */
1712 void ir3_validate(struct ir3 *ir);
1713
1714 /* dump: */
1715 void ir3_print(struct ir3 *ir);
1716 void ir3_print_instr(struct ir3_instruction *instr);
1717
1718 struct log_stream;
1719 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
1720
1721 /* delay calculation: */
1722 int ir3_delayslots(struct ir3_instruction *assigner,
1723 struct ir3_instruction *consumer, unsigned n, bool soft);
1724 unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
1725 struct ir3_instruction *consumer,
1726 unsigned assigner_n, unsigned consumer_n);
1727 unsigned ir3_delay_calc(struct ir3_block *block,
1728 struct ir3_instruction *instr, bool mergedregs);
1729
1730 /* estimated (ss)/(sy) delay calculation */
1731
1732 static inline bool
is_local_mem_load(struct ir3_instruction * instr)1733 is_local_mem_load(struct ir3_instruction *instr)
1734 {
1735 return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
1736 instr->opc == OPC_LDLW;
1737 }
1738
1739 /* Does this instruction need (ss) to wait for its result? */
1740 static inline bool
is_ss_producer(struct ir3_instruction * instr)1741 is_ss_producer(struct ir3_instruction *instr)
1742 {
1743 foreach_dst (dst, instr) {
1744 if (dst->flags & IR3_REG_SHARED)
1745 return true;
1746 }
1747 return is_sfu(instr) || is_local_mem_load(instr);
1748 }
1749
1750 /* The soft delay for approximating the cost of (ss). */
1751 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)1752 soft_ss_delay(struct ir3_instruction *instr)
1753 {
1754 /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
1755 * using nop's instead of (ss) is:
1756 *
1757 * 8 - single warp
1758 * 9 - two warps
1759 * 10 - four warps
1760 *
1761 * and so on. Not quite sure where it tapers out (ie. how many warps share an
1762 * SFU unit). But 10 seems like a reasonable # to choose:
1763 */
1764 if (is_sfu(instr) || is_local_mem_load(instr))
1765 return 10;
1766
1767 /* The blob adds 6 nops between shared producers and consumers, and before we
1768 * used (ss) this was sufficient in most cases.
1769 */
1770 return 6;
1771 }
1772
1773 static inline bool
is_sy_producer(struct ir3_instruction * instr)1774 is_sy_producer(struct ir3_instruction *instr)
1775 {
1776 return is_tex_or_prefetch(instr) ||
1777 (is_load(instr) && !is_local_mem_load(instr)) ||
1778 is_atomic(instr->opc);
1779 }
1780
1781 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)1782 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
1783 {
1784 /* TODO: this is just an optimistic guess, we can do better post-RA.
1785 */
1786 bool double_wavesize =
1787 shader->type == MESA_SHADER_FRAGMENT ||
1788 shader->type == MESA_SHADER_COMPUTE;
1789
1790 unsigned components = reg_elems(instr->dsts[0]);
1791
1792 /* These numbers come from counting the number of delay slots to get
1793 * cat5/cat6 results back using nops instead of (sy). Note that these numbers
1794 * are with the result preloaded to cache by loading it before in the same
1795 * shader - uncached results are much larger.
1796 *
1797 * Note: most ALU instructions can't complete at the full doubled rate, so
1798 * they take 2 cycles. The only exception is fp16 instructions with no
1799 * built-in conversions. Therefore divide the latency by 2.
1800 *
1801 * TODO: Handle this properly in the scheduler and remove this.
1802 */
1803 if (instr->opc == OPC_LDC) {
1804 if (double_wavesize)
1805 return (21 + 8 * components) / 2;
1806 else
1807 return 18 + 4 * components;
1808 } else if (is_tex_or_prefetch(instr)) {
1809 if (double_wavesize) {
1810 switch (components) {
1811 case 1: return 58 / 2;
1812 case 2: return 60 / 2;
1813 case 3: return 77 / 2;
1814 case 4: return 79 / 2;
1815 default: unreachable("bad number of components");
1816 }
1817 } else {
1818 switch (components) {
1819 case 1: return 51;
1820 case 2: return 53;
1821 case 3: return 62;
1822 case 4: return 64;
1823 default: unreachable("bad number of components");
1824 }
1825 }
1826 } else {
1827 /* TODO: measure other cat6 opcodes like ldg */
1828 if (double_wavesize)
1829 return (172 + components) / 2;
1830 else
1831 return 109 + components;
1832 }
1833 }
1834
1835
1836 /* unreachable block elimination: */
1837 bool ir3_remove_unreachable(struct ir3 *ir);
1838
1839 /* dead code elimination: */
1840 struct ir3_shader_variant;
1841 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
1842
1843 /* fp16 conversion folding */
1844 bool ir3_cf(struct ir3 *ir);
1845
1846 /* copy-propagate: */
1847 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1848
1849 /* common subexpression elimination: */
1850 bool ir3_cse(struct ir3 *ir);
1851
1852 /* Make arrays SSA */
1853 bool ir3_array_to_ssa(struct ir3 *ir);
1854
1855 /* scheduling: */
1856 bool ir3_sched_add_deps(struct ir3 *ir);
1857 int ir3_sched(struct ir3 *ir);
1858
1859 struct ir3_context;
1860 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
1861
1862 /* register assignment: */
1863 int ir3_ra(struct ir3_shader_variant *v);
1864
1865 /* lower subgroup ops: */
1866 bool ir3_lower_subgroups(struct ir3 *ir);
1867
1868 /* legalize: */
1869 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
1870 bool ir3_legalize_relative(struct ir3 *ir);
1871
1872 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)1873 ir3_has_latency_to_hide(struct ir3 *ir)
1874 {
1875 /* VS/GS/TCS/TESS co-exist with frag shader invocations, but we don't
1876 * know the nature of the fragment shader. Just assume it will have
1877 * latency to hide:
1878 */
1879 if (ir->type != MESA_SHADER_FRAGMENT)
1880 return true;
1881
1882 foreach_block (block, &ir->block_list) {
1883 foreach_instr (instr, &block->instr_list) {
1884 if (is_tex_or_prefetch(instr))
1885 return true;
1886
1887 if (is_load(instr)) {
1888 switch (instr->opc) {
1889 case OPC_LDLV:
1890 case OPC_LDL:
1891 case OPC_LDLW:
1892 break;
1893 default:
1894 return true;
1895 }
1896 }
1897 }
1898 }
1899
1900 return false;
1901 }
1902
1903 /* ************************************************************************* */
1904 /* instruction helpers */
1905
1906 /* creates SSA src of correct type (ie. half vs full precision) */
1907 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)1908 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
1909 unsigned flags)
1910 {
1911 struct ir3_register *reg;
1912 if (src->dsts[0]->flags & IR3_REG_HALF)
1913 flags |= IR3_REG_HALF;
1914 reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
1915 reg->def = src->dsts[0];
1916 reg->wrmask = src->dsts[0]->wrmask;
1917 return reg;
1918 }
1919
1920 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)1921 __ssa_dst(struct ir3_instruction *instr)
1922 {
1923 struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
1924 reg->instr = instr;
1925 return reg;
1926 }
1927
1928 static inline struct ir3_instruction *
create_immed_typed(struct ir3_block * block,uint32_t val,type_t type)1929 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
1930 {
1931 struct ir3_instruction *mov;
1932 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1933
1934 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1935 mov->cat1.src_type = type;
1936 mov->cat1.dst_type = type;
1937 __ssa_dst(mov)->flags |= flags;
1938 ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
1939
1940 return mov;
1941 }
1942
1943 static inline struct ir3_instruction *
create_immed(struct ir3_block * block,uint32_t val)1944 create_immed(struct ir3_block *block, uint32_t val)
1945 {
1946 return create_immed_typed(block, val, TYPE_U32);
1947 }
1948
1949 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_block * block,unsigned n,type_t type)1950 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
1951 {
1952 struct ir3_instruction *mov;
1953 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1954
1955 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1956 mov->cat1.src_type = type;
1957 mov->cat1.dst_type = type;
1958 __ssa_dst(mov)->flags |= flags;
1959 ir3_src_create(mov, n, IR3_REG_CONST | flags);
1960
1961 return mov;
1962 }
1963
1964 static inline struct ir3_instruction *
create_uniform(struct ir3_block * block,unsigned n)1965 create_uniform(struct ir3_block *block, unsigned n)
1966 {
1967 return create_uniform_typed(block, n, TYPE_F32);
1968 }
1969
1970 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_block * block,int n,type_t type,struct ir3_instruction * address)1971 create_uniform_indirect(struct ir3_block *block, int n, type_t type,
1972 struct ir3_instruction *address)
1973 {
1974 struct ir3_instruction *mov;
1975
1976 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1977 mov->cat1.src_type = type;
1978 mov->cat1.dst_type = type;
1979 __ssa_dst(mov);
1980 ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
1981
1982 ir3_instr_set_address(mov, address);
1983
1984 return mov;
1985 }
1986
1987 static inline struct ir3_instruction *
ir3_MOV(struct ir3_block * block,struct ir3_instruction * src,type_t type)1988 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
1989 {
1990 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
1991 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1992
1993 __ssa_dst(instr)->flags |= flags;
1994 if (src->dsts[0]->flags & IR3_REG_ARRAY) {
1995 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
1996 src_reg->array = src->dsts[0]->array;
1997 } else {
1998 __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
1999 }
2000 assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
2001 instr->cat1.src_type = type;
2002 instr->cat1.dst_type = type;
2003 return instr;
2004 }
2005
2006 static inline struct ir3_instruction *
ir3_COV(struct ir3_block * block,struct ir3_instruction * src,type_t src_type,type_t dst_type)2007 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
2008 type_t dst_type)
2009 {
2010 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2011 unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
2012 unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
2013
2014 assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2015
2016 __ssa_dst(instr)->flags |= dst_flags;
2017 __ssa_src(instr, src, 0);
2018 instr->cat1.src_type = src_type;
2019 instr->cat1.dst_type = dst_type;
2020 assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2021 return instr;
2022 }
2023
2024 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_block * block,unsigned components)2025 ir3_MOVMSK(struct ir3_block *block, unsigned components)
2026 {
2027 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
2028
2029 struct ir3_register *dst = __ssa_dst(instr);
2030 dst->flags |= IR3_REG_SHARED;
2031 dst->wrmask = (1 << components) - 1;
2032 instr->repeat = components - 1;
2033 return instr;
2034 }
2035
2036 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_block * block,struct ir3_instruction * src,unsigned components)2037 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
2038 unsigned components)
2039 {
2040 struct ir3_instruction *instr =
2041 ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
2042
2043 struct ir3_register *dst = __ssa_dst(instr);
2044 dst->flags |= IR3_REG_SHARED;
2045 dst->wrmask = (1 << components) - 1;
2046
2047 __ssa_src(instr, src, 0);
2048
2049 return instr;
2050 }
2051
2052 static inline struct ir3_instruction *
ir3_NOP(struct ir3_block * block)2053 ir3_NOP(struct ir3_block *block)
2054 {
2055 return ir3_instr_create(block, OPC_NOP, 0, 0);
2056 }
2057
2058 /* clang-format off */
2059 #define __INSTR0(flag, name, opc) \
2060 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block) \
2061 { \
2062 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0); \
2063 instr->flags |= flag; \
2064 return instr; \
2065 }
2066 /* clang-format on */
2067 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2068 #define INSTR0(name) __INSTR0(0, name, OPC_##name)
2069
2070 /* clang-format off */
2071 #define __INSTR1(flag, dst_count, name, opc) \
2072 static inline struct ir3_instruction *ir3_##name( \
2073 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags) \
2074 { \
2075 struct ir3_instruction *instr = \
2076 ir3_instr_create(block, opc, dst_count, 1); \
2077 for (unsigned i = 0; i < dst_count; i++) \
2078 __ssa_dst(instr); \
2079 __ssa_src(instr, a, aflags); \
2080 instr->flags |= flag; \
2081 return instr; \
2082 }
2083 /* clang-format on */
2084 #define INSTR1F(f, name) __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2085 #define INSTR1(name) __INSTR1(0, 1, name, OPC_##name)
2086 #define INSTR1NODST(name) __INSTR1(0, 0, name, OPC_##name)
2087
2088 /* clang-format off */
2089 #define __INSTR2(flag, dst_count, name, opc) \
2090 static inline struct ir3_instruction *ir3_##name( \
2091 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2092 struct ir3_instruction *b, unsigned bflags) \
2093 { \
2094 struct ir3_instruction *instr = ir3_instr_create(block, opc, dst_count, 2); \
2095 for (unsigned i = 0; i < dst_count; i++) \
2096 __ssa_dst(instr); \
2097 __ssa_src(instr, a, aflags); \
2098 __ssa_src(instr, b, bflags); \
2099 instr->flags |= flag; \
2100 return instr; \
2101 }
2102 /* clang-format on */
2103 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2104 #define INSTR2(name) __INSTR2(0, 1, name, OPC_##name)
2105 #define INSTR2NODST(name) __INSTR2(0, 0, name, OPC_##name)
2106
2107 /* clang-format off */
2108 #define __INSTR3(flag, dst_count, name, opc) \
2109 static inline struct ir3_instruction *ir3_##name( \
2110 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2111 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2112 unsigned cflags) \
2113 { \
2114 struct ir3_instruction *instr = \
2115 ir3_instr_create(block, opc, dst_count, 3); \
2116 for (unsigned i = 0; i < dst_count; i++) \
2117 __ssa_dst(instr); \
2118 __ssa_src(instr, a, aflags); \
2119 __ssa_src(instr, b, bflags); \
2120 __ssa_src(instr, c, cflags); \
2121 instr->flags |= flag; \
2122 return instr; \
2123 }
2124 /* clang-format on */
2125 #define INSTR3F(f, name) __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2126 #define INSTR3(name) __INSTR3(0, 1, name, OPC_##name)
2127 #define INSTR3NODST(name) __INSTR3(0, 0, name, OPC_##name)
2128
2129 /* clang-format off */
2130 #define __INSTR4(flag, dst_count, name, opc) \
2131 static inline struct ir3_instruction *ir3_##name( \
2132 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2133 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2134 unsigned cflags, struct ir3_instruction *d, unsigned dflags) \
2135 { \
2136 struct ir3_instruction *instr = \
2137 ir3_instr_create(block, opc, dst_count, 4); \
2138 for (unsigned i = 0; i < dst_count; i++) \
2139 __ssa_dst(instr); \
2140 __ssa_src(instr, a, aflags); \
2141 __ssa_src(instr, b, bflags); \
2142 __ssa_src(instr, c, cflags); \
2143 __ssa_src(instr, d, dflags); \
2144 instr->flags |= flag; \
2145 return instr; \
2146 }
2147 /* clang-format on */
2148 #define INSTR4F(f, name) __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2149 #define INSTR4(name) __INSTR4(0, 1, name, OPC_##name)
2150 #define INSTR4NODST(name) __INSTR4(0, 0, name, OPC_##name)
2151
2152 /* clang-format off */
2153 #define __INSTR5(flag, name, opc) \
2154 static inline struct ir3_instruction *ir3_##name( \
2155 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2156 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2157 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2158 struct ir3_instruction *e, unsigned eflags) \
2159 { \
2160 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5); \
2161 __ssa_dst(instr); \
2162 __ssa_src(instr, a, aflags); \
2163 __ssa_src(instr, b, bflags); \
2164 __ssa_src(instr, c, cflags); \
2165 __ssa_src(instr, d, dflags); \
2166 __ssa_src(instr, e, eflags); \
2167 instr->flags |= flag; \
2168 return instr; \
2169 }
2170 /* clang-format on */
2171 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2172 #define INSTR5(name) __INSTR5(0, name, OPC_##name)
2173
2174 /* clang-format off */
2175 #define __INSTR6(flag, dst_count, name, opc) \
2176 static inline struct ir3_instruction *ir3_##name( \
2177 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2178 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2179 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2180 struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f, \
2181 unsigned fflags) \
2182 { \
2183 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6); \
2184 for (unsigned i = 0; i < dst_count; i++) \
2185 __ssa_dst(instr); \
2186 __ssa_src(instr, a, aflags); \
2187 __ssa_src(instr, b, bflags); \
2188 __ssa_src(instr, c, cflags); \
2189 __ssa_src(instr, d, dflags); \
2190 __ssa_src(instr, e, eflags); \
2191 __ssa_src(instr, f, fflags); \
2192 instr->flags |= flag; \
2193 return instr; \
2194 }
2195 /* clang-format on */
2196 #define INSTR6F(f, name) __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2197 #define INSTR6(name) __INSTR6(0, 1, name, OPC_##name)
2198 #define INSTR6NODST(name) __INSTR6(0, 0, name, OPC_##name)
2199
2200 /* cat0 instructions: */
2201 INSTR1NODST(B)
INSTR0(JUMP)2202 INSTR0(JUMP)
2203 INSTR1NODST(KILL)
2204 INSTR1NODST(DEMOTE)
2205 INSTR0(END)
2206 INSTR0(CHSH)
2207 INSTR0(CHMASK)
2208 INSTR1NODST(PREDT)
2209 INSTR0(PREDF)
2210 INSTR0(PREDE)
2211 INSTR0(GETONE)
2212 INSTR0(SHPS)
2213 INSTR0(SHPE)
2214
2215 /* cat1 macros */
2216 INSTR1(ANY_MACRO)
2217 INSTR1(ALL_MACRO)
2218 INSTR1(READ_FIRST_MACRO)
2219 INSTR2(READ_COND_MACRO)
2220
2221 static inline struct ir3_instruction *
2222 ir3_ELECT_MACRO(struct ir3_block *block)
2223 {
2224 struct ir3_instruction *instr =
2225 ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
2226 __ssa_dst(instr);
2227 return instr;
2228 }
2229
2230 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_block * block)2231 ir3_SHPS_MACRO(struct ir3_block *block)
2232 {
2233 struct ir3_instruction *instr =
2234 ir3_instr_create(block, OPC_SHPS_MACRO, 1, 0);
2235 __ssa_dst(instr);
2236 return instr;
2237 }
2238
2239 /* cat2 instructions, most 2 src but some 1 src: */
2240 INSTR2(ADD_F)
INSTR2(MIN_F)2241 INSTR2(MIN_F)
2242 INSTR2(MAX_F)
2243 INSTR2(MUL_F)
2244 INSTR1(SIGN_F)
2245 INSTR2(CMPS_F)
2246 INSTR1(ABSNEG_F)
2247 INSTR2(CMPV_F)
2248 INSTR1(FLOOR_F)
2249 INSTR1(CEIL_F)
2250 INSTR1(RNDNE_F)
2251 INSTR1(RNDAZ_F)
2252 INSTR1(TRUNC_F)
2253 INSTR2(ADD_U)
2254 INSTR2(ADD_S)
2255 INSTR2(SUB_U)
2256 INSTR2(SUB_S)
2257 INSTR2(CMPS_U)
2258 INSTR2(CMPS_S)
2259 INSTR2(MIN_U)
2260 INSTR2(MIN_S)
2261 INSTR2(MAX_U)
2262 INSTR2(MAX_S)
2263 INSTR1(ABSNEG_S)
2264 INSTR2(AND_B)
2265 INSTR2(OR_B)
2266 INSTR1(NOT_B)
2267 INSTR2(XOR_B)
2268 INSTR2(CMPV_U)
2269 INSTR2(CMPV_S)
2270 INSTR2(MUL_U24)
2271 INSTR2(MUL_S24)
2272 INSTR2(MULL_U)
2273 INSTR1(BFREV_B)
2274 INSTR1(CLZ_S)
2275 INSTR1(CLZ_B)
2276 INSTR2(SHL_B)
2277 INSTR2(SHR_B)
2278 INSTR2(ASHR_B)
2279 INSTR2(BARY_F)
2280 INSTR2(FLAT_B)
2281 INSTR2(MGEN_B)
2282 INSTR2(GETBIT_B)
2283 INSTR1(SETRM)
2284 INSTR1(CBITS_B)
2285 INSTR2(SHB)
2286 INSTR2(MSAD)
2287
2288 /* cat3 instructions: */
2289 INSTR3(MAD_U16)
2290 INSTR3(MADSH_U16)
2291 INSTR3(MAD_S16)
2292 INSTR3(MADSH_M16)
2293 INSTR3(MAD_U24)
2294 INSTR3(MAD_S24)
2295 INSTR3(MAD_F16)
2296 INSTR3(MAD_F32)
2297 INSTR3(DP2ACC)
2298 INSTR3(DP4ACC)
2299 /* NOTE: SEL_B32 checks for zero vs nonzero */
2300 INSTR3(SEL_B16)
2301 INSTR3(SEL_B32)
2302 INSTR3(SEL_S16)
2303 INSTR3(SEL_S32)
2304 INSTR3(SEL_F16)
2305 INSTR3(SEL_F32)
2306 INSTR3(SAD_S16)
2307 INSTR3(SAD_S32)
2308
2309 /* cat4 instructions: */
2310 INSTR1(RCP)
2311 INSTR1(RSQ)
2312 INSTR1(HRSQ)
2313 INSTR1(LOG2)
2314 INSTR1(HLOG2)
2315 INSTR1(EXP2)
2316 INSTR1(HEXP2)
2317 INSTR1(SIN)
2318 INSTR1(COS)
2319 INSTR1(SQRT)
2320
2321 /* cat5 instructions: */
2322 INSTR1(DSX)
2323 INSTR1(DSXPP_MACRO)
2324 INSTR1(DSY)
2325 INSTR1(DSYPP_MACRO)
2326 INSTR1F(3D, DSX)
2327 INSTR1F(3D, DSY)
2328 INSTR1(RGETPOS)
2329
2330 static inline struct ir3_instruction *
2331 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
2332 unsigned flags, struct ir3_instruction *samp_tex,
2333 struct ir3_instruction *src0, struct ir3_instruction *src1)
2334 {
2335 struct ir3_instruction *sam;
2336 unsigned nreg = 0;
2337
2338 if (flags & IR3_INSTR_S2EN) {
2339 nreg++;
2340 }
2341 if (src0) {
2342 nreg++;
2343 }
2344 if (src1) {
2345 nreg++;
2346 }
2347
2348 sam = ir3_instr_create(block, opc, 1, nreg);
2349 sam->flags |= flags;
2350 __ssa_dst(sam)->wrmask = wrmask;
2351 if (flags & IR3_INSTR_S2EN) {
2352 __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2353 }
2354 if (src0) {
2355 __ssa_src(sam, src0, 0);
2356 }
2357 if (src1) {
2358 __ssa_src(sam, src1, 0);
2359 }
2360 sam->cat5.type = type;
2361
2362 return sam;
2363 }
2364
2365 /* cat6 instructions: */
2366 INSTR0(GETFIBERID)
2367 INSTR2(LDLV)
2368 INSTR3(LDG)
2369 INSTR3(LDL)
2370 INSTR3(LDLW)
2371 INSTR3(LDP)
2372 INSTR4NODST(STG)
2373 INSTR3NODST(STL)
2374 INSTR3NODST(STLW)
2375 INSTR3NODST(STP)
2376 INSTR1(RESINFO)
2377 INSTR1(RESFMT)
2378 INSTR2(ATOMIC_ADD)
2379 INSTR2(ATOMIC_SUB)
2380 INSTR2(ATOMIC_XCHG)
2381 INSTR2(ATOMIC_INC)
2382 INSTR2(ATOMIC_DEC)
2383 INSTR2(ATOMIC_CMPXCHG)
2384 INSTR2(ATOMIC_MIN)
2385 INSTR2(ATOMIC_MAX)
2386 INSTR2(ATOMIC_AND)
2387 INSTR2(ATOMIC_OR)
2388 INSTR2(ATOMIC_XOR)
2389 INSTR2(LDC)
2390 INSTR2(QUAD_SHUFFLE_BRCST)
2391 INSTR1(QUAD_SHUFFLE_HORIZ)
2392 INSTR1(QUAD_SHUFFLE_VERT)
2393 INSTR1(QUAD_SHUFFLE_DIAG)
2394 INSTR2NODST(LDC_K)
2395 INSTR2NODST(STC)
2396 #if GPU >= 600
2397 INSTR3NODST(STIB);
2398 INSTR2(LDIB);
2399 INSTR5(LDG_A);
2400 INSTR6NODST(STG_A);
2401 INSTR2(ATOMIC_G_ADD)
2402 INSTR2(ATOMIC_G_SUB)
2403 INSTR2(ATOMIC_G_XCHG)
2404 INSTR2(ATOMIC_G_INC)
2405 INSTR2(ATOMIC_G_DEC)
2406 INSTR2(ATOMIC_G_CMPXCHG)
2407 INSTR2(ATOMIC_G_MIN)
2408 INSTR2(ATOMIC_G_MAX)
2409 INSTR2(ATOMIC_G_AND)
2410 INSTR2(ATOMIC_G_OR)
2411 INSTR2(ATOMIC_G_XOR)
2412 INSTR3(ATOMIC_B_ADD)
2413 INSTR3(ATOMIC_B_SUB)
2414 INSTR3(ATOMIC_B_XCHG)
2415 INSTR3(ATOMIC_B_INC)
2416 INSTR3(ATOMIC_B_DEC)
2417 INSTR3(ATOMIC_B_CMPXCHG)
2418 INSTR3(ATOMIC_B_MIN)
2419 INSTR3(ATOMIC_B_MAX)
2420 INSTR3(ATOMIC_B_AND)
2421 INSTR3(ATOMIC_B_OR)
2422 INSTR3(ATOMIC_B_XOR)
2423 #elif GPU >= 400
2424 INSTR3(LDGB)
2425 #if GPU >= 500
2426 INSTR3(LDIB)
2427 #endif
2428 INSTR4NODST(STGB)
2429 INSTR4NODST(STIB)
2430 INSTR4(ATOMIC_S_ADD)
2431 INSTR4(ATOMIC_S_SUB)
2432 INSTR4(ATOMIC_S_XCHG)
2433 INSTR4(ATOMIC_S_INC)
2434 INSTR4(ATOMIC_S_DEC)
2435 INSTR4(ATOMIC_S_CMPXCHG)
2436 INSTR4(ATOMIC_S_MIN)
2437 INSTR4(ATOMIC_S_MAX)
2438 INSTR4(ATOMIC_S_AND)
2439 INSTR4(ATOMIC_S_OR)
2440 INSTR4(ATOMIC_S_XOR)
2441 #endif
2442
2443 /* cat7 instructions: */
2444 INSTR0(BAR)
2445 INSTR0(FENCE)
2446
2447 /* ************************************************************************* */
2448 #include "bitset.h"
2449
2450 #define MAX_REG 256
2451
2452 typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
2453
2454 typedef struct {
2455 bool mergedregs;
2456 regmaskstate_t mask;
2457 } regmask_t;
2458
2459 static inline bool
__regmask_get(regmask_t * regmask,bool half,unsigned n)2460 __regmask_get(regmask_t *regmask, bool half, unsigned n)
2461 {
2462 if (regmask->mergedregs) {
2463 /* a6xx+ case, with merged register file, we track things in terms
2464 * of half-precision registers, with a full precisions register
2465 * using two half-precision slots.
2466 *
2467 * Pretend that special regs (a0.x, a1.x, etc.) are full registers to
2468 * avoid having them alias normal full regs.
2469 */
2470 if (half && !is_reg_num_special(n)) {
2471 return BITSET_TEST(regmask->mask, n);
2472 } else {
2473 n *= 2;
2474 return BITSET_TEST(regmask->mask, n) ||
2475 BITSET_TEST(regmask->mask, n + 1);
2476 }
2477 } else {
2478 /* pre a6xx case, with separate register file for half and full
2479 * precision:
2480 */
2481 if (half)
2482 n += MAX_REG;
2483 return BITSET_TEST(regmask->mask, n);
2484 }
2485 }
2486
2487 static inline void
__regmask_set(regmask_t * regmask,bool half,unsigned n)2488 __regmask_set(regmask_t *regmask, bool half, unsigned n)
2489 {
2490 if (regmask->mergedregs) {
2491 /* a6xx+ case, with merged register file, we track things in terms
2492 * of half-precision registers, with a full precisions register
2493 * using two half-precision slots:
2494 */
2495 if (half && !is_reg_num_special(n)) {
2496 BITSET_SET(regmask->mask, n);
2497 } else {
2498 n *= 2;
2499 BITSET_SET(regmask->mask, n);
2500 BITSET_SET(regmask->mask, n + 1);
2501 }
2502 } else {
2503 /* pre a6xx case, with separate register file for half and full
2504 * precision:
2505 */
2506 if (half)
2507 n += MAX_REG;
2508 BITSET_SET(regmask->mask, n);
2509 }
2510 }
2511
2512 static inline void
__regmask_clear(regmask_t * regmask,bool half,unsigned n)2513 __regmask_clear(regmask_t *regmask, bool half, unsigned n)
2514 {
2515 if (regmask->mergedregs) {
2516 /* a6xx+ case, with merged register file, we track things in terms
2517 * of half-precision registers, with a full precisions register
2518 * using two half-precision slots:
2519 */
2520 if (half && !is_reg_num_special(n)) {
2521 BITSET_CLEAR(regmask->mask, n);
2522 } else {
2523 n *= 2;
2524 BITSET_CLEAR(regmask->mask, n);
2525 BITSET_CLEAR(regmask->mask, n + 1);
2526 }
2527 } else {
2528 /* pre a6xx case, with separate register file for half and full
2529 * precision:
2530 */
2531 if (half)
2532 n += MAX_REG;
2533 BITSET_CLEAR(regmask->mask, n);
2534 }
2535 }
2536
2537 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)2538 regmask_init(regmask_t *regmask, bool mergedregs)
2539 {
2540 memset(®mask->mask, 0, sizeof(regmask->mask));
2541 regmask->mergedregs = mergedregs;
2542 }
2543
2544 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)2545 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
2546 {
2547 assert(dst->mergedregs == a->mergedregs);
2548 assert(dst->mergedregs == b->mergedregs);
2549
2550 for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2551 dst->mask[i] = a->mask[i] | b->mask[i];
2552 }
2553
2554 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)2555 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
2556 {
2557 regmaskstate_t shared_mask;
2558 BITSET_ZERO(shared_mask);
2559
2560 if (b->mergedregs) {
2561 BITSET_SET_RANGE(shared_mask, 2 * 4 * 48, 2 * 4 * 56 - 1);
2562 } else {
2563 BITSET_SET_RANGE(shared_mask, 4 * 48, 4 * 56 - 1);
2564 }
2565
2566 for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2567 dst->mask[i] = a->mask[i] | (b->mask[i] & shared_mask[i]);
2568 }
2569
2570 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)2571 regmask_set(regmask_t *regmask, struct ir3_register *reg)
2572 {
2573 bool half = reg->flags & IR3_REG_HALF;
2574 if (reg->flags & IR3_REG_RELATIV) {
2575 for (unsigned i = 0; i < reg->size; i++)
2576 __regmask_set(regmask, half, reg->array.base + i);
2577 } else {
2578 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2579 if (mask & 1)
2580 __regmask_set(regmask, half, n);
2581 }
2582 }
2583
2584 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)2585 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
2586 {
2587 bool half = reg->flags & IR3_REG_HALF;
2588 if (reg->flags & IR3_REG_RELATIV) {
2589 for (unsigned i = 0; i < reg->size; i++)
2590 __regmask_clear(regmask, half, reg->array.base + i);
2591 } else {
2592 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2593 if (mask & 1)
2594 __regmask_clear(regmask, half, n);
2595 }
2596 }
2597
2598 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)2599 regmask_get(regmask_t *regmask, struct ir3_register *reg)
2600 {
2601 bool half = reg->flags & IR3_REG_HALF;
2602 if (reg->flags & IR3_REG_RELATIV) {
2603 for (unsigned i = 0; i < reg->size; i++)
2604 if (__regmask_get(regmask, half, reg->array.base + i))
2605 return true;
2606 } else {
2607 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2608 if (mask & 1)
2609 if (__regmask_get(regmask, half, n))
2610 return true;
2611 }
2612 return false;
2613 }
2614 /* ************************************************************************* */
2615
2616 #endif /* IR3_H_ */
2617