• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2012 Rob Clark <robdclark@gmail.com>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "ir3.h"
7 
8 #include <assert.h>
9 #include <errno.h>
10 #include <stdbool.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "util/bitscan.h"
16 #include "util/half_float.h"
17 #include "util/ralloc.h"
18 #include "util/u_math.h"
19 
20 #include "instr-a3xx.h"
21 #include "ir3_shader.h"
22 
23 /* simple allocator to carve allocations out of an up-front allocated heap,
24  * so that we can free everything easily in one shot.
25  */
26 void *
ir3_alloc(struct ir3 * shader,int sz)27 ir3_alloc(struct ir3 *shader, int sz)
28 {
29    return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
30 }
31 
32 struct ir3 *
ir3_create(struct ir3_compiler * compiler,struct ir3_shader_variant * v)33 ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
34 {
35    struct ir3 *shader = rzalloc(v, struct ir3);
36 
37    shader->compiler = compiler;
38    shader->type = v->type;
39 
40    list_inithead(&shader->block_list);
41    list_inithead(&shader->array_list);
42 
43    return shader;
44 }
45 
46 void
ir3_destroy(struct ir3 * shader)47 ir3_destroy(struct ir3 *shader)
48 {
49    ralloc_free(shader);
50 }
51 
52 static bool
is_shared_consts(struct ir3_compiler * compiler,const struct ir3_const_state * const_state,struct ir3_register * reg)53 is_shared_consts(struct ir3_compiler *compiler,
54                  const struct ir3_const_state *const_state,
55                  struct ir3_register *reg)
56 {
57    if (const_state->push_consts_type == IR3_PUSH_CONSTS_SHARED &&
58        reg->flags & IR3_REG_CONST) {
59       uint32_t min_const_reg = regid(compiler->shared_consts_base_offset, 0);
60       uint32_t max_const_reg =
61          regid(compiler->shared_consts_base_offset +
62                compiler->shared_consts_size, 0);
63       return reg->num >= min_const_reg && min_const_reg < max_const_reg;
64    }
65 
66    return false;
67 }
68 
69 static void
collect_reg_info(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_info * info)70 collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
71                  struct ir3_info *info)
72 {
73    struct ir3_shader_variant *v = info->data;
74 
75    if (reg->flags & IR3_REG_IMMED) {
76       /* nothing to do */
77       return;
78    }
79 
80    /* Shared consts don't need to be included into constlen. */
81    if (is_shared_consts(v->compiler, ir3_const_state(v), reg))
82       return;
83 
84    unsigned components;
85    int16_t max;
86 
87    if (reg->flags & IR3_REG_RELATIV) {
88       components = reg->size;
89       max = (reg->array.base + components - 1);
90    } else {
91       components = util_last_bit(reg->wrmask);
92       max = (reg->num + components - 1);
93    }
94 
95    if (reg->flags & IR3_REG_CONST) {
96       info->max_const = MAX2(info->max_const, max >> 2);
97    } else if (max < regid(48, 0)) {
98       if (reg->flags & IR3_REG_HALF) {
99          if (v->mergedregs) {
100             /* starting w/ a6xx, half regs conflict with full regs: */
101             info->max_reg = MAX2(info->max_reg, max >> 3);
102          } else {
103             info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
104          }
105       } else {
106          info->max_reg = MAX2(info->max_reg, max >> 2);
107       }
108    }
109 }
110 
111 bool
ir3_should_double_threadsize(struct ir3_shader_variant * v,unsigned regs_count)112 ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
113 {
114    const struct ir3_compiler *compiler = v->compiler;
115 
116    /* If the user forced a particular wavesize respect that. */
117    if (v->shader_options.real_wavesize == IR3_SINGLE_ONLY)
118       return false;
119    if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY)
120       return true;
121 
122    /* We can't support more than compiler->branchstack_size diverging threads
123     * in a wave. Thus, doubling the threadsize is only possible if we don't
124     * exceed the branchstack size limit.
125     */
126    if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
127        compiler->branchstack_size) {
128       return false;
129    }
130 
131    switch (v->type) {
132    case MESA_SHADER_KERNEL:
133    case MESA_SHADER_COMPUTE: {
134       unsigned threads_per_wg =
135          v->local_size[0] * v->local_size[1] * v->local_size[2];
136 
137       /* For a5xx, if the workgroup size is greater than the maximum number
138        * of threads per core with 32 threads per wave (512) then we have to
139        * use the doubled threadsize because otherwise the workgroup wouldn't
140        * fit. For smaller workgroup sizes, we follow the blob and use the
141        * smaller threadsize.
142        */
143       if (compiler->gen < 6) {
144          return v->local_size_variable ||
145                 threads_per_wg >
146                    compiler->threadsize_base * compiler->max_waves;
147       }
148 
149       /* On a6xx, we prefer the larger threadsize unless the workgroup is
150        * small enough that it would be useless. Note that because
151        * threadsize_base is bumped to 64, we don't have to worry about the
152        * workgroup fitting, unlike the a5xx case.
153        */
154       if (!v->local_size_variable) {
155          if (threads_per_wg <= compiler->threadsize_base)
156             return false;
157       }
158    }
159       FALLTHROUGH;
160    case MESA_SHADER_FRAGMENT: {
161       /* Check that doubling the threadsize wouldn't exceed the regfile size */
162       return regs_count * 2 <= compiler->reg_size_vec4;
163    }
164 
165    default:
166       /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
167        * stages - the bit doesn't exist. The blob never used it for the VS
168        * on earlier gen's anyway.
169        */
170       return false;
171    }
172 }
173 
174 /* Get the maximum number of waves that could be used even if this shader
175  * didn't use any registers.
176  */
177 unsigned
ir3_get_reg_independent_max_waves(struct ir3_shader_variant * v,bool double_threadsize)178 ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
179                                   bool double_threadsize)
180 {
181    const struct ir3_compiler *compiler = v->compiler;
182    unsigned max_waves = compiler->max_waves;
183 
184    /* Compute the limit based on branchstack */
185    if (v->branchstack > 0) {
186       unsigned branchstack_max_waves = compiler->branchstack_size /
187                                        v->branchstack *
188                                        compiler->wave_granularity;
189       max_waves = MIN2(max_waves, branchstack_max_waves);
190    }
191 
192    /* If this is a compute shader, compute the limit based on shared size */
193    if ((v->type == MESA_SHADER_COMPUTE) ||
194        (v->type == MESA_SHADER_KERNEL)) {
195       unsigned threads_per_wg =
196          v->local_size[0] * v->local_size[1] * v->local_size[2];
197       unsigned waves_per_wg =
198          DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
199                                          (double_threadsize ? 2 : 1) *
200                                          compiler->wave_granularity);
201 
202       /* Shared is allocated in chunks of 1k */
203       unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
204       if (shared_per_wg > 0 && !v->local_size_variable) {
205          unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
206 
207          max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
208                                         compiler->wave_granularity);
209       }
210 
211       /* If we have a compute shader that has a big workgroup, a barrier, and
212        * a branchstack which limits max_waves - this may result in a situation
213        * when we cannot run concurrently all waves of the workgroup, which
214        * would lead to a hang.
215        *
216        * TODO: Could we spill branchstack or is there other way around?
217        * Blob just explodes in such case.
218        */
219       if (v->has_barrier && (max_waves < waves_per_wg)) {
220          mesa_loge(
221             "Compute shader (%s) which has workgroup barrier cannot be used "
222             "because it's impossible to have enough concurrent waves.",
223             v->name);
224          exit(1);
225       }
226    }
227 
228    return max_waves;
229 }
230 
231 /* Get the maximum number of waves that could be launched limited by reg size.
232  */
233 unsigned
ir3_get_reg_dependent_max_waves(const struct ir3_compiler * compiler,unsigned reg_count,bool double_threadsize)234 ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
235                                 unsigned reg_count, bool double_threadsize)
236 {
237    return reg_count ? (compiler->reg_size_vec4 /
238                        (reg_count * (double_threadsize ? 2 : 1)) *
239                        compiler->wave_granularity)
240                     : compiler->max_waves;
241 }
242 
243 void
ir3_collect_info(struct ir3_shader_variant * v)244 ir3_collect_info(struct ir3_shader_variant *v)
245 {
246    struct ir3_info *info = &v->info;
247    struct ir3 *shader = v->ir;
248    const struct ir3_compiler *compiler = v->compiler;
249 
250    memset(info, 0, sizeof(*info));
251    info->data = v;
252    info->max_reg = -1;
253    info->max_half_reg = -1;
254    info->max_const = -1;
255    info->multi_dword_ldp_stp = false;
256 
257    uint32_t instr_count = 0;
258    foreach_block (block, &shader->block_list) {
259       foreach_instr (instr, &block->instr_list) {
260          instr_count++;
261       }
262    }
263 
264    v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
265 
266    /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
267     * doesn't try to decode the following data as instructions (such as the
268     * next stage's shader in turnip)
269     */
270    info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
271    info->sizedwords = info->size / 4;
272 
273    info->early_preamble = v->early_preamble;
274 
275    bool in_preamble = false;
276    bool has_eq = false;
277 
278    foreach_block (block, &shader->block_list) {
279       int sfu_delay = 0, mem_delay = 0;
280 
281       foreach_instr (instr, &block->instr_list) {
282 
283          foreach_src (reg, instr) {
284             collect_reg_info(instr, reg, info);
285          }
286 
287          foreach_dst (reg, instr) {
288             if (is_dest_gpr(reg)) {
289                collect_reg_info(instr, reg, info);
290             }
291          }
292 
293          if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
294             unsigned components = instr->srcs[2]->uim_val;
295 
296             /* This covers any multi-component access that could straddle
297              * across multiple double-words.
298              */
299             if (components > 1)
300                info->multi_dword_ldp_stp = true;
301 
302             if (instr->opc == OPC_STP)
303                info->stp_count += components;
304             else
305                info->ldp_count += components;
306          }
307 
308          if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) &&
309              (instr->dsts[0]->flags & IR3_REG_EI))
310             info->last_baryf = info->instrs_count;
311 
312          if ((instr->opc == OPC_NOP) && (instr->flags & IR3_INSTR_EQ)) {
313             info->last_helper = info->instrs_count;
314             has_eq = true;
315          }
316 
317          if (v->type == MESA_SHADER_FRAGMENT && v->need_pixlod &&
318              instr->opc == OPC_END && !v->prefetch_end_of_quad && !has_eq)
319             info->last_helper = info->instrs_count;
320 
321          if (instr->opc == OPC_SHPS)
322             in_preamble = true;
323 
324          /* Don't count instructions in the preamble for instruction-count type
325           * stats, because their effect should be much smaller.
326           * TODO: we should probably have separate stats for preamble
327           * instructions, but that would blow up the amount of stats...
328           */
329          if (!in_preamble) {
330             unsigned instrs_count = 1 + instr->repeat + instr->nop;
331             unsigned nops_count = instr->nop;
332 
333             if (instr->opc == OPC_NOP) {
334                nops_count = 1 + instr->repeat;
335                info->instrs_per_cat[0] += nops_count;
336             } else if (!is_meta(instr)) {
337                info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
338                info->instrs_per_cat[0] += nops_count;
339             }
340 
341             if (instr->opc == OPC_MOV) {
342                if (instr->cat1.src_type == instr->cat1.dst_type) {
343                   info->mov_count += 1 + instr->repeat;
344                } else {
345                   info->cov_count += 1 + instr->repeat;
346                }
347             }
348 
349             info->instrs_count += instrs_count;
350             info->nops_count += nops_count;
351 
352             if (instr->flags & IR3_INSTR_SS) {
353                info->ss++;
354                info->sstall += sfu_delay;
355                sfu_delay = 0;
356             }
357 
358             if (instr->flags & IR3_INSTR_SY) {
359                info->sy++;
360                info->systall += mem_delay;
361                mem_delay = 0;
362             }
363 
364             if (is_ss_producer(instr)) {
365                sfu_delay = soft_ss_delay(instr);
366             } else {
367                int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
368                sfu_delay -= n;
369             }
370 
371             if (is_sy_producer(instr)) {
372                mem_delay = soft_sy_delay(instr, shader);
373             } else {
374                int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
375                mem_delay -= n;
376             }
377          } else {
378             unsigned instrs_count = 1 + instr->repeat + instr->nop;
379             info->preamble_instrs_count += instrs_count;
380          }
381 
382          if (instr->opc == OPC_SHPE)
383             in_preamble = false;
384       }
385    }
386 
387    /* for vertex shader, the inputs are loaded into registers before the shader
388     * is executed, so max_regs from the shader instructions might not properly
389     * reflect the # of registers actually used, especially in case passthrough
390     * varyings.
391     *
392     * Likewise, for fragment shader, we can have some regs which are passed
393     * input values but never touched by the resulting shader (ie. as result
394     * of dead code elimination or simply because we don't know how to turn
395     * the reg off.
396     */
397    for (unsigned i = 0; i < v->inputs_count; i++) {
398       /* skip frag inputs fetch via bary.f since their reg's are
399        * not written by gpu before shader starts (and in fact the
400        * regid's might not even be valid)
401        */
402       if (v->inputs[i].bary)
403          continue;
404 
405       /* ignore high regs that are global to all threads in a warp
406        * (they exist by default) (a5xx+)
407        */
408       if (v->inputs[i].regid >= regid(48, 0))
409          continue;
410 
411       if (v->inputs[i].compmask) {
412          unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
413          int32_t regid = v->inputs[i].regid + n;
414          if (v->inputs[i].half) {
415             if (!v->mergedregs) {
416                v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
417             } else {
418                v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
419             }
420          } else {
421             v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
422          }
423       }
424    }
425 
426    for (unsigned i = 0; i < v->num_sampler_prefetch; i++) {
427       unsigned n = util_last_bit(v->sampler_prefetch[i].wrmask) - 1;
428       int32_t regid = v->sampler_prefetch[i].dst + n;
429       if (v->sampler_prefetch[i].half_precision) {
430          if (!v->mergedregs) {
431             v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
432          } else {
433             v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
434          }
435       } else {
436          v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
437       }
438    }
439 
440    /* TODO: for a5xx and below, is there a separate regfile for
441     * half-registers?
442     */
443    unsigned regs_count =
444       info->max_reg + 1 +
445       (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
446 
447    info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
448 
449    /* TODO this is different for earlier gens, but earlier gens don't use this */
450    info->subgroup_size = v->info.double_threadsize ? 128 : 64;
451 
452    unsigned reg_independent_max_waves =
453       ir3_get_reg_independent_max_waves(v, info->double_threadsize);
454    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
455       compiler, regs_count, info->double_threadsize);
456    info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
457    assert(info->max_waves <= v->compiler->max_waves);
458 }
459 
460 static struct ir3_register *
reg_create(struct ir3 * shader,int num,int flags)461 reg_create(struct ir3 *shader, int num, int flags)
462 {
463    struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
464    reg->wrmask = 1;
465    reg->flags = flags;
466    reg->num = num;
467    return reg;
468 }
469 
470 static void
insert_instr(struct ir3_cursor cursor,struct ir3_instruction * instr)471 insert_instr(struct ir3_cursor cursor, struct ir3_instruction *instr)
472 {
473    struct ir3 *shader = instr->block->shader;
474 
475    instr->serialno = ++shader->instr_count;
476 
477    switch (cursor.option) {
478    case IR3_CURSOR_BEFORE_BLOCK:
479       list_add(&instr->node, &cursor.block->instr_list);
480       break;
481    case IR3_CURSOR_AFTER_BLOCK:
482       list_addtail(&instr->node, &cursor.block->instr_list);
483       break;
484    case IR3_CURSOR_BEFORE_INSTR:
485       list_addtail(&instr->node, &cursor.instr->node);
486       break;
487    case IR3_CURSOR_AFTER_INSTR:
488       list_add(&instr->node, &cursor.instr->node);
489       break;
490    }
491 
492    if (is_input(instr))
493       array_insert(shader, shader->baryfs, instr);
494 }
495 
496 struct ir3_block *
ir3_block_create(struct ir3 * shader)497 ir3_block_create(struct ir3 *shader)
498 {
499    struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
500 #if MESA_DEBUG
501    block->serialno = ++shader->block_count;
502 #endif
503    block->shader = shader;
504    list_inithead(&block->node);
505    list_inithead(&block->instr_list);
506    return block;
507 }
508 
509 static struct ir3_instruction *
block_get_last_instruction(struct ir3_block * block)510 block_get_last_instruction(struct ir3_block *block)
511 {
512    if (list_is_empty(&block->instr_list))
513       return NULL;
514    return list_last_entry(&block->instr_list, struct ir3_instruction, node);
515 }
516 
517 struct ir3_instruction *
ir3_block_get_terminator(struct ir3_block * block)518 ir3_block_get_terminator(struct ir3_block *block)
519 {
520    struct ir3_instruction *last = block_get_last_instruction(block);
521 
522    if (last && is_terminator(last))
523       return last;
524 
525    return NULL;
526 }
527 
528 struct ir3_instruction *
ir3_block_take_terminator(struct ir3_block * block)529 ir3_block_take_terminator(struct ir3_block *block)
530 {
531    struct ir3_instruction *terminator = ir3_block_get_terminator(block);
532 
533    if (terminator)
534       list_delinit(&terminator->node);
535 
536    return terminator;
537 }
538 
539 struct ir3_instruction *
ir3_block_get_last_non_terminator(struct ir3_block * block)540 ir3_block_get_last_non_terminator(struct ir3_block *block)
541 {
542    struct ir3_instruction *last = block_get_last_instruction(block);
543 
544    if (!last)
545       return NULL;
546 
547    if (!is_terminator(last))
548       return last;
549 
550    if (last->node.prev != &block->instr_list)
551       return list_entry(last->node.prev, struct ir3_instruction, node);
552 
553    return NULL;
554 }
555 
556 struct ir3_instruction *
ir3_block_get_last_phi(struct ir3_block * block)557 ir3_block_get_last_phi(struct ir3_block *block)
558 {
559    struct ir3_instruction *last_phi = NULL;
560 
561    foreach_instr (instr, &block->instr_list) {
562       if (instr->opc != OPC_META_PHI)
563          break;
564 
565       last_phi = instr;
566    }
567 
568    return last_phi;
569 }
570 
571 void
ir3_block_add_predecessor(struct ir3_block * block,struct ir3_block * pred)572 ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
573 {
574    array_insert(block, block->predecessors, pred);
575 }
576 
577 void
ir3_block_link_physical(struct ir3_block * pred,struct ir3_block * succ)578 ir3_block_link_physical(struct ir3_block *pred,
579                         struct ir3_block *succ)
580 {
581    array_insert(pred, pred->physical_successors, succ);
582    array_insert(succ, succ->physical_predecessors, pred);
583 }
584 
585 void
ir3_block_remove_predecessor(struct ir3_block * block,struct ir3_block * pred)586 ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
587 {
588    for (unsigned i = 0; i < block->predecessors_count; i++) {
589       if (block->predecessors[i] == pred) {
590          if (i < block->predecessors_count - 1) {
591             block->predecessors[i] =
592                block->predecessors[block->predecessors_count - 1];
593          }
594 
595          block->predecessors_count--;
596          return;
597       }
598    }
599 }
600 
601 unsigned
ir3_block_get_pred_index(struct ir3_block * block,struct ir3_block * pred)602 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
603 {
604    for (unsigned i = 0; i < block->predecessors_count; i++) {
605       if (block->predecessors[i] == pred) {
606          return i;
607       }
608    }
609 
610    unreachable("ir3_block_get_pred_index() invalid predecessor");
611 }
612 
613 static struct ir3_instruction *
instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)614 instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
615 {
616    /* Add extra sources for array destinations and the address reg */
617    if (1 <= opc_cat(opc))
618       nsrc += 2;
619    struct ir3_instruction *instr;
620    unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
621                  (nsrc * sizeof(instr->srcs[0]));
622    char *ptr = ir3_alloc(block->shader, sz);
623 
624    instr = (struct ir3_instruction *)ptr;
625    ptr += sizeof(*instr);
626    instr->dsts = (struct ir3_register **)ptr;
627    instr->srcs = instr->dsts + ndst;
628 
629 #if MESA_DEBUG
630    instr->dsts_max = ndst;
631    instr->srcs_max = nsrc;
632 #endif
633 
634    list_inithead(&instr->rpt_node);
635    return instr;
636 }
637 
638 static void
add_to_address_users(struct ir3_instruction * instr)639 add_to_address_users(struct ir3_instruction *instr)
640 {
641    assert(instr->address != NULL);
642 
643    struct ir3 *ir = instr->block->shader;
644    struct ir3_register *addr_reg = instr->address->def;
645    assert(reg_num(addr_reg) == REG_A0);
646    unsigned comp = reg_comp(addr_reg);
647    if (comp == 0) {
648       array_insert(ir, ir->a0_users, instr);
649    } else {
650       assert(comp == 1);
651       array_insert(ir, ir->a1_users, instr);
652    }
653 }
654 
655 static struct ir3_block *
get_block(struct ir3_cursor cursor)656 get_block(struct ir3_cursor cursor)
657 {
658    switch (cursor.option) {
659    case IR3_CURSOR_BEFORE_BLOCK:
660    case IR3_CURSOR_AFTER_BLOCK:
661       return cursor.block;
662    case IR3_CURSOR_BEFORE_INSTR:
663    case IR3_CURSOR_AFTER_INSTR:
664       return cursor.instr->block;
665    }
666 
667    unreachable("illegal cursor option");
668 }
669 
670 struct ir3_instruction *
ir3_instr_create_at(struct ir3_cursor cursor,opc_t opc,int ndst,int nsrc)671 ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc, int ndst, int nsrc)
672 {
673    struct ir3_block *block = get_block(cursor);
674    struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
675    instr->block = block;
676    instr->opc = opc;
677    insert_instr(cursor, instr);
678    return instr;
679 }
680 
681 struct ir3_instruction *
ir3_build_instr(struct ir3_builder * builder,opc_t opc,int ndst,int nsrc)682 ir3_build_instr(struct ir3_builder *builder, opc_t opc, int ndst, int nsrc)
683 {
684    struct ir3_instruction *instr =
685       ir3_instr_create_at(builder->cursor, opc, ndst, nsrc);
686 
687    /* During instruction selection, instructions are sometimes emitted to blocks
688     * other than the current one. For example, to predecessor blocks for phi
689     * sources or to the first block for inputs. For those cases, a new builder
690     * is created to emit at the end of the target block. However, if the target
691     * block happens to be the same as the current block, the main builder would
692     * not be updated to point past the new instructions. Therefore, don't update
693     * the cursor when it points to the end of a block to ensure that new
694     * instructions will always be added at the end.
695     */
696    if (builder->cursor.option != IR3_CURSOR_AFTER_BLOCK) {
697       builder->cursor = ir3_after_instr(instr);
698    }
699 
700    return instr;
701 }
702 
703 struct ir3_instruction *
ir3_instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)704 ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
705 {
706    return ir3_instr_create_at(ir3_before_terminator(block), opc, ndst, nsrc);
707 }
708 
709 struct ir3_instruction *
ir3_instr_create_at_end(struct ir3_block * block,opc_t opc,int ndst,int nsrc)710 ir3_instr_create_at_end(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
711 {
712    return ir3_instr_create_at(ir3_after_block(block), opc, ndst, nsrc);
713 }
714 
715 struct ir3_instruction *
ir3_instr_clone(struct ir3_instruction * instr)716 ir3_instr_clone(struct ir3_instruction *instr)
717 {
718    struct ir3_instruction *new_instr = instr_create(
719       instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
720    struct ir3_register **dsts, **srcs;
721 
722    dsts = new_instr->dsts;
723    srcs = new_instr->srcs;
724    *new_instr = *instr;
725    new_instr->dsts = dsts;
726    new_instr->srcs = srcs;
727    list_inithead(&new_instr->rpt_node);
728 
729    insert_instr(ir3_before_terminator(instr->block), new_instr);
730 
731    /* clone registers: */
732    new_instr->dsts_count = 0;
733    new_instr->srcs_count = 0;
734    foreach_dst (reg, instr) {
735       struct ir3_register *new_reg =
736          ir3_dst_create(new_instr, reg->num, reg->flags);
737       *new_reg = *reg;
738       if (new_reg->instr)
739          new_reg->instr = new_instr;
740    }
741    foreach_src (reg, instr) {
742       struct ir3_register *new_reg =
743          ir3_src_create(new_instr, reg->num, reg->flags);
744       *new_reg = *reg;
745    }
746 
747    if (instr->address) {
748       assert(instr->srcs_count > 0);
749       new_instr->address = new_instr->srcs[instr->srcs_count - 1];
750       add_to_address_users(new_instr);
751    }
752 
753    return new_instr;
754 }
755 
756 /* Add a false dependency to instruction, to ensure it is scheduled first: */
757 void
ir3_instr_add_dep(struct ir3_instruction * instr,struct ir3_instruction * dep)758 ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
759 {
760    for (unsigned i = 0; i < instr->deps_count; i++) {
761       if (instr->deps[i] == dep)
762          return;
763    }
764 
765    array_insert(instr, instr->deps, dep);
766 }
767 
768 void
ir3_instr_remove(struct ir3_instruction * instr)769 ir3_instr_remove(struct ir3_instruction *instr)
770 {
771    list_delinit(&instr->node);
772    list_delinit(&instr->rpt_node);
773 }
774 
775 void
ir3_instr_create_rpt(struct ir3_instruction ** instrs,unsigned n)776 ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n)
777 {
778    assert(n > 0 && !ir3_instr_is_rpt(instrs[0]));
779 
780    for (unsigned i = 1; i < n; ++i) {
781       assert(!ir3_instr_is_rpt(instrs[i]));
782       assert(instrs[i]->serialno > instrs[i - 1]->serialno);
783 
784       list_addtail(&instrs[i]->rpt_node, &instrs[0]->rpt_node);
785    }
786 }
787 
788 bool
ir3_instr_is_rpt(const struct ir3_instruction * instr)789 ir3_instr_is_rpt(const struct ir3_instruction *instr)
790 {
791    return !list_is_empty(&instr->rpt_node);
792 }
793 
794 bool
ir3_instr_is_first_rpt(const struct ir3_instruction * instr)795 ir3_instr_is_first_rpt(const struct ir3_instruction *instr)
796 {
797    if (!ir3_instr_is_rpt(instr))
798       return false;
799 
800    struct ir3_instruction *prev_rpt =
801       list_entry(instr->rpt_node.prev, struct ir3_instruction, rpt_node);
802    return prev_rpt->serialno > instr->serialno;
803 }
804 
805 struct ir3_instruction *
ir3_instr_prev_rpt(const struct ir3_instruction * instr)806 ir3_instr_prev_rpt(const struct ir3_instruction *instr)
807 {
808    assert(ir3_instr_is_rpt(instr));
809 
810    if (ir3_instr_is_first_rpt(instr))
811       return NULL;
812    return list_entry(instr->rpt_node.prev, struct ir3_instruction, rpt_node);
813 }
814 
815 struct ir3_instruction *
ir3_instr_first_rpt(struct ir3_instruction * instr)816 ir3_instr_first_rpt(struct ir3_instruction *instr)
817 {
818    assert(ir3_instr_is_rpt(instr));
819 
820    while (!ir3_instr_is_first_rpt(instr)) {
821       instr = ir3_instr_prev_rpt(instr);
822       assert(instr);
823    }
824 
825    return instr;
826 }
827 
828 unsigned
ir3_instr_rpt_length(const struct ir3_instruction * instr)829 ir3_instr_rpt_length(const struct ir3_instruction *instr)
830 {
831    assert(ir3_instr_is_first_rpt(instr));
832 
833    return list_length(&instr->rpt_node) + 1;
834 }
835 
836 struct ir3_register *
ir3_src_create(struct ir3_instruction * instr,int num,int flags)837 ir3_src_create(struct ir3_instruction *instr, int num, int flags)
838 {
839    struct ir3 *shader = instr->block->shader;
840 #if MESA_DEBUG
841    assert(instr->srcs_count < instr->srcs_max);
842 #endif
843    struct ir3_register *reg = reg_create(shader, num, flags);
844    instr->srcs[instr->srcs_count++] = reg;
845    return reg;
846 }
847 
848 struct ir3_register *
ir3_dst_create(struct ir3_instruction * instr,int num,int flags)849 ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
850 {
851    struct ir3 *shader = instr->block->shader;
852 #if MESA_DEBUG
853    assert(instr->dsts_count < instr->dsts_max);
854 #endif
855    struct ir3_register *reg = reg_create(shader, num, flags);
856    instr->dsts[instr->dsts_count++] = reg;
857    return reg;
858 }
859 
860 struct ir3_register *
ir3_reg_clone(struct ir3 * shader,struct ir3_register * reg)861 ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
862 {
863    struct ir3_register *new_reg = reg_create(shader, 0, 0);
864    *new_reg = *reg;
865    return new_reg;
866 }
867 
868 void
ir3_reg_set_last_array(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_register * last_write)869 ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
870                        struct ir3_register *last_write)
871 {
872    assert(reg->flags & IR3_REG_ARRAY);
873    struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
874    *new_reg = *reg;
875    new_reg->def = last_write;
876    ir3_reg_tie(reg, new_reg);
877 }
878 
879 void
ir3_instr_set_address(struct ir3_instruction * instr,struct ir3_instruction * addr)880 ir3_instr_set_address(struct ir3_instruction *instr,
881                       struct ir3_instruction *addr)
882 {
883    if (!instr->address) {
884       assert(instr->block == addr->block);
885 
886       instr->address =
887          ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
888       instr->address->def = addr->dsts[0];
889       add_to_address_users(instr);
890    } else {
891       assert(instr->address->def->instr == addr);
892    }
893 }
894 
895 /* Does this instruction use the scalar ALU?
896  */
897 bool
is_scalar_alu(struct ir3_instruction * instr,const struct ir3_compiler * compiler)898 is_scalar_alu(struct ir3_instruction *instr,
899               const struct ir3_compiler *compiler)
900 {
901    /* MOVMSK seems to always need (ss) even with other scalar ALU instructions
902     */
903    return instr->opc != OPC_MOVMSK &&
904       instr->opc != OPC_SCAN_CLUSTERS_MACRO &&
905       instr->opc != OPC_SCAN_MACRO &&
906       is_alu(instr) && (instr->dsts[0]->flags & IR3_REG_SHARED) &&
907       /* scalar->scalar mov instructions (but NOT cov) were supported before the
908        * scalar ALU was supported, but they still required (ss) whereas on GPUs
909        * that have a scalar ALU they are executed on it and do not require (ss).
910        * We have to be careful to return false for these if scalar ALU isn't
911        * supported, so that we treat them like vector->scalar mov instructions
912        * (such as requiring (ss)).
913        */
914       compiler->has_scalar_alu &&
915       /* moves from normal to shared seem to use a separate ALU as before and
916        * require a (ss) on dependent instructions.
917        */
918       ((instr->opc != OPC_MOV && !is_subgroup_cond_mov_macro(instr)) ||
919        (instr->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)));
920 }
921 
922 void
ir3_block_clear_mark(struct ir3_block * block)923 ir3_block_clear_mark(struct ir3_block *block)
924 {
925    foreach_instr (instr, &block->instr_list)
926       instr->flags &= ~IR3_INSTR_MARK;
927 }
928 
929 void
ir3_clear_mark(struct ir3 * ir)930 ir3_clear_mark(struct ir3 *ir)
931 {
932    foreach_block (block, &ir->block_list) {
933       ir3_block_clear_mark(block);
934    }
935 }
936 
937 unsigned
ir3_count_instructions(struct ir3 * ir)938 ir3_count_instructions(struct ir3 *ir)
939 {
940    unsigned cnt = 1;
941    foreach_block (block, &ir->block_list) {
942       block->start_ip = cnt;
943       foreach_instr (instr, &block->instr_list) {
944          instr->ip = cnt++;
945       }
946       block->end_ip = cnt;
947    }
948    return cnt;
949 }
950 
951 unsigned
ir3_count_instructions_sched(struct ir3 * ir)952 ir3_count_instructions_sched(struct ir3 *ir)
953 {
954    unsigned cnt = 1;
955    foreach_block (block, &ir->block_list) {
956       block->start_ip = cnt;
957       foreach_instr (instr, &block->instr_list) {
958          if (!is_terminator(instr))
959             instr->ip = cnt++;
960       }
961       block->end_ip = cnt;
962    }
963    return cnt;
964 }
965 
966 /* When counting instructions for RA, we insert extra fake instructions at the
967  * beginning of each block, where values become live, and at the end where
968  * values die. This prevents problems where values live-in at the beginning or
969  * live-out at the end of a block from being treated as if they were
970  * live-in/live-out at the first/last instruction, which would be incorrect.
971  * In ir3_legalize these ip's are assumed to be actual ip's of the final
972  * program, so it would be incorrect to use this everywhere.
973  */
974 
975 unsigned
ir3_count_instructions_ra(struct ir3 * ir)976 ir3_count_instructions_ra(struct ir3 *ir)
977 {
978    unsigned cnt = 1;
979    foreach_block (block, &ir->block_list) {
980       block->start_ip = cnt++;
981       foreach_instr (instr, &block->instr_list) {
982          instr->ip = cnt++;
983       }
984       block->end_ip = cnt++;
985    }
986    return cnt;
987 }
988 
989 struct ir3_array *
ir3_lookup_array(struct ir3 * ir,unsigned id)990 ir3_lookup_array(struct ir3 *ir, unsigned id)
991 {
992    foreach_array (arr, &ir->array_list)
993       if (arr->id == id)
994          return arr;
995    return NULL;
996 }
997 
ir3_find_ssa_uses_for(struct ir3 * ir,void * mem_ctx,use_filter_cb filter)998 void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter)
999 {
1000    /* We could do this in a single pass if we can assume instructions
1001     * are always sorted.  Which currently might not always be true.
1002     * (In particular after ir3_group pass, but maybe other places.)
1003     */
1004    foreach_block (block, &ir->block_list)
1005       foreach_instr (instr, &block->instr_list)
1006          instr->uses = NULL;
1007 
1008    foreach_block (block, &ir->block_list) {
1009       foreach_instr (instr, &block->instr_list) {
1010          foreach_ssa_src_n (src, n, instr) {
1011             if (!filter(instr, n))
1012                continue;
1013             if (!src->uses)
1014                src->uses = _mesa_pointer_set_create(mem_ctx);
1015             _mesa_set_add(src->uses, instr);
1016          }
1017       }
1018    }
1019 }
1020 
1021 static bool
no_false_deps(struct ir3_instruction * instr,unsigned src_n)1022 no_false_deps(struct ir3_instruction *instr, unsigned src_n)
1023 {
1024    return !__is_false_dep(instr, src_n);
1025 }
1026 
1027 static bool
any_src(struct ir3_instruction * instr,unsigned src_n)1028 any_src(struct ir3_instruction *instr, unsigned src_n)
1029 {
1030    return true;
1031 }
1032 
1033 void
ir3_find_ssa_uses(struct ir3 * ir,void * mem_ctx,bool falsedeps)1034 ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
1035 {
1036    if (falsedeps)
1037       return ir3_find_ssa_uses_for(ir, mem_ctx, any_src);
1038    return ir3_find_ssa_uses_for(ir, mem_ctx, no_false_deps);
1039 }
1040 
1041 /**
1042  * Set the destination type of an instruction, for example if a
1043  * conversion is folded in, handling the special cases where the
1044  * instruction's dest type or opcode needs to be fixed up.
1045  */
1046 void
ir3_set_dst_type(struct ir3_instruction * instr,bool half)1047 ir3_set_dst_type(struct ir3_instruction *instr, bool half)
1048 {
1049    if (half) {
1050       instr->dsts[0]->flags |= IR3_REG_HALF;
1051    } else {
1052       instr->dsts[0]->flags &= ~IR3_REG_HALF;
1053    }
1054 
1055    switch (opc_cat(instr->opc)) {
1056    case 1: /* move instructions */
1057       if (half) {
1058          instr->cat1.dst_type = half_type(instr->cat1.dst_type);
1059       } else {
1060          instr->cat1.dst_type = full_type(instr->cat1.dst_type);
1061       }
1062       break;
1063    case 4:
1064       if (half) {
1065          instr->opc = cat4_half_opc(instr->opc);
1066       } else {
1067          instr->opc = cat4_full_opc(instr->opc);
1068       }
1069       break;
1070    case 5:
1071       if (half) {
1072          instr->cat5.type = half_type(instr->cat5.type);
1073       } else {
1074          instr->cat5.type = full_type(instr->cat5.type);
1075       }
1076       break;
1077    }
1078 }
1079 
1080 /**
1081  * One-time fixup for instruction src-types.  Other than cov's that
1082  * are folded, an instruction's src type does not change.
1083  */
1084 void
ir3_fixup_src_type(struct ir3_instruction * instr)1085 ir3_fixup_src_type(struct ir3_instruction *instr)
1086 {
1087    if (instr->srcs_count == 0)
1088       return;
1089 
1090    switch (opc_cat(instr->opc)) {
1091    case 1: /* move instructions */
1092       if (instr->srcs[0]->flags & IR3_REG_HALF) {
1093          instr->cat1.src_type = half_type(instr->cat1.src_type);
1094       } else {
1095          instr->cat1.src_type = full_type(instr->cat1.src_type);
1096       }
1097       break;
1098    case 3:
1099       if (instr->srcs[0]->flags & IR3_REG_HALF) {
1100          instr->opc = cat3_half_opc(instr->opc);
1101       } else {
1102          instr->opc = cat3_full_opc(instr->opc);
1103       }
1104       break;
1105    }
1106 }
1107 
1108 /**
1109  * Map a floating point immed to FLUT (float lookup table) value,
1110  * returns negative for immediates that cannot be mapped.
1111  */
1112 int
ir3_flut(struct ir3_register * src_reg)1113 ir3_flut(struct ir3_register *src_reg)
1114 {
1115    static const struct {
1116       uint32_t f32;
1117       uint16_t f16;
1118    } flut[] = {
1119          { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
1120          { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
1121          { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
1122          { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
1123          { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
1124          { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
1125          { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
1126          { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
1127          { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
1128          { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
1129          { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
1130          { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
1131    };
1132 
1133    if (src_reg->flags & IR3_REG_HALF) {
1134       /* Note that half-float immeds are already lowered to 16b in nir: */
1135       uint32_t imm = src_reg->uim_val;
1136       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
1137          if (flut[i].f16 == imm) {
1138             return i;
1139          }
1140       }
1141    } else {
1142       uint32_t imm = src_reg->uim_val;
1143       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
1144          if (flut[i].f32 == imm) {
1145             return i;
1146          }
1147       }
1148    }
1149 
1150    return -1;
1151 }
1152 
1153 static unsigned
cp_flags(unsigned flags)1154 cp_flags(unsigned flags)
1155 {
1156    /* only considering these flags (at least for now): */
1157    flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
1158              IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
1159              IR3_REG_SHARED);
1160    return flags;
1161 }
1162 
1163 bool
ir3_valid_flags(struct ir3_instruction * instr,unsigned n,unsigned flags)1164 ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
1165 {
1166    struct ir3_compiler *compiler = instr->block->shader->compiler;
1167    unsigned valid_flags;
1168 
1169    flags = cp_flags(flags);
1170 
1171    /* If destination is indirect, then source cannot be.. at least
1172     * I don't think so..
1173     */
1174    if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
1175        (flags & IR3_REG_RELATIV))
1176       return false;
1177 
1178    if (flags & IR3_REG_RELATIV) {
1179       /* TODO need to test on earlier gens.. pretty sure the earlier
1180        * problem was just that we didn't check that the src was from
1181        * same block (since we can't propagate address register values
1182        * across blocks currently)
1183        */
1184       if (compiler->gen < 6)
1185          return false;
1186 
1187       /* NOTE in the special try_swap_mad_two_srcs() case we can be
1188        * called on a src that has already had an indirect load folded
1189        * in, in which case ssa() returns NULL
1190        */
1191       if (instr->srcs[n]->flags & IR3_REG_SSA) {
1192          struct ir3_instruction *src = ssa(instr->srcs[n]);
1193          if (src->address->def->instr->block != instr->block)
1194             return false;
1195       }
1196    }
1197 
1198    if (is_meta(instr)) {
1199       /* collect and phi nodes support const/immed sources, which will be
1200        * turned into move instructions, but not anything else.
1201        */
1202       if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
1203          return false;
1204 
1205       /* Except for immed/const sources, source and dest shared-ness must match.
1206        */
1207       if (!(flags & (IR3_REG_IMMED | IR3_REG_CONST)) &&
1208           (flags & IR3_REG_SHARED) != (instr->dsts[0]->flags & IR3_REG_SHARED))
1209          return false;
1210 
1211       return true;
1212    }
1213 
1214    switch (opc_cat(instr->opc)) {
1215    case 0: /* end, chmask */
1216       return flags == 0;
1217    case 1:
1218       switch (instr->opc) {
1219       case OPC_MOVMSK:
1220       case OPC_SWZ:
1221       case OPC_SCT:
1222       case OPC_GAT:
1223          valid_flags = IR3_REG_SHARED;
1224          break;
1225       case OPC_SCAN_MACRO:
1226          if (n == 0)
1227             return flags == 0;
1228          else
1229             return flags == IR3_REG_SHARED;
1230          break;
1231       case OPC_SCAN_CLUSTERS_MACRO:
1232          if (n == 0)
1233             return flags == IR3_REG_SHARED;
1234          else
1235             return flags == 0;
1236          break;
1237       default: {
1238          valid_flags =
1239             IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
1240 
1241          /* floating-point conversions when moving from non-shared to shared
1242           * seem not to work. We only use floating-point types in ir3 for
1243           * conversions, so don't bother specially handling the case where the
1244           * types are equal. Same goes for 8-bit sign extension.
1245           */
1246          if ((instr->dsts[0]->flags & IR3_REG_SHARED) &&
1247              !(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)) &&
1248              ((full_type(instr->cat1.src_type) == TYPE_F32 ||
1249                full_type(instr->cat1.dst_type) == TYPE_F32) ||
1250               (instr->cat1.src_type == TYPE_U8 &&
1251                full_type(instr->cat1.dst_type) == TYPE_S32)))
1252             return false;
1253 
1254          /* Conversions seem not to work in shared->shared copies before scalar
1255           * ALU is supported.
1256           */
1257          if (!compiler->has_scalar_alu &&
1258              (flags & IR3_REG_SHARED) &&
1259              (instr->dsts[0]->flags & IR3_REG_SHARED) &&
1260              instr->cat1.src_type != instr->cat1.dst_type)
1261             return false;
1262       }
1263       }
1264       if (flags & ~valid_flags)
1265          return false;
1266       break;
1267    case 2:
1268       valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
1269                     IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
1270 
1271       if (flags & ~valid_flags)
1272          return false;
1273 
1274       /* Allow an immediate src1 for flat.b, since it's ignored */
1275       if (instr->opc == OPC_FLAT_B &&
1276           n == 1 && flags == IR3_REG_IMMED)
1277          return true;
1278 
1279       /* cat2/cat3 scalar ALU instructions must not have regular sources. */
1280       if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1281          if (!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)))
1282             return false;
1283       }
1284 
1285       if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
1286          unsigned m = n ^ 1;
1287          /* cannot deal w/ const or shared in both srcs:
1288           * (note that some cat2 actually only have a single src)
1289           */
1290          if (m < instr->srcs_count) {
1291             struct ir3_register *reg = instr->srcs[m];
1292             if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1293                if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
1294                   return false;
1295             } else {
1296                if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
1297                    (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
1298                   return false;
1299             }
1300             if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
1301                return false;
1302          }
1303       }
1304       break;
1305    case 3:
1306       valid_flags =
1307          ir3_cat3_absneg(instr->opc, n) | IR3_REG_RELATIV | IR3_REG_SHARED;
1308 
1309       switch (instr->opc) {
1310       case OPC_SHRM:
1311       case OPC_SHLM:
1312       case OPC_SHRG:
1313       case OPC_SHLG:
1314       case OPC_ANDG: {
1315          if (n != 1) {
1316             valid_flags |= IR3_REG_IMMED;
1317          }
1318 
1319          /* Can be RELATIV+CONST but not CONST: */
1320          if (flags & IR3_REG_RELATIV)
1321             valid_flags |= IR3_REG_CONST;
1322 
1323          if (!(instr->dsts[0]->flags & IR3_REG_SHARED) && n < 2) {
1324             /* Of the first two sources, only one can be shared. */
1325             unsigned m = n ^ 1;
1326 
1327             if ((flags & IR3_REG_SHARED) &&
1328                 (instr->srcs[m]->flags & IR3_REG_SHARED)) {
1329                return false;
1330             }
1331          }
1332          break;
1333       }
1334       case OPC_WMM:
1335       case OPC_WMM_ACCU: {
1336          valid_flags = IR3_REG_SHARED;
1337          if (n == 2)
1338             valid_flags = IR3_REG_CONST;
1339          break;
1340       }
1341       case OPC_DP2ACC:
1342       case OPC_DP4ACC:
1343          break;
1344       default:
1345          valid_flags |= IR3_REG_CONST;
1346       }
1347 
1348       if (flags & ~valid_flags)
1349          return false;
1350 
1351       if (flags & (IR3_REG_CONST | IR3_REG_RELATIV) ||
1352           (!(instr->dsts[0]->flags & IR3_REG_SHARED) &&
1353            (flags & IR3_REG_SHARED))) {
1354          /* cannot deal w/ const/shared/relativ in 2nd src: */
1355          if (n == 1)
1356             return false;
1357       }
1358 
1359       if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1360          if (!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)))
1361             return false;
1362       }
1363 
1364       break;
1365    case 4:
1366       if ((instr->dsts[0]->flags & IR3_REG_SHARED) != (flags & IR3_REG_SHARED))
1367          return false;
1368       /* seems like blob compiler avoids const as src.. */
1369       /* TODO double check if this is still the case on a4xx */
1370       if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
1371          return false;
1372       if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
1373          return false;
1374       break;
1375    case 5:
1376       if (instr->opc == OPC_ISAM && (instr->flags & IR3_INSTR_V)) {
1377          if (((instr->flags & IR3_INSTR_S2EN) && n == 2) ||
1378              (!(instr->flags & IR3_INSTR_S2EN) && n == 1)) {
1379             return flags == IR3_REG_IMMED;
1380          }
1381       }
1382       /* no flags allowed */
1383       if (flags)
1384          return false;
1385       break;
1386    case 6:
1387       valid_flags = IR3_REG_IMMED;
1388 
1389       if (instr->opc == OPC_STC && n == 1)
1390          valid_flags |= IR3_REG_SHARED;
1391       if (instr->opc == OPC_SHFL) {
1392          if (n == 0)
1393             valid_flags &= ~IR3_REG_IMMED;
1394          else if (n == 1)
1395             valid_flags |= IR3_REG_SHARED;
1396       }
1397 
1398       if (flags & ~valid_flags)
1399          return false;
1400 
1401       if (flags & IR3_REG_IMMED) {
1402          /* doesn't seem like we can have immediate src for store
1403           * instructions:
1404           *
1405           * TODO this restriction could also apply to load instructions,
1406           * but for load instructions this arg is the address (and not
1407           * really sure any good way to test a hard-coded immed addr src)
1408           */
1409          if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
1410             return false;
1411 
1412          if ((instr->opc == OPC_LDL) && (n == 0))
1413             return false;
1414 
1415          if ((instr->opc == OPC_STL) && (n != 2))
1416             return false;
1417 
1418          if ((instr->opc == OPC_LDP) && (n == 0))
1419             return false;
1420 
1421          if ((instr->opc == OPC_STP) && (n != 2))
1422             return false;
1423 
1424          if (instr->opc == OPC_STLW && n == 0)
1425             return false;
1426 
1427          if (instr->opc == OPC_LDLW && n == 0)
1428             return false;
1429 
1430          /* disallow immediates in anything but the SSBO slot argument for
1431           * cat6 instructions:
1432           */
1433          if (is_global_a3xx_atomic(instr->opc) && (n != 0))
1434             return false;
1435 
1436          if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) ||
1437              is_bindless_atomic(instr->opc))
1438             return false;
1439 
1440          if (instr->opc == OPC_STG && (n == 2))
1441             return false;
1442 
1443          if (instr->opc == OPC_STG_A && (n == 4))
1444             return false;
1445 
1446          if (instr->opc == OPC_LDG && (n == 0))
1447             return false;
1448 
1449          if (instr->opc == OPC_LDG_A && (n < 2))
1450             return false;
1451 
1452          if (instr->opc == OPC_STC && n != 0)
1453             return false;
1454 
1455          /* as with atomics, these cat6 instrs can only have an immediate
1456           * for SSBO/IBO slot argument
1457           */
1458          switch (instr->opc) {
1459          case OPC_LDIB:
1460          case OPC_STIB:
1461             if (n != 0 && n != 2)
1462                return false;
1463             break;
1464          case OPC_RESINFO:
1465             if (n != 0)
1466                return false;
1467             break;
1468          default:
1469             break;
1470          }
1471       }
1472 
1473       break;
1474    }
1475 
1476    return true;
1477 }
1478 
1479 bool
ir3_valid_immediate(struct ir3_instruction * instr,int32_t immed)1480 ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
1481 {
1482    if (instr->opc == OPC_MOV || is_meta(instr))
1483       return true;
1484 
1485    if (is_mem(instr)) {
1486       switch (instr->opc) {
1487       /* Some load/store instructions have a 13-bit offset and size which must
1488        * always be an immediate and the rest of the sources cannot be
1489        * immediates, so the frontend is responsible for checking the size:
1490        */
1491       case OPC_LDL:
1492       case OPC_STL:
1493       case OPC_LDP:
1494       case OPC_STP:
1495       case OPC_LDG:
1496       case OPC_STG:
1497       case OPC_SPILL_MACRO:
1498       case OPC_RELOAD_MACRO:
1499       case OPC_LDG_A:
1500       case OPC_STG_A:
1501       case OPC_LDLW:
1502       case OPC_STLW:
1503       case OPC_LDLV:
1504          return true;
1505       default:
1506          /* most cat6 src immediates can only encode 8 bits: */
1507          return !(immed & ~0xff);
1508       }
1509    }
1510 
1511    /* The alternative cat3 encoding used for sh[lr][gm]/andg uses 12 bit
1512     * immediates that won't be sign-extended.
1513     */
1514    if (is_cat3_alt(instr->opc)) {
1515       return !(immed & ~0xfff);
1516    }
1517 
1518    /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1519    return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1520 }
1521 
1522 struct ir3_instruction *
ir3_get_cond_for_nonzero_compare(struct ir3_instruction * instr)1523 ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr)
1524 {
1525    /* If instr is a negation (likely as a result of an nir_b2n), we can ignore
1526     * that and use its source, since the nonzero-ness stays the same.
1527     */
1528    if (instr->opc == OPC_ABSNEG_S && instr->flags == 0 &&
1529        (instr->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
1530           IR3_REG_SNEG) {
1531       return instr->srcs[0]->def->instr;
1532    }
1533 
1534    return instr;
1535 }
1536 
1537 bool
ir3_supports_rpt(struct ir3_compiler * compiler,unsigned opc)1538 ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc)
1539 {
1540    switch (opc_cat(opc)) {
1541    case 0:
1542       return opc == OPC_NOP;
1543    case 1:
1544       return opc == OPC_MOV || opc == OPC_SWZ || opc == OPC_MOVMSK;
1545    case 2:
1546       if (opc == OPC_BARY_F && !compiler->has_rpt_bary_f)
1547          return false;
1548       return true;
1549    case 3:
1550       return opc != OPC_DP2ACC && opc != OPC_DP4ACC;
1551    case 4:
1552       return opc != OPC_RCP;
1553    default:
1554       return false;
1555    }
1556 }
1557