• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "ir3.h"
25 
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "util/bitscan.h"
34 #include "util/half_float.h"
35 #include "util/ralloc.h"
36 #include "util/u_math.h"
37 
38 #include "instr-a3xx.h"
39 #include "ir3_shader.h"
40 
41 /* simple allocator to carve allocations out of an up-front allocated heap,
42  * so that we can free everything easily in one shot.
43  */
44 void *
ir3_alloc(struct ir3 * shader,int sz)45 ir3_alloc(struct ir3 *shader, int sz)
46 {
47    return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
48 }
49 
50 struct ir3 *
ir3_create(struct ir3_compiler * compiler,struct ir3_shader_variant * v)51 ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
52 {
53    struct ir3 *shader = rzalloc(v, struct ir3);
54 
55    shader->compiler = compiler;
56    shader->type = v->type;
57 
58    list_inithead(&shader->block_list);
59    list_inithead(&shader->array_list);
60 
61    return shader;
62 }
63 
64 void
ir3_destroy(struct ir3 * shader)65 ir3_destroy(struct ir3 *shader)
66 {
67    ralloc_free(shader);
68 }
69 
70 static bool
is_shared_consts(struct ir3_compiler * compiler,struct ir3_const_state * const_state,struct ir3_register * reg)71 is_shared_consts(struct ir3_compiler *compiler,
72                  struct ir3_const_state *const_state,
73                  struct ir3_register *reg)
74 {
75    if (const_state->push_consts_type == IR3_PUSH_CONSTS_SHARED &&
76        reg->flags & IR3_REG_CONST) {
77       uint32_t min_const_reg = regid(compiler->shared_consts_base_offset, 0);
78       uint32_t max_const_reg =
79          regid(compiler->shared_consts_base_offset +
80                compiler->shared_consts_size, 0);
81       return reg->num >= min_const_reg && min_const_reg < max_const_reg;
82    }
83 
84    return false;
85 }
86 
87 static void
collect_reg_info(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_info * info)88 collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
89                  struct ir3_info *info)
90 {
91    struct ir3_shader_variant *v = info->data;
92    unsigned repeat = instr->repeat;
93 
94    if (reg->flags & IR3_REG_IMMED) {
95       /* nothing to do */
96       return;
97    }
98 
99    /* Shared consts don't need to be included into constlen. */
100    if (is_shared_consts(v->compiler, ir3_const_state(v), reg))
101       return;
102 
103    if (!(reg->flags & IR3_REG_R)) {
104       repeat = 0;
105    }
106 
107    unsigned components;
108    int16_t max;
109 
110    if (reg->flags & IR3_REG_RELATIV) {
111       components = reg->size;
112       max = (reg->array.base + components - 1);
113    } else {
114       components = util_last_bit(reg->wrmask);
115       max = (reg->num + repeat + components - 1);
116    }
117 
118    if (reg->flags & IR3_REG_CONST) {
119       info->max_const = MAX2(info->max_const, max >> 2);
120    } else if (max < regid(48, 0)) {
121       if (reg->flags & IR3_REG_HALF) {
122          if (v->mergedregs) {
123             /* starting w/ a6xx, half regs conflict with full regs: */
124             info->max_reg = MAX2(info->max_reg, max >> 3);
125          } else {
126             info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
127          }
128       } else {
129          info->max_reg = MAX2(info->max_reg, max >> 2);
130       }
131    }
132 }
133 
134 bool
ir3_should_double_threadsize(struct ir3_shader_variant * v,unsigned regs_count)135 ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
136 {
137    const struct ir3_compiler *compiler = v->compiler;
138 
139    /* If the user forced a particular wavesize respect that. */
140    if (v->shader_options.real_wavesize == IR3_SINGLE_ONLY)
141       return false;
142    if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY)
143       return true;
144 
145    /* We can't support more than compiler->branchstack_size diverging threads
146     * in a wave. Thus, doubling the threadsize is only possible if we don't
147     * exceed the branchstack size limit.
148     */
149    if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
150        compiler->branchstack_size) {
151       return false;
152    }
153 
154    switch (v->type) {
155    case MESA_SHADER_KERNEL:
156    case MESA_SHADER_COMPUTE: {
157       unsigned threads_per_wg =
158          v->local_size[0] * v->local_size[1] * v->local_size[2];
159 
160       /* For a5xx, if the workgroup size is greater than the maximum number
161        * of threads per core with 32 threads per wave (512) then we have to
162        * use the doubled threadsize because otherwise the workgroup wouldn't
163        * fit. For smaller workgroup sizes, we follow the blob and use the
164        * smaller threadsize.
165        */
166       if (compiler->gen < 6) {
167          return v->local_size_variable ||
168                 threads_per_wg >
169                    compiler->threadsize_base * compiler->max_waves;
170       }
171 
172       /* On a6xx, we prefer the larger threadsize unless the workgroup is
173        * small enough that it would be useless. Note that because
174        * threadsize_base is bumped to 64, we don't have to worry about the
175        * workgroup fitting, unlike the a5xx case.
176        */
177       if (!v->local_size_variable) {
178          if (threads_per_wg <= compiler->threadsize_base)
179             return false;
180       }
181    }
182       FALLTHROUGH;
183    case MESA_SHADER_FRAGMENT: {
184       /* Check that doubling the threadsize wouldn't exceed the regfile size */
185       return regs_count * 2 <= compiler->reg_size_vec4;
186    }
187 
188    default:
189       /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
190        * stages - the bit doesn't exist. The blob never used it for the VS
191        * on earlier gen's anyway.
192        */
193       return false;
194    }
195 }
196 
197 /* Get the maximum number of waves that could be used even if this shader
198  * didn't use any registers.
199  */
200 unsigned
ir3_get_reg_independent_max_waves(struct ir3_shader_variant * v,bool double_threadsize)201 ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
202                                   bool double_threadsize)
203 {
204    const struct ir3_compiler *compiler = v->compiler;
205    unsigned max_waves = compiler->max_waves;
206 
207    /* Compute the limit based on branchstack */
208    if (v->branchstack > 0) {
209       unsigned branchstack_max_waves = compiler->branchstack_size /
210                                        v->branchstack *
211                                        compiler->wave_granularity;
212       max_waves = MIN2(max_waves, branchstack_max_waves);
213    }
214 
215    /* If this is a compute shader, compute the limit based on shared size */
216    if ((v->type == MESA_SHADER_COMPUTE) ||
217        (v->type == MESA_SHADER_KERNEL)) {
218       unsigned threads_per_wg =
219          v->local_size[0] * v->local_size[1] * v->local_size[2];
220       unsigned waves_per_wg =
221          DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
222                                          (double_threadsize ? 2 : 1) *
223                                          compiler->wave_granularity);
224 
225       /* Shared is allocated in chunks of 1k */
226       unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
227       if (shared_per_wg > 0 && !v->local_size_variable) {
228          unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
229 
230          max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
231                                         compiler->wave_granularity);
232       }
233 
234       /* If we have a compute shader that has a big workgroup, a barrier, and
235        * a branchstack which limits max_waves - this may result in a situation
236        * when we cannot run concurrently all waves of the workgroup, which
237        * would lead to a hang.
238        *
239        * TODO: Could we spill branchstack or is there other way around?
240        * Blob just explodes in such case.
241        */
242       if (v->has_barrier && (max_waves < waves_per_wg)) {
243          mesa_loge(
244             "Compute shader (%s) which has workgroup barrier cannot be used "
245             "because it's impossible to have enough concurrent waves.",
246             v->name);
247          exit(1);
248       }
249    }
250 
251    return max_waves;
252 }
253 
254 /* Get the maximum number of waves that could be launched limited by reg size.
255  */
256 unsigned
ir3_get_reg_dependent_max_waves(const struct ir3_compiler * compiler,unsigned reg_count,bool double_threadsize)257 ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
258                                 unsigned reg_count, bool double_threadsize)
259 {
260    return reg_count ? (compiler->reg_size_vec4 /
261                        (reg_count * (double_threadsize ? 2 : 1)) *
262                        compiler->wave_granularity)
263                     : compiler->max_waves;
264 }
265 
266 void
ir3_collect_info(struct ir3_shader_variant * v)267 ir3_collect_info(struct ir3_shader_variant *v)
268 {
269    struct ir3_info *info = &v->info;
270    struct ir3 *shader = v->ir;
271    const struct ir3_compiler *compiler = v->compiler;
272 
273    memset(info, 0, sizeof(*info));
274    info->data = v;
275    info->max_reg = -1;
276    info->max_half_reg = -1;
277    info->max_const = -1;
278    info->multi_dword_ldp_stp = false;
279 
280    uint32_t instr_count = 0;
281    foreach_block (block, &shader->block_list) {
282       foreach_instr (instr, &block->instr_list) {
283          instr_count++;
284       }
285    }
286 
287    v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
288 
289    /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
290     * doesn't try to decode the following data as instructions (such as the
291     * next stage's shader in turnip)
292     */
293    info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
294    info->sizedwords = info->size / 4;
295 
296    bool in_preamble = false;
297    bool has_eq = false;
298 
299    foreach_block (block, &shader->block_list) {
300       int sfu_delay = 0, mem_delay = 0;
301 
302       foreach_instr (instr, &block->instr_list) {
303 
304          foreach_src (reg, instr) {
305             collect_reg_info(instr, reg, info);
306          }
307 
308          foreach_dst (reg, instr) {
309             if (is_dest_gpr(reg)) {
310                collect_reg_info(instr, reg, info);
311             }
312          }
313 
314          if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
315             unsigned components = instr->srcs[2]->uim_val;
316             if (components * type_size(instr->cat6.type) > 32) {
317                info->multi_dword_ldp_stp = true;
318             }
319 
320             if (instr->opc == OPC_STP)
321                info->stp_count += components;
322             else
323                info->ldp_count += components;
324          }
325 
326          if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) &&
327              (instr->dsts[0]->flags & IR3_REG_EI))
328             info->last_baryf = info->instrs_count;
329 
330          if ((instr->opc == OPC_NOP) && (instr->flags & IR3_INSTR_EQ)) {
331             info->last_helper = info->instrs_count;
332             has_eq = true;
333          }
334 
335          if (v->type == MESA_SHADER_FRAGMENT && v->need_pixlod &&
336              instr->opc == OPC_END && !v->prefetch_end_of_quad && !has_eq)
337             info->last_helper = info->instrs_count;
338 
339          if (instr->opc == OPC_SHPS)
340             in_preamble = true;
341 
342          /* Don't count instructions in the preamble for instruction-count type
343           * stats, because their effect should be much smaller.
344           * TODO: we should probably have separate stats for preamble
345           * instructions, but that would blow up the amount of stats...
346           */
347          if (!in_preamble) {
348             unsigned instrs_count = 1 + instr->repeat + instr->nop;
349             unsigned nops_count = instr->nop;
350 
351             if (instr->opc == OPC_NOP) {
352                nops_count = 1 + instr->repeat;
353                info->instrs_per_cat[0] += nops_count;
354             } else if (!is_meta(instr)) {
355                info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
356                info->instrs_per_cat[0] += nops_count;
357             }
358 
359             if (instr->opc == OPC_MOV) {
360                if (instr->cat1.src_type == instr->cat1.dst_type) {
361                   info->mov_count += 1 + instr->repeat;
362                } else {
363                   info->cov_count += 1 + instr->repeat;
364                }
365             }
366 
367             info->instrs_count += instrs_count;
368             info->nops_count += nops_count;
369 
370             if (instr->flags & IR3_INSTR_SS) {
371                info->ss++;
372                info->sstall += sfu_delay;
373                sfu_delay = 0;
374             }
375 
376             if (instr->flags & IR3_INSTR_SY) {
377                info->sy++;
378                info->systall += mem_delay;
379                mem_delay = 0;
380             }
381 
382             if (is_ss_producer(instr)) {
383                sfu_delay = soft_ss_delay(instr);
384             } else {
385                int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
386                sfu_delay -= n;
387             }
388 
389             if (is_sy_producer(instr)) {
390                mem_delay = soft_sy_delay(instr, shader);
391             } else {
392                int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
393                mem_delay -= n;
394             }
395          }
396 
397          if (instr->opc == OPC_SHPE)
398             in_preamble = false;
399       }
400    }
401 
402    /* for vertex shader, the inputs are loaded into registers before the shader
403     * is executed, so max_regs from the shader instructions might not properly
404     * reflect the # of registers actually used, especially in case passthrough
405     * varyings.
406     *
407     * Likewise, for fragment shader, we can have some regs which are passed
408     * input values but never touched by the resulting shader (ie. as result
409     * of dead code elimination or simply because we don't know how to turn
410     * the reg off.
411     */
412    for (unsigned i = 0; i < v->inputs_count; i++) {
413       /* skip frag inputs fetch via bary.f since their reg's are
414        * not written by gpu before shader starts (and in fact the
415        * regid's might not even be valid)
416        */
417       if (v->inputs[i].bary)
418          continue;
419 
420       /* ignore high regs that are global to all threads in a warp
421        * (they exist by default) (a5xx+)
422        */
423       if (v->inputs[i].regid >= regid(48, 0))
424          continue;
425 
426       if (v->inputs[i].compmask) {
427          unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
428          int32_t regid = v->inputs[i].regid + n;
429          if (v->inputs[i].half) {
430             if (!v->mergedregs) {
431                v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
432             } else {
433                v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
434             }
435          } else {
436             v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
437          }
438       }
439    }
440 
441    for (unsigned i = 0; i < v->num_sampler_prefetch; i++) {
442       unsigned n = util_last_bit(v->sampler_prefetch[i].wrmask) - 1;
443       int32_t regid = v->sampler_prefetch[i].dst + n;
444       if (v->sampler_prefetch[i].half_precision) {
445          if (!v->mergedregs) {
446             v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
447          } else {
448             v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
449          }
450       } else {
451          v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
452       }
453    }
454 
455    /* TODO: for a5xx and below, is there a separate regfile for
456     * half-registers?
457     */
458    unsigned regs_count =
459       info->max_reg + 1 +
460       (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
461 
462    info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
463 
464    /* TODO this is different for earlier gens, but earlier gens don't use this */
465    info->subgroup_size = v->info.double_threadsize ? 128 : 64;
466 
467    unsigned reg_independent_max_waves =
468       ir3_get_reg_independent_max_waves(v, info->double_threadsize);
469    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
470       compiler, regs_count, info->double_threadsize);
471    info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
472    assert(info->max_waves <= v->compiler->max_waves);
473 }
474 
475 static struct ir3_register *
reg_create(struct ir3 * shader,int num,int flags)476 reg_create(struct ir3 *shader, int num, int flags)
477 {
478    struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
479    reg->wrmask = 1;
480    reg->flags = flags;
481    reg->num = num;
482    return reg;
483 }
484 
485 static void
insert_instr(struct ir3_block * block,struct ir3_instruction * instr)486 insert_instr(struct ir3_block *block, struct ir3_instruction *instr)
487 {
488    struct ir3 *shader = block->shader;
489 
490    instr->serialno = ++shader->instr_count;
491 
492    list_addtail(&instr->node, &block->instr_list);
493 
494    if (is_input(instr))
495       array_insert(shader, shader->baryfs, instr);
496 }
497 
498 struct ir3_block *
ir3_block_create(struct ir3 * shader)499 ir3_block_create(struct ir3 *shader)
500 {
501    struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
502 #ifdef DEBUG
503    block->serialno = ++shader->block_count;
504 #endif
505    block->shader = shader;
506    list_inithead(&block->node);
507    list_inithead(&block->instr_list);
508    return block;
509 }
510 
511 void
ir3_block_add_predecessor(struct ir3_block * block,struct ir3_block * pred)512 ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
513 {
514    array_insert(block, block->predecessors, pred);
515 }
516 
517 void
ir3_block_link_physical(struct ir3_block * pred,struct ir3_block * succ)518 ir3_block_link_physical(struct ir3_block *pred,
519                         struct ir3_block *succ)
520 {
521    array_insert(pred, pred->physical_successors, succ);
522    array_insert(succ, succ->physical_predecessors, pred);
523 }
524 
525 void
ir3_block_remove_predecessor(struct ir3_block * block,struct ir3_block * pred)526 ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
527 {
528    for (unsigned i = 0; i < block->predecessors_count; i++) {
529       if (block->predecessors[i] == pred) {
530          if (i < block->predecessors_count - 1) {
531             block->predecessors[i] =
532                block->predecessors[block->predecessors_count - 1];
533          }
534 
535          block->predecessors_count--;
536          return;
537       }
538    }
539 }
540 
541 unsigned
ir3_block_get_pred_index(struct ir3_block * block,struct ir3_block * pred)542 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
543 {
544    for (unsigned i = 0; i < block->predecessors_count; i++) {
545       if (block->predecessors[i] == pred) {
546          return i;
547       }
548    }
549 
550    unreachable("ir3_block_get_pred_index() invalid predecessor");
551 }
552 
553 static struct ir3_instruction *
instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)554 instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
555 {
556    /* Add extra sources for array destinations and the address reg */
557    if (1 <= opc_cat(opc))
558       nsrc += 2;
559    struct ir3_instruction *instr;
560    unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
561                  (nsrc * sizeof(instr->srcs[0]));
562    char *ptr = ir3_alloc(block->shader, sz);
563 
564    instr = (struct ir3_instruction *)ptr;
565    ptr += sizeof(*instr);
566    instr->dsts = (struct ir3_register **)ptr;
567    instr->srcs = instr->dsts + ndst;
568 
569 #ifdef DEBUG
570    instr->dsts_max = ndst;
571    instr->srcs_max = nsrc;
572 #endif
573 
574    return instr;
575 }
576 
577 struct ir3_instruction *
ir3_instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)578 ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
579 {
580    struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
581    instr->block = block;
582    instr->opc = opc;
583    insert_instr(block, instr);
584    return instr;
585 }
586 
587 struct ir3_instruction *
ir3_instr_clone(struct ir3_instruction * instr)588 ir3_instr_clone(struct ir3_instruction *instr)
589 {
590    struct ir3_instruction *new_instr = instr_create(
591       instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
592    struct ir3_register **dsts, **srcs;
593 
594    dsts = new_instr->dsts;
595    srcs = new_instr->srcs;
596    *new_instr = *instr;
597    new_instr->dsts = dsts;
598    new_instr->srcs = srcs;
599 
600    insert_instr(instr->block, new_instr);
601 
602    /* clone registers: */
603    new_instr->dsts_count = 0;
604    new_instr->srcs_count = 0;
605    foreach_dst (reg, instr) {
606       struct ir3_register *new_reg =
607          ir3_dst_create(new_instr, reg->num, reg->flags);
608       *new_reg = *reg;
609       if (new_reg->instr)
610          new_reg->instr = new_instr;
611    }
612    foreach_src (reg, instr) {
613       struct ir3_register *new_reg =
614          ir3_src_create(new_instr, reg->num, reg->flags);
615       *new_reg = *reg;
616    }
617 
618    if (instr->address) {
619       assert(instr->srcs_count > 0);
620       new_instr->address = new_instr->srcs[instr->srcs_count - 1];
621    }
622 
623    return new_instr;
624 }
625 
626 /* Add a false dependency to instruction, to ensure it is scheduled first: */
627 void
ir3_instr_add_dep(struct ir3_instruction * instr,struct ir3_instruction * dep)628 ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
629 {
630    for (unsigned i = 0; i < instr->deps_count; i++) {
631       if (instr->deps[i] == dep)
632          return;
633    }
634 
635    array_insert(instr, instr->deps, dep);
636 }
637 
638 struct ir3_register *
ir3_src_create(struct ir3_instruction * instr,int num,int flags)639 ir3_src_create(struct ir3_instruction *instr, int num, int flags)
640 {
641    struct ir3 *shader = instr->block->shader;
642 #ifdef DEBUG
643    assert(instr->srcs_count < instr->srcs_max);
644 #endif
645    struct ir3_register *reg = reg_create(shader, num, flags);
646    instr->srcs[instr->srcs_count++] = reg;
647    return reg;
648 }
649 
650 struct ir3_register *
ir3_dst_create(struct ir3_instruction * instr,int num,int flags)651 ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
652 {
653    struct ir3 *shader = instr->block->shader;
654 #ifdef DEBUG
655    assert(instr->dsts_count < instr->dsts_max);
656 #endif
657    struct ir3_register *reg = reg_create(shader, num, flags);
658    instr->dsts[instr->dsts_count++] = reg;
659    return reg;
660 }
661 
662 struct ir3_register *
ir3_reg_clone(struct ir3 * shader,struct ir3_register * reg)663 ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
664 {
665    struct ir3_register *new_reg = reg_create(shader, 0, 0);
666    *new_reg = *reg;
667    return new_reg;
668 }
669 
670 void
ir3_reg_set_last_array(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_register * last_write)671 ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
672                        struct ir3_register *last_write)
673 {
674    assert(reg->flags & IR3_REG_ARRAY);
675    struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
676    *new_reg = *reg;
677    new_reg->def = last_write;
678    ir3_reg_tie(reg, new_reg);
679 }
680 
681 void
ir3_instr_set_address(struct ir3_instruction * instr,struct ir3_instruction * addr)682 ir3_instr_set_address(struct ir3_instruction *instr,
683                       struct ir3_instruction *addr)
684 {
685    if (!instr->address) {
686       struct ir3 *ir = instr->block->shader;
687 
688       assert(instr->block == addr->block);
689 
690       instr->address =
691          ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
692       instr->address->def = addr->dsts[0];
693       assert(reg_num(addr->dsts[0]) == REG_A0);
694       unsigned comp = reg_comp(addr->dsts[0]);
695       if (comp == 0) {
696          array_insert(ir, ir->a0_users, instr);
697       } else {
698          assert(comp == 1);
699          array_insert(ir, ir->a1_users, instr);
700       }
701    } else {
702       assert(instr->address->def->instr == addr);
703    }
704 }
705 
706 void
ir3_block_clear_mark(struct ir3_block * block)707 ir3_block_clear_mark(struct ir3_block *block)
708 {
709    foreach_instr (instr, &block->instr_list)
710       instr->flags &= ~IR3_INSTR_MARK;
711 }
712 
713 void
ir3_clear_mark(struct ir3 * ir)714 ir3_clear_mark(struct ir3 *ir)
715 {
716    foreach_block (block, &ir->block_list) {
717       ir3_block_clear_mark(block);
718    }
719 }
720 
721 unsigned
ir3_count_instructions(struct ir3 * ir)722 ir3_count_instructions(struct ir3 *ir)
723 {
724    unsigned cnt = 1;
725    foreach_block (block, &ir->block_list) {
726       block->start_ip = cnt;
727       foreach_instr (instr, &block->instr_list) {
728          instr->ip = cnt++;
729       }
730       block->end_ip = cnt;
731    }
732    return cnt;
733 }
734 
735 /* When counting instructions for RA, we insert extra fake instructions at the
736  * beginning of each block, where values become live, and at the end where
737  * values die. This prevents problems where values live-in at the beginning or
738  * live-out at the end of a block from being treated as if they were
739  * live-in/live-out at the first/last instruction, which would be incorrect.
740  * In ir3_legalize these ip's are assumed to be actual ip's of the final
741  * program, so it would be incorrect to use this everywhere.
742  */
743 
744 unsigned
ir3_count_instructions_ra(struct ir3 * ir)745 ir3_count_instructions_ra(struct ir3 *ir)
746 {
747    unsigned cnt = 1;
748    foreach_block (block, &ir->block_list) {
749       block->start_ip = cnt++;
750       foreach_instr (instr, &block->instr_list) {
751          instr->ip = cnt++;
752       }
753       block->end_ip = cnt++;
754    }
755    return cnt;
756 }
757 
758 struct ir3_array *
ir3_lookup_array(struct ir3 * ir,unsigned id)759 ir3_lookup_array(struct ir3 *ir, unsigned id)
760 {
761    foreach_array (arr, &ir->array_list)
762       if (arr->id == id)
763          return arr;
764    return NULL;
765 }
766 
767 void
ir3_find_ssa_uses(struct ir3 * ir,void * mem_ctx,bool falsedeps)768 ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
769 {
770    /* We could do this in a single pass if we can assume instructions
771     * are always sorted.  Which currently might not always be true.
772     * (In particular after ir3_group pass, but maybe other places.)
773     */
774    foreach_block (block, &ir->block_list)
775       foreach_instr (instr, &block->instr_list)
776          instr->uses = NULL;
777 
778    foreach_block (block, &ir->block_list) {
779       foreach_instr (instr, &block->instr_list) {
780          foreach_ssa_src_n (src, n, instr) {
781             if (__is_false_dep(instr, n) && !falsedeps)
782                continue;
783             if (!src->uses)
784                src->uses = _mesa_pointer_set_create(mem_ctx);
785             _mesa_set_add(src->uses, instr);
786          }
787       }
788    }
789 }
790 
791 /**
792  * Set the destination type of an instruction, for example if a
793  * conversion is folded in, handling the special cases where the
794  * instruction's dest type or opcode needs to be fixed up.
795  */
796 void
ir3_set_dst_type(struct ir3_instruction * instr,bool half)797 ir3_set_dst_type(struct ir3_instruction *instr, bool half)
798 {
799    if (half) {
800       instr->dsts[0]->flags |= IR3_REG_HALF;
801    } else {
802       instr->dsts[0]->flags &= ~IR3_REG_HALF;
803    }
804 
805    switch (opc_cat(instr->opc)) {
806    case 1: /* move instructions */
807       if (half) {
808          instr->cat1.dst_type = half_type(instr->cat1.dst_type);
809       } else {
810          instr->cat1.dst_type = full_type(instr->cat1.dst_type);
811       }
812       break;
813    case 4:
814       if (half) {
815          instr->opc = cat4_half_opc(instr->opc);
816       } else {
817          instr->opc = cat4_full_opc(instr->opc);
818       }
819       break;
820    case 5:
821       if (half) {
822          instr->cat5.type = half_type(instr->cat5.type);
823       } else {
824          instr->cat5.type = full_type(instr->cat5.type);
825       }
826       break;
827    }
828 }
829 
830 /**
831  * One-time fixup for instruction src-types.  Other than cov's that
832  * are folded, an instruction's src type does not change.
833  */
834 void
ir3_fixup_src_type(struct ir3_instruction * instr)835 ir3_fixup_src_type(struct ir3_instruction *instr)
836 {
837    if (instr->srcs_count == 0)
838       return;
839 
840    switch (opc_cat(instr->opc)) {
841    case 1: /* move instructions */
842       if (instr->srcs[0]->flags & IR3_REG_HALF) {
843          instr->cat1.src_type = half_type(instr->cat1.src_type);
844       } else {
845          instr->cat1.src_type = full_type(instr->cat1.src_type);
846       }
847       break;
848    case 3:
849       if (instr->srcs[0]->flags & IR3_REG_HALF) {
850          instr->opc = cat3_half_opc(instr->opc);
851       } else {
852          instr->opc = cat3_full_opc(instr->opc);
853       }
854       break;
855    }
856 }
857 
858 /**
859  * Map a floating point immed to FLUT (float lookup table) value,
860  * returns negative for immediates that cannot be mapped.
861  */
862 int
ir3_flut(struct ir3_register * src_reg)863 ir3_flut(struct ir3_register *src_reg)
864 {
865    static const struct {
866       uint32_t f32;
867       uint16_t f16;
868    } flut[] = {
869          { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
870          { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
871          { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
872          { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
873          { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
874          { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
875          { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
876          { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
877          { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
878          { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
879          { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
880          { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
881    };
882 
883    if (src_reg->flags & IR3_REG_HALF) {
884       /* Note that half-float immeds are already lowered to 16b in nir: */
885       uint32_t imm = src_reg->uim_val;
886       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
887          if (flut[i].f16 == imm) {
888             return i;
889          }
890       }
891    } else {
892       uint32_t imm = src_reg->uim_val;
893       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
894          if (flut[i].f32 == imm) {
895             return i;
896          }
897       }
898    }
899 
900    return -1;
901 }
902 
903 static unsigned
cp_flags(unsigned flags)904 cp_flags(unsigned flags)
905 {
906    /* only considering these flags (at least for now): */
907    flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
908              IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
909              IR3_REG_SHARED);
910    return flags;
911 }
912 
913 bool
ir3_valid_flags(struct ir3_instruction * instr,unsigned n,unsigned flags)914 ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
915 {
916    struct ir3_compiler *compiler = instr->block->shader->compiler;
917    unsigned valid_flags;
918 
919    if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3)
920       return false;
921 
922    flags = cp_flags(flags);
923 
924    /* If destination is indirect, then source cannot be.. at least
925     * I don't think so..
926     */
927    if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
928        (flags & IR3_REG_RELATIV))
929       return false;
930 
931    if (flags & IR3_REG_RELATIV) {
932       /* TODO need to test on earlier gens.. pretty sure the earlier
933        * problem was just that we didn't check that the src was from
934        * same block (since we can't propagate address register values
935        * across blocks currently)
936        */
937       if (compiler->gen < 6)
938          return false;
939 
940       /* NOTE in the special try_swap_mad_two_srcs() case we can be
941        * called on a src that has already had an indirect load folded
942        * in, in which case ssa() returns NULL
943        */
944       if (instr->srcs[n]->flags & IR3_REG_SSA) {
945          struct ir3_instruction *src = ssa(instr->srcs[n]);
946          if (src->address->def->instr->block != instr->block)
947             return false;
948       }
949    }
950 
951    if (is_meta(instr)) {
952       /* collect and phi nodes support const/immed sources, which will be
953        * turned into move instructions, but not anything else.
954        */
955       if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
956          return false;
957 
958       if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
959          return false;
960 
961       return true;
962    }
963 
964    switch (opc_cat(instr->opc)) {
965    case 0: /* end, chmask */
966       return flags == 0;
967    case 1:
968       switch (instr->opc) {
969       case OPC_MOVMSK:
970       case OPC_SWZ:
971       case OPC_SCT:
972       case OPC_GAT:
973          valid_flags = IR3_REG_SHARED;
974          break;
975       case OPC_SCAN_MACRO:
976          return flags == 0;
977          break;
978       default:
979          valid_flags =
980             IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
981       }
982       if (flags & ~valid_flags)
983          return false;
984       break;
985    case 2:
986       valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
987                     IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
988 
989       if (flags & ~valid_flags)
990          return false;
991 
992       /* Allow an immediate src1 for flat.b, since it's ignored */
993       if (instr->opc == OPC_FLAT_B &&
994           n == 1 && flags == IR3_REG_IMMED)
995          return true;
996 
997       if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
998          unsigned m = n ^ 1;
999          /* cannot deal w/ const or shared in both srcs:
1000           * (note that some cat2 actually only have a single src)
1001           */
1002          if (m < instr->srcs_count) {
1003             struct ir3_register *reg = instr->srcs[m];
1004             if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
1005                 (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
1006                return false;
1007             if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
1008                return false;
1009          }
1010       }
1011       break;
1012    case 3:
1013       valid_flags =
1014          ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
1015 
1016       switch (instr->opc) {
1017       case OPC_SHRM:
1018       case OPC_SHLM:
1019       case OPC_SHRG:
1020       case OPC_SHLG:
1021       case OPC_ANDG: {
1022          valid_flags |= IR3_REG_IMMED;
1023          /* Can be RELATIV+CONST but not CONST: */
1024          if (flags & IR3_REG_RELATIV)
1025             valid_flags |= IR3_REG_CONST;
1026          break;
1027       }
1028       case OPC_WMM:
1029       case OPC_WMM_ACCU: {
1030          valid_flags = IR3_REG_SHARED;
1031          if (n == 2)
1032             valid_flags = IR3_REG_CONST;
1033          break;
1034       }
1035       case OPC_DP2ACC:
1036       case OPC_DP4ACC:
1037          break;
1038       default:
1039          valid_flags |= IR3_REG_CONST;
1040       }
1041 
1042       if (flags & ~valid_flags)
1043          return false;
1044 
1045       if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
1046          /* cannot deal w/ const/shared/relativ in 2nd src: */
1047          if (n == 1)
1048             return false;
1049       }
1050 
1051       break;
1052    case 4:
1053       /* seems like blob compiler avoids const as src.. */
1054       /* TODO double check if this is still the case on a4xx */
1055       if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
1056          return false;
1057       if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
1058          return false;
1059       break;
1060    case 5:
1061       /* no flags allowed */
1062       if (flags)
1063          return false;
1064       break;
1065    case 6:
1066       valid_flags = IR3_REG_IMMED;
1067       if (flags & ~valid_flags)
1068          return false;
1069 
1070       if (flags & IR3_REG_IMMED) {
1071          /* doesn't seem like we can have immediate src for store
1072           * instructions:
1073           *
1074           * TODO this restriction could also apply to load instructions,
1075           * but for load instructions this arg is the address (and not
1076           * really sure any good way to test a hard-coded immed addr src)
1077           */
1078          if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
1079             return false;
1080 
1081          if ((instr->opc == OPC_LDL) && (n == 0))
1082             return false;
1083 
1084          if ((instr->opc == OPC_STL) && (n != 2))
1085             return false;
1086 
1087          if ((instr->opc == OPC_LDP) && (n == 0))
1088             return false;
1089 
1090          if ((instr->opc == OPC_STP) && (n != 2))
1091             return false;
1092 
1093          if (instr->opc == OPC_STLW && n == 0)
1094             return false;
1095 
1096          if (instr->opc == OPC_LDLW && n == 0)
1097             return false;
1098 
1099          /* disallow immediates in anything but the SSBO slot argument for
1100           * cat6 instructions:
1101           */
1102          if (is_global_a3xx_atomic(instr->opc) && (n != 0))
1103             return false;
1104 
1105          if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) ||
1106              is_bindless_atomic(instr->opc))
1107             return false;
1108 
1109          if (instr->opc == OPC_STG && (n == 2))
1110             return false;
1111 
1112          if (instr->opc == OPC_STG_A && (n == 4))
1113             return false;
1114 
1115          if (instr->opc == OPC_LDG && (n == 0))
1116             return false;
1117 
1118          if (instr->opc == OPC_LDG_A && (n < 2))
1119             return false;
1120 
1121          /* as with atomics, these cat6 instrs can only have an immediate
1122           * for SSBO/IBO slot argument
1123           */
1124          switch (instr->opc) {
1125          case OPC_LDIB:
1126          case OPC_STIB:
1127          case OPC_RESINFO:
1128             if (n != 0)
1129                return false;
1130             break;
1131          default:
1132             break;
1133          }
1134       }
1135 
1136       break;
1137    }
1138 
1139    return true;
1140 }
1141 
1142 bool
ir3_valid_immediate(struct ir3_instruction * instr,int32_t immed)1143 ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
1144 {
1145    if (instr->opc == OPC_MOV || is_meta(instr))
1146       return true;
1147 
1148    if (is_mem(instr)) {
1149       switch (instr->opc) {
1150       /* Some load/store instructions have a 13-bit offset and size which must
1151        * always be an immediate and the rest of the sources cannot be
1152        * immediates, so the frontend is responsible for checking the size:
1153        */
1154       case OPC_LDL:
1155       case OPC_STL:
1156       case OPC_LDP:
1157       case OPC_STP:
1158       case OPC_LDG:
1159       case OPC_STG:
1160       case OPC_SPILL_MACRO:
1161       case OPC_RELOAD_MACRO:
1162       case OPC_LDG_A:
1163       case OPC_STG_A:
1164       case OPC_LDLW:
1165       case OPC_STLW:
1166       case OPC_LDLV:
1167          return true;
1168       default:
1169          /* most cat6 src immediates can only encode 8 bits: */
1170          return !(immed & ~0xff);
1171       }
1172    }
1173 
1174    /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1175    return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1176 }
1177