• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "ir3.h"
25 
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "util/bitscan.h"
34 #include "util/half_float.h"
35 #include "util/ralloc.h"
36 #include "util/u_math.h"
37 
38 #include "instr-a3xx.h"
39 #include "ir3_shader.h"
40 
41 /* simple allocator to carve allocations out of an up-front allocated heap,
42  * so that we can free everything easily in one shot.
43  */
44 void *
ir3_alloc(struct ir3 * shader,int sz)45 ir3_alloc(struct ir3 *shader, int sz)
46 {
47    return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
48 }
49 
50 struct ir3 *
ir3_create(struct ir3_compiler * compiler,struct ir3_shader_variant * v)51 ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
52 {
53    struct ir3 *shader = rzalloc(v, struct ir3);
54 
55    shader->compiler = compiler;
56    shader->type = v->type;
57 
58    list_inithead(&shader->block_list);
59    list_inithead(&shader->array_list);
60 
61    return shader;
62 }
63 
64 void
ir3_destroy(struct ir3 * shader)65 ir3_destroy(struct ir3 *shader)
66 {
67    ralloc_free(shader);
68 }
69 
70 static bool
is_shared_consts(struct ir3_compiler * compiler,struct ir3_const_state * const_state,struct ir3_register * reg)71 is_shared_consts(struct ir3_compiler *compiler,
72                  struct ir3_const_state *const_state,
73                  struct ir3_register *reg)
74 {
75    if (const_state->shared_consts_enable && reg->flags & IR3_REG_CONST) {
76       uint32_t min_const_reg = regid(compiler->shared_consts_base_offset, 0);
77       uint32_t max_const_reg =
78          regid(compiler->shared_consts_base_offset +
79                compiler->shared_consts_size, 0);
80       return reg->num >= min_const_reg && min_const_reg < max_const_reg;
81    }
82 
83    return false;
84 }
85 
86 static void
collect_reg_info(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_info * info)87 collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
88                  struct ir3_info *info)
89 {
90    struct ir3_shader_variant *v = info->data;
91    unsigned repeat = instr->repeat;
92 
93    if (reg->flags & IR3_REG_IMMED) {
94       /* nothing to do */
95       return;
96    }
97 
98    /* Shared consts don't need to be included into constlen. */
99    if (is_shared_consts(v->compiler, ir3_const_state(v), reg))
100       return;
101 
102    if (!(reg->flags & IR3_REG_R)) {
103       repeat = 0;
104    }
105 
106    unsigned components;
107    int16_t max;
108 
109    if (reg->flags & IR3_REG_RELATIV) {
110       components = reg->size;
111       max = (reg->array.base + components - 1);
112    } else {
113       components = util_last_bit(reg->wrmask);
114       max = (reg->num + repeat + components - 1);
115    }
116 
117    if (reg->flags & IR3_REG_CONST) {
118       info->max_const = MAX2(info->max_const, max >> 2);
119    } else if (max < regid(48, 0)) {
120       if (reg->flags & IR3_REG_HALF) {
121          if (v->mergedregs) {
122             /* starting w/ a6xx, half regs conflict with full regs: */
123             info->max_reg = MAX2(info->max_reg, max >> 3);
124          } else {
125             info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
126          }
127       } else {
128          info->max_reg = MAX2(info->max_reg, max >> 2);
129       }
130    }
131 }
132 
133 bool
ir3_should_double_threadsize(struct ir3_shader_variant * v,unsigned regs_count)134 ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
135 {
136    const struct ir3_compiler *compiler = v->compiler;
137 
138    /* If the user forced a particular wavesize respect that. */
139    if (v->real_wavesize == IR3_SINGLE_ONLY)
140       return false;
141    if (v->real_wavesize == IR3_DOUBLE_ONLY)
142       return true;
143 
144    /* We can't support more than compiler->branchstack_size diverging threads
145     * in a wave. Thus, doubling the threadsize is only possible if we don't
146     * exceed the branchstack size limit.
147     */
148    if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
149        compiler->branchstack_size) {
150       return false;
151    }
152 
153    switch (v->type) {
154    case MESA_SHADER_KERNEL:
155    case MESA_SHADER_COMPUTE: {
156       unsigned threads_per_wg =
157          v->local_size[0] * v->local_size[1] * v->local_size[2];
158 
159       /* For a5xx, if the workgroup size is greater than the maximum number
160        * of threads per core with 32 threads per wave (512) then we have to
161        * use the doubled threadsize because otherwise the workgroup wouldn't
162        * fit. For smaller workgroup sizes, we follow the blob and use the
163        * smaller threadsize.
164        */
165       if (compiler->gen < 6) {
166          return v->local_size_variable ||
167                 threads_per_wg >
168                    compiler->threadsize_base * compiler->max_waves;
169       }
170 
171       /* On a6xx, we prefer the larger threadsize unless the workgroup is
172        * small enough that it would be useless. Note that because
173        * threadsize_base is bumped to 64, we don't have to worry about the
174        * workgroup fitting, unlike the a5xx case.
175        */
176       if (!v->local_size_variable) {
177          if (threads_per_wg <= compiler->threadsize_base)
178             return false;
179       }
180    }
181       FALLTHROUGH;
182    case MESA_SHADER_FRAGMENT: {
183       /* Check that doubling the threadsize wouldn't exceed the regfile size */
184       return regs_count * 2 <= compiler->reg_size_vec4;
185    }
186 
187    default:
188       /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
189        * stages - the bit doesn't exist. The blob never used it for the VS
190        * on earlier gen's anyway.
191        */
192       return false;
193    }
194 }
195 
196 /* Get the maximum number of waves that could be used even if this shader
197  * didn't use any registers.
198  */
199 unsigned
ir3_get_reg_independent_max_waves(struct ir3_shader_variant * v,bool double_threadsize)200 ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
201                                   bool double_threadsize)
202 {
203    const struct ir3_compiler *compiler = v->compiler;
204    unsigned max_waves = compiler->max_waves;
205 
206    /* Compute the limit based on branchstack */
207    if (v->branchstack > 0) {
208       unsigned branchstack_max_waves = compiler->branchstack_size /
209                                        v->branchstack *
210                                        compiler->wave_granularity;
211       max_waves = MIN2(max_waves, branchstack_max_waves);
212    }
213 
214    /* If this is a compute shader, compute the limit based on shared size */
215    if ((v->type == MESA_SHADER_COMPUTE) ||
216        (v->type == MESA_SHADER_KERNEL)) {
217       unsigned threads_per_wg =
218          v->local_size[0] * v->local_size[1] * v->local_size[2];
219       unsigned waves_per_wg =
220          DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
221                                          (double_threadsize ? 2 : 1) *
222                                          compiler->wave_granularity);
223 
224       /* Shared is allocated in chunks of 1k */
225       unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
226       if (shared_per_wg > 0 && !v->local_size_variable) {
227          unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
228 
229          max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
230                                         compiler->wave_granularity);
231       }
232 
233       /* If we have a compute shader that has a big workgroup, a barrier, and
234        * a branchstack which limits max_waves - this may result in a situation
235        * when we cannot run concurrently all waves of the workgroup, which
236        * would lead to a hang.
237        *
238        * TODO: Could we spill branchstack or is there other way around?
239        * Blob just explodes in such case.
240        */
241       if (v->has_barrier && (max_waves < waves_per_wg)) {
242          mesa_loge(
243             "Compute shader (%s) which has workgroup barrier cannot be used "
244             "because it's impossible to have enough concurrent waves.",
245             v->name);
246          exit(1);
247       }
248    }
249 
250    return max_waves;
251 }
252 
253 /* Get the maximum number of waves that could be launched limited by reg size.
254  */
255 unsigned
ir3_get_reg_dependent_max_waves(const struct ir3_compiler * compiler,unsigned reg_count,bool double_threadsize)256 ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
257                                 unsigned reg_count, bool double_threadsize)
258 {
259    return reg_count ? (compiler->reg_size_vec4 /
260                        (reg_count * (double_threadsize ? 2 : 1)) *
261                        compiler->wave_granularity)
262                     : compiler->max_waves;
263 }
264 
265 void
ir3_collect_info(struct ir3_shader_variant * v)266 ir3_collect_info(struct ir3_shader_variant *v)
267 {
268    struct ir3_info *info = &v->info;
269    struct ir3 *shader = v->ir;
270    const struct ir3_compiler *compiler = v->compiler;
271 
272    memset(info, 0, sizeof(*info));
273    info->data = v;
274    info->max_reg = -1;
275    info->max_half_reg = -1;
276    info->max_const = -1;
277    info->multi_dword_ldp_stp = false;
278 
279    uint32_t instr_count = 0;
280    foreach_block (block, &shader->block_list) {
281       foreach_instr (instr, &block->instr_list) {
282          instr_count++;
283       }
284    }
285 
286    v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
287 
288    /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
289     * doesn't try to decode the following data as instructions (such as the
290     * next stage's shader in turnip)
291     */
292    info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
293    info->sizedwords = info->size / 4;
294 
295    bool in_preamble = false;
296 
297    foreach_block (block, &shader->block_list) {
298       int sfu_delay = 0, mem_delay = 0;
299 
300       foreach_instr (instr, &block->instr_list) {
301 
302          foreach_src (reg, instr) {
303             collect_reg_info(instr, reg, info);
304          }
305 
306          foreach_dst (reg, instr) {
307             if (is_dest_gpr(reg)) {
308                collect_reg_info(instr, reg, info);
309             }
310          }
311 
312          if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
313             unsigned components = instr->srcs[2]->uim_val;
314             if (components * type_size(instr->cat6.type) > 32) {
315                info->multi_dword_ldp_stp = true;
316             }
317 
318             if (instr->opc == OPC_STP)
319                info->stp_count += components;
320             else
321                info->ldp_count += components;
322          }
323 
324          if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) &&
325              (instr->dsts[0]->flags & IR3_REG_EI))
326             info->last_baryf = info->instrs_count;
327 
328          if (instr->opc == OPC_SHPS)
329             in_preamble = true;
330 
331          /* Don't count instructions in the preamble for instruction-count type
332           * stats, because their effect should be much smaller.
333           * TODO: we should probably have separate stats for preamble
334           * instructions, but that would blow up the amount of stats...
335           */
336          if (!in_preamble) {
337             unsigned instrs_count = 1 + instr->repeat + instr->nop;
338             unsigned nops_count = instr->nop;
339 
340             if (instr->opc == OPC_NOP) {
341                nops_count = 1 + instr->repeat;
342                info->instrs_per_cat[0] += nops_count;
343             } else {
344                info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
345                info->instrs_per_cat[0] += nops_count;
346             }
347 
348             if (instr->opc == OPC_MOV) {
349                if (instr->cat1.src_type == instr->cat1.dst_type) {
350                   info->mov_count += 1 + instr->repeat;
351                } else {
352                   info->cov_count += 1 + instr->repeat;
353                }
354             }
355 
356             info->instrs_count += instrs_count;
357             info->nops_count += nops_count;
358 
359             if (instr->flags & IR3_INSTR_SS) {
360                info->ss++;
361                info->sstall += sfu_delay;
362                sfu_delay = 0;
363             }
364 
365             if (instr->flags & IR3_INSTR_SY) {
366                info->sy++;
367                info->systall += mem_delay;
368                mem_delay = 0;
369             }
370 
371             if (is_ss_producer(instr)) {
372                sfu_delay = soft_ss_delay(instr);
373             } else {
374                int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
375                sfu_delay -= n;
376             }
377 
378             if (is_sy_producer(instr)) {
379                mem_delay = soft_sy_delay(instr, shader);
380             } else {
381                int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
382                mem_delay -= n;
383             }
384          }
385 
386          if (instr->opc == OPC_SHPE)
387             in_preamble = false;
388       }
389    }
390 
391    /* TODO: for a5xx and below, is there a separate regfile for
392     * half-registers?
393     */
394    unsigned regs_count =
395       info->max_reg + 1 +
396       (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
397 
398    info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
399    unsigned reg_independent_max_waves =
400       ir3_get_reg_independent_max_waves(v, info->double_threadsize);
401    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
402       compiler, regs_count, info->double_threadsize);
403    info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
404    assert(info->max_waves <= v->compiler->max_waves);
405 }
406 
407 static struct ir3_register *
reg_create(struct ir3 * shader,int num,int flags)408 reg_create(struct ir3 *shader, int num, int flags)
409 {
410    struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
411    reg->wrmask = 1;
412    reg->flags = flags;
413    reg->num = num;
414    return reg;
415 }
416 
417 static void
insert_instr(struct ir3_block * block,struct ir3_instruction * instr)418 insert_instr(struct ir3_block *block, struct ir3_instruction *instr)
419 {
420    struct ir3 *shader = block->shader;
421 
422    instr->serialno = ++shader->instr_count;
423 
424    list_addtail(&instr->node, &block->instr_list);
425 
426    if (is_input(instr))
427       array_insert(shader, shader->baryfs, instr);
428 }
429 
430 struct ir3_block *
ir3_block_create(struct ir3 * shader)431 ir3_block_create(struct ir3 *shader)
432 {
433    struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
434 #ifdef DEBUG
435    block->serialno = ++shader->block_count;
436 #endif
437    block->shader = shader;
438    list_inithead(&block->node);
439    list_inithead(&block->instr_list);
440    return block;
441 }
442 
443 void
ir3_block_add_predecessor(struct ir3_block * block,struct ir3_block * pred)444 ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
445 {
446    array_insert(block, block->predecessors, pred);
447 }
448 
449 void
ir3_block_add_physical_predecessor(struct ir3_block * block,struct ir3_block * pred)450 ir3_block_add_physical_predecessor(struct ir3_block *block,
451                                    struct ir3_block *pred)
452 {
453    array_insert(block, block->physical_predecessors, pred);
454 }
455 
456 void
ir3_block_remove_predecessor(struct ir3_block * block,struct ir3_block * pred)457 ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
458 {
459    for (unsigned i = 0; i < block->predecessors_count; i++) {
460       if (block->predecessors[i] == pred) {
461          if (i < block->predecessors_count - 1) {
462             block->predecessors[i] =
463                block->predecessors[block->predecessors_count - 1];
464          }
465 
466          block->predecessors_count--;
467          return;
468       }
469    }
470 }
471 
472 void
ir3_block_remove_physical_predecessor(struct ir3_block * block,struct ir3_block * pred)473 ir3_block_remove_physical_predecessor(struct ir3_block *block, struct ir3_block *pred)
474 {
475    for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
476       if (block->physical_predecessors[i] == pred) {
477          if (i < block->physical_predecessors_count - 1) {
478             block->physical_predecessors[i] =
479                block->physical_predecessors[block->physical_predecessors_count - 1];
480          }
481 
482          block->physical_predecessors_count--;
483          return;
484       }
485    }
486 }
487 
488 unsigned
ir3_block_get_pred_index(struct ir3_block * block,struct ir3_block * pred)489 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
490 {
491    for (unsigned i = 0; i < block->predecessors_count; i++) {
492       if (block->predecessors[i] == pred) {
493          return i;
494       }
495    }
496 
497    unreachable("ir3_block_get_pred_index() invalid predecessor");
498 }
499 
500 static struct ir3_instruction *
instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)501 instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
502 {
503    /* Add extra sources for array destinations and the address reg */
504    if (1 <= opc_cat(opc))
505       nsrc += 2;
506    struct ir3_instruction *instr;
507    unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
508                  (nsrc * sizeof(instr->srcs[0]));
509    char *ptr = ir3_alloc(block->shader, sz);
510 
511    instr = (struct ir3_instruction *)ptr;
512    ptr += sizeof(*instr);
513    instr->dsts = (struct ir3_register **)ptr;
514    instr->srcs = instr->dsts + ndst;
515 
516 #ifdef DEBUG
517    instr->dsts_max = ndst;
518    instr->srcs_max = nsrc;
519 #endif
520 
521    return instr;
522 }
523 
524 struct ir3_instruction *
ir3_instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)525 ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
526 {
527    struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
528    instr->block = block;
529    instr->opc = opc;
530    insert_instr(block, instr);
531    return instr;
532 }
533 
534 struct ir3_instruction *
ir3_instr_clone(struct ir3_instruction * instr)535 ir3_instr_clone(struct ir3_instruction *instr)
536 {
537    struct ir3_instruction *new_instr = instr_create(
538       instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
539    struct ir3_register **dsts, **srcs;
540 
541    dsts = new_instr->dsts;
542    srcs = new_instr->srcs;
543    *new_instr = *instr;
544    new_instr->dsts = dsts;
545    new_instr->srcs = srcs;
546 
547    insert_instr(instr->block, new_instr);
548 
549    /* clone registers: */
550    new_instr->dsts_count = 0;
551    new_instr->srcs_count = 0;
552    foreach_dst (reg, instr) {
553       struct ir3_register *new_reg =
554          ir3_dst_create(new_instr, reg->num, reg->flags);
555       *new_reg = *reg;
556       if (new_reg->instr)
557          new_reg->instr = new_instr;
558    }
559    foreach_src (reg, instr) {
560       struct ir3_register *new_reg =
561          ir3_src_create(new_instr, reg->num, reg->flags);
562       *new_reg = *reg;
563    }
564 
565    if (instr->address) {
566       assert(instr->srcs_count > 0);
567       new_instr->address = new_instr->srcs[instr->srcs_count - 1];
568    }
569 
570    return new_instr;
571 }
572 
573 /* Add a false dependency to instruction, to ensure it is scheduled first: */
574 void
ir3_instr_add_dep(struct ir3_instruction * instr,struct ir3_instruction * dep)575 ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
576 {
577    for (unsigned i = 0; i < instr->deps_count; i++) {
578       if (instr->deps[i] == dep)
579          return;
580    }
581 
582    array_insert(instr, instr->deps, dep);
583 }
584 
585 struct ir3_register *
ir3_src_create(struct ir3_instruction * instr,int num,int flags)586 ir3_src_create(struct ir3_instruction *instr, int num, int flags)
587 {
588    struct ir3 *shader = instr->block->shader;
589 #ifdef DEBUG
590    assert(instr->srcs_count < instr->srcs_max);
591 #endif
592    struct ir3_register *reg = reg_create(shader, num, flags);
593    instr->srcs[instr->srcs_count++] = reg;
594    return reg;
595 }
596 
597 struct ir3_register *
ir3_dst_create(struct ir3_instruction * instr,int num,int flags)598 ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
599 {
600    struct ir3 *shader = instr->block->shader;
601 #ifdef DEBUG
602    assert(instr->dsts_count < instr->dsts_max);
603 #endif
604    struct ir3_register *reg = reg_create(shader, num, flags);
605    instr->dsts[instr->dsts_count++] = reg;
606    return reg;
607 }
608 
609 struct ir3_register *
ir3_reg_clone(struct ir3 * shader,struct ir3_register * reg)610 ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
611 {
612    struct ir3_register *new_reg = reg_create(shader, 0, 0);
613    *new_reg = *reg;
614    return new_reg;
615 }
616 
617 void
ir3_reg_set_last_array(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_register * last_write)618 ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
619                        struct ir3_register *last_write)
620 {
621    assert(reg->flags & IR3_REG_ARRAY);
622    struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
623    *new_reg = *reg;
624    new_reg->def = last_write;
625    ir3_reg_tie(reg, new_reg);
626 }
627 
628 void
ir3_instr_set_address(struct ir3_instruction * instr,struct ir3_instruction * addr)629 ir3_instr_set_address(struct ir3_instruction *instr,
630                       struct ir3_instruction *addr)
631 {
632    if (!instr->address) {
633       struct ir3 *ir = instr->block->shader;
634 
635       assert(instr->block == addr->block);
636 
637       instr->address =
638          ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
639       instr->address->def = addr->dsts[0];
640       assert(reg_num(addr->dsts[0]) == REG_A0);
641       unsigned comp = reg_comp(addr->dsts[0]);
642       if (comp == 0) {
643          array_insert(ir, ir->a0_users, instr);
644       } else {
645          assert(comp == 1);
646          array_insert(ir, ir->a1_users, instr);
647       }
648    } else {
649       assert(instr->address->def->instr == addr);
650    }
651 }
652 
653 void
ir3_block_clear_mark(struct ir3_block * block)654 ir3_block_clear_mark(struct ir3_block *block)
655 {
656    foreach_instr (instr, &block->instr_list)
657       instr->flags &= ~IR3_INSTR_MARK;
658 }
659 
660 void
ir3_clear_mark(struct ir3 * ir)661 ir3_clear_mark(struct ir3 *ir)
662 {
663    foreach_block (block, &ir->block_list) {
664       ir3_block_clear_mark(block);
665    }
666 }
667 
668 unsigned
ir3_count_instructions(struct ir3 * ir)669 ir3_count_instructions(struct ir3 *ir)
670 {
671    unsigned cnt = 1;
672    foreach_block (block, &ir->block_list) {
673       block->start_ip = cnt;
674       foreach_instr (instr, &block->instr_list) {
675          instr->ip = cnt++;
676       }
677       block->end_ip = cnt;
678    }
679    return cnt;
680 }
681 
682 /* When counting instructions for RA, we insert extra fake instructions at the
683  * beginning of each block, where values become live, and at the end where
684  * values die. This prevents problems where values live-in at the beginning or
685  * live-out at the end of a block from being treated as if they were
686  * live-in/live-out at the first/last instruction, which would be incorrect.
687  * In ir3_legalize these ip's are assumed to be actual ip's of the final
688  * program, so it would be incorrect to use this everywhere.
689  */
690 
691 unsigned
ir3_count_instructions_ra(struct ir3 * ir)692 ir3_count_instructions_ra(struct ir3 *ir)
693 {
694    unsigned cnt = 1;
695    foreach_block (block, &ir->block_list) {
696       block->start_ip = cnt++;
697       foreach_instr (instr, &block->instr_list) {
698          instr->ip = cnt++;
699       }
700       block->end_ip = cnt++;
701    }
702    return cnt;
703 }
704 
705 struct ir3_array *
ir3_lookup_array(struct ir3 * ir,unsigned id)706 ir3_lookup_array(struct ir3 *ir, unsigned id)
707 {
708    foreach_array (arr, &ir->array_list)
709       if (arr->id == id)
710          return arr;
711    return NULL;
712 }
713 
714 void
ir3_find_ssa_uses(struct ir3 * ir,void * mem_ctx,bool falsedeps)715 ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
716 {
717    /* We could do this in a single pass if we can assume instructions
718     * are always sorted.  Which currently might not always be true.
719     * (In particular after ir3_group pass, but maybe other places.)
720     */
721    foreach_block (block, &ir->block_list)
722       foreach_instr (instr, &block->instr_list)
723          instr->uses = NULL;
724 
725    foreach_block (block, &ir->block_list) {
726       foreach_instr (instr, &block->instr_list) {
727          foreach_ssa_src_n (src, n, instr) {
728             if (__is_false_dep(instr, n) && !falsedeps)
729                continue;
730             if (!src->uses)
731                src->uses = _mesa_pointer_set_create(mem_ctx);
732             _mesa_set_add(src->uses, instr);
733          }
734       }
735    }
736 }
737 
738 /**
739  * Set the destination type of an instruction, for example if a
740  * conversion is folded in, handling the special cases where the
741  * instruction's dest type or opcode needs to be fixed up.
742  */
743 void
ir3_set_dst_type(struct ir3_instruction * instr,bool half)744 ir3_set_dst_type(struct ir3_instruction *instr, bool half)
745 {
746    if (half) {
747       instr->dsts[0]->flags |= IR3_REG_HALF;
748    } else {
749       instr->dsts[0]->flags &= ~IR3_REG_HALF;
750    }
751 
752    switch (opc_cat(instr->opc)) {
753    case 1: /* move instructions */
754       if (half) {
755          instr->cat1.dst_type = half_type(instr->cat1.dst_type);
756       } else {
757          instr->cat1.dst_type = full_type(instr->cat1.dst_type);
758       }
759       break;
760    case 4:
761       if (half) {
762          instr->opc = cat4_half_opc(instr->opc);
763       } else {
764          instr->opc = cat4_full_opc(instr->opc);
765       }
766       break;
767    case 5:
768       if (half) {
769          instr->cat5.type = half_type(instr->cat5.type);
770       } else {
771          instr->cat5.type = full_type(instr->cat5.type);
772       }
773       break;
774    }
775 }
776 
777 /**
778  * One-time fixup for instruction src-types.  Other than cov's that
779  * are folded, an instruction's src type does not change.
780  */
781 void
ir3_fixup_src_type(struct ir3_instruction * instr)782 ir3_fixup_src_type(struct ir3_instruction *instr)
783 {
784    if (instr->srcs_count == 0)
785       return;
786 
787    switch (opc_cat(instr->opc)) {
788    case 1: /* move instructions */
789       if (instr->srcs[0]->flags & IR3_REG_HALF) {
790          instr->cat1.src_type = half_type(instr->cat1.src_type);
791       } else {
792          instr->cat1.src_type = full_type(instr->cat1.src_type);
793       }
794       break;
795    case 3:
796       if (instr->srcs[0]->flags & IR3_REG_HALF) {
797          instr->opc = cat3_half_opc(instr->opc);
798       } else {
799          instr->opc = cat3_full_opc(instr->opc);
800       }
801       break;
802    }
803 }
804 
805 /**
806  * Map a floating point immed to FLUT (float lookup table) value,
807  * returns negative for immediates that cannot be mapped.
808  */
809 int
ir3_flut(struct ir3_register * src_reg)810 ir3_flut(struct ir3_register *src_reg)
811 {
812    static const struct {
813       uint32_t f32;
814       uint16_t f16;
815    } flut[] = {
816          { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
817          { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
818          { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
819          { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
820          { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
821          { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
822          { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
823          { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
824          { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
825          { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
826          { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
827          { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
828    };
829 
830    if (src_reg->flags & IR3_REG_HALF) {
831       /* Note that half-float immeds are already lowered to 16b in nir: */
832       uint32_t imm = src_reg->uim_val;
833       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
834          if (flut[i].f16 == imm) {
835             return i;
836          }
837       }
838    } else {
839       uint32_t imm = src_reg->uim_val;
840       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
841          if (flut[i].f32 == imm) {
842             return i;
843          }
844       }
845    }
846 
847    return -1;
848 }
849 
850 static unsigned
cp_flags(unsigned flags)851 cp_flags(unsigned flags)
852 {
853    /* only considering these flags (at least for now): */
854    flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
855              IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
856              IR3_REG_SHARED);
857    return flags;
858 }
859 
860 bool
ir3_valid_flags(struct ir3_instruction * instr,unsigned n,unsigned flags)861 ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
862 {
863    struct ir3_compiler *compiler = instr->block->shader->compiler;
864    unsigned valid_flags;
865 
866    if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3)
867       return false;
868 
869    flags = cp_flags(flags);
870 
871    /* If destination is indirect, then source cannot be.. at least
872     * I don't think so..
873     */
874    if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
875        (flags & IR3_REG_RELATIV))
876       return false;
877 
878    if (flags & IR3_REG_RELATIV) {
879       /* TODO need to test on earlier gens.. pretty sure the earlier
880        * problem was just that we didn't check that the src was from
881        * same block (since we can't propagate address register values
882        * across blocks currently)
883        */
884       if (compiler->gen < 6)
885          return false;
886 
887       /* NOTE in the special try_swap_mad_two_srcs() case we can be
888        * called on a src that has already had an indirect load folded
889        * in, in which case ssa() returns NULL
890        */
891       if (instr->srcs[n]->flags & IR3_REG_SSA) {
892          struct ir3_instruction *src = ssa(instr->srcs[n]);
893          if (src->address->def->instr->block != instr->block)
894             return false;
895       }
896    }
897 
898    if (is_meta(instr)) {
899       /* collect and phi nodes support const/immed sources, which will be
900        * turned into move instructions, but not anything else.
901        */
902       if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
903          return false;
904 
905       if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
906          return false;
907 
908       return true;
909    }
910 
911    switch (opc_cat(instr->opc)) {
912    case 0: /* end, chmask */
913       return flags == 0;
914    case 1:
915       switch (instr->opc) {
916       case OPC_MOVMSK:
917       case OPC_SWZ:
918       case OPC_SCT:
919       case OPC_GAT:
920          valid_flags = IR3_REG_SHARED;
921          break;
922       case OPC_SCAN_MACRO:
923          return flags == 0;
924          break;
925       default:
926          valid_flags =
927             IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
928       }
929       if (flags & ~valid_flags)
930          return false;
931       break;
932    case 2:
933       valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
934                     IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
935 
936       if (flags & ~valid_flags)
937          return false;
938 
939       /* Allow an immediate src1 for flat.b, since it's ignored */
940       if (instr->opc == OPC_FLAT_B &&
941           n == 1 && flags == IR3_REG_IMMED)
942          return true;
943 
944       if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
945          unsigned m = n ^ 1;
946          /* cannot deal w/ const or shared in both srcs:
947           * (note that some cat2 actually only have a single src)
948           */
949          if (m < instr->srcs_count) {
950             struct ir3_register *reg = instr->srcs[m];
951             if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
952                 (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
953                return false;
954             if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
955                return false;
956          }
957       }
958       break;
959    case 3:
960       valid_flags =
961          ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
962 
963       switch (instr->opc) {
964       case OPC_SHRM:
965       case OPC_SHLM:
966       case OPC_SHRG:
967       case OPC_SHLG:
968       case OPC_ANDG: {
969          valid_flags |= IR3_REG_IMMED;
970          /* Can be RELATIV+CONST but not CONST: */
971          if (flags & IR3_REG_RELATIV)
972             valid_flags |= IR3_REG_CONST;
973          break;
974       }
975       case OPC_WMM:
976       case OPC_WMM_ACCU: {
977          valid_flags = IR3_REG_SHARED;
978          if (n == 2)
979             valid_flags = IR3_REG_CONST;
980          break;
981       }
982       case OPC_DP2ACC:
983       case OPC_DP4ACC:
984          break;
985       default:
986          valid_flags |= IR3_REG_CONST;
987       }
988 
989       if (flags & ~valid_flags)
990          return false;
991 
992       if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
993          /* cannot deal w/ const/shared/relativ in 2nd src: */
994          if (n == 1)
995             return false;
996       }
997 
998       break;
999    case 4:
1000       /* seems like blob compiler avoids const as src.. */
1001       /* TODO double check if this is still the case on a4xx */
1002       if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
1003          return false;
1004       if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
1005          return false;
1006       break;
1007    case 5:
1008       /* no flags allowed */
1009       if (flags)
1010          return false;
1011       break;
1012    case 6:
1013       valid_flags = IR3_REG_IMMED;
1014       if (flags & ~valid_flags)
1015          return false;
1016 
1017       if (flags & IR3_REG_IMMED) {
1018          /* doesn't seem like we can have immediate src for store
1019           * instructions:
1020           *
1021           * TODO this restriction could also apply to load instructions,
1022           * but for load instructions this arg is the address (and not
1023           * really sure any good way to test a hard-coded immed addr src)
1024           */
1025          if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
1026             return false;
1027 
1028          if ((instr->opc == OPC_LDL) && (n == 0))
1029             return false;
1030 
1031          if ((instr->opc == OPC_STL) && (n != 2))
1032             return false;
1033 
1034          if ((instr->opc == OPC_LDP) && (n == 0))
1035             return false;
1036 
1037          if ((instr->opc == OPC_STP) && (n != 2))
1038             return false;
1039 
1040          if (instr->opc == OPC_STLW && n == 0)
1041             return false;
1042 
1043          if (instr->opc == OPC_LDLW && n == 0)
1044             return false;
1045 
1046          /* disallow immediates in anything but the SSBO slot argument for
1047           * cat6 instructions:
1048           */
1049          if (is_global_a3xx_atomic(instr->opc) && (n != 0))
1050             return false;
1051 
1052          if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) ||
1053              is_bindless_atomic(instr->opc))
1054             return false;
1055 
1056          if (instr->opc == OPC_STG && (n == 2))
1057             return false;
1058 
1059          if (instr->opc == OPC_STG_A && (n == 4))
1060             return false;
1061 
1062          if (instr->opc == OPC_LDG && (n == 0))
1063             return false;
1064 
1065          if (instr->opc == OPC_LDG_A && (n < 2))
1066             return false;
1067 
1068          /* as with atomics, these cat6 instrs can only have an immediate
1069           * for SSBO/IBO slot argument
1070           */
1071          switch (instr->opc) {
1072          case OPC_LDIB:
1073          case OPC_STIB:
1074          case OPC_RESINFO:
1075             if (n != 0)
1076                return false;
1077             break;
1078          default:
1079             break;
1080          }
1081       }
1082 
1083       break;
1084    }
1085 
1086    return true;
1087 }
1088 
1089 bool
ir3_valid_immediate(struct ir3_instruction * instr,int32_t immed)1090 ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
1091 {
1092    if (instr->opc == OPC_MOV || is_meta(instr))
1093       return true;
1094 
1095    if (is_mem(instr)) {
1096       switch (instr->opc) {
1097       /* Some load/store instructions have a 13-bit offset and size which must
1098        * always be an immediate and the rest of the sources cannot be
1099        * immediates, so the frontend is responsible for checking the size:
1100        */
1101       case OPC_LDL:
1102       case OPC_STL:
1103       case OPC_LDP:
1104       case OPC_STP:
1105       case OPC_LDG:
1106       case OPC_STG:
1107       case OPC_SPILL_MACRO:
1108       case OPC_RELOAD_MACRO:
1109       case OPC_LDG_A:
1110       case OPC_STG_A:
1111       case OPC_LDLW:
1112       case OPC_STLW:
1113       case OPC_LDLV:
1114          return true;
1115       default:
1116          /* most cat6 src immediates can only encode 8 bits: */
1117          return !(immed & ~0xff);
1118       }
1119    }
1120 
1121    /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1122    return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1123 }
1124