• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include <stdarg.h>
28 
29 #include "util/u_math.h"
30 #include "util/u_memory.h"
31 #include "util/u_string.h"
32 
33 #include "ir3_compiler.h"
34 #include "ir3_image.h"
35 #include "ir3_nir.h"
36 #include "ir3_shader.h"
37 
38 #include "instr-a3xx.h"
39 #include "ir3.h"
40 #include "ir3_context.h"
41 
42 void
ir3_handle_nonuniform(struct ir3_instruction * instr,nir_intrinsic_instr * intrin)43 ir3_handle_nonuniform(struct ir3_instruction *instr,
44                       nir_intrinsic_instr *intrin)
45 {
46    if (nir_intrinsic_has_access(intrin) &&
47        (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {
48       instr->flags |= IR3_INSTR_NONUNIF;
49    }
50 }
51 
52 void
ir3_handle_bindless_cat6(struct ir3_instruction * instr,nir_src rsrc)53 ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
54 {
55    nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
56    if (!intrin)
57       return;
58 
59    instr->flags |= IR3_INSTR_B;
60    instr->cat6.base = nir_intrinsic_desc_set(intrin);
61 }
62 
63 static struct ir3_instruction *
create_input(struct ir3_context * ctx,unsigned compmask)64 create_input(struct ir3_context *ctx, unsigned compmask)
65 {
66    struct ir3_instruction *in;
67 
68    in = ir3_instr_create(ctx->in_block, OPC_META_INPUT, 1, 0);
69    in->input.sysval = ~0;
70    __ssa_dst(in)->wrmask = compmask;
71 
72    array_insert(ctx->ir, ctx->ir->inputs, in);
73 
74    return in;
75 }
76 
77 static struct ir3_instruction *
create_frag_input(struct ir3_context * ctx,struct ir3_instruction * coord,unsigned n)78 create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,
79                   unsigned n)
80 {
81    struct ir3_block *block = ctx->block;
82    struct ir3_instruction *instr;
83    /* packed inloc is fixed up later: */
84    struct ir3_instruction *inloc = create_immed(block, n);
85 
86    if (coord) {
87       instr = ir3_BARY_F(block, inloc, 0, coord, 0);
88    } else if (ctx->compiler->flat_bypass) {
89       if (ctx->compiler->gen >= 6) {
90          instr = ir3_FLAT_B(block, inloc, 0, inloc, 0);
91       } else {
92          instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
93          instr->cat6.type = TYPE_U32;
94          instr->cat6.iim_val = 1;
95       }
96    } else {
97       instr = ir3_BARY_F(block, inloc, 0, ctx->ij[IJ_PERSP_PIXEL], 0);
98       instr->srcs[1]->wrmask = 0x3;
99    }
100 
101    return instr;
102 }
103 
104 static struct ir3_instruction *
create_driver_param(struct ir3_context * ctx,enum ir3_driver_param dp)105 create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
106 {
107    /* first four vec4 sysval's reserved for UBOs: */
108    /* NOTE: dp is in scalar, but there can be >4 dp components: */
109    struct ir3_const_state *const_state = ir3_const_state(ctx->so);
110    unsigned n = const_state->offsets.driver_param;
111    unsigned r = regid(n + dp / 4, dp % 4);
112    return create_uniform(ctx->block, r);
113 }
114 
115 static struct ir3_instruction *
create_driver_param_indirect(struct ir3_context * ctx,enum ir3_driver_param dp,struct ir3_instruction * address)116 create_driver_param_indirect(struct ir3_context *ctx, enum ir3_driver_param dp,
117                              struct ir3_instruction *address)
118 {
119    /* first four vec4 sysval's reserved for UBOs: */
120    /* NOTE: dp is in scalar, but there can be >4 dp components: */
121    struct ir3_const_state *const_state = ir3_const_state(ctx->so);
122    unsigned n = const_state->offsets.driver_param;
123    return create_uniform_indirect(ctx->block, n * 4 + dp, TYPE_U32, address);
124 }
125 
126 /*
127  * Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or
128  * 32-bit registers.  We use NIR's 1-bit integers to represent bools, and
129  * trust that we will only see and/or/xor on those 1-bit values, so we can
130  * safely store NIR i1s in a 32-bit reg while always containing either a 1 or
131  * 0.
132  */
133 
134 /*
135  * alu/sfu instructions:
136  */
137 
138 static struct ir3_instruction *
create_cov(struct ir3_context * ctx,struct ir3_instruction * src,unsigned src_bitsize,nir_op op)139 create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
140            unsigned src_bitsize, nir_op op)
141 {
142    type_t src_type, dst_type;
143 
144    switch (op) {
145    case nir_op_f2f32:
146    case nir_op_f2f16_rtne:
147    case nir_op_f2f16_rtz:
148    case nir_op_f2f16:
149    case nir_op_f2i32:
150    case nir_op_f2i16:
151    case nir_op_f2i8:
152    case nir_op_f2u32:
153    case nir_op_f2u16:
154    case nir_op_f2u8:
155       switch (src_bitsize) {
156       case 32:
157          src_type = TYPE_F32;
158          break;
159       case 16:
160          src_type = TYPE_F16;
161          break;
162       default:
163          ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
164       }
165       break;
166 
167    case nir_op_i2f32:
168    case nir_op_i2f16:
169    case nir_op_i2i32:
170    case nir_op_i2i16:
171    case nir_op_i2i8:
172       switch (src_bitsize) {
173       case 32:
174          src_type = TYPE_S32;
175          break;
176       case 16:
177          src_type = TYPE_S16;
178          break;
179       case 8:
180          src_type = TYPE_S8;
181          break;
182       default:
183          ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
184       }
185       break;
186 
187    case nir_op_u2f32:
188    case nir_op_u2f16:
189    case nir_op_u2u32:
190    case nir_op_u2u16:
191    case nir_op_u2u8:
192       switch (src_bitsize) {
193       case 32:
194          src_type = TYPE_U32;
195          break;
196       case 16:
197          src_type = TYPE_U16;
198          break;
199       case 8:
200          src_type = TYPE_U8;
201          break;
202       default:
203          ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
204       }
205       break;
206 
207    case nir_op_b2f16:
208    case nir_op_b2f32:
209    case nir_op_b2i8:
210    case nir_op_b2i16:
211    case nir_op_b2i32:
212       src_type = ctx->compiler->bool_type;
213       break;
214 
215    default:
216       ir3_context_error(ctx, "invalid conversion op: %u", op);
217    }
218 
219    switch (op) {
220    case nir_op_f2f32:
221    case nir_op_i2f32:
222    case nir_op_u2f32:
223    case nir_op_b2f32:
224       dst_type = TYPE_F32;
225       break;
226 
227    case nir_op_f2f16_rtne:
228    case nir_op_f2f16_rtz:
229    case nir_op_f2f16:
230    case nir_op_i2f16:
231    case nir_op_u2f16:
232    case nir_op_b2f16:
233       dst_type = TYPE_F16;
234       break;
235 
236    case nir_op_f2i32:
237    case nir_op_i2i32:
238    case nir_op_b2i32:
239       dst_type = TYPE_S32;
240       break;
241 
242    case nir_op_f2i16:
243    case nir_op_i2i16:
244    case nir_op_b2i16:
245       dst_type = TYPE_S16;
246       break;
247 
248    case nir_op_f2i8:
249    case nir_op_i2i8:
250    case nir_op_b2i8:
251       dst_type = TYPE_S8;
252       break;
253 
254    case nir_op_f2u32:
255    case nir_op_u2u32:
256       dst_type = TYPE_U32;
257       break;
258 
259    case nir_op_f2u16:
260    case nir_op_u2u16:
261       dst_type = TYPE_U16;
262       break;
263 
264    case nir_op_f2u8:
265    case nir_op_u2u8:
266       dst_type = TYPE_U8;
267       break;
268 
269    default:
270       ir3_context_error(ctx, "invalid conversion op: %u", op);
271    }
272 
273    if (src_type == dst_type)
274       return src;
275 
276    struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);
277 
278    if (op == nir_op_f2f16_rtne) {
279       cov->cat1.round = ROUND_EVEN;
280    } else if (op == nir_op_f2f16) {
281       unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
282       nir_rounding_mode rounding_mode =
283          nir_get_rounding_mode_from_float_controls(execution_mode,
284                                                    nir_type_float16);
285       if (rounding_mode == nir_rounding_mode_rtne)
286          cov->cat1.round = ROUND_EVEN;
287    }
288 
289    return cov;
290 }
291 
292 /* For shift instructions NIR always has shift amount as 32 bit integer */
293 static struct ir3_instruction *
resize_shift_amount(struct ir3_context * ctx,struct ir3_instruction * src,unsigned bs)294 resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,
295                     unsigned bs)
296 {
297    if (bs == 16)
298       return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
299    else if (bs == 8)
300       return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U8);
301    else
302       return src;
303 }
304 
305 static void
emit_alu_dot_4x8_as_dp4acc(struct ir3_context * ctx,nir_alu_instr * alu,struct ir3_instruction ** dst,struct ir3_instruction ** src)306 emit_alu_dot_4x8_as_dp4acc(struct ir3_context *ctx, nir_alu_instr *alu,
307                            struct ir3_instruction **dst,
308                            struct ir3_instruction **src)
309 {
310    struct ir3_instruction *accumulator = NULL;
311    if (alu->op == nir_op_udot_4x8_uadd_sat) {
312       accumulator = create_immed(ctx->block, 0);
313    } else {
314       accumulator = src[2];
315    }
316 
317    dst[0] = ir3_DP4ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0);
318 
319    if (alu->op == nir_op_udot_4x8_uadd ||
320        alu->op == nir_op_udot_4x8_uadd_sat) {
321       dst[0]->cat3.signedness = IR3_SRC_UNSIGNED;
322    } else {
323       dst[0]->cat3.signedness = IR3_SRC_MIXED;
324    }
325 
326    /* For some reason (sat) doesn't work in unsigned case so
327     * we have to emulate it.
328     */
329    if (alu->op == nir_op_udot_4x8_uadd_sat) {
330       dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0);
331       dst[0]->flags |= IR3_INSTR_SAT;
332    } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
333       dst[0]->flags |= IR3_INSTR_SAT;
334    }
335 }
336 
337 static void
emit_alu_dot_4x8_as_dp2acc(struct ir3_context * ctx,nir_alu_instr * alu,struct ir3_instruction ** dst,struct ir3_instruction ** src)338 emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu,
339                            struct ir3_instruction **dst,
340                            struct ir3_instruction **src)
341 {
342    int signedness;
343    if (alu->op == nir_op_udot_4x8_uadd ||
344        alu->op == nir_op_udot_4x8_uadd_sat) {
345       signedness = IR3_SRC_UNSIGNED;
346    } else {
347       signedness = IR3_SRC_MIXED;
348    }
349 
350    struct ir3_instruction *accumulator = NULL;
351    if (alu->op == nir_op_udot_4x8_uadd_sat ||
352        alu->op == nir_op_sudot_4x8_iadd_sat) {
353       accumulator = create_immed(ctx->block, 0);
354    } else {
355       accumulator = src[2];
356    }
357 
358    dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0);
359    dst[0]->cat3.packed = IR3_SRC_PACKED_LOW;
360    dst[0]->cat3.signedness = signedness;
361 
362    dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, dst[0], 0);
363    dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH;
364    dst[0]->cat3.signedness = signedness;
365 
366    if (alu->op == nir_op_udot_4x8_uadd_sat) {
367       dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0);
368       dst[0]->flags |= IR3_INSTR_SAT;
369    } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
370       dst[0] = ir3_ADD_S(ctx->block, dst[0], 0, src[2], 0);
371       dst[0]->flags |= IR3_INSTR_SAT;
372    }
373 }
374 
375 static void
emit_alu(struct ir3_context * ctx,nir_alu_instr * alu)376 emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
377 {
378    const nir_op_info *info = &nir_op_infos[alu->op];
379    struct ir3_instruction **dst, *src[info->num_inputs];
380    unsigned bs[info->num_inputs]; /* bit size */
381    struct ir3_block *b = ctx->block;
382    unsigned dst_sz, wrmask;
383    type_t dst_type = type_uint_size(alu->def.bit_size);
384 
385    dst_sz = alu->def.num_components;
386    wrmask = (1 << dst_sz) - 1;
387 
388    dst = ir3_get_def(ctx, &alu->def, dst_sz);
389 
390    /* Vectors are special in that they have non-scalarized writemasks,
391     * and just take the first swizzle channel for each argument in
392     * order into each writemask channel.
393     */
394    if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
395        (alu->op == nir_op_vec4) || (alu->op == nir_op_vec8) ||
396        (alu->op == nir_op_vec16)) {
397 
398       for (int i = 0; i < info->num_inputs; i++) {
399          nir_alu_src *asrc = &alu->src[i];
400 
401          src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
402          if (!src[i])
403             src[i] = create_immed_typed(ctx->block, 0, dst_type);
404          dst[i] = ir3_MOV(b, src[i], dst_type);
405       }
406 
407       ir3_put_def(ctx, &alu->def);
408       return;
409    }
410 
411    /* We also get mov's with more than one component for mov's so
412     * handle those specially:
413     */
414    if (alu->op == nir_op_mov) {
415       nir_alu_src *asrc = &alu->src[0];
416       struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src);
417 
418       for (unsigned i = 0; i < dst_sz; i++) {
419          if (wrmask & (1 << i)) {
420             dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);
421          } else {
422             dst[i] = NULL;
423          }
424       }
425 
426       ir3_put_def(ctx, &alu->def);
427       return;
428    }
429 
430    /* General case: We can just grab the one used channel per src. */
431    assert(alu->def.num_components == 1);
432 
433    for (int i = 0; i < info->num_inputs; i++) {
434       nir_alu_src *asrc = &alu->src[i];
435 
436       src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
437       bs[i] = nir_src_bit_size(asrc->src);
438 
439       compile_assert(ctx, src[i]);
440    }
441 
442    switch (alu->op) {
443    case nir_op_f2f32:
444    case nir_op_f2f16_rtne:
445    case nir_op_f2f16_rtz:
446    case nir_op_f2f16:
447    case nir_op_f2i32:
448    case nir_op_f2i16:
449    case nir_op_f2i8:
450    case nir_op_f2u32:
451    case nir_op_f2u16:
452    case nir_op_f2u8:
453    case nir_op_i2f32:
454    case nir_op_i2f16:
455    case nir_op_i2i32:
456    case nir_op_i2i16:
457    case nir_op_i2i8:
458    case nir_op_u2f32:
459    case nir_op_u2f16:
460    case nir_op_u2u32:
461    case nir_op_u2u16:
462    case nir_op_u2u8:
463    case nir_op_b2f16:
464    case nir_op_b2f32:
465    case nir_op_b2i8:
466    case nir_op_b2i16:
467    case nir_op_b2i32:
468       dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
469       break;
470 
471    case nir_op_fquantize2f16:
472       dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne),
473                           16, nir_op_f2f32);
474       break;
475 
476    case nir_op_b2b1:
477       /* b2b1 will appear when translating from
478        *
479        * - nir_intrinsic_load_shared of a 32-bit 0/~0 value.
480        * - nir_intrinsic_load_constant of a 32-bit 0/~0 value
481        *
482        * A negate can turn those into a 1 or 0 for us.
483        */
484       dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
485       break;
486 
487    case nir_op_b2b32:
488       /* b2b32 will appear when converting our 1-bit bools to a store_shared
489        * argument.
490        *
491        * A negate can turn those into a ~0 for us.
492        */
493       dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
494       break;
495 
496    case nir_op_fneg:
497       dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
498       break;
499    case nir_op_fabs:
500       dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
501       break;
502    case nir_op_fmax:
503       dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
504       break;
505    case nir_op_fmin:
506       dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
507       break;
508    case nir_op_fsat:
509       /* if there is just a single use of the src, and it supports
510        * (sat) bit, we can just fold the (sat) flag back to the
511        * src instruction and create a mov.  This is easier for cp
512        * to eliminate.
513        */
514       if (is_sat_compatible(src[0]->opc) &&
515           (list_length(&alu->src[0].src.ssa->uses) == 1)) {
516          src[0]->flags |= IR3_INSTR_SAT;
517          dst[0] = ir3_MOV(b, src[0], dst_type);
518       } else {
519          /* otherwise generate a max.f that saturates.. blob does
520           * similar (generating a cat2 mov using max.f)
521           */
522          dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
523          dst[0]->flags |= IR3_INSTR_SAT;
524       }
525       break;
526    case nir_op_fmul:
527       dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
528       break;
529    case nir_op_fadd:
530       dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
531       break;
532    case nir_op_fsub:
533       dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
534       break;
535    case nir_op_ffma:
536       dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
537       break;
538    case nir_op_fddx:
539    case nir_op_fddx_coarse:
540       dst[0] = ir3_DSX(b, src[0], 0);
541       dst[0]->cat5.type = TYPE_F32;
542       break;
543    case nir_op_fddx_fine:
544       dst[0] = ir3_DSXPP_MACRO(b, src[0], 0);
545       dst[0]->cat5.type = TYPE_F32;
546       break;
547    case nir_op_fddy:
548    case nir_op_fddy_coarse:
549       dst[0] = ir3_DSY(b, src[0], 0);
550       dst[0]->cat5.type = TYPE_F32;
551       break;
552       break;
553    case nir_op_fddy_fine:
554       dst[0] = ir3_DSYPP_MACRO(b, src[0], 0);
555       dst[0]->cat5.type = TYPE_F32;
556       break;
557    case nir_op_flt:
558       dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
559       dst[0]->cat2.condition = IR3_COND_LT;
560       break;
561    case nir_op_fge:
562       dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
563       dst[0]->cat2.condition = IR3_COND_GE;
564       break;
565    case nir_op_feq:
566       dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
567       dst[0]->cat2.condition = IR3_COND_EQ;
568       break;
569    case nir_op_fneu:
570       dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
571       dst[0]->cat2.condition = IR3_COND_NE;
572       break;
573    case nir_op_fceil:
574       dst[0] = ir3_CEIL_F(b, src[0], 0);
575       break;
576    case nir_op_ffloor:
577       dst[0] = ir3_FLOOR_F(b, src[0], 0);
578       break;
579    case nir_op_ftrunc:
580       dst[0] = ir3_TRUNC_F(b, src[0], 0);
581       break;
582    case nir_op_fround_even:
583       dst[0] = ir3_RNDNE_F(b, src[0], 0);
584       break;
585    case nir_op_fsign:
586       dst[0] = ir3_SIGN_F(b, src[0], 0);
587       break;
588 
589    case nir_op_fsin:
590       dst[0] = ir3_SIN(b, src[0], 0);
591       break;
592    case nir_op_fcos:
593       dst[0] = ir3_COS(b, src[0], 0);
594       break;
595    case nir_op_frsq:
596       dst[0] = ir3_RSQ(b, src[0], 0);
597       break;
598    case nir_op_frcp:
599       dst[0] = ir3_RCP(b, src[0], 0);
600       break;
601    case nir_op_flog2:
602       dst[0] = ir3_LOG2(b, src[0], 0);
603       break;
604    case nir_op_fexp2:
605       dst[0] = ir3_EXP2(b, src[0], 0);
606       break;
607    case nir_op_fsqrt:
608       dst[0] = ir3_SQRT(b, src[0], 0);
609       break;
610 
611    case nir_op_iabs:
612       dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
613       break;
614    case nir_op_iadd:
615       dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
616       break;
617    case nir_op_ihadd:
618       dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
619       dst[0]->dsts[0]->flags |= IR3_REG_EI;
620       break;
621    case nir_op_uhadd:
622       dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
623       dst[0]->dsts[0]->flags |= IR3_REG_EI;
624       break;
625    case nir_op_iand:
626       dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
627       break;
628    case nir_op_imax:
629       dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
630       break;
631    case nir_op_umax:
632       dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
633       break;
634    case nir_op_imin:
635       dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
636       break;
637    case nir_op_umin:
638       dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
639       break;
640    case nir_op_umul_low:
641       dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
642       break;
643    case nir_op_imadsh_mix16:
644       dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
645       break;
646    case nir_op_imad24_ir3:
647       dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
648       break;
649    case nir_op_imul:
650       compile_assert(ctx, alu->def.bit_size == 16);
651       dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
652       break;
653    case nir_op_imul24:
654       dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
655       break;
656    case nir_op_ineg:
657       dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
658       break;
659    case nir_op_inot:
660       if (bs[0] == 1) {
661          struct ir3_instruction *one =
662                create_immed_typed(ctx->block, 1, ctx->compiler->bool_type);
663          dst[0] = ir3_SUB_U(b, one, 0, src[0], 0);
664       } else {
665          dst[0] = ir3_NOT_B(b, src[0], 0);
666       }
667       break;
668    case nir_op_ior:
669       dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
670       break;
671    case nir_op_ishl:
672       dst[0] =
673          ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
674       break;
675    case nir_op_ishr:
676       dst[0] =
677          ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
678       break;
679    case nir_op_isub:
680       dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
681       break;
682    case nir_op_ixor:
683       dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
684       break;
685    case nir_op_ushr:
686       dst[0] =
687          ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
688       break;
689    case nir_op_ilt:
690       dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
691       dst[0]->cat2.condition = IR3_COND_LT;
692       break;
693    case nir_op_ige:
694       dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
695       dst[0]->cat2.condition = IR3_COND_GE;
696       break;
697    case nir_op_ieq:
698       dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
699       dst[0]->cat2.condition = IR3_COND_EQ;
700       break;
701    case nir_op_ine:
702       dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
703       dst[0]->cat2.condition = IR3_COND_NE;
704       break;
705    case nir_op_ult:
706       dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
707       dst[0]->cat2.condition = IR3_COND_LT;
708       break;
709    case nir_op_uge:
710       dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
711       dst[0]->cat2.condition = IR3_COND_GE;
712       break;
713 
714    case nir_op_bcsel: {
715       struct ir3_instruction *cond = src[0];
716 
717       /* If src[0] is a negation (likely as a result of an ir3_b2n(cond)),
718        * we can ignore that and use original cond, since the nonzero-ness of
719        * cond stays the same.
720        */
721       if (cond->opc == OPC_ABSNEG_S && cond->flags == 0 &&
722           (cond->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
723              IR3_REG_SNEG) {
724          cond = cond->srcs[0]->def->instr;
725       }
726 
727       compile_assert(ctx, bs[1] == bs[2]);
728 
729       /* The condition's size has to match the other two arguments' size, so
730        * convert down if necessary.
731        *
732        * Single hashtable is fine, because the conversion will either be
733        * 16->32 or 32->16, but never both
734        */
735       if (is_half(src[1]) != is_half(cond)) {
736          struct hash_entry *prev_entry =
737             _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);
738          if (prev_entry) {
739             cond = prev_entry->data;
740          } else {
741             if (is_half(cond)) {
742                cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32);
743             } else {
744                cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
745             }
746             _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);
747          }
748       }
749 
750       if (is_half(src[1])) {
751          dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);
752       } else {
753          dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
754       }
755 
756       break;
757    }
758    case nir_op_bit_count: {
759       if (ctx->compiler->gen < 5 || (src[0]->dsts[0]->flags & IR3_REG_HALF)) {
760          dst[0] = ir3_CBITS_B(b, src[0], 0);
761          break;
762       }
763 
764       // We need to do this 16b at a time on a5xx+a6xx.  Once half-precision
765       // support is in place, this should probably move to a NIR lowering pass:
766       struct ir3_instruction *hi, *lo;
767 
768       hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0), TYPE_U32,
769                    TYPE_U16);
770       lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
771 
772       hi = ir3_CBITS_B(b, hi, 0);
773       lo = ir3_CBITS_B(b, lo, 0);
774 
775       // TODO maybe the builders should default to making dst half-precision
776       // if the src's were half precision, to make this less awkward.. otoh
777       // we should probably just do this lowering in NIR.
778       hi->dsts[0]->flags |= IR3_REG_HALF;
779       lo->dsts[0]->flags |= IR3_REG_HALF;
780 
781       dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
782       dst[0]->dsts[0]->flags |= IR3_REG_HALF;
783       dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
784       break;
785    }
786    case nir_op_ifind_msb: {
787       struct ir3_instruction *cmp;
788       dst[0] = ir3_CLZ_S(b, src[0], 0);
789       cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
790       cmp->cat2.condition = IR3_COND_GE;
791       dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
792                            0, cmp, 0, dst[0], 0);
793       break;
794    }
795    case nir_op_ufind_msb:
796       dst[0] = ir3_CLZ_B(b, src[0], 0);
797       dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
798                            0, src[0], 0, dst[0], 0);
799       break;
800    case nir_op_find_lsb:
801       dst[0] = ir3_BFREV_B(b, src[0], 0);
802       dst[0] = ir3_CLZ_B(b, dst[0], 0);
803       break;
804    case nir_op_bitfield_reverse:
805       dst[0] = ir3_BFREV_B(b, src[0], 0);
806       break;
807 
808    case nir_op_uadd_sat:
809       dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
810       dst[0]->flags |= IR3_INSTR_SAT;
811       break;
812    case nir_op_iadd_sat:
813       dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
814       dst[0]->flags |= IR3_INSTR_SAT;
815       break;
816    case nir_op_usub_sat:
817       dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
818       dst[0]->flags |= IR3_INSTR_SAT;
819       break;
820    case nir_op_isub_sat:
821       dst[0] = ir3_SUB_S(b, src[0], 0, src[1], 0);
822       dst[0]->flags |= IR3_INSTR_SAT;
823       break;
824 
825    case nir_op_udot_4x8_uadd:
826    case nir_op_udot_4x8_uadd_sat:
827    case nir_op_sudot_4x8_iadd:
828    case nir_op_sudot_4x8_iadd_sat: {
829       if (ctx->compiler->has_dp4acc) {
830          emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst, src);
831       } else if (ctx->compiler->has_dp2acc) {
832          emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst, src);
833       } else {
834          ir3_context_error(ctx, "ALU op should have been lowered: %s\n",
835                            nir_op_infos[alu->op].name);
836       }
837 
838       break;
839    }
840 
841    default:
842       ir3_context_error(ctx, "Unhandled ALU op: %s\n",
843                         nir_op_infos[alu->op].name);
844       break;
845    }
846 
847    if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
848       assert(alu->def.bit_size == 1 || alu->op == nir_op_b2b32);
849       assert(dst_sz == 1);
850    } else {
851       /* 1-bit values stored in 32-bit registers are only valid for certain
852        * ALU ops.
853        */
854       switch (alu->op) {
855       case nir_op_iand:
856       case nir_op_ior:
857       case nir_op_ixor:
858       case nir_op_inot:
859       case nir_op_bcsel:
860          break;
861       default:
862          compile_assert(ctx, alu->def.bit_size != 1);
863       }
864    }
865 
866    ir3_put_def(ctx, &alu->def);
867 }
868 
869 static void
emit_intrinsic_load_ubo_ldc(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)870 emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
871                             struct ir3_instruction **dst)
872 {
873    struct ir3_block *b = ctx->block;
874 
875    /* This is only generated for us by nir_lower_ubo_vec4, which leaves base =
876     * 0.
877     */
878    assert(nir_intrinsic_base(intr) == 0);
879 
880    unsigned ncomp = intr->num_components;
881    struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
882    struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
883    struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
884    ldc->dsts[0]->wrmask = MASK(ncomp);
885    ldc->cat6.iim_val = ncomp;
886    ldc->cat6.d = nir_intrinsic_component(intr);
887    ldc->cat6.type = utype_def(&intr->def);
888 
889    ir3_handle_bindless_cat6(ldc, intr->src[0]);
890    if (ldc->flags & IR3_INSTR_B)
891       ctx->so->bindless_ubo = true;
892    ir3_handle_nonuniform(ldc, intr);
893 
894    ir3_split_dest(b, dst, ldc, 0, ncomp);
895 }
896 
897 static void
emit_intrinsic_copy_ubo_to_uniform(struct ir3_context * ctx,nir_intrinsic_instr * intr)898 emit_intrinsic_copy_ubo_to_uniform(struct ir3_context *ctx,
899                                    nir_intrinsic_instr *intr)
900 {
901    struct ir3_block *b = ctx->block;
902 
903    unsigned base = nir_intrinsic_base(intr);
904    unsigned size = nir_intrinsic_range(intr);
905 
906    struct ir3_instruction *addr1 = ir3_get_addr1(ctx, base);
907 
908    struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
909    struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
910    struct ir3_instruction *ldc = ir3_LDC_K(b, idx, 0, offset, 0);
911    ldc->cat6.iim_val = size;
912    ldc->barrier_class = ldc->barrier_conflict = IR3_BARRIER_CONST_W;
913 
914    ir3_handle_bindless_cat6(ldc, intr->src[0]);
915    if (ldc->flags & IR3_INSTR_B)
916       ctx->so->bindless_ubo = true;
917 
918    ir3_instr_set_address(ldc, addr1);
919 
920    array_insert(b, b->keeps, ldc);
921 }
922 
923 static void
emit_intrinsic_copy_global_to_uniform(struct ir3_context * ctx,nir_intrinsic_instr * intr)924 emit_intrinsic_copy_global_to_uniform(struct ir3_context *ctx,
925                                       nir_intrinsic_instr *intr)
926 {
927    struct ir3_block *b = ctx->block;
928 
929    unsigned size = nir_intrinsic_range(intr);
930    unsigned dst = nir_intrinsic_range_base(intr);
931    unsigned addr_offset = nir_intrinsic_base(intr);
932    unsigned dst_lo = dst & 0xff;
933    unsigned dst_hi = dst >> 8;
934 
935    struct ir3_instruction *a1 = NULL;
936    if (dst_hi)
937       a1 = ir3_get_addr1(ctx, dst_hi << 8);
938 
939    struct ir3_instruction *addr_lo = ir3_get_src(ctx, &intr->src[0])[0];
940    struct ir3_instruction *addr_hi = ir3_get_src(ctx, &intr->src[0])[1];
941    struct ir3_instruction *addr = ir3_collect(b, addr_lo, addr_hi);
942    struct ir3_instruction *ldg = ir3_LDG_K(b, create_immed(b, dst_lo), 0, addr, 0,
943                                            create_immed(b, addr_offset), 0,
944                                            create_immed(b, size), 0);
945    ldg->barrier_class = ldg->barrier_conflict = IR3_BARRIER_CONST_W;
946    ldg->cat6.type = TYPE_U32;
947 
948    if (a1) {
949       ir3_instr_set_address(ldg, a1);
950       ldg->flags |= IR3_INSTR_A1EN;
951    }
952 
953    array_insert(b, b->keeps, ldg);
954 }
955 
956 
957 /* handles direct/indirect UBO reads: */
958 static void
emit_intrinsic_load_ubo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)959 emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
960                         struct ir3_instruction **dst)
961 {
962    struct ir3_block *b = ctx->block;
963    struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
964    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
965    unsigned ubo = regid(const_state->offsets.ubo, 0);
966    const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
967 
968    int off = 0;
969 
970    /* First src is ubo index, which could either be an immed or not: */
971    src0 = ir3_get_src(ctx, &intr->src[0])[0];
972    if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
973       base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));
974       base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);
975    } else {
976       base_lo = create_uniform_indirect(b, ubo, TYPE_U32,
977                                         ir3_get_addr0(ctx, src0, ptrsz));
978       base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32,
979                                         ir3_get_addr0(ctx, src0, ptrsz));
980 
981       /* NOTE: since relative addressing is used, make sure constlen is
982        * at least big enough to cover all the UBO addresses, since the
983        * assembler won't know what the max address reg is.
984        */
985       ctx->so->constlen =
986          MAX2(ctx->so->constlen,
987               const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz));
988    }
989 
990    /* note: on 32bit gpu's base_hi is ignored and DCE'd */
991    addr = base_lo;
992 
993    if (nir_src_is_const(intr->src[1])) {
994       off += nir_src_as_uint(intr->src[1]);
995    } else {
996       /* For load_ubo_indirect, second src is indirect offset: */
997       src1 = ir3_get_src(ctx, &intr->src[1])[0];
998 
999       /* and add offset to addr: */
1000       addr = ir3_ADD_S(b, addr, 0, src1, 0);
1001    }
1002 
1003    /* if offset is to large to encode in the ldg, split it out: */
1004    if ((off + (intr->num_components * 4)) > 1024) {
1005       /* split out the minimal amount to improve the odds that
1006        * cp can fit the immediate in the add.s instruction:
1007        */
1008       unsigned off2 = off + (intr->num_components * 4) - 1024;
1009       addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
1010       off -= off2;
1011    }
1012 
1013    if (ptrsz == 2) {
1014       struct ir3_instruction *carry;
1015 
1016       /* handle 32b rollover, ie:
1017        *   if (addr < base_lo)
1018        *      base_hi++
1019        */
1020       carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
1021       carry->cat2.condition = IR3_COND_LT;
1022       base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
1023 
1024       addr = ir3_collect(b, addr, base_hi);
1025    }
1026 
1027    for (int i = 0; i < intr->num_components; i++) {
1028       struct ir3_instruction *load =
1029          ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0,
1030                  create_immed(b, 1), 0); /* num components */
1031       load->cat6.type = TYPE_U32;
1032       dst[i] = load;
1033    }
1034 }
1035 
1036 /* Load a kernel param: src[] = { address }. */
1037 static void
emit_intrinsic_load_kernel_input(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1038 emit_intrinsic_load_kernel_input(struct ir3_context *ctx,
1039                                  nir_intrinsic_instr *intr,
1040                                  struct ir3_instruction **dst)
1041 {
1042    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1043    struct ir3_block *b = ctx->block;
1044    unsigned offset = nir_intrinsic_base(intr);
1045    unsigned p = regid(const_state->offsets.kernel_params, 0);
1046 
1047    struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0];
1048 
1049    if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
1050       offset += src0->srcs[0]->iim_val;
1051 
1052       /* kernel param position is in bytes, but constant space is 32b registers: */
1053       compile_assert(ctx, !(offset & 0x3));
1054 
1055       dst[0] = create_uniform(b, p + (offset / 4));
1056    } else {
1057       /* kernel param position is in bytes, but constant space is 32b registers: */
1058       compile_assert(ctx, !(offset & 0x3));
1059 
1060       /* TODO we should probably be lowering this in nir, and also handling
1061        * non-32b inputs.. Also we probably don't want to be using
1062        * SP_MODE_CONTROL.CONSTANT_DEMOTION_ENABLE for KERNEL shaders..
1063        */
1064       src0 = ir3_SHR_B(b, src0, 0, create_immed(b, 2), 0);
1065 
1066       dst[0] = create_uniform_indirect(b, offset / 4, TYPE_U32,
1067                                        ir3_get_addr0(ctx, src0, 1));
1068    }
1069 }
1070 
1071 /* src[] = { block_index } */
1072 static void
emit_intrinsic_ssbo_size(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1073 emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1074                          struct ir3_instruction **dst)
1075 {
1076    struct ir3_block *b = ctx->block;
1077    struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
1078    struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
1079    resinfo->cat6.iim_val = 1;
1080    resinfo->cat6.d = ctx->compiler->gen >= 6 ? 1 : 2;
1081    resinfo->cat6.type = TYPE_U32;
1082    resinfo->cat6.typed = false;
1083    /* resinfo has no writemask and always writes out 3 components */
1084    resinfo->dsts[0]->wrmask = MASK(3);
1085    ir3_handle_bindless_cat6(resinfo, intr->src[0]);
1086    ir3_handle_nonuniform(resinfo, intr);
1087 
1088    if (ctx->compiler->gen >= 6) {
1089       ir3_split_dest(b, dst, resinfo, 0, 1);
1090    } else {
1091       /* On a5xx, resinfo returns the low 16 bits of ssbo size in .x and the high 16 bits in .y */
1092       struct ir3_instruction *resinfo_dst[2];
1093       ir3_split_dest(b, resinfo_dst, resinfo, 0, 2);
1094       *dst = ir3_ADD_U(b, ir3_SHL_B(b, resinfo_dst[1], 0, create_immed(b, 16), 0), 0, resinfo_dst[0], 0);
1095    }
1096 }
1097 
1098 /* src[] = { offset }. const_index[] = { base } */
1099 static void
emit_intrinsic_load_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1100 emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1101                            struct ir3_instruction **dst)
1102 {
1103    struct ir3_block *b = ctx->block;
1104    struct ir3_instruction *ldl, *offset;
1105    unsigned base;
1106 
1107    offset = ir3_get_src(ctx, &intr->src[0])[0];
1108    base = nir_intrinsic_base(intr);
1109 
1110    ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0,
1111                  create_immed(b, intr->num_components), 0);
1112 
1113    ldl->cat6.type = utype_def(&intr->def);
1114    ldl->dsts[0]->wrmask = MASK(intr->num_components);
1115 
1116    ldl->barrier_class = IR3_BARRIER_SHARED_R;
1117    ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
1118 
1119    ir3_split_dest(b, dst, ldl, 0, intr->num_components);
1120 }
1121 
1122 /* src[] = { value, offset }. const_index[] = { base, write_mask } */
1123 static void
emit_intrinsic_store_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)1124 emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1125 {
1126    struct ir3_block *b = ctx->block;
1127    struct ir3_instruction *stl, *offset;
1128    struct ir3_instruction *const *value;
1129    unsigned base, wrmask, ncomp;
1130 
1131    value = ir3_get_src(ctx, &intr->src[0]);
1132    offset = ir3_get_src(ctx, &intr->src[1])[0];
1133 
1134    base = nir_intrinsic_base(intr);
1135    wrmask = nir_intrinsic_write_mask(intr);
1136    ncomp = ffs(~wrmask) - 1;
1137 
1138    assert(wrmask == BITFIELD_MASK(intr->num_components));
1139 
1140    stl = ir3_STL(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1141                  create_immed(b, ncomp), 0);
1142    stl->cat6.dst_offset = base;
1143    stl->cat6.type = utype_src(intr->src[0]);
1144    stl->barrier_class = IR3_BARRIER_SHARED_W;
1145    stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1146 
1147    array_insert(b, b->keeps, stl);
1148 }
1149 
1150 /* src[] = { offset }. const_index[] = { base } */
1151 static void
emit_intrinsic_load_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1152 emit_intrinsic_load_shared_ir3(struct ir3_context *ctx,
1153                                nir_intrinsic_instr *intr,
1154                                struct ir3_instruction **dst)
1155 {
1156    struct ir3_block *b = ctx->block;
1157    struct ir3_instruction *load, *offset;
1158    unsigned base;
1159 
1160    offset = ir3_get_src(ctx, &intr->src[0])[0];
1161    base = nir_intrinsic_base(intr);
1162 
1163    load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0,
1164                    create_immed(b, intr->num_components), 0);
1165 
1166    /* for a650, use LDL for tess ctrl inputs: */
1167    if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)
1168       load->opc = OPC_LDL;
1169 
1170    load->cat6.type = utype_def(&intr->def);
1171    load->dsts[0]->wrmask = MASK(intr->num_components);
1172 
1173    load->barrier_class = IR3_BARRIER_SHARED_R;
1174    load->barrier_conflict = IR3_BARRIER_SHARED_W;
1175 
1176    ir3_split_dest(b, dst, load, 0, intr->num_components);
1177 }
1178 
1179 /* src[] = { value, offset }. const_index[] = { base } */
1180 static void
emit_intrinsic_store_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr)1181 emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,
1182                                 nir_intrinsic_instr *intr)
1183 {
1184    struct ir3_block *b = ctx->block;
1185    struct ir3_instruction *store, *offset;
1186    struct ir3_instruction *const *value;
1187 
1188    value = ir3_get_src(ctx, &intr->src[0]);
1189    offset = ir3_get_src(ctx, &intr->src[1])[0];
1190 
1191    store = ir3_STLW(b, offset, 0,
1192                     ir3_create_collect(b, value, intr->num_components), 0,
1193                     create_immed(b, intr->num_components), 0);
1194 
1195    /* for a650, use STL for vertex outputs used by tess ctrl shader: */
1196    if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&
1197        ctx->compiler->tess_use_shared)
1198       store->opc = OPC_STL;
1199 
1200    store->cat6.dst_offset = nir_intrinsic_base(intr);
1201    store->cat6.type = utype_src(intr->src[0]);
1202    store->barrier_class = IR3_BARRIER_SHARED_W;
1203    store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1204 
1205    array_insert(b, b->keeps, store);
1206 }
1207 
1208 /*
1209  * CS shared variable atomic intrinsics
1210  *
1211  * All of the shared variable atomic memory operations read a value from
1212  * memory, compute a new value using one of the operations below, write the
1213  * new value to memory, and return the original value read.
1214  *
1215  * All operations take 2 sources except CompSwap that takes 3. These
1216  * sources represent:
1217  *
1218  * 0: The offset into the shared variable storage region that the atomic
1219  *    operation will operate on.
1220  * 1: The data parameter to the atomic function (i.e. the value to add
1221  *    in, etc).
1222  * 2: For CompSwap only: the second data parameter.
1223  */
1224 static struct ir3_instruction *
emit_intrinsic_atomic_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)1225 emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1226 {
1227    struct ir3_block *b = ctx->block;
1228    struct ir3_instruction *atomic, *src0, *src1;
1229    type_t type = TYPE_U32;
1230 
1231    src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */
1232    src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */
1233 
1234    switch (nir_intrinsic_atomic_op(intr)) {
1235    case nir_atomic_op_iadd:
1236       atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
1237       break;
1238    case nir_atomic_op_imin:
1239       atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1240       type = TYPE_S32;
1241       break;
1242    case nir_atomic_op_umin:
1243       atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1244       break;
1245    case nir_atomic_op_imax:
1246       atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1247       type = TYPE_S32;
1248       break;
1249    case nir_atomic_op_umax:
1250       atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1251       break;
1252    case nir_atomic_op_iand:
1253       atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
1254       break;
1255    case nir_atomic_op_ior:
1256       atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
1257       break;
1258    case nir_atomic_op_ixor:
1259       atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
1260       break;
1261    case nir_atomic_op_xchg:
1262       atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
1263       break;
1264    case nir_atomic_op_cmpxchg:
1265       /* for cmpxchg, src1 is [ui]vec2(data, compare): */
1266       src1 = ir3_collect(b, ir3_get_src(ctx, &intr->src[2])[0], src1);
1267       atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
1268       break;
1269    default:
1270       unreachable("boo");
1271    }
1272 
1273    atomic->cat6.iim_val = 1;
1274    atomic->cat6.d = 1;
1275    atomic->cat6.type = type;
1276    atomic->barrier_class = IR3_BARRIER_SHARED_W;
1277    atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1278 
1279    /* even if nothing consume the result, we can't DCE the instruction: */
1280    array_insert(b, b->keeps, atomic);
1281 
1282    return atomic;
1283 }
1284 
1285 static void
stp_ldp_offset(struct ir3_context * ctx,nir_src * src,struct ir3_instruction ** offset,int32_t * base)1286 stp_ldp_offset(struct ir3_context *ctx, nir_src *src,
1287                struct ir3_instruction **offset, int32_t *base)
1288 {
1289    struct ir3_block *b = ctx->block;
1290 
1291    if (nir_src_is_const(*src)) {
1292       unsigned src_offset = nir_src_as_uint(*src);
1293       /* The base offset field is only 13 bits, and it's signed. Try to make the
1294        * offset constant whenever the original offsets are similar, to avoid
1295        * creating too many constants in the final shader.
1296        */
1297       *base = ((int32_t) src_offset << (32 - 13)) >> (32 - 13);
1298       uint32_t offset_val = src_offset - *base;
1299       *offset = create_immed(b, offset_val);
1300    } else {
1301       /* TODO: match on nir_iadd with a constant that fits */
1302       *base = 0;
1303       *offset = ir3_get_src(ctx, src)[0];
1304    }
1305 }
1306 
1307 /* src[] = { offset }. */
1308 static void
emit_intrinsic_load_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1309 emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1310                             struct ir3_instruction **dst)
1311 {
1312    struct ir3_block *b = ctx->block;
1313    struct ir3_instruction *ldp, *offset;
1314    int32_t base;
1315 
1316    stp_ldp_offset(ctx, &intr->src[0], &offset, &base);
1317 
1318    ldp = ir3_LDP(b, offset, 0, create_immed(b, base), 0,
1319                  create_immed(b, intr->num_components), 0);
1320 
1321    ldp->cat6.type = utype_def(&intr->def);
1322    ldp->dsts[0]->wrmask = MASK(intr->num_components);
1323 
1324    ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
1325    ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
1326 
1327    ir3_split_dest(b, dst, ldp, 0, intr->num_components);
1328 }
1329 
1330 /* src[] = { value, offset }. const_index[] = { write_mask } */
1331 static void
emit_intrinsic_store_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr)1332 emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1333 {
1334    struct ir3_block *b = ctx->block;
1335    struct ir3_instruction *stp, *offset;
1336    struct ir3_instruction *const *value;
1337    unsigned wrmask, ncomp;
1338    int32_t base;
1339 
1340    value = ir3_get_src(ctx, &intr->src[0]);
1341 
1342    stp_ldp_offset(ctx, &intr->src[1], &offset, &base);
1343 
1344    wrmask = nir_intrinsic_write_mask(intr);
1345    ncomp = ffs(~wrmask) - 1;
1346 
1347    assert(wrmask == BITFIELD_MASK(intr->num_components));
1348 
1349    stp = ir3_STP(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1350                  create_immed(b, ncomp), 0);
1351    stp->cat6.dst_offset = base;
1352    stp->cat6.type = utype_src(intr->src[0]);
1353    stp->barrier_class = IR3_BARRIER_PRIVATE_W;
1354    stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
1355 
1356    array_insert(b, b->keeps, stp);
1357 }
1358 
1359 struct tex_src_info {
1360    /* For prefetch */
1361    unsigned tex_base, samp_base, tex_idx, samp_idx;
1362    /* For normal tex instructions */
1363    unsigned base, a1_val, flags;
1364    struct ir3_instruction *samp_tex;
1365 };
1366 
1367 /* TODO handle actual indirect/dynamic case.. which is going to be weird
1368  * to handle with the image_mapping table..
1369  */
1370 static struct tex_src_info
get_image_ssbo_samp_tex_src(struct ir3_context * ctx,nir_src * src,bool image)1371 get_image_ssbo_samp_tex_src(struct ir3_context *ctx, nir_src *src, bool image)
1372 {
1373    struct ir3_block *b = ctx->block;
1374    struct tex_src_info info = {0};
1375    nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(*src);
1376 
1377    if (bindless_tex) {
1378       /* Bindless case */
1379       ctx->so->bindless_tex = true;
1380       info.flags |= IR3_INSTR_B;
1381 
1382       /* Gather information required to determine which encoding to
1383        * choose as well as for prefetch.
1384        */
1385       info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1386       bool tex_const = nir_src_is_const(bindless_tex->src[0]);
1387       if (tex_const)
1388          info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1389       info.samp_idx = 0;
1390 
1391       /* Choose encoding. */
1392       if (tex_const && info.tex_idx < 256) {
1393          if (info.tex_idx < 16) {
1394             /* Everything fits within the instruction */
1395             info.base = info.tex_base;
1396          } else {
1397             info.base = info.tex_base;
1398             if (ctx->compiler->gen <= 6) {
1399                info.a1_val = info.tex_idx << 3;
1400             } else {
1401                info.a1_val = info.samp_idx << 3;
1402             }
1403             info.flags |= IR3_INSTR_A1EN;
1404          }
1405          info.samp_tex = NULL;
1406       } else {
1407          info.flags |= IR3_INSTR_S2EN;
1408          info.base = info.tex_base;
1409 
1410          /* Note: the indirect source is now a vec2 instead of hvec2 */
1411          struct ir3_instruction *texture, *sampler;
1412 
1413          texture = ir3_get_src(ctx, src)[0];
1414          sampler = create_immed(b, 0);
1415          info.samp_tex = ir3_collect(b, texture, sampler);
1416       }
1417    } else {
1418       info.flags |= IR3_INSTR_S2EN;
1419       unsigned slot = nir_src_as_uint(*src);
1420       unsigned tex_idx = image ?
1421             ir3_image_to_tex(&ctx->so->image_mapping, slot) :
1422             ir3_ssbo_to_tex(&ctx->so->image_mapping, slot);
1423       struct ir3_instruction *texture, *sampler;
1424 
1425       ctx->so->num_samp = MAX2(ctx->so->num_samp, tex_idx + 1);
1426 
1427       texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1428       sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1429 
1430       info.samp_tex = ir3_collect(b, sampler, texture);
1431    }
1432 
1433    return info;
1434 }
1435 
1436 static struct ir3_instruction *
emit_sam(struct ir3_context * ctx,opc_t opc,struct tex_src_info info,type_t type,unsigned wrmask,struct ir3_instruction * src0,struct ir3_instruction * src1)1437 emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,
1438          type_t type, unsigned wrmask, struct ir3_instruction *src0,
1439          struct ir3_instruction *src1)
1440 {
1441    struct ir3_instruction *sam, *addr;
1442    if (info.flags & IR3_INSTR_A1EN) {
1443       addr = ir3_get_addr1(ctx, info.a1_val);
1444    }
1445    sam = ir3_SAM(ctx->block, opc, type, wrmask, info.flags, info.samp_tex, src0,
1446                  src1);
1447    if (info.flags & IR3_INSTR_A1EN) {
1448       ir3_instr_set_address(sam, addr);
1449    }
1450    if (info.flags & IR3_INSTR_B) {
1451       sam->cat5.tex_base = info.base;
1452       sam->cat5.samp = info.samp_idx;
1453       sam->cat5.tex  = info.tex_idx;
1454    }
1455    return sam;
1456 }
1457 
1458 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
1459 static void
emit_intrinsic_load_image(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1460 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1461                           struct ir3_instruction **dst)
1462 {
1463    /* If the image can be written, must use LDIB to retrieve data, rather than
1464     * through ISAM (which uses the texture cache and won't get previous writes).
1465     */
1466    if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER)) {
1467       ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1468       return;
1469    }
1470 
1471    /* The sparse set of texture descriptors for non-coherent load_images means we can't do indirection, so
1472     * fall back to coherent load.
1473     */
1474    if (ctx->compiler->gen >= 5 &&
1475        !ir3_bindless_resource(intr->src[0]) &&
1476        !nir_src_is_const(intr->src[0])) {
1477       ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1478       return;
1479    }
1480 
1481    struct ir3_block *b = ctx->block;
1482    struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], true);
1483    struct ir3_instruction *sam;
1484    struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]);
1485    struct ir3_instruction *coords[4];
1486    unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1487    type_t type = ir3_get_type_for_image_intrinsic(intr);
1488 
1489    info.flags |= flags;
1490 
1491    /* hw doesn't do 1d, so we treat it as 2d with height of 1, and patch up the
1492     * y coord. Note that the array index must come after the fake y coord.
1493     */
1494    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
1495    if (dim == GLSL_SAMPLER_DIM_1D || dim == GLSL_SAMPLER_DIM_BUF) {
1496       coords[0] = src0[0];
1497       coords[1] = create_immed(b, 0);
1498       for (unsigned i = 1; i < ncoords; i++)
1499          coords[i + 1] = src0[i];
1500       ncoords++;
1501    } else {
1502       for (unsigned i = 0; i < ncoords; i++)
1503          coords[i] = src0[i];
1504    }
1505 
1506    sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
1507                   ir3_create_collect(b, coords, ncoords), NULL);
1508 
1509    ir3_handle_nonuniform(sam, intr);
1510 
1511    sam->barrier_class = IR3_BARRIER_IMAGE_R;
1512    sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
1513 
1514    ir3_split_dest(b, dst, sam, 0, 4);
1515 }
1516 
1517 /* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */
1518 void
emit_intrinsic_image_size_tex(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1519 emit_intrinsic_image_size_tex(struct ir3_context *ctx,
1520                               nir_intrinsic_instr *intr,
1521                               struct ir3_instruction **dst)
1522 {
1523    struct ir3_block *b = ctx->block;
1524    struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], true);
1525    struct ir3_instruction *sam, *lod;
1526    unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1527    type_t dst_type = intr->def.bit_size == 16 ? TYPE_U16 : TYPE_U32;
1528 
1529    info.flags |= flags;
1530    assert(nir_src_as_uint(intr->src[1]) == 0);
1531    lod = create_immed(b, 0);
1532    sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
1533 
1534    /* Array size actually ends up in .w rather than .z. This doesn't
1535     * matter for miplevel 0, but for higher mips the value in z is
1536     * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1537     * returned, which means that we have to add 1 to it for arrays for
1538     * a3xx.
1539     *
1540     * Note use a temporary dst and then copy, since the size of the dst
1541     * array that is passed in is based on nir's understanding of the
1542     * result size, not the hardware's
1543     */
1544    struct ir3_instruction *tmp[4];
1545 
1546    ir3_split_dest(b, tmp, sam, 0, 4);
1547 
1548    for (unsigned i = 0; i < ncoords; i++)
1549       dst[i] = tmp[i];
1550 
1551    if (flags & IR3_INSTR_A) {
1552       if (ctx->compiler->levels_add_one) {
1553          dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
1554       } else {
1555          dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32);
1556       }
1557    }
1558 }
1559 
1560 /* src[] = { buffer_index, offset }. No const_index */
1561 static void
emit_intrinsic_load_ssbo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1562 emit_intrinsic_load_ssbo(struct ir3_context *ctx,
1563                          nir_intrinsic_instr *intr,
1564                          struct ir3_instruction **dst)
1565 {
1566    /* Note: isam currently can't handle vectorized loads/stores */
1567    if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
1568        intr->def.num_components > 1 ||
1569        !ctx->compiler->has_isam_ssbo) {
1570       ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
1571       return;
1572    }
1573 
1574    struct ir3_block *b = ctx->block;
1575    struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0];
1576    struct ir3_instruction *coords = ir3_collect(b, offset, create_immed(b, 0));
1577    struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false);
1578 
1579    unsigned num_components = intr->def.num_components;
1580    struct ir3_instruction *sam =
1581       emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size),
1582                MASK(num_components), coords, NULL);
1583 
1584    ir3_handle_nonuniform(sam, intr);
1585 
1586    sam->barrier_class = IR3_BARRIER_BUFFER_R;
1587    sam->barrier_conflict = IR3_BARRIER_BUFFER_W;
1588 
1589    ir3_split_dest(b, dst, sam, 0, num_components);
1590 }
1591 
1592 static void
emit_control_barrier(struct ir3_context * ctx)1593 emit_control_barrier(struct ir3_context *ctx)
1594 {
1595    /* Hull shaders dispatch 32 wide so an entire patch will always
1596     * fit in a single warp and execute in lock-step. Consequently,
1597     * we don't need to do anything for TCS barriers. Emitting
1598     * barrier instruction will deadlock.
1599     */
1600    if (ctx->so->type == MESA_SHADER_TESS_CTRL)
1601       return;
1602 
1603    struct ir3_block *b = ctx->block;
1604    struct ir3_instruction *barrier = ir3_BAR(b);
1605    barrier->cat7.g = true;
1606    if (ctx->compiler->gen < 6)
1607       barrier->cat7.l = true;
1608    barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
1609    barrier->barrier_class = IR3_BARRIER_EVERYTHING;
1610    array_insert(b, b->keeps, barrier);
1611 
1612    ctx->so->has_barrier = true;
1613 }
1614 
1615 static void
emit_intrinsic_barrier(struct ir3_context * ctx,nir_intrinsic_instr * intr)1616 emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1617 {
1618    struct ir3_block *b = ctx->block;
1619    struct ir3_instruction *barrier;
1620 
1621    /* TODO: find out why there is a major difference of .l usage
1622     * between a5xx and a6xx,
1623     */
1624 
1625    mesa_scope exec_scope = nir_intrinsic_execution_scope(intr);
1626    mesa_scope mem_scope = nir_intrinsic_memory_scope(intr);
1627    nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
1628    /* loads/stores are always cache-coherent so we can filter out
1629     * available/visible.
1630     */
1631    nir_memory_semantics semantics =
1632       nir_intrinsic_memory_semantics(intr) & (NIR_MEMORY_ACQUIRE |
1633                                               NIR_MEMORY_RELEASE);
1634 
1635    if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
1636       /* Remove mode corresponding to TCS patch barriers because hull shaders
1637        * dispatch 32 wide so an entire patch will always fit in a single warp
1638        * and execute in lock-step.
1639        *
1640        * TODO: memory barrier also tells us not to reorder stores, this
1641        * information is lost here (backend doesn't reorder stores so we
1642        * are safe for now).
1643        */
1644       modes &= ~nir_var_shader_out;
1645    }
1646 
1647    assert(!(modes & nir_var_shader_out));
1648 
1649    if ((modes & (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global |
1650                  nir_var_image)) && semantics) {
1651       barrier = ir3_FENCE(b);
1652       barrier->cat7.r = true;
1653       barrier->cat7.w = true;
1654 
1655       if (modes & (nir_var_mem_ssbo | nir_var_image | nir_var_mem_global)) {
1656          barrier->cat7.g = true;
1657       }
1658 
1659       if (ctx->compiler->gen >= 6) {
1660          if (modes & (nir_var_mem_ssbo | nir_var_image)) {
1661             barrier->cat7.l = true;
1662          }
1663       } else {
1664          if (modes & (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_image)) {
1665             barrier->cat7.l = true;
1666          }
1667       }
1668 
1669       barrier->barrier_class = 0;
1670       barrier->barrier_conflict = 0;
1671 
1672       if (modes & nir_var_mem_shared) {
1673          barrier->barrier_class |= IR3_BARRIER_SHARED_W;
1674          barrier->barrier_conflict |=
1675             IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1676       }
1677 
1678       if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
1679          barrier->barrier_class |= IR3_BARRIER_BUFFER_W;
1680          barrier->barrier_conflict |=
1681             IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1682       }
1683 
1684       if (modes & nir_var_image) {
1685          barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
1686          barrier->barrier_conflict |=
1687             IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
1688       }
1689 
1690       /* make sure barrier doesn't get DCE'd */
1691       array_insert(b, b->keeps, barrier);
1692 
1693       if (ctx->compiler->gen >= 7 && mem_scope > SCOPE_WORKGROUP &&
1694           modes & (nir_var_mem_ssbo | nir_var_image) &&
1695           semantics & NIR_MEMORY_ACQUIRE) {
1696          /* "r + l" is not enough to synchronize reads with writes from other
1697           * workgroups, we can disable them since they are useless here.
1698           */
1699          barrier->cat7.r = false;
1700          barrier->cat7.l = false;
1701 
1702          struct ir3_instruction *ccinv = ir3_CCINV(b);
1703          /* A7XX TODO: ccinv should just stick to the barrier,
1704           * the barrier class/conflict introduces unnecessary waits.
1705           */
1706          ccinv->barrier_class = barrier->barrier_class;
1707          ccinv->barrier_conflict = barrier->barrier_conflict;
1708          array_insert(b, b->keeps, ccinv);
1709       }
1710    }
1711 
1712    if (exec_scope >= SCOPE_WORKGROUP) {
1713       emit_control_barrier(ctx);
1714    }
1715 }
1716 
1717 static void
add_sysval_input_compmask(struct ir3_context * ctx,gl_system_value slot,unsigned compmask,struct ir3_instruction * instr)1718 add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot,
1719                           unsigned compmask, struct ir3_instruction *instr)
1720 {
1721    struct ir3_shader_variant *so = ctx->so;
1722    unsigned n = so->inputs_count++;
1723 
1724    assert(instr->opc == OPC_META_INPUT);
1725    instr->input.inidx = n;
1726    instr->input.sysval = slot;
1727 
1728    so->inputs[n].sysval = true;
1729    so->inputs[n].slot = slot;
1730    so->inputs[n].compmask = compmask;
1731    so->total_in++;
1732 
1733    so->sysval_in += util_last_bit(compmask);
1734 }
1735 
1736 static struct ir3_instruction *
create_sysval_input(struct ir3_context * ctx,gl_system_value slot,unsigned compmask)1737 create_sysval_input(struct ir3_context *ctx, gl_system_value slot,
1738                     unsigned compmask)
1739 {
1740    assert(compmask);
1741    struct ir3_instruction *sysval = create_input(ctx, compmask);
1742    add_sysval_input_compmask(ctx, slot, compmask, sysval);
1743    return sysval;
1744 }
1745 
1746 static struct ir3_instruction *
get_barycentric(struct ir3_context * ctx,enum ir3_bary bary)1747 get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)
1748 {
1749    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_PIXEL ==
1750                  SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
1751    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_SAMPLE ==
1752                  SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
1753    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTROID ==
1754                  SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
1755    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTER_RHW ==
1756                  SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW);
1757    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_PIXEL ==
1758                  SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
1759    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_CENTROID ==
1760                  SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
1761    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_SAMPLE ==
1762                  SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
1763 
1764    if (!ctx->ij[bary]) {
1765       struct ir3_instruction *xy[2];
1766       struct ir3_instruction *ij;
1767 
1768       ij = create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL +
1769                                bary, 0x3);
1770       ir3_split_dest(ctx->in_block, xy, ij, 0, 2);
1771 
1772       ctx->ij[bary] = ir3_create_collect(ctx->in_block, xy, 2);
1773    }
1774 
1775    return ctx->ij[bary];
1776 }
1777 
1778 /* TODO: make this a common NIR helper?
1779  * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so
1780  * it can't be extended to work with this
1781  */
1782 static gl_system_value
nir_intrinsic_barycentric_sysval(nir_intrinsic_instr * intr)1783 nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)
1784 {
1785    enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);
1786    gl_system_value sysval;
1787 
1788    switch (intr->intrinsic) {
1789    case nir_intrinsic_load_barycentric_pixel:
1790       if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1791          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1792       else
1793          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1794       break;
1795    case nir_intrinsic_load_barycentric_centroid:
1796       if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1797          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;
1798       else
1799          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;
1800       break;
1801    case nir_intrinsic_load_barycentric_sample:
1802       if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1803          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;
1804       else
1805          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;
1806       break;
1807    default:
1808       unreachable("invalid barycentric intrinsic");
1809    }
1810 
1811    return sysval;
1812 }
1813 
1814 static void
emit_intrinsic_barycentric(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1815 emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1816                            struct ir3_instruction **dst)
1817 {
1818    gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);
1819 
1820    if (!ctx->so->key.msaa && ctx->compiler->gen < 6) {
1821       switch (sysval) {
1822       case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
1823          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1824          break;
1825       case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
1826          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1827          break;
1828       case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
1829          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1830          break;
1831       case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
1832          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1833          break;
1834       default:
1835          break;
1836       }
1837    }
1838 
1839    enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1840 
1841    struct ir3_instruction *ij = get_barycentric(ctx, bary);
1842    ir3_split_dest(ctx->block, dst, ij, 0, 2);
1843 }
1844 
1845 static struct ir3_instruction *
get_frag_coord(struct ir3_context * ctx,nir_intrinsic_instr * intr)1846 get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1847 {
1848    if (!ctx->frag_coord) {
1849       struct ir3_block *b = ir3_after_preamble(ctx->ir);
1850       struct ir3_instruction *xyzw[4];
1851       struct ir3_instruction *hw_frag_coord;
1852 
1853       hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);
1854       ir3_split_dest(b, xyzw, hw_frag_coord, 0, 4);
1855 
1856       /* for frag_coord.xy, we get unsigned values.. we need
1857        * to subtract (integer) 8 and divide by 16 (right-
1858        * shift by 4) then convert to float:
1859        *
1860        *    sub.s tmp, src, 8
1861        *    shr.b tmp, tmp, 4
1862        *    mov.u32f32 dst, tmp
1863        *
1864        */
1865       for (int i = 0; i < 2; i++) {
1866          xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32);
1867          xyzw[i] =
1868             ir3_MUL_F(b, xyzw[i], 0, create_immed(b, fui(1.0 / 16.0)), 0);
1869       }
1870 
1871       ctx->frag_coord = ir3_create_collect(b, xyzw, 4);
1872    }
1873 
1874    ctx->so->fragcoord_compmask |= nir_def_components_read(&intr->def);
1875 
1876    return ctx->frag_coord;
1877 }
1878 
1879 /* This is a bit of a hack until ir3_context is converted to store SSA values
1880  * as ir3_register's instead of ir3_instruction's. Pick out a given destination
1881  * of an instruction with multiple destinations using a mov that will get folded
1882  * away by ir3_cp.
1883  */
1884 static struct ir3_instruction *
create_multidst_mov(struct ir3_block * block,struct ir3_register * dst)1885 create_multidst_mov(struct ir3_block *block, struct ir3_register *dst)
1886 {
1887    struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1888    unsigned dst_flags = dst->flags & IR3_REG_HALF;
1889    unsigned src_flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
1890 
1891    __ssa_dst(mov)->flags |= dst_flags;
1892    struct ir3_register *src =
1893       ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | src_flags);
1894    src->wrmask = dst->wrmask;
1895    src->def = dst;
1896    assert(!(dst->flags & IR3_REG_RELATIV));
1897    mov->cat1.src_type = mov->cat1.dst_type =
1898       (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1899    return mov;
1900 }
1901 
1902 static reduce_op_t
get_reduce_op(nir_op opc)1903 get_reduce_op(nir_op opc)
1904 {
1905    switch (opc) {
1906    case nir_op_iadd: return REDUCE_OP_ADD_U;
1907    case nir_op_fadd: return REDUCE_OP_ADD_F;
1908    case nir_op_imul: return REDUCE_OP_MUL_U;
1909    case nir_op_fmul: return REDUCE_OP_MUL_F;
1910    case nir_op_umin: return REDUCE_OP_MIN_U;
1911    case nir_op_imin: return REDUCE_OP_MIN_S;
1912    case nir_op_fmin: return REDUCE_OP_MIN_F;
1913    case nir_op_umax: return REDUCE_OP_MAX_U;
1914    case nir_op_imax: return REDUCE_OP_MAX_S;
1915    case nir_op_fmax: return REDUCE_OP_MAX_F;
1916    case nir_op_iand: return REDUCE_OP_AND_B;
1917    case nir_op_ior:  return REDUCE_OP_OR_B;
1918    case nir_op_ixor: return REDUCE_OP_XOR_B;
1919    default:
1920       unreachable("unknown NIR reduce op");
1921    }
1922 }
1923 
1924 static uint32_t
get_reduce_identity(nir_op opc,unsigned size)1925 get_reduce_identity(nir_op opc, unsigned size)
1926 {
1927    switch (opc) {
1928    case nir_op_iadd:
1929       return 0;
1930    case nir_op_fadd:
1931       return size == 32 ? fui(0.0f) : _mesa_float_to_half(0.0f);
1932    case nir_op_imul:
1933       return 1;
1934    case nir_op_fmul:
1935       return size == 32 ? fui(1.0f) : _mesa_float_to_half(1.0f);
1936    case nir_op_umax:
1937       return 0;
1938    case nir_op_imax:
1939       return size == 32 ? INT32_MIN : (uint32_t)INT16_MIN;
1940    case nir_op_fmax:
1941       return size == 32 ? fui(-INFINITY) : _mesa_float_to_half(-INFINITY);
1942    case nir_op_umin:
1943       return size == 32 ? UINT32_MAX : UINT16_MAX;
1944    case nir_op_imin:
1945       return size == 32 ? INT32_MAX : (uint32_t)INT16_MAX;
1946    case nir_op_fmin:
1947       return size == 32 ? fui(INFINITY) : _mesa_float_to_half(INFINITY);
1948    case nir_op_iand:
1949       return size == 32 ? ~0 : (size == 16 ? (uint32_t)(uint16_t)~0 : 1);
1950    case nir_op_ior:
1951       return 0;
1952    case nir_op_ixor:
1953       return 0;
1954    default:
1955       unreachable("unknown NIR reduce op");
1956    }
1957 }
1958 
1959 static struct ir3_instruction *
emit_intrinsic_reduce(struct ir3_context * ctx,nir_intrinsic_instr * intr)1960 emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1961 {
1962    struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
1963    nir_op nir_reduce_op = (nir_op) nir_intrinsic_reduction_op(intr);
1964    reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
1965    unsigned dst_size = intr->def.bit_size;
1966    unsigned flags = (ir3_bitsize(ctx, dst_size) == 16) ? IR3_REG_HALF : 0;
1967 
1968    /* Note: the shared reg is initialized to the identity, so we need it to
1969     * always be 32-bit even when the source isn't because half shared regs are
1970     * not supported.
1971     */
1972    struct ir3_instruction *identity =
1973       create_immed(ctx->block, get_reduce_identity(nir_reduce_op, dst_size));
1974    identity = ir3_READ_FIRST_MACRO(ctx->block, identity, 0);
1975    identity->dsts[0]->flags |= IR3_REG_SHARED;
1976 
1977    /* OPC_SCAN_MACRO has the following destinations:
1978     * - Exclusive scan result (interferes with source)
1979     * - Inclusive scan result
1980     * - Shared reg reduction result, must be initialized to the identity
1981     *
1982     * The loop computes all three results at the same time, we just have to
1983     * choose which destination to return.
1984     */
1985    struct ir3_instruction *scan =
1986       ir3_instr_create(ctx->block, OPC_SCAN_MACRO, 3, 2);
1987    scan->cat1.reduce_op = reduce_op;
1988 
1989    struct ir3_register *exclusive = __ssa_dst(scan);
1990    exclusive->flags |= flags | IR3_REG_EARLY_CLOBBER;
1991    struct ir3_register *inclusive = __ssa_dst(scan);
1992    inclusive->flags |= flags;
1993    struct ir3_register *reduce = __ssa_dst(scan);
1994    reduce->flags |= IR3_REG_SHARED;
1995 
1996    /* The 32-bit multiply macro reads its sources after writing a partial result
1997     * to the destination, therefore inclusive also interferes with the source.
1998     */
1999    if (reduce_op == REDUCE_OP_MUL_U && dst_size == 32)
2000       inclusive->flags |= IR3_REG_EARLY_CLOBBER;
2001 
2002    /* Normal source */
2003    __ssa_src(scan, src, 0);
2004 
2005    /* shared reg tied source */
2006    struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
2007    ir3_reg_tie(reduce, reduce_init);
2008 
2009    struct ir3_register *dst;
2010    switch (intr->intrinsic) {
2011    case nir_intrinsic_reduce: dst = reduce; break;
2012    case nir_intrinsic_inclusive_scan: dst = inclusive; break;
2013    case nir_intrinsic_exclusive_scan: dst = exclusive; break;
2014    default:
2015       unreachable("unknown reduce intrinsic");
2016    }
2017 
2018    return create_multidst_mov(ctx->block, dst);
2019 }
2020 
2021 static struct ir3_instruction *
emit_intrinsic_reduce_clusters(struct ir3_context * ctx,nir_intrinsic_instr * intr)2022 emit_intrinsic_reduce_clusters(struct ir3_context *ctx,
2023                                nir_intrinsic_instr *intr)
2024 {
2025    nir_op nir_reduce_op = (nir_op)nir_intrinsic_reduction_op(intr);
2026    reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
2027    unsigned dst_size = intr->def.bit_size;
2028 
2029    bool need_exclusive =
2030       intr->intrinsic == nir_intrinsic_exclusive_scan_clusters_ir3;
2031    bool need_scratch = reduce_op == REDUCE_OP_MUL_U && dst_size == 32;
2032 
2033    /* Note: the shared reg is initialized to the identity, so we need it to
2034     * always be 32-bit even when the source isn't because half shared regs are
2035     * not supported.
2036     */
2037    struct ir3_instruction *identity =
2038       create_immed(ctx->block, get_reduce_identity(nir_reduce_op, dst_size));
2039    identity->dsts[0]->flags |= IR3_REG_SHARED;
2040 
2041    /* OPC_SCAN_CLUSTERS_MACRO has the following destinations:
2042     * - Shared reg reduction result, must be initialized to the identity
2043     * - Inclusive scan result
2044     * - (iff exclusive) Exclusive scan result. Conditionally added because
2045     *   calculating the exclusive value is optional (i.e., not a side-effect of
2046     *   calculating the inclusive value) and won't be DCE'd anymore at this
2047     *   point.
2048     * - (iff 32b mul_u) Scratch register. We try to emit "op rx, ry, rx" for
2049     *   most ops but this isn't possible for the 32b mul_u macro since its
2050     *   destination is clobbered. So conditionally allocate an extra
2051     *   register in that case.
2052     *
2053     * Note that the getlast loop this macro expands to iterates over all
2054     * clusters. However, for each iteration, not only the fibers in the current
2055     * cluster are active but all later ones as well. Since they still need their
2056     * sources when their cluster is handled, all destinations interfere with
2057     * the sources.
2058     */
2059    unsigned ndst = 2 + need_exclusive + need_scratch;
2060    unsigned nsrc = 2 + need_exclusive;
2061    struct ir3_instruction *scan =
2062       ir3_instr_create(ctx->block, OPC_SCAN_CLUSTERS_MACRO, ndst, nsrc);
2063    scan->cat1.reduce_op = reduce_op;
2064 
2065    unsigned dst_flags = IR3_REG_EARLY_CLOBBER;
2066    if (ir3_bitsize(ctx, dst_size) == 16)
2067       dst_flags |= IR3_REG_HALF;
2068 
2069    struct ir3_register *reduce = __ssa_dst(scan);
2070    reduce->flags |= IR3_REG_SHARED;
2071    struct ir3_register *inclusive = __ssa_dst(scan);
2072    inclusive->flags |= dst_flags;
2073 
2074    struct ir3_register *exclusive = NULL;
2075    if (need_exclusive) {
2076       exclusive = __ssa_dst(scan);
2077       exclusive->flags |= dst_flags;
2078    }
2079 
2080    if (need_scratch) {
2081       struct ir3_register *scratch = __ssa_dst(scan);
2082       scratch->flags |= dst_flags;
2083    }
2084 
2085    struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
2086    ir3_reg_tie(reduce, reduce_init);
2087 
2088    struct ir3_instruction *inclusive_src = ir3_get_src(ctx, &intr->src[0])[0];
2089    __ssa_src(scan, inclusive_src, 0);
2090 
2091    if (need_exclusive) {
2092       struct ir3_instruction *exclusive_src =
2093          ir3_get_src(ctx, &intr->src[1])[0];
2094       __ssa_src(scan, exclusive_src, 0);
2095    }
2096 
2097    struct ir3_register *dst;
2098    switch (intr->intrinsic) {
2099    case nir_intrinsic_reduce_clusters_ir3:
2100       dst = reduce;
2101       break;
2102    case nir_intrinsic_inclusive_scan_clusters_ir3:
2103       dst = inclusive;
2104       break;
2105    case nir_intrinsic_exclusive_scan_clusters_ir3: {
2106       assert(exclusive != NULL);
2107       dst = exclusive;
2108       break;
2109    }
2110    default:
2111       unreachable("unknown reduce intrinsic");
2112    }
2113 
2114    return create_multidst_mov(ctx->block, dst);
2115 }
2116 
2117 static struct ir3_instruction *
emit_intrinsic_brcst_active(struct ir3_context * ctx,nir_intrinsic_instr * intr)2118 emit_intrinsic_brcst_active(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2119 {
2120    struct ir3_instruction *default_src = ir3_get_src(ctx, &intr->src[0])[0];
2121    struct ir3_instruction *brcst_val = ir3_get_src(ctx, &intr->src[1])[0];
2122    return ir3_BRCST_ACTIVE(ctx->block, nir_intrinsic_cluster_size(intr),
2123                            brcst_val, default_src);
2124 }
2125 
2126 static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
2127 static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
2128 
2129 static void
emit_intrinsic(struct ir3_context * ctx,nir_intrinsic_instr * intr)2130 emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2131 {
2132    const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
2133    struct ir3_instruction **dst;
2134    struct ir3_instruction *const *src;
2135    struct ir3_block *b = ctx->block;
2136    unsigned dest_components = nir_intrinsic_dest_components(intr);
2137    int idx;
2138 
2139    if (info->has_dest) {
2140       dst = ir3_get_def(ctx, &intr->def, dest_components);
2141    } else {
2142       dst = NULL;
2143    }
2144 
2145    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
2146    const unsigned primitive_param = const_state->offsets.primitive_param * 4;
2147    const unsigned primitive_map = const_state->offsets.primitive_map * 4;
2148 
2149    switch (intr->intrinsic) {
2150    case nir_intrinsic_decl_reg:
2151       /* There's logically nothing to do, but this has a destination in NIR so
2152        * plug in something... It will get DCE'd.
2153        */
2154       dst[0] = create_immed(ctx->block, 0);
2155       break;
2156 
2157    case nir_intrinsic_load_reg:
2158    case nir_intrinsic_load_reg_indirect: {
2159       struct ir3_array *arr = ir3_get_array(ctx, intr->src[0].ssa);
2160       struct ir3_instruction *addr = NULL;
2161 
2162       if (intr->intrinsic == nir_intrinsic_load_reg_indirect) {
2163          addr = ir3_get_addr0(ctx, ir3_get_src(ctx, &intr->src[1])[0],
2164                               dest_components);
2165       }
2166 
2167       ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(intr->src[0].ssa);
2168       assert(dest_components == nir_intrinsic_num_components(decl));
2169 
2170       for (unsigned i = 0; i < dest_components; i++) {
2171          unsigned n = nir_intrinsic_base(intr) * dest_components + i;
2172          compile_assert(ctx, n < arr->length);
2173          dst[i] = ir3_create_array_load(ctx, arr, n, addr);
2174       }
2175 
2176       break;
2177    }
2178 
2179    case nir_intrinsic_store_reg:
2180    case nir_intrinsic_store_reg_indirect: {
2181       struct ir3_array *arr = ir3_get_array(ctx, intr->src[1].ssa);
2182       unsigned num_components = nir_src_num_components(intr->src[0]);
2183       struct ir3_instruction *addr = NULL;
2184 
2185       ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(intr->src[1].ssa);
2186       assert(num_components == nir_intrinsic_num_components(decl));
2187 
2188       struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[0]);
2189 
2190       if (intr->intrinsic == nir_intrinsic_store_reg_indirect) {
2191          addr = ir3_get_addr0(ctx, ir3_get_src(ctx, &intr->src[2])[0],
2192                               num_components);
2193       }
2194 
2195       u_foreach_bit(i, nir_intrinsic_write_mask(intr)) {
2196          assert(i < num_components);
2197 
2198          unsigned n = nir_intrinsic_base(intr) * num_components + i;
2199          compile_assert(ctx, n < arr->length);
2200          if (value[i])
2201             ir3_create_array_store(ctx, arr, n, value[i], addr);
2202       }
2203 
2204       break;
2205    }
2206 
2207    case nir_intrinsic_load_uniform:
2208       idx = nir_intrinsic_base(intr);
2209       if (nir_src_is_const(intr->src[0])) {
2210          idx += nir_src_as_uint(intr->src[0]);
2211          for (int i = 0; i < dest_components; i++) {
2212             dst[i] = create_uniform_typed(
2213                b, idx + i,
2214                intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32);
2215          }
2216       } else {
2217          src = ir3_get_src(ctx, &intr->src[0]);
2218          for (int i = 0; i < dest_components; i++) {
2219             dst[i] = create_uniform_indirect(
2220                b, idx + i,
2221                intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32,
2222                ir3_get_addr0(ctx, src[0], 1));
2223          }
2224          /* NOTE: if relative addressing is used, we set
2225           * constlen in the compiler (to worst-case value)
2226           * since we don't know in the assembler what the max
2227           * addr reg value can be:
2228           */
2229          ctx->so->constlen =
2230             MAX2(ctx->so->constlen,
2231                  ctx->so->shader_options.num_reserved_user_consts +
2232                  const_state->ubo_state.size / 16);
2233       }
2234       break;
2235 
2236    case nir_intrinsic_load_vs_primitive_stride_ir3:
2237       dst[0] = create_uniform(b, primitive_param + 0);
2238       break;
2239    case nir_intrinsic_load_vs_vertex_stride_ir3:
2240       dst[0] = create_uniform(b, primitive_param + 1);
2241       break;
2242    case nir_intrinsic_load_hs_patch_stride_ir3:
2243       dst[0] = create_uniform(b, primitive_param + 2);
2244       break;
2245    case nir_intrinsic_load_patch_vertices_in:
2246       dst[0] = create_uniform(b, primitive_param + 3);
2247       break;
2248    case nir_intrinsic_load_tess_param_base_ir3:
2249       dst[0] = create_uniform(b, primitive_param + 4);
2250       dst[1] = create_uniform(b, primitive_param + 5);
2251       break;
2252    case nir_intrinsic_load_tess_factor_base_ir3:
2253       dst[0] = create_uniform(b, primitive_param + 6);
2254       dst[1] = create_uniform(b, primitive_param + 7);
2255       break;
2256 
2257    case nir_intrinsic_load_primitive_location_ir3:
2258       idx = nir_intrinsic_driver_location(intr);
2259       dst[0] = create_uniform(b, primitive_map + idx);
2260       break;
2261 
2262    case nir_intrinsic_load_gs_header_ir3:
2263       dst[0] = ctx->gs_header;
2264       break;
2265    case nir_intrinsic_load_tcs_header_ir3:
2266       dst[0] = ctx->tcs_header;
2267       break;
2268 
2269    case nir_intrinsic_load_rel_patch_id_ir3:
2270       dst[0] = ctx->rel_patch_id;
2271       break;
2272 
2273    case nir_intrinsic_load_primitive_id:
2274       if (!ctx->primitive_id) {
2275          ctx->primitive_id =
2276             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
2277       }
2278       dst[0] = ctx->primitive_id;
2279       break;
2280 
2281    case nir_intrinsic_load_tess_coord_xy:
2282       if (!ctx->tess_coord) {
2283          ctx->tess_coord =
2284             create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);
2285       }
2286       ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);
2287       break;
2288 
2289    case nir_intrinsic_end_patch_ir3:
2290       assert(ctx->so->type == MESA_SHADER_TESS_CTRL);
2291       struct ir3_instruction *end = ir3_PREDE(b);
2292       array_insert(b, b->keeps, end);
2293 
2294       end->barrier_class = IR3_BARRIER_EVERYTHING;
2295       end->barrier_conflict = IR3_BARRIER_EVERYTHING;
2296       break;
2297 
2298    case nir_intrinsic_store_global_ir3:
2299       ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);
2300       break;
2301    case nir_intrinsic_load_global_ir3:
2302       ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);
2303       break;
2304 
2305    case nir_intrinsic_load_ubo:
2306       emit_intrinsic_load_ubo(ctx, intr, dst);
2307       break;
2308    case nir_intrinsic_load_ubo_vec4:
2309       emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
2310       break;
2311    case nir_intrinsic_copy_ubo_to_uniform_ir3:
2312       emit_intrinsic_copy_ubo_to_uniform(ctx, intr);
2313       break;
2314    case nir_intrinsic_copy_global_to_uniform_ir3:
2315       emit_intrinsic_copy_global_to_uniform(ctx, intr);
2316       break;
2317    case nir_intrinsic_load_frag_coord:
2318    case nir_intrinsic_load_frag_coord_unscaled_ir3:
2319       ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);
2320       break;
2321    case nir_intrinsic_load_sample_pos_from_id: {
2322       /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
2323        * but that doesn't seem necessary.
2324        */
2325       struct ir3_instruction *offset =
2326          ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
2327       offset->dsts[0]->wrmask = 0x3;
2328       offset->cat5.type = TYPE_F32;
2329 
2330       ir3_split_dest(b, dst, offset, 0, 2);
2331 
2332       break;
2333    }
2334    case nir_intrinsic_load_persp_center_rhw_ir3:
2335       if (!ctx->ij[IJ_PERSP_CENTER_RHW]) {
2336          ctx->ij[IJ_PERSP_CENTER_RHW] =
2337             create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW, 0x1);
2338       }
2339       dst[0] = ctx->ij[IJ_PERSP_CENTER_RHW];
2340       break;
2341    case nir_intrinsic_load_barycentric_centroid:
2342    case nir_intrinsic_load_barycentric_sample:
2343    case nir_intrinsic_load_barycentric_pixel:
2344       emit_intrinsic_barycentric(ctx, intr, dst);
2345       break;
2346    case nir_intrinsic_load_interpolated_input:
2347    case nir_intrinsic_load_input:
2348       setup_input(ctx, intr);
2349       break;
2350    case nir_intrinsic_load_kernel_input:
2351       emit_intrinsic_load_kernel_input(ctx, intr, dst);
2352       break;
2353    /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
2354     * pass and replaced by an ir3-specifc version that adds the
2355     * dword-offset in the last source.
2356     */
2357    case nir_intrinsic_load_ssbo_ir3:
2358       emit_intrinsic_load_ssbo(ctx, intr, dst);
2359       break;
2360    case nir_intrinsic_store_ssbo_ir3:
2361       ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
2362       break;
2363    case nir_intrinsic_get_ssbo_size:
2364       emit_intrinsic_ssbo_size(ctx, intr, dst);
2365       break;
2366    case nir_intrinsic_ssbo_atomic_ir3:
2367    case nir_intrinsic_ssbo_atomic_swap_ir3:
2368       dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
2369       break;
2370    case nir_intrinsic_load_shared:
2371       emit_intrinsic_load_shared(ctx, intr, dst);
2372       break;
2373    case nir_intrinsic_store_shared:
2374       emit_intrinsic_store_shared(ctx, intr);
2375       break;
2376    case nir_intrinsic_shared_atomic:
2377    case nir_intrinsic_shared_atomic_swap:
2378       dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
2379       break;
2380    case nir_intrinsic_load_scratch:
2381       emit_intrinsic_load_scratch(ctx, intr, dst);
2382       break;
2383    case nir_intrinsic_store_scratch:
2384       emit_intrinsic_store_scratch(ctx, intr);
2385       break;
2386    case nir_intrinsic_image_load:
2387    case nir_intrinsic_bindless_image_load:
2388       emit_intrinsic_load_image(ctx, intr, dst);
2389       break;
2390    case nir_intrinsic_image_store:
2391    case nir_intrinsic_bindless_image_store:
2392       ctx->funcs->emit_intrinsic_store_image(ctx, intr);
2393       break;
2394    case nir_intrinsic_image_size:
2395    case nir_intrinsic_bindless_image_size:
2396       ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);
2397       break;
2398    case nir_intrinsic_image_atomic:
2399    case nir_intrinsic_bindless_image_atomic:
2400    case nir_intrinsic_image_atomic_swap:
2401    case nir_intrinsic_bindless_image_atomic_swap:
2402       dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
2403       break;
2404    case nir_intrinsic_barrier:
2405       emit_intrinsic_barrier(ctx, intr);
2406       /* note that blk ptr no longer valid, make that obvious: */
2407       b = NULL;
2408       break;
2409    case nir_intrinsic_store_output:
2410       setup_output(ctx, intr);
2411       break;
2412    case nir_intrinsic_load_base_vertex:
2413    case nir_intrinsic_load_first_vertex:
2414       if (!ctx->basevertex) {
2415          ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
2416       }
2417       dst[0] = ctx->basevertex;
2418       break;
2419    case nir_intrinsic_load_is_indexed_draw:
2420       if (!ctx->is_indexed_draw) {
2421          ctx->is_indexed_draw = create_driver_param(ctx, IR3_DP_IS_INDEXED_DRAW);
2422       }
2423       dst[0] = ctx->is_indexed_draw;
2424       break;
2425    case nir_intrinsic_load_draw_id:
2426       if (!ctx->draw_id) {
2427          ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);
2428       }
2429       dst[0] = ctx->draw_id;
2430       break;
2431    case nir_intrinsic_load_base_instance:
2432       if (!ctx->base_instance) {
2433          ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE);
2434       }
2435       dst[0] = ctx->base_instance;
2436       break;
2437    case nir_intrinsic_load_view_index:
2438       if (!ctx->view_index) {
2439          ctx->view_index =
2440             create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);
2441       }
2442       dst[0] = ctx->view_index;
2443       break;
2444    case nir_intrinsic_load_vertex_id_zero_base:
2445    case nir_intrinsic_load_vertex_id:
2446       if (!ctx->vertex_id) {
2447          gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id)
2448                                  ? SYSTEM_VALUE_VERTEX_ID
2449                                  : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
2450          ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);
2451       }
2452       dst[0] = ctx->vertex_id;
2453       break;
2454    case nir_intrinsic_load_instance_id:
2455       if (!ctx->instance_id) {
2456          ctx->instance_id =
2457             create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);
2458       }
2459       dst[0] = ctx->instance_id;
2460       break;
2461    case nir_intrinsic_load_sample_id:
2462    case nir_intrinsic_load_sample_id_no_per_sample:
2463       if (!ctx->samp_id) {
2464          ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);
2465          ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;
2466       }
2467       dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
2468       break;
2469    case nir_intrinsic_load_sample_mask_in:
2470       if (!ctx->samp_mask_in) {
2471          ctx->samp_mask_in =
2472             create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
2473       }
2474       dst[0] = ctx->samp_mask_in;
2475       break;
2476    case nir_intrinsic_load_user_clip_plane:
2477       idx = nir_intrinsic_ucp_id(intr);
2478       for (int i = 0; i < dest_components; i++) {
2479          unsigned n = idx * 4 + i;
2480          dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
2481       }
2482       break;
2483    case nir_intrinsic_load_front_face:
2484       if (!ctx->frag_face) {
2485          ctx->so->frag_face = true;
2486          ctx->frag_face =
2487             create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);
2488          ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;
2489       }
2490       /* for fragface, we get -1 for back and 0 for front. However this is
2491        * the inverse of what nir expects (where ~0 is true).
2492        */
2493       dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0,
2494                           create_immed_typed(b, 0, TYPE_U16), 0);
2495       dst[0]->cat2.condition = IR3_COND_EQ;
2496       break;
2497    case nir_intrinsic_load_local_invocation_id:
2498       if (!ctx->local_invocation_id) {
2499          ctx->local_invocation_id =
2500             create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);
2501       }
2502       ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);
2503       break;
2504    case nir_intrinsic_load_workgroup_id:
2505    case nir_intrinsic_load_workgroup_id_zero_base:
2506       if (ctx->compiler->has_shared_regfile) {
2507          if (!ctx->work_group_id) {
2508             ctx->work_group_id =
2509                create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);
2510             ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;
2511          }
2512          ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);
2513       } else {
2514          /* For a3xx/a4xx, this comes in via const injection by the hw */
2515          for (int i = 0; i < dest_components; i++) {
2516             dst[i] = create_driver_param(ctx, IR3_DP_WORKGROUP_ID_X + i);
2517          }
2518       }
2519       break;
2520    case nir_intrinsic_load_base_workgroup_id:
2521       for (int i = 0; i < dest_components; i++) {
2522          dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
2523       }
2524       break;
2525    case nir_intrinsic_load_num_workgroups:
2526       for (int i = 0; i < dest_components; i++) {
2527          dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
2528       }
2529       break;
2530    case nir_intrinsic_load_workgroup_size:
2531       for (int i = 0; i < dest_components; i++) {
2532          dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
2533       }
2534       break;
2535    case nir_intrinsic_load_subgroup_size: {
2536       assert(ctx->so->type == MESA_SHADER_COMPUTE ||
2537              ctx->so->type == MESA_SHADER_FRAGMENT);
2538       enum ir3_driver_param size = ctx->so->type == MESA_SHADER_COMPUTE ?
2539          IR3_DP_CS_SUBGROUP_SIZE : IR3_DP_FS_SUBGROUP_SIZE;
2540       dst[0] = create_driver_param(ctx, size);
2541       break;
2542    }
2543    case nir_intrinsic_load_subgroup_id_shift_ir3:
2544       dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
2545       break;
2546    case nir_intrinsic_load_work_dim:
2547       dst[0] = create_driver_param(ctx, IR3_DP_WORK_DIM);
2548       break;
2549    case nir_intrinsic_load_subgroup_invocation:
2550       assert(ctx->compiler->has_getfiberid);
2551       dst[0] = ir3_GETFIBERID(b);
2552       dst[0]->cat6.type = TYPE_U32;
2553       __ssa_dst(dst[0]);
2554       break;
2555    case nir_intrinsic_load_tess_level_outer_default:
2556       for (int i = 0; i < dest_components; i++) {
2557          dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_OUTER_LEVEL_X + i);
2558       }
2559       break;
2560    case nir_intrinsic_load_tess_level_inner_default:
2561       for (int i = 0; i < dest_components; i++) {
2562          dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_INNER_LEVEL_X + i);
2563       }
2564       break;
2565    case nir_intrinsic_load_frag_invocation_count:
2566       dst[0] = create_driver_param(ctx, IR3_DP_FS_FRAG_INVOCATION_COUNT);
2567       break;
2568    case nir_intrinsic_load_frag_size_ir3:
2569    case nir_intrinsic_load_frag_offset_ir3: {
2570       enum ir3_driver_param param =
2571          intr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
2572          IR3_DP_FS_FRAG_SIZE : IR3_DP_FS_FRAG_OFFSET;
2573       if (nir_src_is_const(intr->src[0])) {
2574          uint32_t view = nir_src_as_uint(intr->src[0]);
2575          for (int i = 0; i < dest_components; i++) {
2576             dst[i] = create_driver_param(ctx, param + 4 * view + i);
2577          }
2578       } else {
2579          struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0];
2580          for (int i = 0; i < dest_components; i++) {
2581             dst[i] = create_driver_param_indirect(ctx, param + i,
2582                                                   ir3_get_addr0(ctx, view, 4));
2583          }
2584          ctx->so->constlen =
2585             MAX2(ctx->so->constlen,
2586                  const_state->offsets.driver_param + param / 4 +
2587                  nir_intrinsic_range(intr));
2588       }
2589       break;
2590    }
2591    case nir_intrinsic_discard_if:
2592    case nir_intrinsic_discard:
2593    case nir_intrinsic_demote:
2594    case nir_intrinsic_demote_if:
2595    case nir_intrinsic_terminate:
2596    case nir_intrinsic_terminate_if: {
2597       struct ir3_instruction *cond, *kill;
2598 
2599       if (intr->intrinsic == nir_intrinsic_discard_if ||
2600           intr->intrinsic == nir_intrinsic_demote_if ||
2601           intr->intrinsic == nir_intrinsic_terminate_if) {
2602          /* conditional discard: */
2603          src = ir3_get_src(ctx, &intr->src[0]);
2604          cond = src[0];
2605       } else {
2606          /* unconditional discard: */
2607          cond = create_immed_typed(b, 1, ctx->compiler->bool_type);
2608       }
2609 
2610       /* NOTE: only cmps.*.* can write p0.x: */
2611       struct ir3_instruction *zero =
2612             create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32);
2613       cond = ir3_CMPS_S(b, cond, 0, zero, 0);
2614       cond->cat2.condition = IR3_COND_NE;
2615 
2616       /* condition always goes in predicate register: */
2617       cond->dsts[0]->num = regid(REG_P0, 0);
2618       cond->dsts[0]->flags &= ~IR3_REG_SSA;
2619 
2620       if (intr->intrinsic == nir_intrinsic_demote ||
2621           intr->intrinsic == nir_intrinsic_demote_if) {
2622          kill = ir3_DEMOTE(b, cond, 0);
2623       } else {
2624          kill = ir3_KILL(b, cond, 0);
2625       }
2626 
2627       /* - Side-effects should not be moved on a different side of the kill
2628        * - Instructions that depend on active fibers should not be reordered
2629        */
2630       kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
2631                             IR3_BARRIER_ACTIVE_FIBERS_W;
2632       kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
2633                                IR3_BARRIER_ACTIVE_FIBERS_R;
2634       kill->srcs[0]->num = regid(REG_P0, 0);
2635       array_insert(ctx->ir, ctx->ir->predicates, kill);
2636 
2637       array_insert(b, b->keeps, kill);
2638       ctx->so->has_kill = true;
2639 
2640       break;
2641    }
2642 
2643    case nir_intrinsic_cond_end_ir3: {
2644       struct ir3_instruction *cond, *kill;
2645 
2646       src = ir3_get_src(ctx, &intr->src[0]);
2647       cond = src[0];
2648 
2649       /* NOTE: only cmps.*.* can write p0.x: */
2650       struct ir3_instruction *zero =
2651             create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32);
2652       cond = ir3_CMPS_S(b, cond, 0, zero, 0);
2653       cond->cat2.condition = IR3_COND_NE;
2654 
2655       /* condition always goes in predicate register: */
2656       cond->dsts[0]->num = regid(REG_P0, 0);
2657 
2658       kill = ir3_PREDT(b, cond, 0);
2659 
2660       kill->barrier_class = IR3_BARRIER_EVERYTHING;
2661       kill->barrier_conflict = IR3_BARRIER_EVERYTHING;
2662 
2663       array_insert(ctx->ir, ctx->ir->predicates, kill);
2664       array_insert(b, b->keeps, kill);
2665       break;
2666    }
2667 
2668    case nir_intrinsic_vote_any:
2669    case nir_intrinsic_vote_all: {
2670       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2671       struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
2672       if (intr->intrinsic == nir_intrinsic_vote_any)
2673          dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0);
2674       else
2675          dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0);
2676       dst[0]->srcs[0]->num = regid(REG_P0, 0);
2677       array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
2678       break;
2679    }
2680    case nir_intrinsic_elect:
2681       dst[0] = ir3_ELECT_MACRO(ctx->block);
2682       break;
2683    case nir_intrinsic_preamble_start_ir3:
2684       dst[0] = ir3_SHPS_MACRO(ctx->block);
2685       break;
2686 
2687    case nir_intrinsic_read_invocation_cond_ir3: {
2688       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2689       struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];
2690       dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0,
2691                                    src, 0);
2692       dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
2693       dst[0]->srcs[0]->num = regid(REG_P0, 0);
2694       array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
2695       break;
2696    }
2697 
2698    case nir_intrinsic_read_first_invocation: {
2699       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2700       dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0);
2701       dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
2702       break;
2703    }
2704 
2705    case nir_intrinsic_ballot: {
2706       struct ir3_instruction *ballot;
2707       unsigned components = intr->def.num_components;
2708       if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
2709          /* ballot(true) is just MOVMSK */
2710          ballot = ir3_MOVMSK(ctx->block, components);
2711       } else {
2712          struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2713          struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
2714          ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);
2715          ballot->srcs[0]->num = regid(REG_P0, 0);
2716          array_insert(ctx->ir, ctx->ir->predicates, ballot);
2717       }
2718 
2719       ballot->barrier_class = IR3_BARRIER_ACTIVE_FIBERS_R;
2720       ballot->barrier_conflict = IR3_BARRIER_ACTIVE_FIBERS_W;
2721 
2722       ir3_split_dest(ctx->block, dst, ballot, 0, components);
2723       break;
2724    }
2725 
2726    case nir_intrinsic_quad_broadcast: {
2727       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2728       struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0];
2729 
2730       type_t dst_type = type_uint_size(intr->def.bit_size);
2731 
2732       if (dst_type != TYPE_U32)
2733          idx = ir3_COV(ctx->block, idx, TYPE_U32, dst_type);
2734 
2735       dst[0] = ir3_QUAD_SHUFFLE_BRCST(ctx->block, src, 0, idx, 0);
2736       dst[0]->cat5.type = dst_type;
2737       break;
2738    }
2739 
2740    case nir_intrinsic_quad_swap_horizontal: {
2741       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2742       dst[0] = ir3_QUAD_SHUFFLE_HORIZ(ctx->block, src, 0);
2743       dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
2744       break;
2745    }
2746 
2747    case nir_intrinsic_quad_swap_vertical: {
2748       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2749       dst[0] = ir3_QUAD_SHUFFLE_VERT(ctx->block, src, 0);
2750       dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
2751       break;
2752    }
2753 
2754    case nir_intrinsic_quad_swap_diagonal: {
2755       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2756       dst[0] = ir3_QUAD_SHUFFLE_DIAG(ctx->block, src, 0);
2757       dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
2758       break;
2759    }
2760 
2761    case nir_intrinsic_load_shared_ir3:
2762       emit_intrinsic_load_shared_ir3(ctx, intr, dst);
2763       break;
2764    case nir_intrinsic_store_shared_ir3:
2765       emit_intrinsic_store_shared_ir3(ctx, intr);
2766       break;
2767    case nir_intrinsic_bindless_resource_ir3:
2768       dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
2769       break;
2770    case nir_intrinsic_global_atomic_ir3:
2771    case nir_intrinsic_global_atomic_swap_ir3: {
2772       dst[0] = ctx->funcs->emit_intrinsic_atomic_global(ctx, intr);
2773       break;
2774    }
2775 
2776    case nir_intrinsic_reduce:
2777    case nir_intrinsic_inclusive_scan:
2778    case nir_intrinsic_exclusive_scan:
2779       dst[0] = emit_intrinsic_reduce(ctx, intr);
2780       break;
2781 
2782    case nir_intrinsic_reduce_clusters_ir3:
2783    case nir_intrinsic_inclusive_scan_clusters_ir3:
2784    case nir_intrinsic_exclusive_scan_clusters_ir3:
2785       dst[0] = emit_intrinsic_reduce_clusters(ctx, intr);
2786       break;
2787 
2788    case nir_intrinsic_brcst_active_ir3:
2789       dst[0] = emit_intrinsic_brcst_active(ctx, intr);
2790       break;
2791 
2792    case nir_intrinsic_preamble_end_ir3: {
2793       struct ir3_instruction *instr = ir3_SHPE(ctx->block);
2794       instr->barrier_class = instr->barrier_conflict = IR3_BARRIER_CONST_W;
2795       array_insert(b, b->keeps, instr);
2796       break;
2797    }
2798    case nir_intrinsic_store_uniform_ir3: {
2799       unsigned components = nir_src_num_components(intr->src[0]);
2800       unsigned dst = nir_intrinsic_base(intr);
2801       unsigned dst_lo = dst & 0xff;
2802       unsigned dst_hi = dst >> 8;
2803 
2804       struct ir3_instruction *src =
2805          ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), components);
2806       struct ir3_instruction *a1 = NULL;
2807       if (dst_hi) {
2808          /* Encode only the high part of the destination in a1.x to increase the
2809           * chance that we can reuse the a1.x value in subsequent stc
2810           * instructions.
2811           */
2812          a1 = ir3_get_addr1(ctx, dst_hi << 8);
2813       }
2814 
2815       struct ir3_instruction *stc =
2816          ir3_STC(ctx->block, create_immed(b, dst_lo),  0, src, 0);
2817       stc->cat6.iim_val = components;
2818       stc->cat6.type = TYPE_U32;
2819       stc->barrier_conflict = IR3_BARRIER_CONST_W;
2820       if (a1) {
2821          ir3_instr_set_address(stc, a1);
2822          stc->flags |= IR3_INSTR_A1EN;
2823       }
2824       array_insert(b, b->keeps, stc);
2825       break;
2826    }
2827    case nir_intrinsic_copy_push_const_to_uniform_ir3: {
2828       struct ir3_instruction *load =
2829          ir3_instr_create(ctx->block, OPC_PUSH_CONSTS_LOAD_MACRO, 0, 0);
2830       array_insert(b, b->keeps, load);
2831 
2832       load->push_consts.dst_base = nir_src_as_uint(intr->src[0]);
2833       load->push_consts.src_base = nir_intrinsic_base(intr);
2834       load->push_consts.src_size = nir_intrinsic_range(intr);
2835 
2836       ctx->so->constlen =
2837          MAX2(ctx->so->constlen,
2838               DIV_ROUND_UP(
2839                  load->push_consts.dst_base + load->push_consts.src_size, 4));
2840       break;
2841    }
2842    default:
2843       ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
2844                         nir_intrinsic_infos[intr->intrinsic].name);
2845       break;
2846    }
2847 
2848    if (info->has_dest)
2849       ir3_put_def(ctx, &intr->def);
2850 }
2851 
2852 static void
emit_load_const(struct ir3_context * ctx,nir_load_const_instr * instr)2853 emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
2854 {
2855    struct ir3_instruction **dst =
2856       ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components);
2857    unsigned bit_size = ir3_bitsize(ctx, instr->def.bit_size);
2858 
2859    if (bit_size <= 8) {
2860       for (int i = 0; i < instr->def.num_components; i++)
2861          dst[i] = create_immed_typed(ctx->block, instr->value[i].u8, TYPE_U8);
2862    } else if (bit_size <= 16) {
2863       for (int i = 0; i < instr->def.num_components; i++)
2864          dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, TYPE_U16);
2865    } else {
2866       for (int i = 0; i < instr->def.num_components; i++)
2867          dst[i] = create_immed_typed(ctx->block, instr->value[i].u32, TYPE_U32);
2868    }
2869 }
2870 
2871 static void
emit_undef(struct ir3_context * ctx,nir_undef_instr * undef)2872 emit_undef(struct ir3_context *ctx, nir_undef_instr *undef)
2873 {
2874    struct ir3_instruction **dst =
2875       ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);
2876    type_t type = utype_for_size(ir3_bitsize(ctx, undef->def.bit_size));
2877 
2878    /* backend doesn't want undefined instructions, so just plug
2879     * in 0.0..
2880     */
2881    for (int i = 0; i < undef->def.num_components; i++)
2882       dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
2883 }
2884 
2885 /*
2886  * texture fetch/sample instructions:
2887  */
2888 
2889 static type_t
get_tex_dest_type(nir_tex_instr * tex)2890 get_tex_dest_type(nir_tex_instr *tex)
2891 {
2892    type_t type;
2893 
2894    switch (tex->dest_type) {
2895    case nir_type_float32:
2896       return TYPE_F32;
2897    case nir_type_float16:
2898       return TYPE_F16;
2899    case nir_type_int32:
2900       return TYPE_S32;
2901    case nir_type_int16:
2902       return TYPE_S16;
2903    case nir_type_bool32:
2904    case nir_type_uint32:
2905       return TYPE_U32;
2906    case nir_type_bool16:
2907    case nir_type_uint16:
2908       return TYPE_U16;
2909    case nir_type_invalid:
2910    default:
2911       unreachable("bad dest_type");
2912    }
2913 
2914    return type;
2915 }
2916 
2917 static void
tex_info(nir_tex_instr * tex,unsigned * flagsp,unsigned * coordsp)2918 tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
2919 {
2920    unsigned coords =
2921       glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);
2922    unsigned flags = 0;
2923 
2924    /* note: would use tex->coord_components.. except txs.. also,
2925     * since array index goes after shadow ref, we don't want to
2926     * count it:
2927     */
2928    if (coords == 3)
2929       flags |= IR3_INSTR_3D;
2930 
2931    if (tex->is_shadow && tex->op != nir_texop_lod)
2932       flags |= IR3_INSTR_S;
2933 
2934    if (tex->is_array && tex->op != nir_texop_lod)
2935       flags |= IR3_INSTR_A;
2936 
2937    *flagsp = flags;
2938    *coordsp = coords;
2939 }
2940 
2941 /* Gets the sampler/texture idx as a hvec2.  Which could either be dynamic
2942  * or immediate (in which case it will get lowered later to a non .s2en
2943  * version of the tex instruction which encode tex/samp as immediates:
2944  */
2945 static struct tex_src_info
get_tex_samp_tex_src(struct ir3_context * ctx,nir_tex_instr * tex)2946 get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
2947 {
2948    struct ir3_block *b = ctx->block;
2949    struct tex_src_info info = {0};
2950    int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
2951    int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
2952    struct ir3_instruction *texture, *sampler;
2953 
2954    if (texture_idx >= 0 || sampler_idx >= 0) {
2955       /* Bindless case */
2956       info.flags |= IR3_INSTR_B;
2957 
2958       if (tex->texture_non_uniform || tex->sampler_non_uniform)
2959          info.flags |= IR3_INSTR_NONUNIF;
2960 
2961       /* Gather information required to determine which encoding to
2962        * choose as well as for prefetch.
2963        */
2964       nir_intrinsic_instr *bindless_tex = NULL;
2965       bool tex_const;
2966       if (texture_idx >= 0) {
2967          ctx->so->bindless_tex = true;
2968          bindless_tex = ir3_bindless_resource(tex->src[texture_idx].src);
2969          assert(bindless_tex);
2970          info.tex_base = nir_intrinsic_desc_set(bindless_tex);
2971          tex_const = nir_src_is_const(bindless_tex->src[0]);
2972          if (tex_const)
2973             info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
2974       } else {
2975          /* To simplify some of the logic below, assume the index is
2976           * constant 0 when it's not enabled.
2977           */
2978          tex_const = true;
2979          info.tex_idx = 0;
2980       }
2981       nir_intrinsic_instr *bindless_samp = NULL;
2982       bool samp_const;
2983       if (sampler_idx >= 0) {
2984          ctx->so->bindless_samp = true;
2985          bindless_samp = ir3_bindless_resource(tex->src[sampler_idx].src);
2986          assert(bindless_samp);
2987          info.samp_base = nir_intrinsic_desc_set(bindless_samp);
2988          samp_const = nir_src_is_const(bindless_samp->src[0]);
2989          if (samp_const)
2990             info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);
2991       } else {
2992          samp_const = true;
2993          info.samp_idx = 0;
2994       }
2995 
2996       /* Choose encoding. */
2997       if (tex_const && samp_const && info.tex_idx < 256 &&
2998           info.samp_idx < 256) {
2999          if (info.tex_idx < 16 && info.samp_idx < 16 &&
3000              (!bindless_tex || !bindless_samp ||
3001               info.tex_base == info.samp_base)) {
3002             /* Everything fits within the instruction */
3003             info.base = info.tex_base;
3004          } else {
3005             info.base = info.tex_base;
3006             if (ctx->compiler->gen <= 6) {
3007                info.a1_val = info.tex_idx << 3 | info.samp_base;
3008             } else {
3009                info.a1_val = info.samp_idx << 3 | info.samp_base;
3010             }
3011 
3012             info.flags |= IR3_INSTR_A1EN;
3013          }
3014          info.samp_tex = NULL;
3015       } else {
3016          info.flags |= IR3_INSTR_S2EN;
3017          /* In the indirect case, we only use a1.x to store the sampler
3018           * base if it differs from the texture base.
3019           */
3020          if (!bindless_tex || !bindless_samp ||
3021              info.tex_base == info.samp_base) {
3022             info.base = info.tex_base;
3023          } else {
3024             info.base = info.tex_base;
3025             info.a1_val = info.samp_base;
3026             info.flags |= IR3_INSTR_A1EN;
3027          }
3028 
3029          /* Note: the indirect source is now a vec2 instead of hvec2, and
3030           * for some reason the texture and sampler are swapped.
3031           */
3032          struct ir3_instruction *texture, *sampler;
3033 
3034          if (bindless_tex) {
3035             texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
3036          } else {
3037             texture = create_immed(b, 0);
3038          }
3039 
3040          if (bindless_samp) {
3041             sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
3042          } else {
3043             sampler = create_immed(b, 0);
3044          }
3045          info.samp_tex = ir3_collect(b, texture, sampler);
3046       }
3047    } else {
3048       info.flags |= IR3_INSTR_S2EN;
3049       texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
3050       sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
3051       if (texture_idx >= 0) {
3052          texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
3053          texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16);
3054       } else {
3055          /* TODO what to do for dynamic case? I guess we only need the
3056           * max index for astc srgb workaround so maybe not a problem
3057           * to worry about if we don't enable indirect samplers for
3058           * a4xx?
3059           */
3060          ctx->max_texture_index =
3061             MAX2(ctx->max_texture_index, tex->texture_index);
3062          texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16);
3063          info.tex_idx = tex->texture_index;
3064       }
3065 
3066       if (sampler_idx >= 0) {
3067          sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
3068          sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16);
3069       } else {
3070          sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16);
3071          info.samp_idx = tex->texture_index;
3072       }
3073 
3074       info.samp_tex = ir3_collect(b, sampler, texture);
3075    }
3076 
3077    return info;
3078 }
3079 
3080 static void
emit_tex(struct ir3_context * ctx,nir_tex_instr * tex)3081 emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
3082 {
3083    struct ir3_block *b = ctx->block;
3084    struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
3085    struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
3086    struct ir3_instruction *lod, *compare, *proj, *sample_index;
3087    struct tex_src_info info = {0};
3088    bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
3089    unsigned i, coords, flags, ncomp;
3090    unsigned nsrc0 = 0, nsrc1 = 0;
3091    type_t type;
3092    opc_t opc = 0;
3093 
3094    ncomp = tex->def.num_components;
3095 
3096    coord = off = ddx = ddy = NULL;
3097    lod = proj = compare = sample_index = NULL;
3098 
3099    dst = ir3_get_def(ctx, &tex->def, ncomp);
3100 
3101    for (unsigned i = 0; i < tex->num_srcs; i++) {
3102       switch (tex->src[i].src_type) {
3103       case nir_tex_src_coord:
3104          coord = ir3_get_src(ctx, &tex->src[i].src);
3105          break;
3106       case nir_tex_src_bias:
3107          lod = ir3_get_src(ctx, &tex->src[i].src)[0];
3108          has_bias = true;
3109          break;
3110       case nir_tex_src_lod:
3111          lod = ir3_get_src(ctx, &tex->src[i].src)[0];
3112          has_lod = true;
3113          break;
3114       case nir_tex_src_comparator: /* shadow comparator */
3115          compare = ir3_get_src(ctx, &tex->src[i].src)[0];
3116          break;
3117       case nir_tex_src_projector:
3118          proj = ir3_get_src(ctx, &tex->src[i].src)[0];
3119          has_proj = true;
3120          break;
3121       case nir_tex_src_offset:
3122          off = ir3_get_src(ctx, &tex->src[i].src);
3123          has_off = true;
3124          break;
3125       case nir_tex_src_ddx:
3126          ddx = ir3_get_src(ctx, &tex->src[i].src);
3127          break;
3128       case nir_tex_src_ddy:
3129          ddy = ir3_get_src(ctx, &tex->src[i].src);
3130          break;
3131       case nir_tex_src_ms_index:
3132          sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];
3133          break;
3134       case nir_tex_src_texture_offset:
3135       case nir_tex_src_sampler_offset:
3136       case nir_tex_src_texture_handle:
3137       case nir_tex_src_sampler_handle:
3138          /* handled in get_tex_samp_src() */
3139          break;
3140       default:
3141          ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
3142                            tex->src[i].src_type);
3143          return;
3144       }
3145    }
3146 
3147    switch (tex->op) {
3148    case nir_texop_tex_prefetch:
3149       compile_assert(ctx, !has_bias);
3150       compile_assert(ctx, !has_lod);
3151       compile_assert(ctx, !compare);
3152       compile_assert(ctx, !has_proj);
3153       compile_assert(ctx, !has_off);
3154       compile_assert(ctx, !ddx);
3155       compile_assert(ctx, !ddy);
3156       compile_assert(ctx, !sample_index);
3157       compile_assert(
3158          ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);
3159       compile_assert(
3160          ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);
3161 
3162       if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {
3163          opc = OPC_META_TEX_PREFETCH;
3164          ctx->so->num_sampler_prefetch++;
3165          break;
3166       }
3167       FALLTHROUGH;
3168    case nir_texop_tex:
3169       opc = has_lod ? OPC_SAML : OPC_SAM;
3170       break;
3171    case nir_texop_txb:
3172       opc = OPC_SAMB;
3173       break;
3174    case nir_texop_txl:
3175       opc = OPC_SAML;
3176       break;
3177    case nir_texop_txd:
3178       opc = OPC_SAMGQ;
3179       break;
3180    case nir_texop_txf:
3181       opc = OPC_ISAML;
3182       break;
3183    case nir_texop_lod:
3184       opc = OPC_GETLOD;
3185       break;
3186    case nir_texop_tg4:
3187       switch (tex->component) {
3188       case 0:
3189          opc = OPC_GATHER4R;
3190          break;
3191       case 1:
3192          opc = OPC_GATHER4G;
3193          break;
3194       case 2:
3195          opc = OPC_GATHER4B;
3196          break;
3197       case 3:
3198          opc = OPC_GATHER4A;
3199          break;
3200       }
3201       break;
3202    case nir_texop_txf_ms_fb:
3203    case nir_texop_txf_ms:
3204       opc = OPC_ISAMM;
3205       break;
3206    default:
3207       ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
3208       return;
3209    }
3210 
3211    tex_info(tex, &flags, &coords);
3212 
3213    /*
3214     * lay out the first argument in the proper order:
3215     *  - actual coordinates first
3216     *  - shadow reference
3217     *  - array index
3218     *  - projection w
3219     *  - starting at offset 4, dpdx.xy, dpdy.xy
3220     *
3221     * bias/lod go into the second arg
3222     */
3223 
3224    /* insert tex coords: */
3225    for (i = 0; i < coords; i++)
3226       src0[i] = coord[i];
3227 
3228    nsrc0 = i;
3229 
3230    type_t coord_pad_type = is_half(coord[0]) ? TYPE_U16 : TYPE_U32;
3231    /* scale up integer coords for TXF based on the LOD */
3232    if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
3233       assert(has_lod);
3234       for (i = 0; i < coords; i++)
3235          src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
3236    }
3237 
3238    if (coords == 1) {
3239       /* hw doesn't do 1d, so we treat it as 2d with
3240        * height of 1, and patch up the y coord.
3241        */
3242       if (is_isam(opc)) {
3243          src0[nsrc0++] = create_immed_typed(b, 0, coord_pad_type);
3244       } else if (is_half(coord[0])) {
3245          src0[nsrc0++] = create_immed_typed(b, _mesa_float_to_half(0.5), coord_pad_type);
3246       } else {
3247          src0[nsrc0++] = create_immed_typed(b, fui(0.5), coord_pad_type);
3248       }
3249    }
3250 
3251    if (tex->is_shadow && tex->op != nir_texop_lod)
3252       src0[nsrc0++] = compare;
3253 
3254    if (tex->is_array && tex->op != nir_texop_lod)
3255       src0[nsrc0++] = coord[coords];
3256 
3257    if (has_proj) {
3258       src0[nsrc0++] = proj;
3259       flags |= IR3_INSTR_P;
3260    }
3261 
3262    /* pad to 4, then ddx/ddy: */
3263    if (tex->op == nir_texop_txd) {
3264       while (nsrc0 < 4)
3265          src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3266       for (i = 0; i < coords; i++)
3267          src0[nsrc0++] = ddx[i];
3268       if (coords < 2)
3269          src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3270       for (i = 0; i < coords; i++)
3271          src0[nsrc0++] = ddy[i];
3272       if (coords < 2)
3273          src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3274    }
3275 
3276    /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
3277     * with scaled x coord according to requested sample:
3278     */
3279    if (opc == OPC_ISAMM) {
3280       if (ctx->compiler->txf_ms_with_isaml) {
3281          /* the samples are laid out in x dimension as
3282           *     0 1 2 3
3283           * x_ms = (x << ms) + sample_index;
3284           */
3285          struct ir3_instruction *ms;
3286          ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
3287 
3288          src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
3289          src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
3290 
3291          opc = OPC_ISAML;
3292       } else {
3293          src0[nsrc0++] = sample_index;
3294       }
3295    }
3296 
3297    /*
3298     * second argument (if applicable):
3299     *  - offsets
3300     *  - lod
3301     *  - bias
3302     */
3303    if (has_off | has_lod | has_bias) {
3304       if (has_off) {
3305          unsigned off_coords = coords;
3306          if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
3307             off_coords--;
3308          for (i = 0; i < off_coords; i++)
3309             src1[nsrc1++] = off[i];
3310          if (off_coords < 2)
3311             src1[nsrc1++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3312          flags |= IR3_INSTR_O;
3313       }
3314 
3315       if (has_lod | has_bias)
3316          src1[nsrc1++] = lod;
3317    }
3318 
3319    type = get_tex_dest_type(tex);
3320 
3321    if (opc == OPC_GETLOD)
3322       type = TYPE_S32;
3323 
3324    if (tex->op == nir_texop_txf_ms_fb) {
3325       compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
3326 
3327       ctx->so->fb_read = true;
3328       if (ctx->compiler->options.bindless_fb_read_descriptor >= 0) {
3329          ctx->so->bindless_tex = true;
3330          info.flags = IR3_INSTR_B;
3331          info.base = ctx->compiler->options.bindless_fb_read_descriptor;
3332          struct ir3_instruction *texture, *sampler;
3333 
3334          int base_index =
3335             nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
3336          nir_src tex_src = tex->src[base_index].src;
3337 
3338          if (nir_src_is_const(tex_src)) {
3339             texture = create_immed_typed(b,
3340                nir_src_as_uint(tex_src) + ctx->compiler->options.bindless_fb_read_slot,
3341                TYPE_U32);
3342          } else {
3343             texture = create_immed_typed(
3344                ctx->block, ctx->compiler->options.bindless_fb_read_slot, TYPE_U32);
3345             struct ir3_instruction *base =
3346                ir3_get_src(ctx, &tex->src[base_index].src)[0];
3347             texture = ir3_ADD_U(b, texture, 0, base, 0);
3348          }
3349          sampler = create_immed_typed(ctx->block, 0, TYPE_U32);
3350          info.samp_tex = ir3_collect(b, texture, sampler);
3351          info.flags |= IR3_INSTR_S2EN;
3352          if (tex->texture_non_uniform) {
3353             info.flags |= IR3_INSTR_NONUNIF;
3354          }
3355       } else {
3356          /* Otherwise append a sampler to be patched into the texture
3357           * state:
3358           */
3359          info.samp_tex = ir3_collect(
3360                b, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
3361                create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16));
3362          info.flags = IR3_INSTR_S2EN;
3363       }
3364 
3365       ctx->so->num_samp++;
3366    } else {
3367       info = get_tex_samp_tex_src(ctx, tex);
3368    }
3369 
3370    bool tg4_swizzle_fixup = false;
3371    if (tex->op == nir_texop_tg4 && ctx->compiler->gen == 4 &&
3372          ctx->sampler_swizzles[tex->texture_index] != 0x688 /* rgba */) {
3373       uint16_t swizzles = ctx->sampler_swizzles[tex->texture_index];
3374       uint16_t swizzle = (swizzles >> (tex->component * 3)) & 7;
3375       if (swizzle > 3) {
3376          /* this would mean that we can just return 0 / 1, no texturing
3377           * necessary
3378           */
3379          struct ir3_instruction *imm = create_immed(b,
3380                type_float(type) ? fui(swizzle - 4) : (swizzle - 4));
3381          for (int i = 0; i < 4; i++)
3382             dst[i] = imm;
3383          ir3_put_def(ctx, &tex->def);
3384          return;
3385       }
3386       opc = OPC_GATHER4R + swizzle;
3387       tg4_swizzle_fixup = true;
3388    }
3389 
3390    struct ir3_instruction *col0 = ir3_create_collect(b, src0, nsrc0);
3391    struct ir3_instruction *col1 = ir3_create_collect(b, src1, nsrc1);
3392 
3393    if (opc == OPC_META_TEX_PREFETCH) {
3394       int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
3395 
3396 
3397       sam = ir3_SAM(ctx->in_block, opc, type, MASK(ncomp), 0, NULL,
3398                     get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
3399       sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);
3400       /* make sure not to add irrelevant flags like S2EN */
3401       sam->flags = flags | (info.flags & IR3_INSTR_B);
3402       sam->prefetch.tex = info.tex_idx;
3403       sam->prefetch.samp = info.samp_idx;
3404       sam->prefetch.tex_base = info.tex_base;
3405       sam->prefetch.samp_base = info.samp_base;
3406    } else {
3407       info.flags |= flags;
3408       sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
3409    }
3410 
3411    if (tg4_swizzle_fixup) {
3412       /* TODO: fix-up for ASTC when alpha is selected? */
3413       array_insert(ctx->ir, ctx->ir->tg4, sam);
3414 
3415       ir3_split_dest(b, dst, sam, 0, 4);
3416 
3417       uint8_t tex_bits = ctx->sampler_swizzles[tex->texture_index] >> 12;
3418       if (!type_float(type) && tex_bits != 3 /* 32bpp */ &&
3419             tex_bits != 0 /* key unset */) {
3420          uint8_t bits = 0;
3421          switch (tex_bits) {
3422          case 1: /* 8bpp */
3423             bits = 8;
3424             break;
3425          case 2: /* 16bpp */
3426             bits = 16;
3427             break;
3428          case 4: /* 10bpp or 2bpp for alpha */
3429             if (opc == OPC_GATHER4A)
3430                bits = 2;
3431             else
3432                bits = 10;
3433             break;
3434          default:
3435             assert(0);
3436          }
3437 
3438          sam->cat5.type = TYPE_F32;
3439          for (int i = 0; i < 4; i++) {
3440             /* scale and offset the unorm data */
3441             dst[i] = ir3_MAD_F32(b, dst[i], 0, create_immed(b, fui((1 << bits) - 1)), 0, create_immed(b, fui(0.5f)), 0);
3442             /* convert the scaled value to integer */
3443             dst[i] = ir3_COV(b, dst[i], TYPE_F32, TYPE_U32);
3444             /* sign extend for signed values */
3445             if (type == TYPE_S32) {
3446                dst[i] = ir3_SHL_B(b, dst[i], 0, create_immed(b, 32 - bits), 0);
3447                dst[i] = ir3_ASHR_B(b, dst[i], 0, create_immed(b, 32 - bits), 0);
3448             }
3449          }
3450       }
3451    } else if ((ctx->astc_srgb & (1 << tex->texture_index)) &&
3452        tex->op != nir_texop_tg4 && /* leave out tg4, unless it's on alpha? */
3453        !nir_tex_instr_is_query(tex)) {
3454       assert(opc != OPC_META_TEX_PREFETCH);
3455 
3456       /* only need first 3 components: */
3457       sam->dsts[0]->wrmask = 0x7;
3458       ir3_split_dest(b, dst, sam, 0, 3);
3459 
3460       /* we need to sample the alpha separately with a non-SRGB
3461        * texture state:
3462        */
3463       sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex,
3464                     col0, col1);
3465 
3466       array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
3467 
3468       /* fixup .w component: */
3469       ir3_split_dest(b, &dst[3], sam, 3, 1);
3470    } else {
3471       /* normal (non-workaround) case: */
3472       ir3_split_dest(b, dst, sam, 0, ncomp);
3473    }
3474 
3475    /* GETLOD returns results in 4.8 fixed point */
3476    if (opc == OPC_GETLOD) {
3477       bool half = tex->def.bit_size == 16;
3478       struct ir3_instruction *factor =
3479          half ? create_immed_typed(b, _mesa_float_to_half(1.0 / 256), TYPE_F16)
3480               : create_immed(b, fui(1.0 / 256));
3481 
3482       for (i = 0; i < 2; i++) {
3483          dst[i] = ir3_MUL_F(
3484             b, ir3_COV(b, dst[i], TYPE_S32, half ? TYPE_F16 : TYPE_F32), 0,
3485             factor, 0);
3486       }
3487    }
3488 
3489    ir3_put_def(ctx, &tex->def);
3490 }
3491 
3492 static void
emit_tex_info(struct ir3_context * ctx,nir_tex_instr * tex,unsigned idx)3493 emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx)
3494 {
3495    struct ir3_block *b = ctx->block;
3496    struct ir3_instruction **dst, *sam;
3497    type_t dst_type = get_tex_dest_type(tex);
3498    struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
3499 
3500    dst = ir3_get_def(ctx, &tex->def, 1);
3501 
3502    sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);
3503 
3504    /* even though there is only one component, since it ends
3505     * up in .y/.z/.w rather than .x, we need a split_dest()
3506     */
3507    ir3_split_dest(b, dst, sam, idx, 1);
3508 
3509    /* The # of levels comes from getinfo.z. We need to add 1 to it, since
3510     * the value in TEX_CONST_0 is zero-based.
3511     */
3512    if (ctx->compiler->levels_add_one)
3513       dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
3514 
3515    ir3_put_def(ctx, &tex->def);
3516 }
3517 
3518 static void
emit_tex_txs(struct ir3_context * ctx,nir_tex_instr * tex)3519 emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
3520 {
3521    struct ir3_block *b = ctx->block;
3522    struct ir3_instruction **dst, *sam;
3523    struct ir3_instruction *lod;
3524    unsigned flags, coords;
3525    type_t dst_type = get_tex_dest_type(tex);
3526    struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
3527 
3528    tex_info(tex, &flags, &coords);
3529    info.flags |= flags;
3530 
3531    /* Actually we want the number of dimensions, not coordinates. This
3532     * distinction only matters for cubes.
3533     */
3534    if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
3535       coords = 2;
3536 
3537    dst = ir3_get_def(ctx, &tex->def, 4);
3538 
3539    int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
3540    compile_assert(ctx, lod_idx >= 0);
3541 
3542    lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];
3543 
3544    if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
3545       sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
3546    } else {
3547       /*
3548        * The maximum value which OPC_GETSIZE could return for one dimension
3549        * is 0x007ff0, however sampler buffer could be much bigger.
3550        * Blob uses OPC_GETBUF for them.
3551        */
3552       sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);
3553    }
3554 
3555    ir3_split_dest(b, dst, sam, 0, 4);
3556 
3557    /* Array size actually ends up in .w rather than .z. This doesn't
3558     * matter for miplevel 0, but for higher mips the value in z is
3559     * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
3560     * returned, which means that we have to add 1 to it for arrays.
3561     */
3562    if (tex->is_array) {
3563       if (ctx->compiler->levels_add_one) {
3564          dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
3565       } else {
3566          dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
3567       }
3568    }
3569 
3570    ir3_put_def(ctx, &tex->def);
3571 }
3572 
3573 /* phi instructions are left partially constructed.  We don't resolve
3574  * their srcs until the end of the shader, since (eg. loops) one of
3575  * the phi's srcs might be defined after the phi due to back edges in
3576  * the CFG.
3577  */
3578 static void
emit_phi(struct ir3_context * ctx,nir_phi_instr * nphi)3579 emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)
3580 {
3581    struct ir3_instruction *phi, **dst;
3582 
3583    /* NOTE: phi's should be lowered to scalar at this point */
3584    compile_assert(ctx, nphi->def.num_components == 1);
3585 
3586    dst = ir3_get_def(ctx, &nphi->def, 1);
3587 
3588    phi = ir3_instr_create(ctx->block, OPC_META_PHI, 1,
3589                           exec_list_length(&nphi->srcs));
3590    __ssa_dst(phi);
3591    phi->phi.nphi = nphi;
3592 
3593    dst[0] = phi;
3594 
3595    ir3_put_def(ctx, &nphi->def);
3596 }
3597 
3598 static struct ir3_block *get_block(struct ir3_context *ctx,
3599                                    const nir_block *nblock);
3600 
3601 static struct ir3_instruction *
read_phi_src(struct ir3_context * ctx,struct ir3_block * blk,struct ir3_instruction * phi,nir_phi_instr * nphi)3602 read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,
3603              struct ir3_instruction *phi, nir_phi_instr *nphi)
3604 {
3605    if (!blk->nblock) {
3606       struct ir3_instruction *continue_phi =
3607          ir3_instr_create(blk, OPC_META_PHI, 1, blk->predecessors_count);
3608       __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;
3609 
3610       for (unsigned i = 0; i < blk->predecessors_count; i++) {
3611          struct ir3_instruction *src =
3612             read_phi_src(ctx, blk->predecessors[i], phi, nphi);
3613          if (src)
3614             __ssa_src(continue_phi, src, 0);
3615          else
3616             ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);
3617       }
3618 
3619       return continue_phi;
3620    }
3621 
3622    nir_foreach_phi_src (nsrc, nphi) {
3623       if (blk->nblock == nsrc->pred) {
3624          if (nsrc->src.ssa->parent_instr->type == nir_instr_type_undef) {
3625             /* Create an ir3 undef */
3626             return NULL;
3627          } else {
3628             return ir3_get_src(ctx, &nsrc->src)[0];
3629          }
3630       }
3631    }
3632 
3633    unreachable("couldn't find phi node ir3 block");
3634    return NULL;
3635 }
3636 
3637 static void
resolve_phis(struct ir3_context * ctx,struct ir3_block * block)3638 resolve_phis(struct ir3_context *ctx, struct ir3_block *block)
3639 {
3640    foreach_instr (phi, &block->instr_list) {
3641       if (phi->opc != OPC_META_PHI)
3642          break;
3643 
3644       nir_phi_instr *nphi = phi->phi.nphi;
3645 
3646       if (!nphi) /* skip continue phis created above */
3647          continue;
3648 
3649       for (unsigned i = 0; i < block->predecessors_count; i++) {
3650          struct ir3_block *pred = block->predecessors[i];
3651          struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);
3652          if (src) {
3653             __ssa_src(phi, src, 0);
3654          } else {
3655             /* Create an ir3 undef */
3656             ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);
3657          }
3658       }
3659    }
3660 }
3661 
3662 static void
emit_jump(struct ir3_context * ctx,nir_jump_instr * jump)3663 emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
3664 {
3665    switch (jump->type) {
3666    case nir_jump_break:
3667    case nir_jump_continue:
3668    case nir_jump_return:
3669       /* I *think* we can simply just ignore this, and use the
3670        * successor block link to figure out where we need to
3671        * jump to for break/continue
3672        */
3673       break;
3674    default:
3675       ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
3676       break;
3677    }
3678 }
3679 
3680 static void
emit_instr(struct ir3_context * ctx,nir_instr * instr)3681 emit_instr(struct ir3_context *ctx, nir_instr *instr)
3682 {
3683    switch (instr->type) {
3684    case nir_instr_type_alu:
3685       emit_alu(ctx, nir_instr_as_alu(instr));
3686       break;
3687    case nir_instr_type_deref:
3688       /* ignored, handled as part of the intrinsic they are src to */
3689       break;
3690    case nir_instr_type_intrinsic:
3691       emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
3692       break;
3693    case nir_instr_type_load_const:
3694       emit_load_const(ctx, nir_instr_as_load_const(instr));
3695       break;
3696    case nir_instr_type_undef:
3697       emit_undef(ctx, nir_instr_as_undef(instr));
3698       break;
3699    case nir_instr_type_tex: {
3700       nir_tex_instr *tex = nir_instr_as_tex(instr);
3701       /* couple tex instructions get special-cased:
3702        */
3703       switch (tex->op) {
3704       case nir_texop_txs:
3705          emit_tex_txs(ctx, tex);
3706          break;
3707       case nir_texop_query_levels:
3708          emit_tex_info(ctx, tex, 2);
3709          break;
3710       case nir_texop_texture_samples:
3711          emit_tex_info(ctx, tex, 3);
3712          break;
3713       default:
3714          emit_tex(ctx, tex);
3715          break;
3716       }
3717       break;
3718    }
3719    case nir_instr_type_jump:
3720       emit_jump(ctx, nir_instr_as_jump(instr));
3721       break;
3722    case nir_instr_type_phi:
3723       emit_phi(ctx, nir_instr_as_phi(instr));
3724       break;
3725    case nir_instr_type_call:
3726    case nir_instr_type_parallel_copy:
3727       ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n",
3728                         instr->type);
3729       break;
3730    }
3731 }
3732 
3733 static struct ir3_block *
get_block(struct ir3_context * ctx,const nir_block * nblock)3734 get_block(struct ir3_context *ctx, const nir_block *nblock)
3735 {
3736    struct ir3_block *block;
3737    struct hash_entry *hentry;
3738 
3739    hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
3740    if (hentry)
3741       return hentry->data;
3742 
3743    block = ir3_block_create(ctx->ir);
3744    block->nblock = nblock;
3745    _mesa_hash_table_insert(ctx->block_ht, nblock, block);
3746 
3747    return block;
3748 }
3749 
3750 static struct ir3_block *
get_block_or_continue(struct ir3_context * ctx,const nir_block * nblock)3751 get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock)
3752 {
3753    struct hash_entry *hentry;
3754 
3755    hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);
3756    if (hentry)
3757       return hentry->data;
3758 
3759    return get_block(ctx, nblock);
3760 }
3761 
3762 static struct ir3_block *
create_continue_block(struct ir3_context * ctx,const nir_block * nblock)3763 create_continue_block(struct ir3_context *ctx, const nir_block *nblock)
3764 {
3765    struct ir3_block *block = ir3_block_create(ctx->ir);
3766    block->nblock = NULL;
3767    _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);
3768    return block;
3769 }
3770 
3771 static void
emit_block(struct ir3_context * ctx,nir_block * nblock)3772 emit_block(struct ir3_context *ctx, nir_block *nblock)
3773 {
3774    ctx->block = get_block(ctx, nblock);
3775 
3776    list_addtail(&ctx->block->node, &ctx->ir->block_list);
3777 
3778    ctx->block->loop_id = ctx->loop_id;
3779    ctx->block->loop_depth = ctx->loop_depth;
3780 
3781    /* re-emit addr register in each block if needed: */
3782    for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
3783       _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
3784       ctx->addr0_ht[i] = NULL;
3785    }
3786 
3787    _mesa_hash_table_u64_destroy(ctx->addr1_ht);
3788    ctx->addr1_ht = NULL;
3789 
3790    nir_foreach_instr (instr, nblock) {
3791       ctx->cur_instr = instr;
3792       emit_instr(ctx, instr);
3793       ctx->cur_instr = NULL;
3794       if (ctx->error)
3795          return;
3796    }
3797 
3798    for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {
3799       if (nblock->successors[i]) {
3800          ctx->block->successors[i] =
3801             get_block_or_continue(ctx, nblock->successors[i]);
3802       }
3803    }
3804 
3805    _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
3806 }
3807 
3808 static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
3809 
3810 static void
emit_if(struct ir3_context * ctx,nir_if * nif)3811 emit_if(struct ir3_context *ctx, nir_if *nif)
3812 {
3813    struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0];
3814 
3815    if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
3816       ctx->block->condition = ssa(condition->srcs[0]);
3817       ctx->block->brtype = IR3_BRANCH_ANY;
3818    } else if (condition->opc == OPC_ALL_MACRO &&
3819               condition->block == ctx->block) {
3820       ctx->block->condition = ssa(condition->srcs[0]);
3821       ctx->block->brtype = IR3_BRANCH_ALL;
3822    } else if (condition->opc == OPC_ELECT_MACRO &&
3823               condition->block == ctx->block) {
3824       ctx->block->condition = NULL;
3825       ctx->block->brtype = IR3_BRANCH_GETONE;
3826    } else if (condition->opc == OPC_SHPS_MACRO &&
3827               condition->block == ctx->block) {
3828       /* TODO: technically this only works if the block is the only user of the
3829        * shps, but we only use it in very constrained scenarios so this should
3830        * be ok.
3831        */
3832       ctx->block->condition = NULL;
3833       ctx->block->brtype = IR3_BRANCH_SHPS;
3834    } else {
3835       ctx->block->condition = ir3_get_predicate(ctx, condition);
3836       ctx->block->brtype = IR3_BRANCH_COND;
3837    }
3838 
3839    emit_cf_list(ctx, &nif->then_list);
3840    emit_cf_list(ctx, &nif->else_list);
3841 }
3842 
3843 static void
emit_loop(struct ir3_context * ctx,nir_loop * nloop)3844 emit_loop(struct ir3_context *ctx, nir_loop *nloop)
3845 {
3846    assert(!nir_loop_has_continue_construct(nloop));
3847    unsigned old_loop_id = ctx->loop_id;
3848    ctx->loop_id = ctx->so->loops + 1;
3849    ctx->loop_depth++;
3850 
3851    struct nir_block *nstart = nir_loop_first_block(nloop);
3852    struct ir3_block *continue_blk = NULL;
3853 
3854    /* There's always one incoming edge from outside the loop, and if there
3855     * is more than one backedge from inside the loop (so more than 2 total
3856     * edges) then we need to create a continue block after the loop to ensure
3857     * that control reconverges at the end of each loop iteration.
3858     */
3859    if (nstart->predecessors->entries > 2) {
3860       continue_blk = create_continue_block(ctx, nstart);
3861    }
3862 
3863    emit_cf_list(ctx, &nloop->body);
3864 
3865    if (continue_blk) {
3866       struct ir3_block *start = get_block(ctx, nstart);
3867       continue_blk->successors[0] = start;
3868       continue_blk->loop_id = ctx->loop_id;
3869       continue_blk->loop_depth = ctx->loop_depth;
3870       list_addtail(&continue_blk->node, &ctx->ir->block_list);
3871    }
3872 
3873    ctx->so->loops++;
3874    ctx->loop_depth--;
3875    ctx->loop_id = old_loop_id;
3876 }
3877 
3878 static void
emit_cf_list(struct ir3_context * ctx,struct exec_list * list)3879 emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
3880 {
3881    foreach_list_typed (nir_cf_node, node, node, list) {
3882       switch (node->type) {
3883       case nir_cf_node_block:
3884          emit_block(ctx, nir_cf_node_as_block(node));
3885          break;
3886       case nir_cf_node_if:
3887          emit_if(ctx, nir_cf_node_as_if(node));
3888          break;
3889       case nir_cf_node_loop:
3890          emit_loop(ctx, nir_cf_node_as_loop(node));
3891          break;
3892       case nir_cf_node_function:
3893          ir3_context_error(ctx, "TODO\n");
3894          break;
3895       }
3896    }
3897 }
3898 
3899 /* emit stream-out code.  At this point, the current block is the original
3900  * (nir) end block, and nir ensures that all flow control paths terminate
3901  * into the end block.  We re-purpose the original end block to generate
3902  * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
3903  * block holding stream-out write instructions, followed by the new end
3904  * block:
3905  *
3906  *   blockOrigEnd {
3907  *      p0.x = (vtxcnt < maxvtxcnt)
3908  *      // succs: blockStreamOut, blockNewEnd
3909  *   }
3910  *   blockStreamOut {
3911  *      // preds: blockOrigEnd
3912  *      ... stream-out instructions ...
3913  *      // succs: blockNewEnd
3914  *   }
3915  *   blockNewEnd {
3916  *      // preds: blockOrigEnd, blockStreamOut
3917  *   }
3918  */
3919 static void
emit_stream_out(struct ir3_context * ctx)3920 emit_stream_out(struct ir3_context *ctx)
3921 {
3922    struct ir3 *ir = ctx->ir;
3923    struct ir3_stream_output_info *strmout = &ctx->so->stream_output;
3924    struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
3925    struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
3926    struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
3927 
3928    /* create vtxcnt input in input block at top of shader,
3929     * so that it is seen as live over the entire duration
3930     * of the shader:
3931     */
3932    vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
3933    maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
3934 
3935    /* at this point, we are at the original 'end' block,
3936     * re-purpose this block to stream-out condition, then
3937     * append stream-out block and new-end block
3938     */
3939    orig_end_block = ctx->block;
3940 
3941    // maybe w/ store_global intrinsic, we could do this
3942    // stuff in nir->nir pass
3943 
3944    stream_out_block = ir3_block_create(ir);
3945    list_addtail(&stream_out_block->node, &ir->block_list);
3946 
3947    new_end_block = ir3_block_create(ir);
3948    list_addtail(&new_end_block->node, &ir->block_list);
3949 
3950    orig_end_block->successors[0] = stream_out_block;
3951    orig_end_block->successors[1] = new_end_block;
3952 
3953    stream_out_block->successors[0] = new_end_block;
3954 
3955    /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
3956    cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
3957    cond->dsts[0]->num = regid(REG_P0, 0);
3958    cond->dsts[0]->flags &= ~IR3_REG_SSA;
3959    cond->cat2.condition = IR3_COND_LT;
3960 
3961    /* condition goes on previous block to the conditional,
3962     * since it is used to pick which of the two successor
3963     * paths to take:
3964     */
3965    orig_end_block->condition = cond;
3966 
3967    /* switch to stream_out_block to generate the stream-out
3968     * instructions:
3969     */
3970    ctx->block = stream_out_block;
3971 
3972    /* Calculate base addresses based on vtxcnt.  Instructions
3973     * generated for bases not used in following loop will be
3974     * stripped out in the backend.
3975     */
3976    for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3977       const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
3978       unsigned stride = strmout->stride[i];
3979       struct ir3_instruction *base, *off;
3980 
3981       base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i));
3982 
3983       /* 24-bit should be enough: */
3984       off = ir3_MUL_U24(ctx->block, vtxcnt, 0,
3985                         create_immed(ctx->block, stride * 4), 0);
3986 
3987       bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
3988    }
3989 
3990    /* Generate the per-output store instructions: */
3991    for (unsigned i = 0; i < strmout->num_outputs; i++) {
3992       for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
3993          unsigned c = j + strmout->output[i].start_component;
3994          struct ir3_instruction *base, *out, *stg;
3995 
3996          base = bases[strmout->output[i].output_buffer];
3997          out = ctx->outputs[regid(strmout->output[i].register_index, c)];
3998 
3999          stg = ir3_STG(
4000             ctx->block, base, 0,
4001             create_immed(ctx->block, (strmout->output[i].dst_offset + j) * 4),
4002             0, out, 0, create_immed(ctx->block, 1), 0);
4003          stg->cat6.type = TYPE_U32;
4004 
4005          array_insert(ctx->block, ctx->block->keeps, stg);
4006       }
4007    }
4008 
4009    /* and finally switch to the new_end_block: */
4010    ctx->block = new_end_block;
4011 }
4012 
4013 static void
setup_predecessors(struct ir3 * ir)4014 setup_predecessors(struct ir3 *ir)
4015 {
4016    foreach_block (block, &ir->block_list) {
4017       for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
4018          if (block->successors[i])
4019             ir3_block_add_predecessor(block->successors[i], block);
4020       }
4021    }
4022 }
4023 
4024 static void
emit_function(struct ir3_context * ctx,nir_function_impl * impl)4025 emit_function(struct ir3_context *ctx, nir_function_impl *impl)
4026 {
4027    nir_metadata_require(impl, nir_metadata_block_index);
4028 
4029    emit_cf_list(ctx, &impl->body);
4030    emit_block(ctx, impl->end_block);
4031 
4032    /* at this point, we should have a single empty block,
4033     * into which we emit the 'end' instruction.
4034     */
4035    compile_assert(ctx, list_is_empty(&ctx->block->instr_list));
4036 
4037    /* If stream-out (aka transform-feedback) enabled, emit the
4038     * stream-out instructions, followed by a new empty block (into
4039     * which the 'end' instruction lands).
4040     *
4041     * NOTE: it is done in this order, rather than inserting before
4042     * we emit end_block, because NIR guarantees that all blocks
4043     * flow into end_block, and that end_block has no successors.
4044     * So by re-purposing end_block as the first block of stream-
4045     * out, we guarantee that all exit paths flow into the stream-
4046     * out instructions.
4047     */
4048    if ((ctx->compiler->gen < 5) &&
4049        (ctx->so->stream_output.num_outputs > 0) &&
4050        !ctx->so->binning_pass) {
4051       assert(ctx->so->type == MESA_SHADER_VERTEX);
4052       emit_stream_out(ctx);
4053    }
4054 
4055    setup_predecessors(ctx->ir);
4056    foreach_block (block, &ctx->ir->block_list) {
4057       resolve_phis(ctx, block);
4058    }
4059 }
4060 
4061 static void
setup_input(struct ir3_context * ctx,nir_intrinsic_instr * intr)4062 setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
4063 {
4064    struct ir3_shader_variant *so = ctx->so;
4065    struct ir3_instruction *coord = NULL;
4066 
4067    if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
4068       coord = ir3_create_collect(ctx->block, ir3_get_src(ctx, &intr->src[0]), 2);
4069 
4070    compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
4071 
4072    unsigned frac = nir_intrinsic_component(intr);
4073    unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
4074    unsigned ncomp = nir_intrinsic_dest_components(intr);
4075    unsigned n = nir_intrinsic_base(intr) + offset;
4076    unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
4077    unsigned compmask;
4078 
4079    /* Inputs are loaded using ldlw or ldg for other stages. */
4080    compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
4081                           ctx->so->type == MESA_SHADER_VERTEX);
4082 
4083    if (ctx->so->type == MESA_SHADER_FRAGMENT)
4084       compmask = BITFIELD_MASK(ncomp) << frac;
4085    else
4086       compmask = BITFIELD_MASK(ncomp + frac);
4087 
4088    /* for a4xx+ rasterflat */
4089    if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
4090       coord = NULL;
4091 
4092    so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
4093 
4094    so->inputs[n].slot = slot;
4095    so->inputs[n].compmask |= compmask;
4096    so->inputs_count = MAX2(so->inputs_count, n + 1);
4097    compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));
4098    so->inputs[n].flat = !coord;
4099 
4100    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4101       compile_assert(ctx, slot != VARYING_SLOT_POS);
4102 
4103       so->inputs[n].bary = true;
4104 
4105       for (int i = 0; i < ncomp; i++) {
4106          unsigned idx = (n * 4) + i + frac;
4107          ctx->last_dst[i] = create_frag_input(ctx, coord, idx);
4108       }
4109 
4110       if (slot == VARYING_SLOT_PRIMITIVE_ID)
4111          so->reads_primid = true;
4112    } else {
4113       struct ir3_instruction *input = NULL;
4114 
4115       foreach_input (in, ctx->ir) {
4116          if (in->input.inidx == n) {
4117             input = in;
4118             break;
4119          }
4120       }
4121 
4122       if (!input) {
4123          input = create_input(ctx, compmask);
4124          input->input.inidx = n;
4125       } else {
4126          /* For aliased inputs, just append to the wrmask.. ie. if we
4127           * first see a vec2 index at slot N, and then later a vec4,
4128           * the wrmask of the resulting overlapped vec2 and vec4 is 0xf
4129           */
4130          input->dsts[0]->wrmask |= compmask;
4131       }
4132 
4133       for (int i = 0; i < ncomp + frac; i++) {
4134          unsigned idx = (n * 4) + i;
4135          compile_assert(ctx, idx < ctx->ninputs);
4136 
4137          /* fixup the src wrmask to avoid validation fail */
4138          if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {
4139             ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;
4140             continue;
4141          }
4142 
4143          ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1);
4144       }
4145 
4146       for (int i = 0; i < ncomp; i++) {
4147          unsigned idx = (n * 4) + i + frac;
4148          ctx->last_dst[i] = ctx->inputs[idx];
4149       }
4150    }
4151 }
4152 
4153 /* Initially we assign non-packed inloc's for varyings, as we don't really
4154  * know up-front which components will be unused.  After all the compilation
4155  * stages we scan the shader to see which components are actually used, and
4156  * re-pack the inlocs to eliminate unneeded varyings.
4157  */
4158 static void
pack_inlocs(struct ir3_context * ctx)4159 pack_inlocs(struct ir3_context *ctx)
4160 {
4161    struct ir3_shader_variant *so = ctx->so;
4162    uint8_t used_components[so->inputs_count];
4163 
4164    memset(used_components, 0, sizeof(used_components));
4165 
4166    /*
4167     * First Step: scan shader to find which bary.f/ldlv remain:
4168     */
4169 
4170    foreach_block (block, &ctx->ir->block_list) {
4171       foreach_instr (instr, &block->instr_list) {
4172          if (is_input(instr)) {
4173             unsigned inloc = instr->srcs[0]->iim_val;
4174             unsigned i = inloc / 4;
4175             unsigned j = inloc % 4;
4176 
4177             compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);
4178             compile_assert(ctx, i < so->inputs_count);
4179 
4180             used_components[i] |= 1 << j;
4181          } else if (instr->opc == OPC_META_TEX_PREFETCH) {
4182             for (int n = 0; n < 2; n++) {
4183                unsigned inloc = instr->prefetch.input_offset + n;
4184                unsigned i = inloc / 4;
4185                unsigned j = inloc % 4;
4186 
4187                compile_assert(ctx, i < so->inputs_count);
4188 
4189                used_components[i] |= 1 << j;
4190             }
4191          }
4192       }
4193    }
4194 
4195    /*
4196     * Second Step: reassign varying inloc/slots:
4197     */
4198 
4199    unsigned inloc = 0;
4200 
4201    /* for clip+cull distances, unused components can't be eliminated because
4202     * they're read by fixed-function, even if there's a hole.  Note that
4203     * clip/cull distance arrays must be declared in the FS, so we can just
4204     * use the NIR clip/cull distances to avoid reading ucp_enables in the
4205     * shader key.
4206     */
4207    unsigned clip_cull_mask = so->clip_mask | so->cull_mask;
4208 
4209    for (unsigned i = 0; i < so->inputs_count; i++) {
4210       unsigned compmask = 0, maxcomp = 0;
4211 
4212       so->inputs[i].inloc = inloc;
4213       so->inputs[i].bary = false;
4214 
4215       if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||
4216           so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {
4217          if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)
4218             compmask = clip_cull_mask & 0xf;
4219          else
4220             compmask = clip_cull_mask >> 4;
4221          used_components[i] = compmask;
4222       }
4223 
4224       for (unsigned j = 0; j < 4; j++) {
4225          if (!(used_components[i] & (1 << j)))
4226             continue;
4227 
4228          compmask |= (1 << j);
4229          maxcomp = j + 1;
4230 
4231          /* at this point, since used_components[i] mask is only
4232           * considering varyings (ie. not sysvals) we know this
4233           * is a varying:
4234           */
4235          so->inputs[i].bary = true;
4236       }
4237 
4238       if (so->inputs[i].bary) {
4239          so->varying_in++;
4240          so->inputs[i].compmask = (1 << maxcomp) - 1;
4241          inloc += maxcomp;
4242       }
4243    }
4244 
4245    /*
4246     * Third Step: reassign packed inloc's:
4247     */
4248 
4249    foreach_block (block, &ctx->ir->block_list) {
4250       foreach_instr (instr, &block->instr_list) {
4251          if (is_input(instr)) {
4252             unsigned inloc = instr->srcs[0]->iim_val;
4253             unsigned i = inloc / 4;
4254             unsigned j = inloc % 4;
4255 
4256             instr->srcs[0]->iim_val = so->inputs[i].inloc + j;
4257             if (instr->opc == OPC_FLAT_B)
4258                instr->srcs[1]->iim_val = instr->srcs[0]->iim_val;
4259          } else if (instr->opc == OPC_META_TEX_PREFETCH) {
4260             unsigned i = instr->prefetch.input_offset / 4;
4261             unsigned j = instr->prefetch.input_offset % 4;
4262             instr->prefetch.input_offset = so->inputs[i].inloc + j;
4263          }
4264       }
4265    }
4266 }
4267 
4268 static void
setup_output(struct ir3_context * ctx,nir_intrinsic_instr * intr)4269 setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)
4270 {
4271    struct ir3_shader_variant *so = ctx->so;
4272    nir_io_semantics io = nir_intrinsic_io_semantics(intr);
4273 
4274    compile_assert(ctx, nir_src_is_const(intr->src[1]));
4275 
4276    unsigned offset = nir_src_as_uint(intr->src[1]);
4277    unsigned n = nir_intrinsic_base(intr) + offset;
4278    unsigned frac = nir_intrinsic_component(intr);
4279    unsigned ncomp = nir_intrinsic_src_components(intr, 0);
4280 
4281    /* For per-view variables, each user-facing slot corresponds to multiple
4282     * views, each with a corresponding driver_location, and the offset is for
4283     * the driver_location. To properly figure out of the slot, we'd need to
4284     * plumb through the number of views. However, for now we only use
4285     * per-view with gl_Position, so we assume that the variable is not an
4286     * array or matrix (so there are no indirect accesses to the variable
4287     * itself) and the indirect offset corresponds to the view.
4288     */
4289    unsigned slot = io.location + (io.per_view ? 0 : offset);
4290 
4291    if (io.per_view && offset > 0)
4292       so->multi_pos_output = true;
4293 
4294    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4295       switch (slot) {
4296       case FRAG_RESULT_DEPTH:
4297          so->writes_pos = true;
4298          break;
4299       case FRAG_RESULT_COLOR:
4300          if (!ctx->s->info.fs.color_is_dual_source) {
4301             so->color0_mrt = 1;
4302          } else {
4303             slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;
4304             if (io.dual_source_blend_index > 0)
4305                so->dual_src_blend = true;
4306          }
4307          break;
4308       case FRAG_RESULT_SAMPLE_MASK:
4309          so->writes_smask = true;
4310          break;
4311       case FRAG_RESULT_STENCIL:
4312          so->writes_stencilref = true;
4313          break;
4314       default:
4315          slot += io.dual_source_blend_index; /* For dual-src blend */
4316          if (io.dual_source_blend_index > 0)
4317             so->dual_src_blend = true;
4318          if (slot >= FRAG_RESULT_DATA0)
4319             break;
4320          ir3_context_error(ctx, "unknown FS output name: %s\n",
4321                            gl_frag_result_name(slot));
4322       }
4323    } else if (ctx->so->type == MESA_SHADER_VERTEX ||
4324               ctx->so->type == MESA_SHADER_TESS_EVAL ||
4325               ctx->so->type == MESA_SHADER_GEOMETRY) {
4326       switch (slot) {
4327       case VARYING_SLOT_POS:
4328          so->writes_pos = true;
4329          break;
4330       case VARYING_SLOT_PSIZ:
4331          so->writes_psize = true;
4332          break;
4333       case VARYING_SLOT_VIEWPORT:
4334          so->writes_viewport = true;
4335          break;
4336       case VARYING_SLOT_PRIMITIVE_ID:
4337       case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
4338          assert(ctx->so->type == MESA_SHADER_GEOMETRY);
4339          FALLTHROUGH;
4340       case VARYING_SLOT_COL0:
4341       case VARYING_SLOT_COL1:
4342       case VARYING_SLOT_BFC0:
4343       case VARYING_SLOT_BFC1:
4344       case VARYING_SLOT_FOGC:
4345       case VARYING_SLOT_CLIP_DIST0:
4346       case VARYING_SLOT_CLIP_DIST1:
4347       case VARYING_SLOT_CLIP_VERTEX:
4348       case VARYING_SLOT_LAYER:
4349          break;
4350       default:
4351          if (slot >= VARYING_SLOT_VAR0)
4352             break;
4353          if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
4354             break;
4355          ir3_context_error(ctx, "unknown %s shader output name: %s\n",
4356                            _mesa_shader_stage_to_string(ctx->so->type),
4357                            gl_varying_slot_name_for_stage(slot, ctx->so->type));
4358       }
4359    } else {
4360       ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
4361    }
4362 
4363    so->outputs_count = MAX2(so->outputs_count, n + 1);
4364    compile_assert(ctx, so->outputs_count <= ARRAY_SIZE(so->outputs));
4365 
4366    so->outputs[n].slot = slot;
4367    if (io.per_view)
4368       so->outputs[n].view = offset;
4369 
4370    for (int i = 0; i < ncomp; i++) {
4371       unsigned idx = (n * 4) + i + frac;
4372       compile_assert(ctx, idx < ctx->noutputs);
4373       ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
4374    }
4375 
4376    /* if varying packing doesn't happen, we could end up in a situation
4377     * with "holes" in the output, and since the per-generation code that
4378     * sets up varying linkage registers doesn't expect to have more than
4379     * one varying per vec4 slot, pad the holes.
4380     *
4381     * Note that this should probably generate a performance warning of
4382     * some sort.
4383     */
4384    for (int i = 0; i < frac; i++) {
4385       unsigned idx = (n * 4) + i;
4386       if (!ctx->outputs[idx]) {
4387          ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
4388       }
4389    }
4390 
4391    struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]);
4392    for (int i = 0; i < ncomp; i++) {
4393       unsigned idx = (n * 4) + i + frac;
4394       ctx->outputs[idx] = src[i];
4395    }
4396 }
4397 
4398 static bool
uses_load_input(struct ir3_shader_variant * so)4399 uses_load_input(struct ir3_shader_variant *so)
4400 {
4401    return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;
4402 }
4403 
4404 static bool
uses_store_output(struct ir3_shader_variant * so)4405 uses_store_output(struct ir3_shader_variant *so)
4406 {
4407    switch (so->type) {
4408    case MESA_SHADER_VERTEX:
4409       return !so->key.has_gs && !so->key.tessellation;
4410    case MESA_SHADER_TESS_EVAL:
4411       return !so->key.has_gs;
4412    case MESA_SHADER_GEOMETRY:
4413    case MESA_SHADER_FRAGMENT:
4414       return true;
4415    case MESA_SHADER_TESS_CTRL:
4416    case MESA_SHADER_COMPUTE:
4417    case MESA_SHADER_KERNEL:
4418       return false;
4419    default:
4420       unreachable("unknown stage");
4421    }
4422 }
4423 
4424 static void
emit_instructions(struct ir3_context * ctx)4425 emit_instructions(struct ir3_context *ctx)
4426 {
4427    MESA_TRACE_FUNC();
4428 
4429    nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
4430 
4431    /* some varying setup which can't be done in setup_input(): */
4432    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4433       nir_foreach_shader_in_variable (var, ctx->s) {
4434          /* set rasterflat flag for front/back color */
4435          if (var->data.interpolation == INTERP_MODE_NONE) {
4436             switch (var->data.location) {
4437             case VARYING_SLOT_COL0:
4438             case VARYING_SLOT_COL1:
4439             case VARYING_SLOT_BFC0:
4440             case VARYING_SLOT_BFC1:
4441                ctx->so->inputs[var->data.driver_location].rasterflat = true;
4442                break;
4443             default:
4444                break;
4445             }
4446          }
4447       }
4448    }
4449 
4450    if (uses_load_input(ctx->so)) {
4451       ctx->so->inputs_count = ctx->s->num_inputs;
4452       compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));
4453       ctx->ninputs = ctx->s->num_inputs * 4;
4454       ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
4455    } else {
4456       ctx->ninputs = 0;
4457       ctx->so->inputs_count = 0;
4458    }
4459 
4460    if (uses_store_output(ctx->so)) {
4461       ctx->noutputs = ctx->s->num_outputs * 4;
4462       ctx->outputs =
4463          rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
4464    } else {
4465       ctx->noutputs = 0;
4466    }
4467 
4468    ctx->ir = ir3_create(ctx->compiler, ctx->so);
4469 
4470    /* Create inputs in first block: */
4471    ctx->block = get_block(ctx, nir_start_block(fxn));
4472    ctx->in_block = ctx->block;
4473 
4474    /* for fragment shader, the vcoord input register is used as the
4475     * base for bary.f varying fetch instrs:
4476     *
4477     * TODO defer creating ctx->ij_pixel and corresponding sysvals
4478     * until emit_intrinsic when we know they are actually needed.
4479     * For now, we defer creating ctx->ij_centroid, etc, since we
4480     * only need ij_pixel for "old style" varying inputs (ie.
4481     * tgsi_to_nir)
4482     */
4483    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4484       ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
4485    }
4486 
4487    /* Defer add_sysval_input() stuff until after setup_inputs(),
4488     * because sysvals need to be appended after varyings:
4489     */
4490    if (ctx->ij[IJ_PERSP_PIXEL]) {
4491       add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3,
4492                                 ctx->ij[IJ_PERSP_PIXEL]);
4493    }
4494 
4495    /* Tesselation shaders always need primitive ID for indexing the
4496     * BO. Geometry shaders don't always need it but when they do it has be
4497     * delivered and unclobbered in the VS. To make things easy, we always
4498     * make room for it in VS/DS.
4499     */
4500    bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;
4501    bool has_gs = ctx->so->key.has_gs;
4502    switch (ctx->so->type) {
4503    case MESA_SHADER_VERTEX:
4504       if (has_tess) {
4505          ctx->tcs_header =
4506             create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
4507          ctx->rel_patch_id =
4508             create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
4509          ctx->primitive_id =
4510             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
4511       } else if (has_gs) {
4512          ctx->gs_header =
4513             create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
4514          ctx->primitive_id =
4515             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
4516       }
4517       break;
4518    case MESA_SHADER_TESS_CTRL:
4519       ctx->tcs_header =
4520          create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
4521       ctx->rel_patch_id =
4522          create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
4523       break;
4524    case MESA_SHADER_TESS_EVAL:
4525       if (has_gs) {
4526          ctx->gs_header =
4527             create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
4528          ctx->primitive_id =
4529             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
4530       }
4531       ctx->rel_patch_id =
4532          create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
4533       break;
4534    case MESA_SHADER_GEOMETRY:
4535       ctx->gs_header =
4536          create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
4537       break;
4538    default:
4539       break;
4540    }
4541 
4542    /* Find # of samplers. Just assume that we'll be reading from images.. if
4543     * it is write-only we don't have to count it, but after lowering derefs
4544     * is too late to compact indices for that.
4545     */
4546    ctx->so->num_samp =
4547       BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;
4548 
4549    /* Save off clip+cull information. Note that in OpenGL clip planes may
4550     * be individually enabled/disabled, and some gens handle lowering in
4551     * backend, so we also need to consider the shader key:
4552     */
4553    ctx->so->clip_mask = ctx->so->key.ucp_enables |
4554                         MASK(ctx->s->info.clip_distance_array_size);
4555    ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size)
4556                         << ctx->s->info.clip_distance_array_size;
4557 
4558    ctx->so->pvtmem_size = ctx->s->scratch_size;
4559    ctx->so->shared_size = ctx->s->info.shared_size;
4560 
4561    /* NOTE: need to do something more clever when we support >1 fxn */
4562    nir_foreach_reg_decl (decl, fxn) {
4563       ir3_declare_array(ctx, decl);
4564    }
4565 
4566    if (ctx->so->type == MESA_SHADER_TESS_CTRL &&
4567        ctx->compiler->tess_use_shared) {
4568       struct ir3_instruction *barrier = ir3_BAR(ctx->block);
4569       barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
4570       barrier->barrier_class = IR3_BARRIER_EVERYTHING;
4571       array_insert(ctx->block, ctx->block->keeps, barrier);
4572       ctx->so->has_barrier = true;
4573    }
4574 
4575    /* And emit the body: */
4576    ctx->impl = fxn;
4577    emit_function(ctx, fxn);
4578 }
4579 
4580 /* Fixup tex sampler state for astc/srgb workaround instructions.  We
4581  * need to assign the tex state indexes for these after we know the
4582  * max tex index.
4583  */
4584 static void
fixup_astc_srgb(struct ir3_context * ctx)4585 fixup_astc_srgb(struct ir3_context *ctx)
4586 {
4587    struct ir3_shader_variant *so = ctx->so;
4588    /* indexed by original tex idx, value is newly assigned alpha sampler
4589     * state tex idx.  Zero is invalid since there is at least one sampler
4590     * if we get here.
4591     */
4592    unsigned alt_tex_state[16] = {0};
4593    unsigned tex_idx = ctx->max_texture_index + 1;
4594    unsigned idx = 0;
4595 
4596    so->astc_srgb.base = tex_idx;
4597 
4598    for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
4599       struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
4600 
4601       compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
4602 
4603       if (alt_tex_state[sam->cat5.tex] == 0) {
4604          /* assign new alternate/alpha tex state slot: */
4605          alt_tex_state[sam->cat5.tex] = tex_idx++;
4606          so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
4607          so->astc_srgb.count++;
4608       }
4609 
4610       sam->cat5.tex = alt_tex_state[sam->cat5.tex];
4611    }
4612 }
4613 
4614 /* Fixup tex sampler state for tg4 workaround instructions.  We
4615  * need to assign the tex state indexes for these after we know the
4616  * max tex index.
4617  */
4618 static void
fixup_tg4(struct ir3_context * ctx)4619 fixup_tg4(struct ir3_context *ctx)
4620 {
4621    struct ir3_shader_variant *so = ctx->so;
4622    /* indexed by original tex idx, value is newly assigned alpha sampler
4623     * state tex idx.  Zero is invalid since there is at least one sampler
4624     * if we get here.
4625     */
4626    unsigned alt_tex_state[16] = {0};
4627    unsigned tex_idx = ctx->max_texture_index + so->astc_srgb.count + 1;
4628    unsigned idx = 0;
4629 
4630    so->tg4.base = tex_idx;
4631 
4632    for (unsigned i = 0; i < ctx->ir->tg4_count; i++) {
4633       struct ir3_instruction *sam = ctx->ir->tg4[i];
4634 
4635       compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
4636 
4637       if (alt_tex_state[sam->cat5.tex] == 0) {
4638          /* assign new alternate/alpha tex state slot: */
4639          alt_tex_state[sam->cat5.tex] = tex_idx++;
4640          so->tg4.orig_idx[idx++] = sam->cat5.tex;
4641          so->tg4.count++;
4642       }
4643 
4644       sam->cat5.tex = alt_tex_state[sam->cat5.tex];
4645    }
4646 }
4647 
4648 static bool
output_slot_used_for_binning(gl_varying_slot slot)4649 output_slot_used_for_binning(gl_varying_slot slot)
4650 {
4651    return slot == VARYING_SLOT_POS || slot == VARYING_SLOT_PSIZ ||
4652           slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1 ||
4653           slot == VARYING_SLOT_VIEWPORT;
4654 }
4655 
4656 static struct ir3_instruction *
find_end(struct ir3 * ir)4657 find_end(struct ir3 *ir)
4658 {
4659    foreach_block_rev (block, &ir->block_list) {
4660       foreach_instr_rev (instr, &block->instr_list) {
4661          if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
4662             return instr;
4663       }
4664    }
4665    unreachable("couldn't find end instruction");
4666 }
4667 
4668 static void
fixup_binning_pass(struct ir3_context * ctx,struct ir3_instruction * end)4669 fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end)
4670 {
4671    struct ir3_shader_variant *so = ctx->so;
4672    unsigned i, j;
4673 
4674    /* first pass, remove unused outputs from the IR level outputs: */
4675    for (i = 0, j = 0; i < end->srcs_count; i++) {
4676       unsigned outidx = end->end.outidxs[i];
4677       unsigned slot = so->outputs[outidx].slot;
4678 
4679       if (output_slot_used_for_binning(slot)) {
4680          end->srcs[j] = end->srcs[i];
4681          end->end.outidxs[j] = end->end.outidxs[i];
4682          j++;
4683       }
4684    }
4685    end->srcs_count = j;
4686 
4687    /* second pass, cleanup the unused slots in ir3_shader_variant::outputs
4688     * table:
4689     */
4690    for (i = 0, j = 0; i < so->outputs_count; i++) {
4691       unsigned slot = so->outputs[i].slot;
4692 
4693       if (output_slot_used_for_binning(slot)) {
4694          so->outputs[j] = so->outputs[i];
4695 
4696          /* fixup outidx to point to new output table entry: */
4697          for (unsigned k = 0; k < end->srcs_count; k++) {
4698             if (end->end.outidxs[k] == i) {
4699                end->end.outidxs[k] = j;
4700                break;
4701             }
4702          }
4703 
4704          j++;
4705       }
4706    }
4707    so->outputs_count = j;
4708 }
4709 
4710 static void
collect_tex_prefetches(struct ir3_context * ctx,struct ir3 * ir)4711 collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir)
4712 {
4713    unsigned idx = 0;
4714 
4715    /* Collect sampling instructions eligible for pre-dispatch. */
4716    foreach_block (block, &ir->block_list) {
4717       foreach_instr_safe (instr, &block->instr_list) {
4718          if (instr->opc == OPC_META_TEX_PREFETCH) {
4719             assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));
4720             struct ir3_sampler_prefetch *fetch =
4721                &ctx->so->sampler_prefetch[idx];
4722             idx++;
4723 
4724             fetch->bindless = instr->flags & IR3_INSTR_B;
4725             if (fetch->bindless) {
4726                /* In bindless mode, the index is actually the base */
4727                fetch->tex_id = instr->prefetch.tex_base;
4728                fetch->samp_id = instr->prefetch.samp_base;
4729                fetch->tex_bindless_id = instr->prefetch.tex;
4730                fetch->samp_bindless_id = instr->prefetch.samp;
4731             } else {
4732                fetch->tex_id = instr->prefetch.tex;
4733                fetch->samp_id = instr->prefetch.samp;
4734             }
4735             fetch->tex_opc = OPC_SAM;
4736             fetch->wrmask = instr->dsts[0]->wrmask;
4737             fetch->dst = instr->dsts[0]->num;
4738             fetch->src = instr->prefetch.input_offset;
4739 
4740             /* These are the limits on a5xx/a6xx, we might need to
4741              * revisit if SP_FS_PREFETCH[n] changes on later gens:
4742              */
4743             assert(fetch->dst <= 0x3f);
4744             assert(fetch->tex_id <= 0x1f);
4745             assert(fetch->samp_id <= 0xf);
4746 
4747             ctx->so->total_in =
4748                MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);
4749 
4750             fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);
4751 
4752             /* Remove the prefetch placeholder instruction: */
4753             list_delinit(&instr->node);
4754          }
4755       }
4756    }
4757 }
4758 
4759 int
ir3_compile_shader_nir(struct ir3_compiler * compiler,struct ir3_shader * shader,struct ir3_shader_variant * so)4760 ir3_compile_shader_nir(struct ir3_compiler *compiler,
4761                        struct ir3_shader *shader,
4762                        struct ir3_shader_variant *so)
4763 {
4764    struct ir3_context *ctx;
4765    struct ir3 *ir;
4766    int ret = 0, max_bary;
4767    bool progress;
4768 
4769    MESA_TRACE_FUNC();
4770 
4771    assert(!so->ir);
4772 
4773    ctx = ir3_context_init(compiler, shader, so);
4774    if (!ctx) {
4775       DBG("INIT failed!");
4776       ret = -1;
4777       goto out;
4778    }
4779 
4780    emit_instructions(ctx);
4781 
4782    if (ctx->error) {
4783       DBG("EMIT failed!");
4784       ret = -1;
4785       goto out;
4786    }
4787 
4788    ir = so->ir = ctx->ir;
4789 
4790    if (gl_shader_stage_is_compute(so->type)) {
4791       so->local_size[0] = ctx->s->info.workgroup_size[0];
4792       so->local_size[1] = ctx->s->info.workgroup_size[1];
4793       so->local_size[2] = ctx->s->info.workgroup_size[2];
4794       so->local_size_variable = ctx->s->info.workgroup_size_variable;
4795    }
4796 
4797    /* Vertex shaders in a tessellation or geometry pipeline treat END as a
4798     * NOP and has an epilogue that writes the VS outputs to local storage, to
4799     * be read by the HS.  Then it resets execution mask (chmask) and chains
4800     * to the next shader (chsh). There are also a few output values which we
4801     * must send to the next stage via registers, and in order for both stages
4802     * to agree on the register used we must force these to be in specific
4803     * registers.
4804     */
4805    if ((so->type == MESA_SHADER_VERTEX &&
4806         (so->key.has_gs || so->key.tessellation)) ||
4807        (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
4808       struct ir3_instruction *outputs[3];
4809       unsigned outidxs[3];
4810       unsigned regids[3];
4811       unsigned outputs_count = 0;
4812 
4813       if (ctx->primitive_id) {
4814          unsigned n = so->outputs_count++;
4815          so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
4816 
4817          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->primitive_id);
4818          outputs[outputs_count] = out;
4819          outidxs[outputs_count] = n;
4820          if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id)
4821             regids[outputs_count] = regid(0, 2);
4822          else
4823             regids[outputs_count] = regid(0, 1);
4824          outputs_count++;
4825       }
4826 
4827       if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id) {
4828          unsigned n = so->outputs_count++;
4829          so->outputs[n].slot = VARYING_SLOT_REL_PATCH_ID_IR3;
4830          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->rel_patch_id);
4831          outputs[outputs_count] = out;
4832          outidxs[outputs_count] = n;
4833          regids[outputs_count] = regid(0, 1);
4834          outputs_count++;
4835       }
4836 
4837       if (ctx->gs_header) {
4838          unsigned n = so->outputs_count++;
4839          so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
4840          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->gs_header);
4841          outputs[outputs_count] = out;
4842          outidxs[outputs_count] = n;
4843          regids[outputs_count] = regid(0, 0);
4844          outputs_count++;
4845       }
4846 
4847       if (ctx->tcs_header) {
4848          unsigned n = so->outputs_count++;
4849          so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
4850          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->tcs_header);
4851          outputs[outputs_count] = out;
4852          outidxs[outputs_count] = n;
4853          regids[outputs_count] = regid(0, 0);
4854          outputs_count++;
4855       }
4856 
4857       struct ir3_instruction *chmask =
4858          ir3_instr_create(ctx->block, OPC_CHMASK, 0, outputs_count);
4859       chmask->barrier_class = IR3_BARRIER_EVERYTHING;
4860       chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
4861 
4862       for (unsigned i = 0; i < outputs_count; i++)
4863          __ssa_src(chmask, outputs[i], 0)->num = regids[i];
4864 
4865       chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
4866       memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
4867 
4868       array_insert(ctx->block, ctx->block->keeps, chmask);
4869 
4870       struct ir3_instruction *chsh = ir3_CHSH(ctx->block);
4871       chsh->barrier_class = IR3_BARRIER_EVERYTHING;
4872       chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
4873    } else {
4874       assert((ctx->noutputs % 4) == 0);
4875       unsigned outidxs[ctx->noutputs / 4];
4876       struct ir3_instruction *outputs[ctx->noutputs / 4];
4877       unsigned outputs_count = 0;
4878 
4879       struct ir3_block *b = ctx->block;
4880       /* Insert these collect's in the block before the end-block if
4881        * possible, so that any moves they generate can be shuffled around to
4882        * reduce nop's:
4883        */
4884       if (ctx->block->predecessors_count == 1)
4885          b = ctx->block->predecessors[0];
4886 
4887       /* Setup IR level outputs, which are "collects" that gather
4888        * the scalar components of outputs.
4889        */
4890       for (unsigned i = 0; i < ctx->noutputs; i += 4) {
4891          unsigned ncomp = 0;
4892          /* figure out the # of components written:
4893           *
4894           * TODO do we need to handle holes, ie. if .x and .z
4895           * components written, but .y component not written?
4896           */
4897          for (unsigned j = 0; j < 4; j++) {
4898             if (!ctx->outputs[i + j])
4899                break;
4900             ncomp++;
4901          }
4902 
4903          /* Note that in some stages, like TCS, store_output is
4904           * lowered to memory writes, so no components of the
4905           * are "written" from the PoV of traditional store-
4906           * output instructions:
4907           */
4908          if (!ncomp)
4909             continue;
4910 
4911          struct ir3_instruction *out =
4912             ir3_create_collect(b, &ctx->outputs[i], ncomp);
4913 
4914          int outidx = i / 4;
4915          assert(outidx < so->outputs_count);
4916 
4917          outidxs[outputs_count] = outidx;
4918          outputs[outputs_count] = out;
4919          outputs_count++;
4920       }
4921 
4922       /* for a6xx+, binning and draw pass VS use same VBO state, so we
4923        * need to make sure not to remove any inputs that are used by
4924        * the nonbinning VS.
4925        */
4926       if (ctx->compiler->gen >= 6 && so->binning_pass &&
4927           so->type == MESA_SHADER_VERTEX) {
4928          for (int i = 0; i < ctx->ninputs; i++) {
4929             struct ir3_instruction *in = ctx->inputs[i];
4930 
4931             if (!in)
4932                continue;
4933 
4934             unsigned n = i / 4;
4935             unsigned c = i % 4;
4936 
4937             assert(n < so->nonbinning->inputs_count);
4938 
4939             if (so->nonbinning->inputs[n].sysval)
4940                continue;
4941 
4942             /* be sure to keep inputs, even if only used in VS */
4943             if (so->nonbinning->inputs[n].compmask & (1 << c))
4944                array_insert(in->block, in->block->keeps, in);
4945          }
4946       }
4947 
4948       struct ir3_instruction *end =
4949          ir3_instr_create(ctx->block, OPC_END, 0, outputs_count);
4950 
4951       for (unsigned i = 0; i < outputs_count; i++) {
4952          __ssa_src(end, outputs[i], 0);
4953       }
4954 
4955       end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
4956       memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
4957 
4958       array_insert(ctx->block, ctx->block->keeps, end);
4959 
4960       /* at this point, for binning pass, throw away unneeded outputs: */
4961       if (so->binning_pass && (ctx->compiler->gen < 6))
4962          fixup_binning_pass(ctx, end);
4963    }
4964 
4965    if (so->type == MESA_SHADER_FRAGMENT &&
4966        ctx->s->info.fs.needs_quad_helper_invocations) {
4967       so->need_pixlod = true;
4968       so->need_full_quad = true;
4969    }
4970 
4971    ir3_debug_print(ir, "AFTER: nir->ir3");
4972    ir3_validate(ir);
4973 
4974    IR3_PASS(ir, ir3_remove_unreachable);
4975 
4976    IR3_PASS(ir, ir3_array_to_ssa);
4977 
4978    ir3_calc_reconvergence(so);
4979 
4980    do {
4981       progress = false;
4982 
4983       /* the folding doesn't seem to work reliably on a4xx */
4984       if (ctx->compiler->gen != 4)
4985          progress |= IR3_PASS(ir, ir3_cf);
4986       progress |= IR3_PASS(ir, ir3_cp, so);
4987       progress |= IR3_PASS(ir, ir3_cse);
4988       progress |= IR3_PASS(ir, ir3_dce, so);
4989    } while (progress);
4990 
4991    /* at this point, for binning pass, throw away unneeded outputs:
4992     * Note that for a6xx and later, we do this after ir3_cp to ensure
4993     * that the uniform/constant layout for BS and VS matches, so that
4994     * we can re-use same VS_CONST state group.
4995     */
4996    if (so->binning_pass && (ctx->compiler->gen >= 6)) {
4997       fixup_binning_pass(ctx, find_end(ctx->so->ir));
4998       /* cleanup the result of removing unneeded outputs: */
4999       while (IR3_PASS(ir, ir3_dce, so)) {
5000       }
5001    }
5002 
5003    IR3_PASS(ir, ir3_sched_add_deps);
5004 
5005    /* At this point, all the dead code should be long gone: */
5006    assert(!IR3_PASS(ir, ir3_dce, so));
5007 
5008    ret = ir3_sched(ir);
5009    if (ret) {
5010       DBG("SCHED failed!");
5011       goto out;
5012    }
5013 
5014    ir3_debug_print(ir, "AFTER: ir3_sched");
5015 
5016    /* Pre-assign VS inputs on a6xx+ binning pass shader, to align
5017     * with draw pass VS, so binning and draw pass can both use the
5018     * same VBO state.
5019     *
5020     * Note that VS inputs are expected to be full precision.
5021     */
5022    bool pre_assign_inputs = (ir->compiler->gen >= 6) &&
5023                             (ir->type == MESA_SHADER_VERTEX) &&
5024                             so->binning_pass;
5025 
5026    if (pre_assign_inputs) {
5027       foreach_input (in, ir) {
5028          assert(in->opc == OPC_META_INPUT);
5029          unsigned inidx = in->input.inidx;
5030 
5031          in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;
5032       }
5033    } else if (ctx->tcs_header) {
5034       /* We need to have these values in the same registers between VS and TCS
5035        * since the VS chains to TCS and doesn't get the sysvals redelivered.
5036        */
5037 
5038       ctx->tcs_header->dsts[0]->num = regid(0, 0);
5039       ctx->rel_patch_id->dsts[0]->num = regid(0, 1);
5040       if (ctx->primitive_id)
5041          ctx->primitive_id->dsts[0]->num = regid(0, 2);
5042    } else if (ctx->gs_header) {
5043       /* We need to have these values in the same registers between producer
5044        * (VS or DS) and GS since the producer chains to GS and doesn't get
5045        * the sysvals redelivered.
5046        */
5047 
5048       ctx->gs_header->dsts[0]->num = regid(0, 0);
5049       if (ctx->primitive_id)
5050          ctx->primitive_id->dsts[0]->num = regid(0, 1);
5051    } else if (so->num_sampler_prefetch) {
5052       assert(so->type == MESA_SHADER_FRAGMENT);
5053       int idx = 0;
5054 
5055       foreach_input (instr, ir) {
5056          if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
5057             continue;
5058 
5059          assert(idx < 2);
5060          instr->dsts[0]->num = idx;
5061          idx++;
5062       }
5063    }
5064 
5065    ret = ir3_ra(so);
5066 
5067    if (ret) {
5068       mesa_loge("ir3_ra() failed!");
5069       goto out;
5070    }
5071 
5072    IR3_PASS(ir, ir3_postsched, so);
5073 
5074    IR3_PASS(ir, ir3_legalize_relative);
5075    IR3_PASS(ir, ir3_lower_subgroups);
5076 
5077    if (so->type == MESA_SHADER_FRAGMENT)
5078       pack_inlocs(ctx);
5079 
5080    /*
5081     * Fixup inputs/outputs to point to the actual registers assigned:
5082     *
5083     * 1) initialize to r63.x (invalid/unused)
5084     * 2) iterate IR level inputs/outputs and update the variants
5085     *    inputs/outputs table based on the assigned registers for
5086     *    the remaining inputs/outputs.
5087     */
5088 
5089    for (unsigned i = 0; i < so->inputs_count; i++)
5090       so->inputs[i].regid = INVALID_REG;
5091    for (unsigned i = 0; i < so->outputs_count; i++)
5092       so->outputs[i].regid = INVALID_REG;
5093 
5094    struct ir3_instruction *end = find_end(so->ir);
5095 
5096    for (unsigned i = 0; i < end->srcs_count; i++) {
5097       unsigned outidx = end->end.outidxs[i];
5098       struct ir3_register *reg = end->srcs[i];
5099 
5100       so->outputs[outidx].regid = reg->num;
5101       so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
5102    }
5103 
5104    foreach_input (in, ir) {
5105       assert(in->opc == OPC_META_INPUT);
5106       unsigned inidx = in->input.inidx;
5107 
5108       if (pre_assign_inputs && !so->inputs[inidx].sysval) {
5109          if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {
5110             compile_assert(
5111                ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid);
5112             compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==
5113                                    so->nonbinning->inputs[inidx].half);
5114          }
5115          so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;
5116          so->inputs[inidx].half = so->nonbinning->inputs[inidx].half;
5117       } else {
5118          so->inputs[inidx].regid = in->dsts[0]->num;
5119          so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF);
5120       }
5121    }
5122 
5123    if (ctx->astc_srgb)
5124       fixup_astc_srgb(ctx);
5125 
5126    if (ctx->compiler->gen == 4 && ctx->s->info.uses_texture_gather)
5127       fixup_tg4(ctx);
5128 
5129    /* We need to do legalize after (for frag shader's) the "bary.f"
5130     * offsets (inloc) have been assigned.
5131     */
5132    IR3_PASS(ir, ir3_legalize, so, &max_bary);
5133 
5134    /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't
5135     * know what we might have to wait on when coming in from VS chsh.
5136     */
5137    if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) {
5138       foreach_block (block, &ir->block_list) {
5139          foreach_instr (instr, &block->instr_list) {
5140             instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
5141             break;
5142          }
5143       }
5144    }
5145 
5146    if (ctx->compiler->gen >= 7 && so->type == MESA_SHADER_COMPUTE) {
5147       struct ir3_instruction *end = find_end(so->ir);
5148       struct ir3_instruction *lock =
5149          ir3_instr_create(ctx->block, OPC_LOCK, 0, 0);
5150       /* TODO: This flags should be set by scheduler only when needed */
5151       lock->flags = IR3_INSTR_SS | IR3_INSTR_SY | IR3_INSTR_JP;
5152       ir3_instr_move_before(lock, end);
5153       struct ir3_instruction *unlock =
5154          ir3_instr_create(ctx->block, OPC_UNLOCK, 0, 0);
5155       ir3_instr_move_before(unlock, end);
5156    }
5157 
5158    so->pvtmem_size = ALIGN(so->pvtmem_size, compiler->pvtmem_per_fiber_align);
5159 
5160    /* Note that max_bary counts inputs that are not bary.f'd for FS: */
5161    if (so->type == MESA_SHADER_FRAGMENT)
5162       so->total_in = max_bary + 1;
5163 
5164    /* Collect sampling instructions eligible for pre-dispatch. */
5165    collect_tex_prefetches(ctx, ir);
5166 
5167    if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
5168        !ctx->s->info.fs.early_fragment_tests)
5169       ctx->so->no_earlyz |= ctx->s->info.writes_memory;
5170 
5171    if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
5172        ctx->s->info.fs.post_depth_coverage)
5173       so->post_depth_coverage = true;
5174 
5175    ctx->so->per_samp = ctx->s->info.fs.uses_sample_shading;
5176 
5177 out:
5178    if (ret) {
5179       if (so->ir)
5180          ir3_destroy(so->ir);
5181       so->ir = NULL;
5182    }
5183    ir3_context_free(ctx);
5184 
5185    return ret;
5186 }
5187