• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Jonathan Marek <jonathan@marek.ca>
25  */
26 
27 #include "ir2_private.h"
28 
29 #include "fd2_program.h"
30 #include "freedreno_util.h"
31 
32 static const nir_shader_compiler_options options = {
33    .lower_fpow = true,
34    .lower_flrp32 = true,
35    .lower_fmod = true,
36    .lower_fdiv = true,
37    .lower_fceil = true,
38    .fuse_ffma16 = true,
39    .fuse_ffma32 = true,
40    .fuse_ffma64 = true,
41    /* .fdot_replicates = true, it is replicated, but it makes things worse */
42    .lower_all_io_to_temps = true,
43    .vertex_id_zero_based = true, /* its not implemented anyway */
44    .lower_bitops = true,
45    .lower_rotate = true,
46    .lower_vector_cmp = true,
47    .lower_fdph = true,
48    .has_fsub = true,
49    .has_isub = true,
50    .lower_insert_byte = true,
51    .lower_insert_word = true,
52    .force_indirect_unrolling = nir_var_all,
53 };
54 
55 const nir_shader_compiler_options *
ir2_get_compiler_options(void)56 ir2_get_compiler_options(void)
57 {
58    return &options;
59 }
60 
61 #define OPT(nir, pass, ...)                                                    \
62    ({                                                                          \
63       bool this_progress = false;                                              \
64       NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);                       \
65       this_progress;                                                           \
66    })
67 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
68 
69 static void
ir2_optimize_loop(nir_shader * s)70 ir2_optimize_loop(nir_shader *s)
71 {
72    bool progress;
73    do {
74       progress = false;
75 
76       OPT_V(s, nir_lower_vars_to_ssa);
77       progress |= OPT(s, nir_opt_copy_prop_vars);
78       progress |= OPT(s, nir_copy_prop);
79       progress |= OPT(s, nir_opt_dce);
80       progress |= OPT(s, nir_opt_cse);
81       /* progress |= OPT(s, nir_opt_gcm, true); */
82       progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
83       progress |= OPT(s, nir_opt_intrinsics);
84       progress |= OPT(s, nir_opt_algebraic);
85       progress |= OPT(s, nir_opt_constant_folding);
86       progress |= OPT(s, nir_opt_dead_cf);
87       if (OPT(s, nir_opt_trivial_continues)) {
88          progress |= true;
89          /* If nir_opt_trivial_continues makes progress, then we need to clean
90           * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
91           * to make progress.
92           */
93          OPT(s, nir_copy_prop);
94          OPT(s, nir_opt_dce);
95       }
96       progress |= OPT(s, nir_opt_loop_unroll);
97       progress |= OPT(s, nir_opt_if, false);
98       progress |= OPT(s, nir_opt_remove_phis);
99       progress |= OPT(s, nir_opt_undef);
100 
101    } while (progress);
102 }
103 
104 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
105 bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
106 
107 int
ir2_optimize_nir(nir_shader * s,bool lower)108 ir2_optimize_nir(nir_shader *s, bool lower)
109 {
110    struct nir_lower_tex_options tex_options = {
111       .lower_txp = ~0u,
112       .lower_rect = 0,
113    };
114 
115    if (FD_DBG(DISASM)) {
116       debug_printf("----------------------\n");
117       nir_print_shader(s, stdout);
118       debug_printf("----------------------\n");
119    }
120 
121    OPT_V(s, nir_lower_regs_to_ssa);
122    OPT_V(s, nir_lower_vars_to_ssa);
123    OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out,
124          UINT32_MAX);
125 
126    if (lower) {
127       OPT_V(s, ir3_nir_apply_trig_workarounds);
128       OPT_V(s, nir_lower_tex, &tex_options);
129    }
130 
131    ir2_optimize_loop(s);
132 
133    OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
134    OPT_V(s, nir_opt_sink, nir_move_const_undef);
135 
136    /* TODO we dont want to get shaders writing to depth for depth textures */
137    if (s->info.stage == MESA_SHADER_FRAGMENT) {
138       nir_foreach_shader_out_variable (var, s) {
139          if (var->data.location == FRAG_RESULT_DEPTH)
140             return -1;
141       }
142    }
143 
144    return 0;
145 }
146 
147 static struct ir2_src
load_const(struct ir2_context * ctx,float * value_f,unsigned ncomp)148 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
149 {
150    struct fd2_shader_stateobj *so = ctx->so;
151    unsigned imm_ncomp, swiz, idx, i, j;
152    uint32_t *value = (uint32_t *)value_f;
153 
154    /* try to merge with existing immediate (TODO: try with neg) */
155    for (idx = 0; idx < so->num_immediates; idx++) {
156       swiz = 0;
157       imm_ncomp = so->immediates[idx].ncomp;
158       for (i = 0; i < ncomp; i++) {
159          for (j = 0; j < imm_ncomp; j++) {
160             if (value[i] == so->immediates[idx].val[j])
161                break;
162          }
163          if (j == imm_ncomp) {
164             if (j == 4)
165                break;
166             so->immediates[idx].val[imm_ncomp++] = value[i];
167          }
168          swiz |= swiz_set(j, i);
169       }
170       /* matched all components */
171       if (i == ncomp)
172          break;
173    }
174 
175    /* need to allocate new immediate */
176    if (idx == so->num_immediates) {
177       swiz = 0;
178       imm_ncomp = 0;
179       for (i = 0; i < ncomp; i++) {
180          for (j = 0; j < imm_ncomp; j++) {
181             if (value[i] == ctx->so->immediates[idx].val[j])
182                break;
183          }
184          if (j == imm_ncomp) {
185             so->immediates[idx].val[imm_ncomp++] = value[i];
186          }
187          swiz |= swiz_set(j, i);
188       }
189       so->num_immediates++;
190    }
191    so->immediates[idx].ncomp = imm_ncomp;
192 
193    if (ncomp == 1)
194       swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
195 
196    return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
197 }
198 
199 struct ir2_src
ir2_zero(struct ir2_context * ctx)200 ir2_zero(struct ir2_context *ctx)
201 {
202    return load_const(ctx, (float[]){0.0f}, 1);
203 }
204 
205 static void
update_range(struct ir2_context * ctx,struct ir2_reg * reg)206 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
207 {
208    if (!reg->initialized) {
209       reg->initialized = true;
210       reg->loop_depth = ctx->loop_depth;
211    }
212 
213    if (ctx->loop_depth > reg->loop_depth) {
214       reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
215    } else {
216       reg->loop_depth = ctx->loop_depth;
217       reg->block_idx_free = -1;
218    }
219 
220    /* for regs we want to free at the end of the loop in any case
221     * XXX dont do this for ssa
222     */
223    if (reg->loop_depth)
224       reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
225 }
226 
227 static struct ir2_src
make_src(struct ir2_context * ctx,nir_src src)228 make_src(struct ir2_context *ctx, nir_src src)
229 {
230    struct ir2_src res = {};
231    struct ir2_reg *reg;
232 
233    nir_const_value *const_value = nir_src_as_const_value(src);
234 
235    if (const_value) {
236       assert(src.is_ssa);
237       float c[src.ssa->num_components];
238       nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
239       return load_const(ctx, c, src.ssa->num_components);
240    }
241 
242    if (!src.is_ssa) {
243       res.num = src.reg.reg->index;
244       res.type = IR2_SRC_REG;
245       reg = &ctx->reg[res.num];
246    } else {
247       assert(ctx->ssa_map[src.ssa->index] >= 0);
248       res.num = ctx->ssa_map[src.ssa->index];
249       res.type = IR2_SRC_SSA;
250       reg = &ctx->instr[res.num].ssa;
251    }
252 
253    update_range(ctx, reg);
254    return res;
255 }
256 
257 static void
set_index(struct ir2_context * ctx,nir_dest * dst,struct ir2_instr * instr)258 set_index(struct ir2_context *ctx, nir_dest *dst, struct ir2_instr *instr)
259 {
260    struct ir2_reg *reg = &instr->ssa;
261 
262    if (dst->is_ssa) {
263       ctx->ssa_map[dst->ssa.index] = instr->idx;
264    } else {
265       assert(instr->is_ssa);
266       reg = &ctx->reg[dst->reg.reg->index];
267 
268       instr->is_ssa = false;
269       instr->reg = reg;
270    }
271    update_range(ctx, reg);
272 }
273 
274 static struct ir2_instr *
ir2_instr_create(struct ir2_context * ctx,int type)275 ir2_instr_create(struct ir2_context *ctx, int type)
276 {
277    struct ir2_instr *instr;
278 
279    instr = &ctx->instr[ctx->instr_count++];
280    instr->idx = ctx->instr_count - 1;
281    instr->type = type;
282    instr->block_idx = ctx->block_idx;
283    instr->pred = ctx->pred;
284    instr->is_ssa = true;
285    return instr;
286 }
287 
288 static struct ir2_instr *
instr_create_alu(struct ir2_context * ctx,nir_op opcode,unsigned ncomp)289 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
290 {
291    /* emit_alu will fixup instrs that don't map directly */
292    static const struct ir2_opc {
293       int8_t scalar, vector;
294    } nir_ir2_opc[nir_num_opcodes + 1] = {
295       [0 ... nir_num_opcodes - 1] = {-1, -1},
296 
297       [nir_op_mov] = {MAXs, MAXv},
298       [nir_op_fneg] = {MAXs, MAXv},
299       [nir_op_fabs] = {MAXs, MAXv},
300       [nir_op_fsat] = {MAXs, MAXv},
301       [nir_op_fsign] = {-1, CNDGTEv},
302       [nir_op_fadd] = {ADDs, ADDv},
303       [nir_op_fsub] = {ADDs, ADDv},
304       [nir_op_fmul] = {MULs, MULv},
305       [nir_op_ffma] = {-1, MULADDv},
306       [nir_op_fmax] = {MAXs, MAXv},
307       [nir_op_fmin] = {MINs, MINv},
308       [nir_op_ffloor] = {FLOORs, FLOORv},
309       [nir_op_ffract] = {FRACs, FRACv},
310       [nir_op_ftrunc] = {TRUNCs, TRUNCv},
311       [nir_op_fdot2] = {-1, DOT2ADDv},
312       [nir_op_fdot3] = {-1, DOT3v},
313       [nir_op_fdot4] = {-1, DOT4v},
314       [nir_op_sge] = {-1, SETGTEv},
315       [nir_op_slt] = {-1, SETGTv},
316       [nir_op_sne] = {-1, SETNEv},
317       [nir_op_seq] = {-1, SETEv},
318       [nir_op_fcsel] = {-1, CNDEv},
319       [nir_op_frsq] = {RECIPSQ_IEEE, -1},
320       [nir_op_frcp] = {RECIP_IEEE, -1},
321       [nir_op_flog2] = {LOG_IEEE, -1},
322       [nir_op_fexp2] = {EXP_IEEE, -1},
323       [nir_op_fsqrt] = {SQRT_IEEE, -1},
324       [nir_op_fcos] = {COS, -1},
325       [nir_op_fsin] = {SIN, -1},
326    /* no fsat, fneg, fabs since source mods deal with those */
327 
328    /* so we can use this function with non-nir op */
329 #define ir2_op_cube nir_num_opcodes
330       [ir2_op_cube] = {-1, CUBEv},
331    };
332 
333    struct ir2_opc op = nir_ir2_opc[opcode];
334    assert(op.vector >= 0 || op.scalar >= 0);
335 
336    struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
337    instr->alu.vector_opc = op.vector;
338    instr->alu.scalar_opc = op.scalar;
339    instr->alu.export = -1;
340    instr->alu.write_mask = (1 << ncomp) - 1;
341    instr->src_count =
342       opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs;
343    instr->ssa.ncomp = ncomp;
344    return instr;
345 }
346 
347 static struct ir2_instr *
instr_create_alu_reg(struct ir2_context * ctx,nir_op opcode,uint8_t write_mask,struct ir2_instr * share_reg)348 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask,
349                      struct ir2_instr *share_reg)
350 {
351    struct ir2_instr *instr;
352    struct ir2_reg *reg;
353 
354    reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
355    reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
356 
357    instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
358    instr->alu.write_mask = write_mask;
359    instr->reg = reg;
360    instr->is_ssa = false;
361    return instr;
362 }
363 
364 static struct ir2_instr *
instr_create_alu_dest(struct ir2_context * ctx,nir_op opcode,nir_dest * dst)365 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
366 {
367    struct ir2_instr *instr;
368    instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
369    set_index(ctx, dst, instr);
370    return instr;
371 }
372 
373 static struct ir2_instr *
ir2_instr_create_fetch(struct ir2_context * ctx,nir_dest * dst,instr_fetch_opc_t opc)374 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
375                        instr_fetch_opc_t opc)
376 {
377    struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
378    instr->fetch.opc = opc;
379    instr->src_count = 1;
380    instr->ssa.ncomp = nir_dest_num_components(*dst);
381    set_index(ctx, dst, instr);
382    return instr;
383 }
384 
385 static struct ir2_src
make_src_noconst(struct ir2_context * ctx,nir_src src)386 make_src_noconst(struct ir2_context *ctx, nir_src src)
387 {
388    struct ir2_instr *instr;
389 
390    if (nir_src_as_const_value(src)) {
391       assert(src.is_ssa);
392       instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
393       instr->src[0] = make_src(ctx, src);
394       return ir2_src(instr->idx, 0, IR2_SRC_SSA);
395    }
396 
397    return make_src(ctx, src);
398 }
399 
400 static void
emit_alu(struct ir2_context * ctx,nir_alu_instr * alu)401 emit_alu(struct ir2_context *ctx, nir_alu_instr *alu)
402 {
403    const nir_op_info *info = &nir_op_infos[alu->op];
404    nir_dest *dst = &alu->dest.dest;
405    struct ir2_instr *instr;
406    struct ir2_src tmp;
407    unsigned ncomp;
408 
409    /* get the number of dst components */
410    if (dst->is_ssa) {
411       ncomp = dst->ssa.num_components;
412    } else {
413       ncomp = 0;
414       for (int i = 0; i < 4; i++)
415          ncomp += !!(alu->dest.write_mask & 1 << i);
416    }
417 
418    instr = instr_create_alu(ctx, alu->op, ncomp);
419    set_index(ctx, dst, instr);
420    instr->alu.saturate = alu->dest.saturate;
421    instr->alu.write_mask = alu->dest.write_mask;
422 
423    for (int i = 0; i < info->num_inputs; i++) {
424       nir_alu_src *src = &alu->src[i];
425 
426       /* compress swizzle with writemask when applicable */
427       unsigned swiz = 0, j = 0;
428       for (int i = 0; i < 4; i++) {
429          if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
430             continue;
431          swiz |= swiz_set(src->swizzle[i], j++);
432       }
433 
434       instr->src[i] = make_src(ctx, src->src);
435       instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
436       instr->src[i].negate = src->negate;
437       instr->src[i].abs = src->abs;
438    }
439 
440    /* workarounds for NIR ops that don't map directly to a2xx ops */
441    switch (alu->op) {
442    case nir_op_fneg:
443       instr->src[0].negate = 1;
444       break;
445    case nir_op_fabs:
446       instr->src[0].abs = 1;
447       break;
448    case nir_op_fsat:
449       instr->alu.saturate = 1;
450       break;
451    case nir_op_slt:
452       tmp = instr->src[0];
453       instr->src[0] = instr->src[1];
454       instr->src[1] = tmp;
455       break;
456    case nir_op_fcsel:
457       tmp = instr->src[1];
458       instr->src[1] = instr->src[2];
459       instr->src[2] = tmp;
460       break;
461    case nir_op_fsub:
462       instr->src[1].negate = !instr->src[1].negate;
463       break;
464    case nir_op_fdot2:
465       instr->src_count = 3;
466       instr->src[2] = ir2_zero(ctx);
467       break;
468    case nir_op_fsign: {
469       /* we need an extra instruction to deal with the zero case */
470       struct ir2_instr *tmp;
471 
472       /* tmp = x == 0 ? 0 : 1 */
473       tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
474       tmp->src[0] = instr->src[0];
475       tmp->src[1] = ir2_zero(ctx);
476       tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1);
477 
478       /* result = x >= 0 ? tmp : -tmp */
479       instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
480       instr->src[2] = instr->src[1];
481       instr->src[2].negate = true;
482       instr->src_count = 3;
483    } break;
484    default:
485       break;
486    }
487 }
488 
489 static void
load_input(struct ir2_context * ctx,nir_dest * dst,unsigned idx)490 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
491 {
492    struct ir2_instr *instr;
493    int slot = -1;
494 
495    if (ctx->so->type == MESA_SHADER_VERTEX) {
496       instr = ir2_instr_create_fetch(ctx, dst, 0);
497       instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
498       instr->fetch.vtx.const_idx = 20 + (idx / 3);
499       instr->fetch.vtx.const_idx_sel = idx % 3;
500       return;
501    }
502 
503    /* get slot from idx */
504    nir_foreach_shader_in_variable (var, ctx->nir) {
505       if (var->data.driver_location == idx) {
506          slot = var->data.location;
507          break;
508       }
509    }
510    assert(slot >= 0);
511 
512    switch (slot) {
513    case VARYING_SLOT_POS:
514       /* need to extract xy with abs and add tile offset on a20x
515        * zw from fragcoord input (w inverted in fragment shader)
516        * TODO: only components that are required by fragment shader
517        */
518       instr = instr_create_alu_reg(
519          ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
520       instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
521       instr->src[0].abs = true;
522       /* on a20x, C64 contains the tile offset */
523       instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
524 
525       instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
526       instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
527 
528       instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
529       instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
530 
531       unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
532       instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
533       instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
534       break;
535    default:
536       instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
537       instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
538       break;
539    }
540 }
541 
542 static unsigned
output_slot(struct ir2_context * ctx,nir_intrinsic_instr * intr)543 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
544 {
545    int slot = -1;
546    unsigned idx = nir_intrinsic_base(intr);
547    nir_foreach_shader_out_variable (var, ctx->nir) {
548       if (var->data.driver_location == idx) {
549          slot = var->data.location;
550          break;
551       }
552    }
553    assert(slot != -1);
554    return slot;
555 }
556 
557 static void
store_output(struct ir2_context * ctx,nir_src src,unsigned slot,unsigned ncomp)558 store_output(struct ir2_context *ctx, nir_src src, unsigned slot,
559              unsigned ncomp)
560 {
561    struct ir2_instr *instr;
562    unsigned idx = 0;
563 
564    if (ctx->so->type == MESA_SHADER_VERTEX) {
565       switch (slot) {
566       case VARYING_SLOT_POS:
567          ctx->position = make_src(ctx, src);
568          idx = 62;
569          break;
570       case VARYING_SLOT_PSIZ:
571          ctx->so->writes_psize = true;
572          idx = 63;
573          break;
574       default:
575          /* find matching slot from fragment shader input */
576          for (idx = 0; idx < ctx->f->inputs_count; idx++)
577             if (ctx->f->inputs[idx].slot == slot)
578                break;
579          if (idx == ctx->f->inputs_count)
580             return;
581       }
582    } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
583       /* only color output is implemented */
584       return;
585    }
586 
587    instr = instr_create_alu(ctx, nir_op_mov, ncomp);
588    instr->src[0] = make_src(ctx, src);
589    instr->alu.export = idx;
590 }
591 
592 static void
emit_intrinsic(struct ir2_context * ctx,nir_intrinsic_instr * intr)593 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
594 {
595    struct ir2_instr *instr;
596    ASSERTED nir_const_value *const_offset;
597    unsigned idx;
598 
599    switch (intr->intrinsic) {
600    case nir_intrinsic_load_input:
601       load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
602       break;
603    case nir_intrinsic_store_output:
604       store_output(ctx, intr->src[0], output_slot(ctx, intr),
605                    intr->num_components);
606       break;
607    case nir_intrinsic_load_uniform:
608       const_offset = nir_src_as_const_value(intr->src[0]);
609       assert(const_offset); /* TODO can be false in ES2? */
610       idx = nir_intrinsic_base(intr);
611       idx += (uint32_t)const_offset[0].f32;
612       instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
613       instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
614       break;
615    case nir_intrinsic_discard:
616    case nir_intrinsic_discard_if:
617       instr = ir2_instr_create(ctx, IR2_ALU);
618       instr->alu.vector_opc = VECTOR_NONE;
619       if (intr->intrinsic == nir_intrinsic_discard_if) {
620          instr->alu.scalar_opc = KILLNEs;
621          instr->src[0] = make_src(ctx, intr->src[0]);
622       } else {
623          instr->alu.scalar_opc = KILLEs;
624          instr->src[0] = ir2_zero(ctx);
625       }
626       instr->alu.export = -1;
627       instr->src_count = 1;
628       ctx->so->has_kill = true;
629       break;
630    case nir_intrinsic_load_front_face:
631       /* gl_FrontFacing is in the sign of param.x
632        * rcp required because otherwise we can't differentiate -0.0 and +0.0
633        */
634       ctx->so->need_param = true;
635 
636       struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
637       tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
638 
639       instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
640       instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
641       instr->src[1] = ir2_zero(ctx);
642       break;
643    case nir_intrinsic_load_point_coord:
644       /* param.zw (note: abs might be needed like fragcoord in param.xy?) */
645       ctx->so->need_param = true;
646 
647       instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
648       instr->src[0] =
649          ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
650       break;
651    default:
652       compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
653       break;
654    }
655 }
656 
657 static void
emit_tex(struct ir2_context * ctx,nir_tex_instr * tex)658 emit_tex(struct ir2_context *ctx, nir_tex_instr *tex)
659 {
660    bool is_rect = false, is_cube = false;
661    struct ir2_instr *instr;
662    nir_src *coord, *lod_bias;
663 
664    coord = lod_bias = NULL;
665 
666    for (unsigned i = 0; i < tex->num_srcs; i++) {
667       switch (tex->src[i].src_type) {
668       case nir_tex_src_coord:
669          coord = &tex->src[i].src;
670          break;
671       case nir_tex_src_bias:
672       case nir_tex_src_lod:
673          assert(!lod_bias);
674          lod_bias = &tex->src[i].src;
675          break;
676       default:
677          compile_error(ctx, "Unhandled NIR tex src type: %d\n",
678                        tex->src[i].src_type);
679          return;
680       }
681    }
682 
683    switch (tex->op) {
684    case nir_texop_tex:
685    case nir_texop_txb:
686    case nir_texop_txl:
687       break;
688    default:
689       compile_error(ctx, "unimplemented texop %d\n", tex->op);
690       return;
691    }
692 
693    switch (tex->sampler_dim) {
694    case GLSL_SAMPLER_DIM_2D:
695    case GLSL_SAMPLER_DIM_EXTERNAL:
696       break;
697    case GLSL_SAMPLER_DIM_RECT:
698       is_rect = true;
699       break;
700    case GLSL_SAMPLER_DIM_CUBE:
701       is_cube = true;
702       break;
703    default:
704       compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
705       return;
706    }
707 
708    struct ir2_src src_coord = make_src_noconst(ctx, *coord);
709 
710    /* for cube maps
711     * tmp = cube(coord)
712     * tmp.xy = tmp.xy / |tmp.z| + 1.5
713     * coord = tmp.xyw
714     */
715    if (is_cube) {
716       struct ir2_instr *rcp, *coord_xy;
717       unsigned reg_idx;
718 
719       instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
720       instr->src[0] = src_coord;
721       instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
722       instr->src[1] = src_coord;
723       instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
724 
725       reg_idx = instr->reg - ctx->reg; /* hacky */
726 
727       rcp = instr_create_alu(ctx, nir_op_frcp, 1);
728       rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
729       rcp->src[0].abs = true;
730 
731       coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
732       coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
733       coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
734       coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1);
735 
736       src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
737       /* TODO: lod/bias transformed by src_coord.z ? */
738    }
739 
740    instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
741    instr->src[0] = src_coord;
742    instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;
743    instr->fetch.tex.is_cube = is_cube;
744    instr->fetch.tex.is_rect = is_rect;
745    instr->fetch.tex.samp_id = tex->sampler_index;
746 
747    /* for lod/bias, we insert an extra src for the backend to deal with */
748    if (lod_bias) {
749       instr->src[1] = make_src_noconst(ctx, *lod_bias);
750       /* backend will use 2-3 components so apply swizzle */
751       swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
752       instr->src_count = 2;
753    }
754 }
755 
756 static void
setup_input(struct ir2_context * ctx,nir_variable * in)757 setup_input(struct ir2_context *ctx, nir_variable *in)
758 {
759    struct fd2_shader_stateobj *so = ctx->so;
760    ASSERTED unsigned array_len = MAX2(glsl_get_length(in->type), 1);
761    unsigned n = in->data.driver_location;
762    unsigned slot = in->data.location;
763 
764    assert(array_len == 1);
765 
766    /* handle later */
767    if (ctx->so->type == MESA_SHADER_VERTEX)
768       return;
769 
770    if (ctx->so->type != MESA_SHADER_FRAGMENT)
771       compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
772 
773    n = ctx->f->inputs_count++;
774 
775    /* half of fragcoord from param reg, half from a varying */
776    if (slot == VARYING_SLOT_POS) {
777       ctx->f->fragcoord = n;
778       so->need_param = true;
779    }
780 
781    ctx->f->inputs[n].slot = slot;
782    ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
783 
784    /* in->data.interpolation?
785     * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
786     */
787 }
788 
789 static void
emit_undef(struct ir2_context * ctx,nir_ssa_undef_instr * undef)790 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr *undef)
791 {
792    /* TODO we don't want to emit anything for undefs */
793 
794    struct ir2_instr *instr;
795 
796    instr = instr_create_alu_dest(
797       ctx, nir_op_mov, &(nir_dest){.ssa = undef->def, .is_ssa = true});
798    instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
799 }
800 
801 static void
emit_instr(struct ir2_context * ctx,nir_instr * instr)802 emit_instr(struct ir2_context *ctx, nir_instr *instr)
803 {
804    switch (instr->type) {
805    case nir_instr_type_alu:
806       emit_alu(ctx, nir_instr_as_alu(instr));
807       break;
808    case nir_instr_type_deref:
809       /* ignored, handled as part of the intrinsic they are src to */
810       break;
811    case nir_instr_type_intrinsic:
812       emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
813       break;
814    case nir_instr_type_load_const:
815       /* dealt with when using nir_src */
816       break;
817    case nir_instr_type_tex:
818       emit_tex(ctx, nir_instr_as_tex(instr));
819       break;
820    case nir_instr_type_jump:
821       ctx->block_has_jump[ctx->block_idx] = true;
822       break;
823    case nir_instr_type_ssa_undef:
824       emit_undef(ctx, nir_instr_as_ssa_undef(instr));
825       break;
826    default:
827       break;
828    }
829 }
830 
831 /* fragcoord.zw and a20x hw binning outputs */
832 static void
extra_position_exports(struct ir2_context * ctx,bool binning)833 extra_position_exports(struct ir2_context *ctx, bool binning)
834 {
835    struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
836 
837    if (ctx->f->fragcoord < 0 && !binning)
838       return;
839 
840    instr = instr_create_alu(ctx, nir_op_fmax, 1);
841    instr->src[0] = ctx->position;
842    instr->src[0].swizzle = IR2_SWIZZLE_W;
843    instr->src[1] = ir2_zero(ctx);
844 
845    rcp = instr_create_alu(ctx, nir_op_frcp, 1);
846    rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
847 
848    sc = instr_create_alu(ctx, nir_op_fmul, 4);
849    sc->src[0] = ctx->position;
850    sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
851 
852    wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
853    wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
854    wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
855    wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
856 
857    /* fragcoord z/w */
858    if (ctx->f->fragcoord >= 0 && !binning) {
859       instr = instr_create_alu(ctx, nir_op_mov, 1);
860       instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
861       instr->alu.export = ctx->f->fragcoord;
862 
863       instr = instr_create_alu(ctx, nir_op_mov, 1);
864       instr->src[0] = ctx->position;
865       instr->src[0].swizzle = IR2_SWIZZLE_W;
866       instr->alu.export = ctx->f->fragcoord;
867       instr->alu.write_mask = 2;
868    }
869 
870    if (!binning)
871       return;
872 
873    off = instr_create_alu(ctx, nir_op_fadd, 1);
874    off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
875    off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
876 
877    /* 8 max set in freedreno_screen.. unneeded instrs patched out */
878    for (int i = 0; i < 8; i++) {
879       instr = instr_create_alu(ctx, nir_op_ffma, 4);
880       instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
881       instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
882       instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
883       instr->alu.export = 32;
884 
885       instr = instr_create_alu(ctx, nir_op_ffma, 4);
886       instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
887       instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
888       instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
889       instr->alu.export = 33;
890    }
891 }
892 
893 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
894 
895 static bool
emit_block(struct ir2_context * ctx,nir_block * block)896 emit_block(struct ir2_context *ctx, nir_block *block)
897 {
898    struct ir2_instr *instr;
899    nir_block *succs = block->successors[0];
900 
901    ctx->block_idx = block->index;
902 
903    nir_foreach_instr (instr, block)
904       emit_instr(ctx, instr);
905 
906    if (!succs || !succs->index)
907       return false;
908 
909    /* we want to be smart and always jump and have the backend cleanup
910     * but we are not, so there are two cases where jump is needed:
911     *  loops (succs index lower)
912     *  jumps (jump instruction seen in block)
913     */
914    if (succs->index > block->index && !ctx->block_has_jump[block->index])
915       return false;
916 
917    assert(block->successors[1] == NULL);
918 
919    instr = ir2_instr_create(ctx, IR2_CF);
920    instr->cf.block_idx = succs->index;
921    /* XXX can't jump to a block with different predicate */
922    return true;
923 }
924 
925 static void
emit_if(struct ir2_context * ctx,nir_if * nif)926 emit_if(struct ir2_context *ctx, nir_if *nif)
927 {
928    unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
929    struct ir2_instr *instr;
930 
931    /* XXX: blob seems to always use same register for condition */
932 
933    instr = ir2_instr_create(ctx, IR2_ALU);
934    instr->src[0] = make_src(ctx, nif->condition);
935    instr->src_count = 1;
936    instr->ssa.ncomp = 1;
937    instr->alu.vector_opc = VECTOR_NONE;
938    instr->alu.scalar_opc = SCALAR_NONE;
939    instr->alu.export = -1;
940    instr->alu.write_mask = 1;
941    instr->pred = 0;
942 
943    /* if nested, use PRED_SETNE_PUSHv */
944    if (pred) {
945       instr->alu.vector_opc = PRED_SETNE_PUSHv;
946       instr->src[1] = instr->src[0];
947       instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
948       instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
949       instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
950       instr->src_count = 2;
951    } else {
952       instr->alu.scalar_opc = PRED_SETNEs;
953    }
954 
955    ctx->pred_idx = instr->idx;
956    ctx->pred = 3;
957 
958    emit_cf_list(ctx, &nif->then_list);
959 
960    /* TODO: if these is no else branch we don't need this
961     * and if the else branch is simple, can just flip ctx->pred instead
962     */
963    instr = ir2_instr_create(ctx, IR2_ALU);
964    instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
965    instr->src_count = 1;
966    instr->ssa.ncomp = 1;
967    instr->alu.vector_opc = VECTOR_NONE;
968    instr->alu.scalar_opc = PRED_SET_INVs;
969    instr->alu.export = -1;
970    instr->alu.write_mask = 1;
971    instr->pred = 0;
972    ctx->pred_idx = instr->idx;
973 
974    emit_cf_list(ctx, &nif->else_list);
975 
976    /* restore predicate for nested predicates */
977    if (pred) {
978       instr = ir2_instr_create(ctx, IR2_ALU);
979       instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
980       instr->src_count = 1;
981       instr->ssa.ncomp = 1;
982       instr->alu.vector_opc = VECTOR_NONE;
983       instr->alu.scalar_opc = PRED_SET_POPs;
984       instr->alu.export = -1;
985       instr->alu.write_mask = 1;
986       instr->pred = 0;
987       ctx->pred_idx = instr->idx;
988    }
989 
990    /* restore ctx->pred */
991    ctx->pred = pred;
992 }
993 
994 /* get the highest block idx in the loop, so we know when
995  * we can free registers that are allocated outside the loop
996  */
997 static unsigned
loop_last_block(struct exec_list * list)998 loop_last_block(struct exec_list *list)
999 {
1000    nir_cf_node *node =
1001       exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
1002    switch (node->type) {
1003    case nir_cf_node_block:
1004       return nir_cf_node_as_block(node)->index;
1005    case nir_cf_node_if:
1006       assert(0); /* XXX could this ever happen? */
1007       return 0;
1008    case nir_cf_node_loop:
1009       return loop_last_block(&nir_cf_node_as_loop(node)->body);
1010    default:
1011       compile_error(ctx, "Not supported\n");
1012       return 0;
1013    }
1014 }
1015 
1016 static void
emit_loop(struct ir2_context * ctx,nir_loop * nloop)1017 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1018 {
1019    ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1020    emit_cf_list(ctx, &nloop->body);
1021    ctx->loop_depth--;
1022 }
1023 
1024 static bool
emit_cf_list(struct ir2_context * ctx,struct exec_list * list)1025 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1026 {
1027    bool ret = false;
1028    foreach_list_typed (nir_cf_node, node, node, list) {
1029       ret = false;
1030       switch (node->type) {
1031       case nir_cf_node_block:
1032          ret = emit_block(ctx, nir_cf_node_as_block(node));
1033          break;
1034       case nir_cf_node_if:
1035          emit_if(ctx, nir_cf_node_as_if(node));
1036          break;
1037       case nir_cf_node_loop:
1038          emit_loop(ctx, nir_cf_node_as_loop(node));
1039          break;
1040       case nir_cf_node_function:
1041          compile_error(ctx, "Not supported\n");
1042          break;
1043       }
1044    }
1045    return ret;
1046 }
1047 
1048 static void
cleanup_binning(struct ir2_context * ctx)1049 cleanup_binning(struct ir2_context *ctx)
1050 {
1051    assert(ctx->so->type == MESA_SHADER_VERTEX);
1052 
1053    /* kill non-position outputs for binning variant */
1054    nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) {
1055       nir_foreach_instr_safe (instr, block) {
1056          if (instr->type != nir_instr_type_intrinsic)
1057             continue;
1058 
1059          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1060          if (intr->intrinsic != nir_intrinsic_store_output)
1061             continue;
1062 
1063          if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1064             nir_instr_remove(instr);
1065       }
1066    }
1067 
1068    ir2_optimize_nir(ctx->nir, false);
1069 }
1070 
1071 static bool
ir2_alu_to_scalar_filter_cb(const nir_instr * instr,const void * data)1072 ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
1073 {
1074    if (instr->type != nir_instr_type_alu)
1075       return false;
1076 
1077    nir_alu_instr *alu = nir_instr_as_alu(instr);
1078    switch (alu->op) {
1079    case nir_op_frsq:
1080    case nir_op_frcp:
1081    case nir_op_flog2:
1082    case nir_op_fexp2:
1083    case nir_op_fsqrt:
1084    case nir_op_fcos:
1085    case nir_op_fsin:
1086       return true;
1087    default:
1088       break;
1089    }
1090 
1091    return false;
1092 }
1093 
1094 void
ir2_nir_compile(struct ir2_context * ctx,bool binning)1095 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1096 {
1097    struct fd2_shader_stateobj *so = ctx->so;
1098 
1099    memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1100 
1101    ctx->nir = nir_shader_clone(NULL, so->nir);
1102 
1103    if (binning)
1104       cleanup_binning(ctx);
1105 
1106    OPT_V(ctx->nir, nir_copy_prop);
1107    OPT_V(ctx->nir, nir_opt_dce);
1108    OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
1109 
1110    OPT_V(ctx->nir, nir_lower_int_to_float);
1111    OPT_V(ctx->nir, nir_lower_bool_to_float);
1112    while (OPT(ctx->nir, nir_opt_algebraic))
1113       ;
1114    OPT_V(ctx->nir, nir_opt_algebraic_late);
1115    OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1116 
1117    OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL);
1118 
1119    OPT_V(ctx->nir, nir_lower_locals_to_regs);
1120 
1121    OPT_V(ctx->nir, nir_convert_from_ssa, true);
1122 
1123    OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1124    OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL);
1125 
1126    OPT_V(ctx->nir, nir_opt_dce);
1127 
1128    nir_sweep(ctx->nir);
1129 
1130    if (FD_DBG(DISASM)) {
1131       debug_printf("----------------------\n");
1132       nir_print_shader(ctx->nir, stdout);
1133       debug_printf("----------------------\n");
1134    }
1135 
1136    /* fd2_shader_stateobj init */
1137    if (so->type == MESA_SHADER_FRAGMENT) {
1138       ctx->f->fragcoord = -1;
1139       ctx->f->inputs_count = 0;
1140       memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1141    }
1142 
1143    /* Setup inputs: */
1144    nir_foreach_shader_in_variable (in, ctx->nir)
1145       setup_input(ctx, in);
1146 
1147    if (so->type == MESA_SHADER_FRAGMENT) {
1148       unsigned idx;
1149       for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1150          ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1151          update_range(ctx, &ctx->input[idx]);
1152       }
1153       /* assume we have param input and kill it later if not */
1154       ctx->input[idx].ncomp = 4;
1155       update_range(ctx, &ctx->input[idx]);
1156    } else {
1157       ctx->input[0].ncomp = 1;
1158       ctx->input[2].ncomp = 1;
1159       update_range(ctx, &ctx->input[0]);
1160       update_range(ctx, &ctx->input[2]);
1161    }
1162 
1163    /* And emit the body: */
1164    nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1165 
1166    nir_foreach_register (reg, &fxn->registers) {
1167       ctx->reg[reg->index].ncomp = reg->num_components;
1168       ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1169    }
1170 
1171    nir_metadata_require(fxn, nir_metadata_block_index);
1172    emit_cf_list(ctx, &fxn->body);
1173    /* TODO emit_block(ctx, fxn->end_block); */
1174 
1175    if (so->type == MESA_SHADER_VERTEX)
1176       extra_position_exports(ctx, binning);
1177 
1178    ralloc_free(ctx->nir);
1179 
1180    /* kill unused param input */
1181    if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1182       ctx->input[ctx->f->inputs_count].initialized = false;
1183 }
1184