• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Jonathan Marek <jonathan@marek.ca>
25  */
26 
27 #include "ir2_private.h"
28 
29 #include "fd2_program.h"
30 #include "freedreno_util.h"
31 
32 static const nir_shader_compiler_options options = {
33    .lower_fpow = true,
34    .lower_flrp32 = true,
35    .lower_fmod = true,
36    .lower_fdiv = true,
37    .lower_fceil = true,
38    .fuse_ffma16 = true,
39    .fuse_ffma32 = true,
40    .fuse_ffma64 = true,
41    /* .fdot_replicates = true, it is replicated, but it makes things worse */
42    .lower_all_io_to_temps = true,
43    .vertex_id_zero_based = true, /* its not implemented anyway */
44    .lower_bitops = true,
45    .lower_rotate = true,
46    .lower_vector_cmp = true,
47    .lower_fdph = true,
48    .has_fsub = true,
49    .has_isub = true,
50    .lower_insert_byte = true,
51    .lower_insert_word = true,
52    .force_indirect_unrolling = nir_var_all,
53    .force_indirect_unrolling_sampler = true,
54    .max_unroll_iterations = 32,
55 };
56 
57 const nir_shader_compiler_options *
ir2_get_compiler_options(void)58 ir2_get_compiler_options(void)
59 {
60    return &options;
61 }
62 
63 #define OPT(nir, pass, ...)                                                    \
64    ({                                                                          \
65       bool this_progress = false;                                              \
66       NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);                       \
67       this_progress;                                                           \
68    })
69 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
70 
71 static void
ir2_optimize_loop(nir_shader * s)72 ir2_optimize_loop(nir_shader *s)
73 {
74    bool progress;
75    do {
76       progress = false;
77 
78       OPT_V(s, nir_lower_vars_to_ssa);
79       progress |= OPT(s, nir_opt_copy_prop_vars);
80       progress |= OPT(s, nir_copy_prop);
81       progress |= OPT(s, nir_opt_dce);
82       progress |= OPT(s, nir_opt_cse);
83       /* progress |= OPT(s, nir_opt_gcm, true); */
84       progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
85       progress |= OPT(s, nir_opt_intrinsics);
86       progress |= OPT(s, nir_opt_algebraic);
87       progress |= OPT(s, nir_opt_constant_folding);
88       progress |= OPT(s, nir_opt_dead_cf);
89       if (OPT(s, nir_opt_trivial_continues)) {
90          progress |= true;
91          /* If nir_opt_trivial_continues makes progress, then we need to clean
92           * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
93           * to make progress.
94           */
95          OPT(s, nir_copy_prop);
96          OPT(s, nir_opt_dce);
97       }
98       progress |= OPT(s, nir_opt_loop_unroll);
99       progress |= OPT(s, nir_opt_if, nir_opt_if_optimize_phi_true_false);
100       progress |= OPT(s, nir_opt_remove_phis);
101       progress |= OPT(s, nir_opt_undef);
102 
103    } while (progress);
104 }
105 
106 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
107 bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
108 
109 int
ir2_optimize_nir(nir_shader * s,bool lower)110 ir2_optimize_nir(nir_shader *s, bool lower)
111 {
112    struct nir_lower_tex_options tex_options = {
113       .lower_txp = ~0u,
114       .lower_rect = 0,
115       .lower_invalid_implicit_lod = true,
116    };
117 
118    if (FD_DBG(DISASM)) {
119       debug_printf("----------------------\n");
120       nir_print_shader(s, stdout);
121       debug_printf("----------------------\n");
122    }
123 
124    OPT_V(s, nir_lower_regs_to_ssa);
125    OPT_V(s, nir_lower_vars_to_ssa);
126    OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out,
127          UINT32_MAX);
128 
129    if (lower) {
130       OPT_V(s, ir3_nir_apply_trig_workarounds);
131       OPT_V(s, nir_lower_tex, &tex_options);
132    }
133 
134    ir2_optimize_loop(s);
135 
136    OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
137    OPT_V(s, nir_opt_sink, nir_move_const_undef);
138 
139    /* TODO we dont want to get shaders writing to depth for depth textures */
140    if (s->info.stage == MESA_SHADER_FRAGMENT) {
141       nir_foreach_shader_out_variable (var, s) {
142          if (var->data.location == FRAG_RESULT_DEPTH)
143             return -1;
144       }
145    }
146 
147    return 0;
148 }
149 
150 static struct ir2_src
load_const(struct ir2_context * ctx,float * value_f,unsigned ncomp)151 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
152 {
153    struct fd2_shader_stateobj *so = ctx->so;
154    unsigned imm_ncomp, swiz, idx, i, j;
155    uint32_t *value = (uint32_t *)value_f;
156 
157    /* try to merge with existing immediate (TODO: try with neg) */
158    for (idx = 0; idx < so->num_immediates; idx++) {
159       swiz = 0;
160       imm_ncomp = so->immediates[idx].ncomp;
161       for (i = 0; i < ncomp; i++) {
162          for (j = 0; j < imm_ncomp; j++) {
163             if (value[i] == so->immediates[idx].val[j])
164                break;
165          }
166          if (j == imm_ncomp) {
167             if (j == 4)
168                break;
169             so->immediates[idx].val[imm_ncomp++] = value[i];
170          }
171          swiz |= swiz_set(j, i);
172       }
173       /* matched all components */
174       if (i == ncomp)
175          break;
176    }
177 
178    /* need to allocate new immediate */
179    if (idx == so->num_immediates) {
180       swiz = 0;
181       imm_ncomp = 0;
182       for (i = 0; i < ncomp; i++) {
183          for (j = 0; j < imm_ncomp; j++) {
184             if (value[i] == ctx->so->immediates[idx].val[j])
185                break;
186          }
187          if (j == imm_ncomp) {
188             so->immediates[idx].val[imm_ncomp++] = value[i];
189          }
190          swiz |= swiz_set(j, i);
191       }
192       so->num_immediates++;
193    }
194    so->immediates[idx].ncomp = imm_ncomp;
195 
196    if (ncomp == 1)
197       swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
198 
199    return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
200 }
201 
202 struct ir2_src
ir2_zero(struct ir2_context * ctx)203 ir2_zero(struct ir2_context *ctx)
204 {
205    return load_const(ctx, (float[]){0.0f}, 1);
206 }
207 
208 static void
update_range(struct ir2_context * ctx,struct ir2_reg * reg)209 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
210 {
211    if (!reg->initialized) {
212       reg->initialized = true;
213       reg->loop_depth = ctx->loop_depth;
214    }
215 
216    if (ctx->loop_depth > reg->loop_depth) {
217       reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
218    } else {
219       reg->loop_depth = ctx->loop_depth;
220       reg->block_idx_free = -1;
221    }
222 
223    /* for regs we want to free at the end of the loop in any case
224     * XXX dont do this for ssa
225     */
226    if (reg->loop_depth)
227       reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
228 }
229 
230 static struct ir2_src
make_src(struct ir2_context * ctx,nir_src src)231 make_src(struct ir2_context *ctx, nir_src src)
232 {
233    struct ir2_src res = {};
234    struct ir2_reg *reg;
235 
236    nir_const_value *const_value = nir_src_as_const_value(src);
237 
238    if (const_value) {
239       assert(src.is_ssa);
240       float c[src.ssa->num_components];
241       nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
242       return load_const(ctx, c, src.ssa->num_components);
243    }
244 
245    if (!src.is_ssa) {
246       res.num = src.reg.reg->index;
247       res.type = IR2_SRC_REG;
248       reg = &ctx->reg[res.num];
249    } else {
250       assert(ctx->ssa_map[src.ssa->index] >= 0);
251       res.num = ctx->ssa_map[src.ssa->index];
252       res.type = IR2_SRC_SSA;
253       reg = &ctx->instr[res.num].ssa;
254    }
255 
256    update_range(ctx, reg);
257    return res;
258 }
259 
260 static void
set_index(struct ir2_context * ctx,nir_dest * dst,struct ir2_instr * instr)261 set_index(struct ir2_context *ctx, nir_dest *dst, struct ir2_instr *instr)
262 {
263    struct ir2_reg *reg = &instr->ssa;
264 
265    if (dst->is_ssa) {
266       ctx->ssa_map[dst->ssa.index] = instr->idx;
267    } else {
268       assert(instr->is_ssa);
269       reg = &ctx->reg[dst->reg.reg->index];
270 
271       instr->is_ssa = false;
272       instr->reg = reg;
273    }
274    update_range(ctx, reg);
275 }
276 
277 static struct ir2_instr *
ir2_instr_create(struct ir2_context * ctx,int type)278 ir2_instr_create(struct ir2_context *ctx, int type)
279 {
280    struct ir2_instr *instr;
281 
282    instr = &ctx->instr[ctx->instr_count++];
283    instr->idx = ctx->instr_count - 1;
284    instr->type = type;
285    instr->block_idx = ctx->block_idx;
286    instr->pred = ctx->pred;
287    instr->is_ssa = true;
288    return instr;
289 }
290 
291 static struct ir2_instr *
instr_create_alu(struct ir2_context * ctx,nir_op opcode,unsigned ncomp)292 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
293 {
294    /* emit_alu will fixup instrs that don't map directly */
295    static const struct ir2_opc {
296       int8_t scalar, vector;
297    } nir_ir2_opc[nir_num_opcodes + 1] = {
298       [0 ... nir_num_opcodes - 1] = {-1, -1},
299 
300       [nir_op_mov] = {MAXs, MAXv},
301       [nir_op_fneg] = {MAXs, MAXv},
302       [nir_op_fabs] = {MAXs, MAXv},
303       [nir_op_fsat] = {MAXs, MAXv},
304       [nir_op_fsign] = {-1, CNDGTEv},
305       [nir_op_fadd] = {ADDs, ADDv},
306       [nir_op_fsub] = {ADDs, ADDv},
307       [nir_op_fmul] = {MULs, MULv},
308       [nir_op_ffma] = {-1, MULADDv},
309       [nir_op_fmax] = {MAXs, MAXv},
310       [nir_op_fmin] = {MINs, MINv},
311       [nir_op_ffloor] = {FLOORs, FLOORv},
312       [nir_op_ffract] = {FRACs, FRACv},
313       [nir_op_ftrunc] = {TRUNCs, TRUNCv},
314       [nir_op_fdot2] = {-1, DOT2ADDv},
315       [nir_op_fdot3] = {-1, DOT3v},
316       [nir_op_fdot4] = {-1, DOT4v},
317       [nir_op_sge] = {-1, SETGTEv},
318       [nir_op_slt] = {-1, SETGTv},
319       [nir_op_sne] = {-1, SETNEv},
320       [nir_op_seq] = {-1, SETEv},
321       [nir_op_fcsel] = {-1, CNDEv},
322       [nir_op_frsq] = {RECIPSQ_IEEE, -1},
323       [nir_op_frcp] = {RECIP_IEEE, -1},
324       [nir_op_flog2] = {LOG_IEEE, -1},
325       [nir_op_fexp2] = {EXP_IEEE, -1},
326       [nir_op_fsqrt] = {SQRT_IEEE, -1},
327       [nir_op_fcos] = {COS, -1},
328       [nir_op_fsin] = {SIN, -1},
329    /* no fsat, fneg, fabs since source mods deal with those */
330 
331    /* so we can use this function with non-nir op */
332 #define ir2_op_cube nir_num_opcodes
333       [ir2_op_cube] = {-1, CUBEv},
334    };
335 
336    struct ir2_opc op = nir_ir2_opc[opcode];
337    assert(op.vector >= 0 || op.scalar >= 0);
338 
339    struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
340    instr->alu.vector_opc = op.vector;
341    instr->alu.scalar_opc = op.scalar;
342    instr->alu.export = -1;
343    instr->alu.write_mask = (1 << ncomp) - 1;
344    instr->src_count =
345       opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs;
346    instr->ssa.ncomp = ncomp;
347    return instr;
348 }
349 
350 static struct ir2_instr *
instr_create_alu_reg(struct ir2_context * ctx,nir_op opcode,uint8_t write_mask,struct ir2_instr * share_reg)351 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask,
352                      struct ir2_instr *share_reg)
353 {
354    struct ir2_instr *instr;
355    struct ir2_reg *reg;
356 
357    reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
358    reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
359 
360    instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
361    instr->alu.write_mask = write_mask;
362    instr->reg = reg;
363    instr->is_ssa = false;
364    return instr;
365 }
366 
367 static struct ir2_instr *
instr_create_alu_dest(struct ir2_context * ctx,nir_op opcode,nir_dest * dst)368 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
369 {
370    struct ir2_instr *instr;
371    instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
372    set_index(ctx, dst, instr);
373    return instr;
374 }
375 
376 static struct ir2_instr *
ir2_instr_create_fetch(struct ir2_context * ctx,nir_dest * dst,instr_fetch_opc_t opc)377 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
378                        instr_fetch_opc_t opc)
379 {
380    struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
381    instr->fetch.opc = opc;
382    instr->src_count = 1;
383    instr->ssa.ncomp = nir_dest_num_components(*dst);
384    set_index(ctx, dst, instr);
385    return instr;
386 }
387 
388 static struct ir2_src
make_src_noconst(struct ir2_context * ctx,nir_src src)389 make_src_noconst(struct ir2_context *ctx, nir_src src)
390 {
391    struct ir2_instr *instr;
392 
393    if (nir_src_as_const_value(src)) {
394       assert(src.is_ssa);
395       instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
396       instr->src[0] = make_src(ctx, src);
397       return ir2_src(instr->idx, 0, IR2_SRC_SSA);
398    }
399 
400    return make_src(ctx, src);
401 }
402 
403 static void
emit_alu(struct ir2_context * ctx,nir_alu_instr * alu)404 emit_alu(struct ir2_context *ctx, nir_alu_instr *alu)
405 {
406    const nir_op_info *info = &nir_op_infos[alu->op];
407    nir_dest *dst = &alu->dest.dest;
408    struct ir2_instr *instr;
409    struct ir2_src tmp;
410    unsigned ncomp;
411 
412    /* get the number of dst components */
413    if (dst->is_ssa) {
414       ncomp = dst->ssa.num_components;
415    } else {
416       ncomp = 0;
417       for (int i = 0; i < 4; i++)
418          ncomp += !!(alu->dest.write_mask & 1 << i);
419    }
420 
421    instr = instr_create_alu(ctx, alu->op, ncomp);
422    set_index(ctx, dst, instr);
423    instr->alu.saturate = alu->dest.saturate;
424    instr->alu.write_mask = alu->dest.write_mask;
425 
426    for (int i = 0; i < info->num_inputs; i++) {
427       nir_alu_src *src = &alu->src[i];
428 
429       /* compress swizzle with writemask when applicable */
430       unsigned swiz = 0, j = 0;
431       for (int i = 0; i < 4; i++) {
432          if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
433             continue;
434          swiz |= swiz_set(src->swizzle[i], j++);
435       }
436 
437       instr->src[i] = make_src(ctx, src->src);
438       instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
439       instr->src[i].negate = src->negate;
440       instr->src[i].abs = src->abs;
441    }
442 
443    /* workarounds for NIR ops that don't map directly to a2xx ops */
444    switch (alu->op) {
445    case nir_op_fneg:
446       instr->src[0].negate = 1;
447       break;
448    case nir_op_fabs:
449       instr->src[0].abs = 1;
450       break;
451    case nir_op_fsat:
452       instr->alu.saturate = 1;
453       break;
454    case nir_op_slt:
455       tmp = instr->src[0];
456       instr->src[0] = instr->src[1];
457       instr->src[1] = tmp;
458       break;
459    case nir_op_fcsel:
460       tmp = instr->src[1];
461       instr->src[1] = instr->src[2];
462       instr->src[2] = tmp;
463       break;
464    case nir_op_fsub:
465       instr->src[1].negate = !instr->src[1].negate;
466       break;
467    case nir_op_fdot2:
468       instr->src_count = 3;
469       instr->src[2] = ir2_zero(ctx);
470       break;
471    case nir_op_fsign: {
472       /* we need an extra instruction to deal with the zero case */
473       struct ir2_instr *tmp;
474 
475       /* tmp = x == 0 ? 0 : 1 */
476       tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
477       tmp->src[0] = instr->src[0];
478       tmp->src[1] = ir2_zero(ctx);
479       tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1);
480 
481       /* result = x >= 0 ? tmp : -tmp */
482       instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
483       instr->src[2] = instr->src[1];
484       instr->src[2].negate = true;
485       instr->src_count = 3;
486    } break;
487    default:
488       break;
489    }
490 }
491 
492 static void
load_input(struct ir2_context * ctx,nir_dest * dst,unsigned idx)493 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
494 {
495    struct ir2_instr *instr;
496    int slot = -1;
497 
498    if (ctx->so->type == MESA_SHADER_VERTEX) {
499       instr = ir2_instr_create_fetch(ctx, dst, 0);
500       instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
501       instr->fetch.vtx.const_idx = 20 + (idx / 3);
502       instr->fetch.vtx.const_idx_sel = idx % 3;
503       return;
504    }
505 
506    /* get slot from idx */
507    nir_foreach_shader_in_variable (var, ctx->nir) {
508       if (var->data.driver_location == idx) {
509          slot = var->data.location;
510          break;
511       }
512    }
513    assert(slot >= 0);
514 
515    switch (slot) {
516    case VARYING_SLOT_POS:
517       /* need to extract xy with abs and add tile offset on a20x
518        * zw from fragcoord input (w inverted in fragment shader)
519        * TODO: only components that are required by fragment shader
520        */
521       instr = instr_create_alu_reg(
522          ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
523       instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
524       instr->src[0].abs = true;
525       /* on a20x, C64 contains the tile offset */
526       instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
527 
528       instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
529       instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
530 
531       instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
532       instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
533 
534       unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
535       instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
536       instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
537       break;
538    default:
539       instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
540       instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
541       break;
542    }
543 }
544 
545 static unsigned
output_slot(struct ir2_context * ctx,nir_intrinsic_instr * intr)546 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
547 {
548    int slot = -1;
549    unsigned idx = nir_intrinsic_base(intr);
550    nir_foreach_shader_out_variable (var, ctx->nir) {
551       if (var->data.driver_location == idx) {
552          slot = var->data.location;
553          break;
554       }
555    }
556    assert(slot != -1);
557    return slot;
558 }
559 
560 static void
store_output(struct ir2_context * ctx,nir_src src,unsigned slot,unsigned ncomp)561 store_output(struct ir2_context *ctx, nir_src src, unsigned slot,
562              unsigned ncomp)
563 {
564    struct ir2_instr *instr;
565    unsigned idx = 0;
566 
567    if (ctx->so->type == MESA_SHADER_VERTEX) {
568       switch (slot) {
569       case VARYING_SLOT_POS:
570          ctx->position = make_src(ctx, src);
571          idx = 62;
572          break;
573       case VARYING_SLOT_PSIZ:
574          ctx->so->writes_psize = true;
575          idx = 63;
576          break;
577       default:
578          /* find matching slot from fragment shader input */
579          for (idx = 0; idx < ctx->f->inputs_count; idx++)
580             if (ctx->f->inputs[idx].slot == slot)
581                break;
582          if (idx == ctx->f->inputs_count)
583             return;
584       }
585    } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
586       /* only color output is implemented */
587       return;
588    }
589 
590    instr = instr_create_alu(ctx, nir_op_mov, ncomp);
591    instr->src[0] = make_src(ctx, src);
592    instr->alu.export = idx;
593 }
594 
595 static void
emit_intrinsic(struct ir2_context * ctx,nir_intrinsic_instr * intr)596 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
597 {
598    struct ir2_instr *instr;
599    ASSERTED nir_const_value *const_offset;
600    unsigned idx;
601 
602    switch (intr->intrinsic) {
603    case nir_intrinsic_load_input:
604       load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
605       break;
606    case nir_intrinsic_store_output:
607       store_output(ctx, intr->src[0], output_slot(ctx, intr),
608                    intr->num_components);
609       break;
610    case nir_intrinsic_load_uniform:
611       const_offset = nir_src_as_const_value(intr->src[0]);
612       assert(const_offset); /* TODO can be false in ES2? */
613       idx = nir_intrinsic_base(intr);
614       idx += (uint32_t)const_offset[0].f32;
615       instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
616       instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
617       break;
618    case nir_intrinsic_discard:
619    case nir_intrinsic_discard_if:
620       instr = ir2_instr_create(ctx, IR2_ALU);
621       instr->alu.vector_opc = VECTOR_NONE;
622       if (intr->intrinsic == nir_intrinsic_discard_if) {
623          instr->alu.scalar_opc = KILLNEs;
624          instr->src[0] = make_src(ctx, intr->src[0]);
625       } else {
626          instr->alu.scalar_opc = KILLEs;
627          instr->src[0] = ir2_zero(ctx);
628       }
629       instr->alu.export = -1;
630       instr->src_count = 1;
631       ctx->so->has_kill = true;
632       break;
633    case nir_intrinsic_load_front_face:
634       /* gl_FrontFacing is in the sign of param.x
635        * rcp required because otherwise we can't differentiate -0.0 and +0.0
636        */
637       ctx->so->need_param = true;
638 
639       struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
640       tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
641 
642       instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
643       instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
644       instr->src[1] = ir2_zero(ctx);
645       break;
646    case nir_intrinsic_load_point_coord:
647       /* param.zw (note: abs might be needed like fragcoord in param.xy?) */
648       ctx->so->need_param = true;
649 
650       instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
651       instr->src[0] =
652          ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
653       break;
654    default:
655       compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
656       break;
657    }
658 }
659 
660 static void
emit_tex(struct ir2_context * ctx,nir_tex_instr * tex)661 emit_tex(struct ir2_context *ctx, nir_tex_instr *tex)
662 {
663    bool is_rect = false, is_cube = false;
664    struct ir2_instr *instr;
665    nir_src *coord, *lod_bias;
666 
667    coord = lod_bias = NULL;
668 
669    for (unsigned i = 0; i < tex->num_srcs; i++) {
670       switch (tex->src[i].src_type) {
671       case nir_tex_src_coord:
672          coord = &tex->src[i].src;
673          break;
674       case nir_tex_src_bias:
675       case nir_tex_src_lod:
676          assert(!lod_bias);
677          lod_bias = &tex->src[i].src;
678          break;
679       default:
680          compile_error(ctx, "Unhandled NIR tex src type: %d\n",
681                        tex->src[i].src_type);
682          return;
683       }
684    }
685 
686    switch (tex->op) {
687    case nir_texop_tex:
688    case nir_texop_txb:
689    case nir_texop_txl:
690       break;
691    default:
692       compile_error(ctx, "unimplemented texop %d\n", tex->op);
693       return;
694    }
695 
696    switch (tex->sampler_dim) {
697    case GLSL_SAMPLER_DIM_2D:
698    case GLSL_SAMPLER_DIM_EXTERNAL:
699       break;
700    case GLSL_SAMPLER_DIM_RECT:
701       is_rect = true;
702       break;
703    case GLSL_SAMPLER_DIM_CUBE:
704       is_cube = true;
705       break;
706    default:
707       compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
708       return;
709    }
710 
711    struct ir2_src src_coord = make_src_noconst(ctx, *coord);
712 
713    /* for cube maps
714     * tmp = cube(coord)
715     * tmp.xy = tmp.xy / |tmp.z| + 1.5
716     * coord = tmp.xyw
717     */
718    if (is_cube) {
719       struct ir2_instr *rcp, *coord_xy;
720       unsigned reg_idx;
721 
722       instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
723       instr->src[0] = src_coord;
724       instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
725       instr->src[1] = src_coord;
726       instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
727 
728       reg_idx = instr->reg - ctx->reg; /* hacky */
729 
730       rcp = instr_create_alu(ctx, nir_op_frcp, 1);
731       rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
732       rcp->src[0].abs = true;
733 
734       coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
735       coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
736       coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
737       coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1);
738 
739       src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
740       /* TODO: lod/bias transformed by src_coord.z ? */
741    }
742 
743    instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
744    instr->src[0] = src_coord;
745    instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;
746    instr->fetch.tex.is_cube = is_cube;
747    instr->fetch.tex.is_rect = is_rect;
748    instr->fetch.tex.samp_id = tex->sampler_index;
749 
750    /* for lod/bias, we insert an extra src for the backend to deal with */
751    if (lod_bias) {
752       instr->src[1] = make_src_noconst(ctx, *lod_bias);
753       /* backend will use 2-3 components so apply swizzle */
754       swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
755       instr->src_count = 2;
756    }
757 }
758 
759 static void
setup_input(struct ir2_context * ctx,nir_variable * in)760 setup_input(struct ir2_context *ctx, nir_variable *in)
761 {
762    struct fd2_shader_stateobj *so = ctx->so;
763    ASSERTED unsigned array_len = MAX2(glsl_get_length(in->type), 1);
764    unsigned n = in->data.driver_location;
765    unsigned slot = in->data.location;
766 
767    assert(array_len == 1);
768 
769    /* handle later */
770    if (ctx->so->type == MESA_SHADER_VERTEX)
771       return;
772 
773    if (ctx->so->type != MESA_SHADER_FRAGMENT)
774       compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
775 
776    n = ctx->f->inputs_count++;
777 
778    /* half of fragcoord from param reg, half from a varying */
779    if (slot == VARYING_SLOT_POS) {
780       ctx->f->fragcoord = n;
781       so->need_param = true;
782    }
783 
784    ctx->f->inputs[n].slot = slot;
785    ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
786 
787    /* in->data.interpolation?
788     * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
789     */
790 }
791 
792 static void
emit_undef(struct ir2_context * ctx,nir_ssa_undef_instr * undef)793 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr *undef)
794 {
795    /* TODO we don't want to emit anything for undefs */
796 
797    struct ir2_instr *instr;
798 
799    instr = instr_create_alu_dest(
800       ctx, nir_op_mov, &(nir_dest){.ssa = undef->def, .is_ssa = true});
801    instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
802 }
803 
804 static void
emit_instr(struct ir2_context * ctx,nir_instr * instr)805 emit_instr(struct ir2_context *ctx, nir_instr *instr)
806 {
807    switch (instr->type) {
808    case nir_instr_type_alu:
809       emit_alu(ctx, nir_instr_as_alu(instr));
810       break;
811    case nir_instr_type_deref:
812       /* ignored, handled as part of the intrinsic they are src to */
813       break;
814    case nir_instr_type_intrinsic:
815       emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
816       break;
817    case nir_instr_type_load_const:
818       /* dealt with when using nir_src */
819       break;
820    case nir_instr_type_tex:
821       emit_tex(ctx, nir_instr_as_tex(instr));
822       break;
823    case nir_instr_type_jump:
824       ctx->block_has_jump[ctx->block_idx] = true;
825       break;
826    case nir_instr_type_ssa_undef:
827       emit_undef(ctx, nir_instr_as_ssa_undef(instr));
828       break;
829    default:
830       break;
831    }
832 }
833 
834 /* fragcoord.zw and a20x hw binning outputs */
835 static void
extra_position_exports(struct ir2_context * ctx,bool binning)836 extra_position_exports(struct ir2_context *ctx, bool binning)
837 {
838    struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
839 
840    if (ctx->f->fragcoord < 0 && !binning)
841       return;
842 
843    instr = instr_create_alu(ctx, nir_op_fmax, 1);
844    instr->src[0] = ctx->position;
845    instr->src[0].swizzle = IR2_SWIZZLE_W;
846    instr->src[1] = ir2_zero(ctx);
847 
848    rcp = instr_create_alu(ctx, nir_op_frcp, 1);
849    rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
850 
851    sc = instr_create_alu(ctx, nir_op_fmul, 4);
852    sc->src[0] = ctx->position;
853    sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
854 
855    wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
856    wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
857    wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
858    wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
859 
860    /* fragcoord z/w */
861    if (ctx->f->fragcoord >= 0 && !binning) {
862       instr = instr_create_alu(ctx, nir_op_mov, 1);
863       instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
864       instr->alu.export = ctx->f->fragcoord;
865 
866       instr = instr_create_alu(ctx, nir_op_mov, 1);
867       instr->src[0] = ctx->position;
868       instr->src[0].swizzle = IR2_SWIZZLE_W;
869       instr->alu.export = ctx->f->fragcoord;
870       instr->alu.write_mask = 2;
871    }
872 
873    if (!binning)
874       return;
875 
876    off = instr_create_alu(ctx, nir_op_fadd, 1);
877    off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
878    off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
879 
880    /* 8 max set in freedreno_screen.. unneeded instrs patched out */
881    for (int i = 0; i < 8; i++) {
882       instr = instr_create_alu(ctx, nir_op_ffma, 4);
883       instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
884       instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
885       instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
886       instr->alu.export = 32;
887 
888       instr = instr_create_alu(ctx, nir_op_ffma, 4);
889       instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
890       instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
891       instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
892       instr->alu.export = 33;
893    }
894 }
895 
896 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
897 
898 static bool
emit_block(struct ir2_context * ctx,nir_block * block)899 emit_block(struct ir2_context *ctx, nir_block *block)
900 {
901    struct ir2_instr *instr;
902    nir_block *succs = block->successors[0];
903 
904    ctx->block_idx = block->index;
905 
906    nir_foreach_instr (instr, block)
907       emit_instr(ctx, instr);
908 
909    if (!succs || !succs->index)
910       return false;
911 
912    /* we want to be smart and always jump and have the backend cleanup
913     * but we are not, so there are two cases where jump is needed:
914     *  loops (succs index lower)
915     *  jumps (jump instruction seen in block)
916     */
917    if (succs->index > block->index && !ctx->block_has_jump[block->index])
918       return false;
919 
920    assert(block->successors[1] == NULL);
921 
922    instr = ir2_instr_create(ctx, IR2_CF);
923    instr->cf.block_idx = succs->index;
924    /* XXX can't jump to a block with different predicate */
925    return true;
926 }
927 
928 static void
emit_if(struct ir2_context * ctx,nir_if * nif)929 emit_if(struct ir2_context *ctx, nir_if *nif)
930 {
931    unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
932    struct ir2_instr *instr;
933 
934    /* XXX: blob seems to always use same register for condition */
935 
936    instr = ir2_instr_create(ctx, IR2_ALU);
937    instr->src[0] = make_src(ctx, nif->condition);
938    instr->src_count = 1;
939    instr->ssa.ncomp = 1;
940    instr->alu.vector_opc = VECTOR_NONE;
941    instr->alu.scalar_opc = SCALAR_NONE;
942    instr->alu.export = -1;
943    instr->alu.write_mask = 1;
944    instr->pred = 0;
945 
946    /* if nested, use PRED_SETNE_PUSHv */
947    if (pred) {
948       instr->alu.vector_opc = PRED_SETNE_PUSHv;
949       instr->src[1] = instr->src[0];
950       instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
951       instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
952       instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
953       instr->src_count = 2;
954    } else {
955       instr->alu.scalar_opc = PRED_SETNEs;
956    }
957 
958    ctx->pred_idx = instr->idx;
959    ctx->pred = 3;
960 
961    emit_cf_list(ctx, &nif->then_list);
962 
963    /* TODO: if these is no else branch we don't need this
964     * and if the else branch is simple, can just flip ctx->pred instead
965     */
966    instr = ir2_instr_create(ctx, IR2_ALU);
967    instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
968    instr->src_count = 1;
969    instr->ssa.ncomp = 1;
970    instr->alu.vector_opc = VECTOR_NONE;
971    instr->alu.scalar_opc = PRED_SET_INVs;
972    instr->alu.export = -1;
973    instr->alu.write_mask = 1;
974    instr->pred = 0;
975    ctx->pred_idx = instr->idx;
976 
977    emit_cf_list(ctx, &nif->else_list);
978 
979    /* restore predicate for nested predicates */
980    if (pred) {
981       instr = ir2_instr_create(ctx, IR2_ALU);
982       instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
983       instr->src_count = 1;
984       instr->ssa.ncomp = 1;
985       instr->alu.vector_opc = VECTOR_NONE;
986       instr->alu.scalar_opc = PRED_SET_POPs;
987       instr->alu.export = -1;
988       instr->alu.write_mask = 1;
989       instr->pred = 0;
990       ctx->pred_idx = instr->idx;
991    }
992 
993    /* restore ctx->pred */
994    ctx->pred = pred;
995 }
996 
997 /* get the highest block idx in the loop, so we know when
998  * we can free registers that are allocated outside the loop
999  */
1000 static unsigned
loop_last_block(struct exec_list * list)1001 loop_last_block(struct exec_list *list)
1002 {
1003    nir_cf_node *node =
1004       exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
1005    switch (node->type) {
1006    case nir_cf_node_block:
1007       return nir_cf_node_as_block(node)->index;
1008    case nir_cf_node_if:
1009       assert(0); /* XXX could this ever happen? */
1010       return 0;
1011    case nir_cf_node_loop:
1012       return loop_last_block(&nir_cf_node_as_loop(node)->body);
1013    default:
1014       compile_error(ctx, "Not supported\n");
1015       return 0;
1016    }
1017 }
1018 
1019 static void
emit_loop(struct ir2_context * ctx,nir_loop * nloop)1020 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1021 {
1022    ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1023    emit_cf_list(ctx, &nloop->body);
1024    ctx->loop_depth--;
1025 }
1026 
1027 static bool
emit_cf_list(struct ir2_context * ctx,struct exec_list * list)1028 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1029 {
1030    bool ret = false;
1031    foreach_list_typed (nir_cf_node, node, node, list) {
1032       ret = false;
1033       switch (node->type) {
1034       case nir_cf_node_block:
1035          ret = emit_block(ctx, nir_cf_node_as_block(node));
1036          break;
1037       case nir_cf_node_if:
1038          emit_if(ctx, nir_cf_node_as_if(node));
1039          break;
1040       case nir_cf_node_loop:
1041          emit_loop(ctx, nir_cf_node_as_loop(node));
1042          break;
1043       case nir_cf_node_function:
1044          compile_error(ctx, "Not supported\n");
1045          break;
1046       }
1047    }
1048    return ret;
1049 }
1050 
1051 static void
cleanup_binning(struct ir2_context * ctx)1052 cleanup_binning(struct ir2_context *ctx)
1053 {
1054    assert(ctx->so->type == MESA_SHADER_VERTEX);
1055 
1056    /* kill non-position outputs for binning variant */
1057    nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) {
1058       nir_foreach_instr_safe (instr, block) {
1059          if (instr->type != nir_instr_type_intrinsic)
1060             continue;
1061 
1062          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1063          if (intr->intrinsic != nir_intrinsic_store_output)
1064             continue;
1065 
1066          if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1067             nir_instr_remove(instr);
1068       }
1069    }
1070 
1071    ir2_optimize_nir(ctx->nir, false);
1072 }
1073 
1074 static bool
ir2_alu_to_scalar_filter_cb(const nir_instr * instr,const void * data)1075 ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
1076 {
1077    if (instr->type != nir_instr_type_alu)
1078       return false;
1079 
1080    nir_alu_instr *alu = nir_instr_as_alu(instr);
1081    switch (alu->op) {
1082    case nir_op_frsq:
1083    case nir_op_frcp:
1084    case nir_op_flog2:
1085    case nir_op_fexp2:
1086    case nir_op_fsqrt:
1087    case nir_op_fcos:
1088    case nir_op_fsin:
1089       return true;
1090    default:
1091       break;
1092    }
1093 
1094    return false;
1095 }
1096 
1097 void
ir2_nir_compile(struct ir2_context * ctx,bool binning)1098 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1099 {
1100    struct fd2_shader_stateobj *so = ctx->so;
1101 
1102    memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1103 
1104    ctx->nir = nir_shader_clone(NULL, so->nir);
1105 
1106    if (binning)
1107       cleanup_binning(ctx);
1108 
1109    OPT_V(ctx->nir, nir_copy_prop);
1110    OPT_V(ctx->nir, nir_opt_dce);
1111    OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
1112 
1113    OPT_V(ctx->nir, nir_lower_int_to_float);
1114    OPT_V(ctx->nir, nir_lower_bool_to_float);
1115    while (OPT(ctx->nir, nir_opt_algebraic))
1116       ;
1117    OPT_V(ctx->nir, nir_opt_algebraic_late);
1118    OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1119 
1120    OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL);
1121 
1122    OPT_V(ctx->nir, nir_lower_locals_to_regs);
1123 
1124    OPT_V(ctx->nir, nir_convert_from_ssa, true);
1125 
1126    OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1127    OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL);
1128 
1129    OPT_V(ctx->nir, nir_opt_dce);
1130 
1131    nir_sweep(ctx->nir);
1132 
1133    if (FD_DBG(DISASM)) {
1134       debug_printf("----------------------\n");
1135       nir_print_shader(ctx->nir, stdout);
1136       debug_printf("----------------------\n");
1137    }
1138 
1139    /* fd2_shader_stateobj init */
1140    if (so->type == MESA_SHADER_FRAGMENT) {
1141       ctx->f->fragcoord = -1;
1142       ctx->f->inputs_count = 0;
1143       memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1144    }
1145 
1146    /* Setup inputs: */
1147    nir_foreach_shader_in_variable (in, ctx->nir)
1148       setup_input(ctx, in);
1149 
1150    if (so->type == MESA_SHADER_FRAGMENT) {
1151       unsigned idx;
1152       for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1153          ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1154          update_range(ctx, &ctx->input[idx]);
1155       }
1156       /* assume we have param input and kill it later if not */
1157       ctx->input[idx].ncomp = 4;
1158       update_range(ctx, &ctx->input[idx]);
1159    } else {
1160       ctx->input[0].ncomp = 1;
1161       ctx->input[2].ncomp = 1;
1162       update_range(ctx, &ctx->input[0]);
1163       update_range(ctx, &ctx->input[2]);
1164    }
1165 
1166    /* And emit the body: */
1167    nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1168 
1169    nir_foreach_register (reg, &fxn->registers) {
1170       ctx->reg[reg->index].ncomp = reg->num_components;
1171       ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1172    }
1173 
1174    nir_metadata_require(fxn, nir_metadata_block_index);
1175    emit_cf_list(ctx, &fxn->body);
1176    /* TODO emit_block(ctx, fxn->end_block); */
1177 
1178    if (so->type == MESA_SHADER_VERTEX)
1179       extra_position_exports(ctx, binning);
1180 
1181    ralloc_free(ctx->nir);
1182 
1183    /* kill unused param input */
1184    if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1185       ctx->input[ctx->f->inputs_count].initialized = false;
1186 }
1187