1 /*
2 * Copyright © 2018 Jonathan Marek <jonathan@marek.ca>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "ir2_private.h"
10
11 #include "fd2_program.h"
12 #include "freedreno_util.h"
13 #include "nir_legacy.h"
14
15 static const nir_shader_compiler_options options = {
16 .compact_arrays = true,
17 .lower_fpow = true,
18 .lower_flrp32 = true,
19 .lower_fmod = true,
20 .lower_fdiv = true,
21 .lower_fceil = true,
22 .fuse_ffma16 = true,
23 .fuse_ffma32 = true,
24 .fuse_ffma64 = true,
25 /* .fdot_replicates = true, it is replicated, but it makes things worse */
26 .lower_all_io_to_temps = true,
27 .vertex_id_zero_based = true, /* its not implemented anyway */
28 .lower_bitops = true,
29 .lower_vector_cmp = true,
30 .lower_fdph = true,
31 .has_fsub = true,
32 .has_isub = true,
33 .no_integers = true,
34 .lower_insert_byte = true,
35 .lower_insert_word = true,
36 .force_indirect_unrolling = nir_var_all,
37 .force_indirect_unrolling_sampler = true,
38 .max_unroll_iterations = 32,
39 };
40
41 const nir_shader_compiler_options *
ir2_get_compiler_options(void)42 ir2_get_compiler_options(void)
43 {
44 return &options;
45 }
46
47 #define OPT(nir, pass, ...) \
48 ({ \
49 bool this_progress = false; \
50 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
51 this_progress; \
52 })
53 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
54
55 static void
ir2_optimize_loop(nir_shader * s)56 ir2_optimize_loop(nir_shader *s)
57 {
58 bool progress;
59 do {
60 progress = false;
61
62 OPT_V(s, nir_lower_vars_to_ssa);
63 progress |= OPT(s, nir_opt_copy_prop_vars);
64 progress |= OPT(s, nir_copy_prop);
65 progress |= OPT(s, nir_opt_dce);
66 progress |= OPT(s, nir_opt_cse);
67 /* progress |= OPT(s, nir_opt_gcm, true); */
68 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
69 progress |= OPT(s, nir_opt_intrinsics);
70 progress |= OPT(s, nir_opt_algebraic);
71 progress |= OPT(s, nir_opt_constant_folding);
72 progress |= OPT(s, nir_opt_dead_cf);
73 if (OPT(s, nir_opt_loop)) {
74 progress |= true;
75 /* If nir_opt_loop makes progress, then we need to clean
76 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
77 * to make progress.
78 */
79 OPT(s, nir_copy_prop);
80 OPT(s, nir_opt_dce);
81 }
82 progress |= OPT(s, nir_opt_loop_unroll);
83 progress |= OPT(s, nir_opt_if, nir_opt_if_optimize_phi_true_false);
84 progress |= OPT(s, nir_opt_remove_phis);
85 progress |= OPT(s, nir_opt_undef);
86
87 } while (progress);
88 }
89
90 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
91 bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
92
93 int
ir2_optimize_nir(nir_shader * s,bool lower)94 ir2_optimize_nir(nir_shader *s, bool lower)
95 {
96 struct nir_lower_tex_options tex_options = {
97 .lower_txp = ~0u,
98 .lower_rect = 0,
99 .lower_invalid_implicit_lod = true,
100 };
101
102 if (FD_DBG(DISASM)) {
103 debug_printf("----------------------\n");
104 nir_print_shader(s, stdout);
105 debug_printf("----------------------\n");
106 }
107
108 OPT_V(s, nir_lower_vars_to_ssa);
109 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out,
110 UINT32_MAX);
111
112 if (lower) {
113 OPT_V(s, ir3_nir_apply_trig_workarounds);
114 OPT_V(s, nir_lower_tex, &tex_options);
115 }
116
117 ir2_optimize_loop(s);
118
119 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
120 OPT_V(s, nir_opt_sink, nir_move_const_undef);
121
122 /* TODO we dont want to get shaders writing to depth for depth textures */
123 if (s->info.stage == MESA_SHADER_FRAGMENT) {
124 nir_foreach_shader_out_variable (var, s) {
125 if (var->data.location == FRAG_RESULT_DEPTH)
126 return -1;
127 }
128 }
129
130 return 0;
131 }
132
133 static struct ir2_src
load_const(struct ir2_context * ctx,float * value_f,unsigned ncomp)134 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
135 {
136 struct fd2_shader_stateobj *so = ctx->so;
137 unsigned idx, i, j;
138 unsigned imm_ncomp = 0;
139 unsigned swiz = 0;
140 uint32_t *value = (uint32_t *)value_f;
141
142 /* try to merge with existing immediate (TODO: try with neg) */
143 for (idx = 0; idx < so->num_immediates; idx++) {
144 swiz = 0;
145 imm_ncomp = so->immediates[idx].ncomp;
146 for (i = 0; i < ncomp; i++) {
147 for (j = 0; j < imm_ncomp; j++) {
148 if (value[i] == so->immediates[idx].val[j])
149 break;
150 }
151 if (j == imm_ncomp) {
152 if (j == 4)
153 break;
154 so->immediates[idx].val[imm_ncomp++] = value[i];
155 }
156 swiz |= swiz_set(j, i);
157 }
158 /* matched all components */
159 if (i == ncomp)
160 break;
161 }
162
163 /* need to allocate new immediate */
164 if (idx == so->num_immediates) {
165 swiz = 0;
166 imm_ncomp = 0;
167 for (i = 0; i < ncomp; i++) {
168 for (j = 0; j < imm_ncomp; j++) {
169 if (value[i] == ctx->so->immediates[idx].val[j])
170 break;
171 }
172 if (j == imm_ncomp) {
173 so->immediates[idx].val[imm_ncomp++] = value[i];
174 }
175 swiz |= swiz_set(j, i);
176 }
177 so->num_immediates++;
178 }
179 so->immediates[idx].ncomp = imm_ncomp;
180
181 if (ncomp == 1)
182 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
183
184 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
185 }
186
187 struct ir2_src
ir2_zero(struct ir2_context * ctx)188 ir2_zero(struct ir2_context *ctx)
189 {
190 return load_const(ctx, (float[]){0.0f}, 1);
191 }
192
193 static void
update_range(struct ir2_context * ctx,struct ir2_reg * reg)194 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
195 {
196 if (!reg->initialized) {
197 reg->initialized = true;
198 reg->loop_depth = ctx->loop_depth;
199 }
200
201 if (ctx->loop_depth > reg->loop_depth) {
202 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
203 } else {
204 reg->loop_depth = ctx->loop_depth;
205 reg->block_idx_free = -1;
206 }
207
208 /* for regs we want to free at the end of the loop in any case
209 * XXX dont do this for ssa
210 */
211 if (reg->loop_depth)
212 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
213 }
214
215 static struct ir2_src
make_legacy_src(struct ir2_context * ctx,nir_legacy_src src)216 make_legacy_src(struct ir2_context *ctx, nir_legacy_src src)
217 {
218 struct ir2_src res = {};
219 struct ir2_reg *reg;
220
221 /* Handle constants specially */
222 if (src.is_ssa) {
223 nir_const_value *const_value =
224 nir_src_as_const_value(nir_src_for_ssa(src.ssa));
225
226 if (const_value) {
227 float c[src.ssa->num_components];
228 nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
229 return load_const(ctx, c, src.ssa->num_components);
230 }
231 }
232
233 /* Otherwise translate the SSA def or register */
234 if (!src.is_ssa) {
235 res.num = src.reg.handle->index;
236 res.type = IR2_SRC_REG;
237 reg = &ctx->reg[res.num];
238 } else {
239 assert(ctx->ssa_map[src.ssa->index] >= 0);
240 res.num = ctx->ssa_map[src.ssa->index];
241 res.type = IR2_SRC_SSA;
242 reg = &ctx->instr[res.num].ssa;
243 }
244
245 update_range(ctx, reg);
246 return res;
247 }
248
249 static struct ir2_src
make_src(struct ir2_context * ctx,nir_src src)250 make_src(struct ir2_context *ctx, nir_src src)
251 {
252 return make_legacy_src(ctx, nir_legacy_chase_src(&src));
253 }
254
255 static void
set_legacy_index(struct ir2_context * ctx,nir_legacy_dest dst,struct ir2_instr * instr)256 set_legacy_index(struct ir2_context *ctx, nir_legacy_dest dst,
257 struct ir2_instr *instr)
258 {
259 struct ir2_reg *reg = &instr->ssa;
260
261 if (dst.is_ssa) {
262 ctx->ssa_map[dst.ssa->index] = instr->idx;
263 } else {
264 reg = &ctx->reg[dst.reg.handle->index];
265
266 instr->is_ssa = false;
267 instr->reg = reg;
268 }
269 update_range(ctx, reg);
270 }
271
272 static void
set_index(struct ir2_context * ctx,nir_def * def,struct ir2_instr * instr)273 set_index(struct ir2_context *ctx, nir_def *def, struct ir2_instr *instr)
274 {
275 set_legacy_index(ctx, nir_legacy_chase_dest(def), instr);
276 }
277
278 static struct ir2_instr *
ir2_instr_create(struct ir2_context * ctx,int type)279 ir2_instr_create(struct ir2_context *ctx, int type)
280 {
281 struct ir2_instr *instr;
282
283 instr = &ctx->instr[ctx->instr_count++];
284 instr->idx = ctx->instr_count - 1;
285 instr->type = type;
286 instr->block_idx = ctx->block_idx;
287 instr->pred = ctx->pred;
288 instr->is_ssa = true;
289 return instr;
290 }
291
292 static struct ir2_instr *
instr_create_alu(struct ir2_context * ctx,nir_op opcode,unsigned ncomp)293 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
294 {
295 /* emit_alu will fixup instrs that don't map directly */
296 static const struct ir2_opc {
297 int8_t scalar, vector;
298 } nir_ir2_opc[nir_num_opcodes + 1] = {
299 [0 ... nir_num_opcodes - 1] = {-1, -1},
300
301 [nir_op_mov] = {MAXs, MAXv},
302 [nir_op_fneg] = {MAXs, MAXv},
303 [nir_op_fabs] = {MAXs, MAXv},
304 [nir_op_fsat] = {MAXs, MAXv},
305 [nir_op_fsign] = {-1, CNDGTEv},
306 [nir_op_fadd] = {ADDs, ADDv},
307 [nir_op_fsub] = {ADDs, ADDv},
308 [nir_op_fmul] = {MULs, MULv},
309 [nir_op_ffma] = {-1, MULADDv},
310 [nir_op_fmax] = {MAXs, MAXv},
311 [nir_op_fmin] = {MINs, MINv},
312 [nir_op_ffloor] = {FLOORs, FLOORv},
313 [nir_op_ffract] = {FRACs, FRACv},
314 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
315 [nir_op_fdot2] = {-1, DOT2ADDv},
316 [nir_op_fdot3] = {-1, DOT3v},
317 [nir_op_fdot4] = {-1, DOT4v},
318 [nir_op_sge] = {-1, SETGTEv},
319 [nir_op_slt] = {-1, SETGTv},
320 [nir_op_sne] = {-1, SETNEv},
321 [nir_op_seq] = {-1, SETEv},
322 [nir_op_fcsel] = {-1, CNDEv},
323 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
324 [nir_op_frcp] = {RECIP_IEEE, -1},
325 [nir_op_flog2] = {LOG_IEEE, -1},
326 [nir_op_fexp2] = {EXP_IEEE, -1},
327 [nir_op_fsqrt] = {SQRT_IEEE, -1},
328 [nir_op_fcos] = {COS, -1},
329 [nir_op_fsin] = {SIN, -1},
330 /* no fsat, fneg, fabs since source mods deal with those */
331
332 /* so we can use this function with non-nir op */
333 #define ir2_op_cube nir_num_opcodes
334 [ir2_op_cube] = {-1, CUBEv},
335 };
336
337 struct ir2_opc op = nir_ir2_opc[opcode];
338 assert(op.vector >= 0 || op.scalar >= 0);
339
340 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
341 instr->alu.vector_opc = op.vector;
342 instr->alu.scalar_opc = op.scalar;
343 instr->alu.export = -1;
344 instr->alu.write_mask = (1 << ncomp) - 1;
345 instr->src_count =
346 opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs;
347 instr->ssa.ncomp = ncomp;
348 return instr;
349 }
350
351 static struct ir2_instr *
instr_create_alu_reg(struct ir2_context * ctx,nir_op opcode,uint8_t write_mask,struct ir2_instr * share_reg)352 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask,
353 struct ir2_instr *share_reg)
354 {
355 struct ir2_instr *instr;
356 struct ir2_reg *reg;
357
358 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
359 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
360
361 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
362 instr->alu.write_mask = write_mask;
363 instr->reg = reg;
364 instr->is_ssa = false;
365 return instr;
366 }
367
368 static struct ir2_instr *
instr_create_alu_dest(struct ir2_context * ctx,nir_op opcode,nir_def * def)369 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_def *def)
370 {
371 struct ir2_instr *instr;
372 instr = instr_create_alu(ctx, opcode, def->num_components);
373 set_index(ctx, def, instr);
374 return instr;
375 }
376
377 static struct ir2_instr *
ir2_instr_create_fetch(struct ir2_context * ctx,nir_def * def,instr_fetch_opc_t opc)378 ir2_instr_create_fetch(struct ir2_context *ctx, nir_def *def,
379 instr_fetch_opc_t opc)
380 {
381 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
382 instr->fetch.opc = opc;
383 instr->src_count = 1;
384 instr->ssa.ncomp = def->num_components;
385 set_index(ctx, def, instr);
386 return instr;
387 }
388
389 static struct ir2_src
make_src_noconst(struct ir2_context * ctx,nir_src src)390 make_src_noconst(struct ir2_context *ctx, nir_src src)
391 {
392 struct ir2_instr *instr;
393
394 if (nir_src_as_const_value(src)) {
395 instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
396 instr->src[0] = make_src(ctx, src);
397 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
398 }
399
400 return make_src(ctx, src);
401 }
402
403 static void
emit_alu(struct ir2_context * ctx,nir_alu_instr * alu)404 emit_alu(struct ir2_context *ctx, nir_alu_instr *alu)
405 {
406 const nir_op_info *info = &nir_op_infos[alu->op];
407 nir_def *def = &alu->def;
408 struct ir2_instr *instr;
409 struct ir2_src tmp;
410 unsigned ncomp;
411
412 /* Don't emit modifiers that are totally folded */
413 if (((alu->op == nir_op_fneg) || (alu->op == nir_op_fabs)) &&
414 nir_legacy_float_mod_folds(alu))
415 return;
416
417 if ((alu->op == nir_op_fsat) && nir_legacy_fsat_folds(alu))
418 return;
419
420 /* get the number of dst components */
421 ncomp = def->num_components;
422
423 instr = instr_create_alu(ctx, alu->op, ncomp);
424
425 nir_legacy_alu_dest legacy_dest =
426 nir_legacy_chase_alu_dest(&alu->def);
427 set_legacy_index(ctx, legacy_dest.dest, instr);
428 instr->alu.saturate = legacy_dest.fsat;
429 instr->alu.write_mask = legacy_dest.write_mask;
430
431 for (int i = 0; i < info->num_inputs; i++) {
432 nir_alu_src *src = &alu->src[i];
433
434 /* compress swizzle with writemask when applicable */
435 unsigned swiz = 0, j = 0;
436 for (int i = 0; i < 4; i++) {
437 if (!(legacy_dest.write_mask & 1 << i) && !info->output_size)
438 continue;
439 swiz |= swiz_set(src->swizzle[i], j++);
440 }
441
442 nir_legacy_alu_src legacy_src =
443 nir_legacy_chase_alu_src(src, true /* fuse_abs */);
444
445 instr->src[i] = make_legacy_src(ctx, legacy_src.src);
446 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
447 instr->src[i].negate = legacy_src.fneg;
448 instr->src[i].abs = legacy_src.fabs;
449 }
450
451 /* workarounds for NIR ops that don't map directly to a2xx ops */
452 switch (alu->op) {
453 case nir_op_fneg:
454 instr->src[0].negate = 1;
455 break;
456 case nir_op_fabs:
457 instr->src[0].abs = 1;
458 break;
459 case nir_op_fsat:
460 instr->alu.saturate = 1;
461 break;
462 case nir_op_slt:
463 tmp = instr->src[0];
464 instr->src[0] = instr->src[1];
465 instr->src[1] = tmp;
466 break;
467 case nir_op_fcsel:
468 tmp = instr->src[1];
469 instr->src[1] = instr->src[2];
470 instr->src[2] = tmp;
471 break;
472 case nir_op_fsub:
473 instr->src[1].negate = !instr->src[1].negate;
474 break;
475 case nir_op_fdot2:
476 instr->src_count = 3;
477 instr->src[2] = ir2_zero(ctx);
478 break;
479 case nir_op_fsign: {
480 /* we need an extra instruction to deal with the zero case */
481 struct ir2_instr *tmp;
482
483 /* tmp = x == 0 ? 0 : 1 */
484 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
485 tmp->src[0] = instr->src[0];
486 tmp->src[1] = ir2_zero(ctx);
487 tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1);
488
489 /* result = x >= 0 ? tmp : -tmp */
490 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
491 instr->src[2] = instr->src[1];
492 instr->src[2].negate = true;
493 instr->src_count = 3;
494 } break;
495 default:
496 break;
497 }
498 }
499
500 static void
load_input(struct ir2_context * ctx,nir_def * def,unsigned idx)501 load_input(struct ir2_context *ctx, nir_def *def, unsigned idx)
502 {
503 struct ir2_instr *instr;
504 int slot = -1;
505
506 if (ctx->so->type == MESA_SHADER_VERTEX) {
507 instr = ir2_instr_create_fetch(ctx, def, 0);
508 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
509 instr->fetch.vtx.const_idx = 20 + (idx / 3);
510 instr->fetch.vtx.const_idx_sel = idx % 3;
511 return;
512 }
513
514 /* get slot from idx */
515 nir_foreach_shader_in_variable (var, ctx->nir) {
516 if (var->data.driver_location == idx) {
517 slot = var->data.location;
518 break;
519 }
520 }
521 assert(slot >= 0);
522
523 switch (slot) {
524 case VARYING_SLOT_POS:
525 /* need to extract xy with abs and add tile offset on a20x
526 * zw from fragcoord input (w inverted in fragment shader)
527 * TODO: only components that are required by fragment shader
528 */
529 instr = instr_create_alu_reg(
530 ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
531 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
532 instr->src[0].abs = true;
533 /* on a20x, C64 contains the tile offset */
534 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
535
536 instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
537 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
538
539 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
540 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
541
542 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
543 instr = instr_create_alu_dest(ctx, nir_op_mov, def);
544 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
545 break;
546 default:
547 instr = instr_create_alu_dest(ctx, nir_op_mov, def);
548 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
549 break;
550 }
551 }
552
553 static unsigned
output_slot(struct ir2_context * ctx,nir_intrinsic_instr * intr)554 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
555 {
556 int slot = -1;
557 unsigned idx = nir_intrinsic_base(intr);
558 nir_foreach_shader_out_variable (var, ctx->nir) {
559 if (var->data.driver_location == idx) {
560 slot = var->data.location;
561 break;
562 }
563 }
564 assert(slot != -1);
565 return slot;
566 }
567
568 static void
store_output(struct ir2_context * ctx,nir_src src,unsigned slot,unsigned ncomp)569 store_output(struct ir2_context *ctx, nir_src src, unsigned slot,
570 unsigned ncomp)
571 {
572 struct ir2_instr *instr;
573 unsigned idx = 0;
574
575 if (ctx->so->type == MESA_SHADER_VERTEX) {
576 switch (slot) {
577 case VARYING_SLOT_POS:
578 ctx->position = make_src(ctx, src);
579 idx = 62;
580 break;
581 case VARYING_SLOT_PSIZ:
582 ctx->so->writes_psize = true;
583 idx = 63;
584 break;
585 default:
586 /* find matching slot from fragment shader input */
587 for (idx = 0; idx < ctx->f->inputs_count; idx++)
588 if (ctx->f->inputs[idx].slot == slot)
589 break;
590 if (idx == ctx->f->inputs_count)
591 return;
592 }
593 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
594 /* only color output is implemented */
595 return;
596 }
597
598 instr = instr_create_alu(ctx, nir_op_mov, ncomp);
599 instr->src[0] = make_src(ctx, src);
600 instr->alu.export = idx;
601 }
602
603 static void
emit_intrinsic(struct ir2_context * ctx,nir_intrinsic_instr * intr)604 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
605 {
606 struct ir2_instr *instr;
607 ASSERTED nir_const_value *const_offset;
608 unsigned idx;
609
610 switch (intr->intrinsic) {
611 case nir_intrinsic_decl_reg:
612 case nir_intrinsic_load_reg:
613 case nir_intrinsic_store_reg:
614 /* Nothing to do for these */
615 break;
616
617 case nir_intrinsic_load_input:
618 load_input(ctx, &intr->def, nir_intrinsic_base(intr));
619 break;
620 case nir_intrinsic_store_output:
621 store_output(ctx, intr->src[0], output_slot(ctx, intr),
622 intr->num_components);
623 break;
624 case nir_intrinsic_load_uniform:
625 const_offset = nir_src_as_const_value(intr->src[0]);
626 assert(const_offset); /* TODO can be false in ES2? */
627 idx = nir_intrinsic_base(intr);
628 idx += (uint32_t)const_offset[0].f32;
629 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->def);
630 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
631 break;
632 case nir_intrinsic_terminate:
633 case nir_intrinsic_terminate_if:
634 instr = ir2_instr_create(ctx, IR2_ALU);
635 instr->alu.vector_opc = VECTOR_NONE;
636 if (intr->intrinsic == nir_intrinsic_terminate_if) {
637 instr->alu.scalar_opc = KILLNEs;
638 instr->src[0] = make_src(ctx, intr->src[0]);
639 } else {
640 instr->alu.scalar_opc = KILLEs;
641 instr->src[0] = ir2_zero(ctx);
642 }
643 instr->alu.export = -1;
644 instr->src_count = 1;
645 ctx->so->has_kill = true;
646 break;
647 case nir_intrinsic_load_front_face:
648 /* gl_FrontFacing is in the sign of param.x
649 * rcp required because otherwise we can't differentiate -0.0 and +0.0
650 */
651 ctx->so->need_param = true;
652
653 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
654 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
655
656 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->def);
657 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
658 instr->src[1] = ir2_zero(ctx);
659 break;
660 case nir_intrinsic_load_point_coord:
661 /* param.zw (note: abs might be needed like fragcoord in param.xy?) */
662 ctx->so->need_param = true;
663
664 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->def);
665 instr->src[0] =
666 ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
667 break;
668 default:
669 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
670 break;
671 }
672 }
673
674 static void
emit_tex(struct ir2_context * ctx,nir_tex_instr * tex)675 emit_tex(struct ir2_context *ctx, nir_tex_instr *tex)
676 {
677 bool is_rect = false, is_cube = false;
678 struct ir2_instr *instr;
679 nir_src *coord, *lod_bias;
680
681 coord = lod_bias = NULL;
682
683 for (unsigned i = 0; i < tex->num_srcs; i++) {
684 switch (tex->src[i].src_type) {
685 case nir_tex_src_coord:
686 coord = &tex->src[i].src;
687 break;
688 case nir_tex_src_bias:
689 case nir_tex_src_lod:
690 assert(!lod_bias);
691 lod_bias = &tex->src[i].src;
692 break;
693 default:
694 compile_error(ctx, "Unhandled NIR tex src type: %d\n",
695 tex->src[i].src_type);
696 return;
697 }
698 }
699
700 switch (tex->op) {
701 case nir_texop_tex:
702 case nir_texop_txb:
703 case nir_texop_txl:
704 break;
705 default:
706 compile_error(ctx, "unimplemented texop %d\n", tex->op);
707 return;
708 }
709
710 switch (tex->sampler_dim) {
711 case GLSL_SAMPLER_DIM_2D:
712 case GLSL_SAMPLER_DIM_EXTERNAL:
713 break;
714 case GLSL_SAMPLER_DIM_RECT:
715 is_rect = true;
716 break;
717 case GLSL_SAMPLER_DIM_CUBE:
718 is_cube = true;
719 break;
720 default:
721 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
722 return;
723 }
724
725 struct ir2_src src_coord = make_src_noconst(ctx, *coord);
726
727 /* for cube maps
728 * tmp = cube(coord)
729 * tmp.xy = tmp.xy / |tmp.z| + 1.5
730 * coord = tmp.xyw
731 */
732 if (is_cube) {
733 struct ir2_instr *rcp, *coord_xy;
734 unsigned reg_idx;
735
736 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
737 instr->src[0] = src_coord;
738 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
739 instr->src[1] = src_coord;
740 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
741
742 reg_idx = instr->reg - ctx->reg; /* hacky */
743
744 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
745 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
746 rcp->src[0].abs = true;
747
748 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
749 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
750 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
751 coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1);
752
753 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
754 /* TODO: lod/bias transformed by src_coord.z ? */
755 }
756
757 instr = ir2_instr_create_fetch(ctx, &tex->def, TEX_FETCH);
758 instr->src[0] = src_coord;
759 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;
760 instr->fetch.tex.is_cube = is_cube;
761 instr->fetch.tex.is_rect = is_rect;
762 instr->fetch.tex.samp_id = tex->sampler_index;
763
764 /* for lod/bias, we insert an extra src for the backend to deal with */
765 if (lod_bias) {
766 instr->src[1] = make_src_noconst(ctx, *lod_bias);
767 /* backend will use 2-3 components so apply swizzle */
768 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
769 instr->src_count = 2;
770 }
771 }
772
773 static void
setup_input(struct ir2_context * ctx,nir_variable * in)774 setup_input(struct ir2_context *ctx, nir_variable *in)
775 {
776 struct fd2_shader_stateobj *so = ctx->so;
777 unsigned n = in->data.driver_location;
778 unsigned slot = in->data.location;
779
780 assert(glsl_type_is_vector_or_scalar(in->type) ||
781 glsl_type_is_unsized_array(in->type));
782
783 /* handle later */
784 if (ctx->so->type == MESA_SHADER_VERTEX)
785 return;
786
787 if (ctx->so->type != MESA_SHADER_FRAGMENT)
788 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
789
790 n = ctx->f->inputs_count++;
791
792 /* half of fragcoord from param reg, half from a varying */
793 if (slot == VARYING_SLOT_POS) {
794 ctx->f->fragcoord = n;
795 so->need_param = true;
796 }
797
798 ctx->f->inputs[n].slot = slot;
799 ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
800
801 /* in->data.interpolation?
802 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
803 */
804 }
805
806 static void
emit_undef(struct ir2_context * ctx,nir_undef_instr * undef)807 emit_undef(struct ir2_context *ctx, nir_undef_instr *undef)
808 {
809 /* TODO we don't want to emit anything for undefs */
810
811 struct ir2_instr *instr;
812
813 instr = instr_create_alu_dest(ctx, nir_op_mov, &undef->def);
814 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
815 }
816
817 static void
emit_instr(struct ir2_context * ctx,nir_instr * instr)818 emit_instr(struct ir2_context *ctx, nir_instr *instr)
819 {
820 switch (instr->type) {
821 case nir_instr_type_alu:
822 emit_alu(ctx, nir_instr_as_alu(instr));
823 break;
824 case nir_instr_type_deref:
825 /* ignored, handled as part of the intrinsic they are src to */
826 break;
827 case nir_instr_type_intrinsic:
828 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
829 break;
830 case nir_instr_type_load_const:
831 /* dealt with when using nir_src */
832 break;
833 case nir_instr_type_tex:
834 emit_tex(ctx, nir_instr_as_tex(instr));
835 break;
836 case nir_instr_type_jump:
837 ctx->block_has_jump[ctx->block_idx] = true;
838 break;
839 case nir_instr_type_undef:
840 emit_undef(ctx, nir_instr_as_undef(instr));
841 break;
842 default:
843 break;
844 }
845 }
846
847 /* fragcoord.zw and a20x hw binning outputs */
848 static void
extra_position_exports(struct ir2_context * ctx,bool binning)849 extra_position_exports(struct ir2_context *ctx, bool binning)
850 {
851 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
852
853 if (ctx->f->fragcoord < 0 && !binning)
854 return;
855
856 instr = instr_create_alu(ctx, nir_op_fmax, 1);
857 instr->src[0] = ctx->position;
858 instr->src[0].swizzle = IR2_SWIZZLE_W;
859 instr->src[1] = ir2_zero(ctx);
860
861 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
862 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
863
864 sc = instr_create_alu(ctx, nir_op_fmul, 4);
865 sc->src[0] = ctx->position;
866 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
867
868 wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
869 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
870 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
871 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
872
873 /* fragcoord z/w */
874 if (ctx->f->fragcoord >= 0 && !binning) {
875 instr = instr_create_alu(ctx, nir_op_mov, 1);
876 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
877 instr->alu.export = ctx->f->fragcoord;
878
879 instr = instr_create_alu(ctx, nir_op_mov, 1);
880 instr->src[0] = ctx->position;
881 instr->src[0].swizzle = IR2_SWIZZLE_W;
882 instr->alu.export = ctx->f->fragcoord;
883 instr->alu.write_mask = 2;
884 }
885
886 if (!binning)
887 return;
888
889 off = instr_create_alu(ctx, nir_op_fadd, 1);
890 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
891 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
892
893 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
894 for (int i = 0; i < 8; i++) {
895 instr = instr_create_alu(ctx, nir_op_ffma, 4);
896 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
897 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
898 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
899 instr->alu.export = 32;
900
901 instr = instr_create_alu(ctx, nir_op_ffma, 4);
902 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
903 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
904 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
905 instr->alu.export = 33;
906 }
907 }
908
909 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
910
911 static bool
emit_block(struct ir2_context * ctx,nir_block * block)912 emit_block(struct ir2_context *ctx, nir_block *block)
913 {
914 struct ir2_instr *instr;
915 nir_block *succs = block->successors[0];
916
917 ctx->block_idx = block->index;
918
919 nir_foreach_instr (instr, block)
920 emit_instr(ctx, instr);
921
922 if (!succs || !succs->index)
923 return false;
924
925 /* we want to be smart and always jump and have the backend cleanup
926 * but we are not, so there are two cases where jump is needed:
927 * loops (succs index lower)
928 * jumps (jump instruction seen in block)
929 */
930 if (succs->index > block->index && !ctx->block_has_jump[block->index])
931 return false;
932
933 assert(block->successors[1] == NULL);
934
935 instr = ir2_instr_create(ctx, IR2_CF);
936 instr->cf.block_idx = succs->index;
937 /* XXX can't jump to a block with different predicate */
938 return true;
939 }
940
941 static void
emit_if(struct ir2_context * ctx,nir_if * nif)942 emit_if(struct ir2_context *ctx, nir_if *nif)
943 {
944 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
945 struct ir2_instr *instr;
946
947 /* XXX: blob seems to always use same register for condition */
948
949 instr = ir2_instr_create(ctx, IR2_ALU);
950 instr->src[0] = make_src(ctx, nif->condition);
951 instr->src_count = 1;
952 instr->ssa.ncomp = 1;
953 instr->alu.vector_opc = VECTOR_NONE;
954 instr->alu.scalar_opc = SCALAR_NONE;
955 instr->alu.export = -1;
956 instr->alu.write_mask = 1;
957 instr->pred = 0;
958
959 /* if nested, use PRED_SETNE_PUSHv */
960 if (pred) {
961 instr->alu.vector_opc = PRED_SETNE_PUSHv;
962 instr->src[1] = instr->src[0];
963 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
964 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
965 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
966 instr->src_count = 2;
967 } else {
968 instr->alu.scalar_opc = PRED_SETNEs;
969 }
970
971 ctx->pred_idx = instr->idx;
972 ctx->pred = 3;
973
974 emit_cf_list(ctx, &nif->then_list);
975
976 /* TODO: if these is no else branch we don't need this
977 * and if the else branch is simple, can just flip ctx->pred instead
978 */
979 instr = ir2_instr_create(ctx, IR2_ALU);
980 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
981 instr->src_count = 1;
982 instr->ssa.ncomp = 1;
983 instr->alu.vector_opc = VECTOR_NONE;
984 instr->alu.scalar_opc = PRED_SET_INVs;
985 instr->alu.export = -1;
986 instr->alu.write_mask = 1;
987 instr->pred = 0;
988 ctx->pred_idx = instr->idx;
989
990 emit_cf_list(ctx, &nif->else_list);
991
992 /* restore predicate for nested predicates */
993 if (pred) {
994 instr = ir2_instr_create(ctx, IR2_ALU);
995 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
996 instr->src_count = 1;
997 instr->ssa.ncomp = 1;
998 instr->alu.vector_opc = VECTOR_NONE;
999 instr->alu.scalar_opc = PRED_SET_POPs;
1000 instr->alu.export = -1;
1001 instr->alu.write_mask = 1;
1002 instr->pred = 0;
1003 ctx->pred_idx = instr->idx;
1004 }
1005
1006 /* restore ctx->pred */
1007 ctx->pred = pred;
1008 }
1009
1010 /* get the highest block idx in the loop, so we know when
1011 * we can free registers that are allocated outside the loop
1012 */
1013 static unsigned
loop_last_block(struct exec_list * list)1014 loop_last_block(struct exec_list *list)
1015 {
1016 nir_cf_node *node =
1017 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
1018 switch (node->type) {
1019 case nir_cf_node_block:
1020 return nir_cf_node_as_block(node)->index;
1021 case nir_cf_node_if:
1022 assert(0); /* XXX could this ever happen? */
1023 return 0;
1024 case nir_cf_node_loop:
1025 return loop_last_block(&nir_cf_node_as_loop(node)->body);
1026 default:
1027 compile_error(ctx, "Not supported\n");
1028 return 0;
1029 }
1030 }
1031
1032 static void
emit_loop(struct ir2_context * ctx,nir_loop * nloop)1033 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1034 {
1035 assert(!nir_loop_has_continue_construct(nloop));
1036 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1037 emit_cf_list(ctx, &nloop->body);
1038 ctx->loop_depth--;
1039 }
1040
1041 static bool
emit_cf_list(struct ir2_context * ctx,struct exec_list * list)1042 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1043 {
1044 bool ret = false;
1045 foreach_list_typed (nir_cf_node, node, node, list) {
1046 ret = false;
1047 switch (node->type) {
1048 case nir_cf_node_block:
1049 ret = emit_block(ctx, nir_cf_node_as_block(node));
1050 break;
1051 case nir_cf_node_if:
1052 emit_if(ctx, nir_cf_node_as_if(node));
1053 break;
1054 case nir_cf_node_loop:
1055 emit_loop(ctx, nir_cf_node_as_loop(node));
1056 break;
1057 case nir_cf_node_function:
1058 compile_error(ctx, "Not supported\n");
1059 break;
1060 }
1061 }
1062 return ret;
1063 }
1064
1065 static void
cleanup_binning(struct ir2_context * ctx)1066 cleanup_binning(struct ir2_context *ctx)
1067 {
1068 assert(ctx->so->type == MESA_SHADER_VERTEX);
1069
1070 /* kill non-position outputs for binning variant */
1071 nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) {
1072 nir_foreach_instr_safe (instr, block) {
1073 if (instr->type != nir_instr_type_intrinsic)
1074 continue;
1075
1076 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1077 if (intr->intrinsic != nir_intrinsic_store_output)
1078 continue;
1079
1080 if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1081 nir_instr_remove(instr);
1082 }
1083 }
1084
1085 ir2_optimize_nir(ctx->nir, false);
1086 }
1087
1088 static bool
ir2_alu_to_scalar_filter_cb(const nir_instr * instr,const void * data)1089 ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
1090 {
1091 if (instr->type != nir_instr_type_alu)
1092 return false;
1093
1094 nir_alu_instr *alu = nir_instr_as_alu(instr);
1095 switch (alu->op) {
1096 case nir_op_frsq:
1097 case nir_op_frcp:
1098 case nir_op_flog2:
1099 case nir_op_fexp2:
1100 case nir_op_fsqrt:
1101 case nir_op_fcos:
1102 case nir_op_fsin:
1103 return true;
1104 default:
1105 break;
1106 }
1107
1108 return false;
1109 }
1110
1111 void
ir2_nir_compile(struct ir2_context * ctx,bool binning)1112 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1113 {
1114 struct fd2_shader_stateobj *so = ctx->so;
1115
1116 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1117
1118 ctx->nir = nir_shader_clone(NULL, so->nir);
1119
1120 if (binning)
1121 cleanup_binning(ctx);
1122
1123 OPT_V(ctx->nir, nir_copy_prop);
1124 OPT_V(ctx->nir, nir_opt_dce);
1125 OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
1126
1127 OPT_V(ctx->nir, nir_lower_int_to_float);
1128 OPT_V(ctx->nir, nir_lower_bool_to_float, true);
1129 while (OPT(ctx->nir, nir_opt_algebraic))
1130 ;
1131 OPT_V(ctx->nir, nir_opt_algebraic_late);
1132 OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL);
1133
1134 OPT_V(ctx->nir, nir_convert_from_ssa, true);
1135
1136 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest, false);
1137 OPT_V(ctx->nir, nir_lower_vec_to_regs, NULL, NULL);
1138
1139 OPT_V(ctx->nir, nir_legacy_trivialize, true);
1140
1141 OPT_V(ctx->nir, nir_opt_dce);
1142
1143 nir_sweep(ctx->nir);
1144
1145 if (FD_DBG(DISASM)) {
1146 debug_printf("----------------------\n");
1147 nir_print_shader(ctx->nir, stdout);
1148 debug_printf("----------------------\n");
1149 }
1150
1151 /* fd2_shader_stateobj init */
1152 if (so->type == MESA_SHADER_FRAGMENT) {
1153 ctx->f->fragcoord = -1;
1154 ctx->f->inputs_count = 0;
1155 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1156 }
1157
1158 /* Setup inputs: */
1159 nir_foreach_shader_in_variable (in, ctx->nir)
1160 setup_input(ctx, in);
1161
1162 if (so->type == MESA_SHADER_FRAGMENT) {
1163 unsigned idx;
1164 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1165 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1166 update_range(ctx, &ctx->input[idx]);
1167 }
1168 /* assume we have param input and kill it later if not */
1169 ctx->input[idx].ncomp = 4;
1170 update_range(ctx, &ctx->input[idx]);
1171 } else {
1172 ctx->input[0].ncomp = 1;
1173 ctx->input[2].ncomp = 1;
1174 update_range(ctx, &ctx->input[0]);
1175 update_range(ctx, &ctx->input[2]);
1176 }
1177
1178 /* And emit the body: */
1179 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1180
1181 nir_foreach_reg_decl (decl, fxn) {
1182 assert(decl->def.index < ARRAY_SIZE(ctx->reg));
1183 ctx->reg[decl->def.index].ncomp = nir_intrinsic_num_components(decl);
1184 ctx->reg_count = MAX2(ctx->reg_count, decl->def.index + 1);
1185 }
1186
1187 nir_metadata_require(fxn, nir_metadata_block_index);
1188 emit_cf_list(ctx, &fxn->body);
1189 /* TODO emit_block(ctx, fxn->end_block); */
1190
1191 if (so->type == MESA_SHADER_VERTEX)
1192 extra_position_exports(ctx, binning);
1193
1194 ralloc_free(ctx->nir);
1195
1196 /* kill unused param input */
1197 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1198 ctx->input[ctx->f->inputs_count].initialized = false;
1199 }
1200