1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28
29 #include "freedreno_util.h"
30 #include "fd2_program.h"
31
32 static const nir_shader_compiler_options options = {
33 .lower_fpow = true,
34 .lower_flrp32 = true,
35 .lower_fmod = true,
36 .lower_fdiv = true,
37 .lower_fceil = true,
38 .fuse_ffma16 = true,
39 .fuse_ffma32 = true,
40 .fuse_ffma64 = true,
41 /* .fdot_replicates = true, it is replicated, but it makes things worse */
42 .lower_all_io_to_temps = true,
43 .vertex_id_zero_based = true, /* its not implemented anyway */
44 .lower_bitops = true,
45 .lower_rotate = true,
46 .lower_vector_cmp = true,
47 .lower_fdph = true,
48 };
49
50 const nir_shader_compiler_options *
ir2_get_compiler_options(void)51 ir2_get_compiler_options(void)
52 {
53 return &options;
54 }
55
56 #define OPT(nir, pass, ...) ({ \
57 bool this_progress = false; \
58 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
59 this_progress; \
60 })
61 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
62
63 static void
ir2_optimize_loop(nir_shader * s)64 ir2_optimize_loop(nir_shader *s)
65 {
66 bool progress;
67 do {
68 progress = false;
69
70 OPT_V(s, nir_lower_vars_to_ssa);
71 progress |= OPT(s, nir_opt_copy_prop_vars);
72 progress |= OPT(s, nir_copy_prop);
73 progress |= OPT(s, nir_opt_dce);
74 progress |= OPT(s, nir_opt_cse);
75 /* progress |= OPT(s, nir_opt_gcm, true); */
76 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
77 progress |= OPT(s, nir_opt_intrinsics);
78 progress |= OPT(s, nir_opt_algebraic);
79 progress |= OPT(s, nir_opt_constant_folding);
80 progress |= OPT(s, nir_opt_dead_cf);
81 if (OPT(s, nir_opt_trivial_continues)) {
82 progress |= true;
83 /* If nir_opt_trivial_continues makes progress, then we need to clean
84 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
85 * to make progress.
86 */
87 OPT(s, nir_copy_prop);
88 OPT(s, nir_opt_dce);
89 }
90 progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
91 progress |= OPT(s, nir_opt_if, false);
92 progress |= OPT(s, nir_opt_remove_phis);
93 progress |= OPT(s, nir_opt_undef);
94
95 }
96 while (progress);
97 }
98
99 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
100 bool ir3_nir_apply_trig_workarounds(nir_shader * shader);
101
102 int
ir2_optimize_nir(nir_shader * s,bool lower)103 ir2_optimize_nir(nir_shader *s, bool lower)
104 {
105 struct nir_lower_tex_options tex_options = {
106 .lower_txp = ~0u,
107 .lower_rect = 0,
108 };
109
110 if (fd_mesa_debug & FD_DBG_DISASM) {
111 debug_printf("----------------------\n");
112 nir_print_shader(s, stdout);
113 debug_printf("----------------------\n");
114 }
115
116 OPT_V(s, nir_lower_regs_to_ssa);
117 OPT_V(s, nir_lower_vars_to_ssa);
118 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
119
120 if (lower) {
121 OPT_V(s, ir3_nir_apply_trig_workarounds);
122 OPT_V(s, nir_lower_tex, &tex_options);
123 }
124
125 ir2_optimize_loop(s);
126
127 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
128 OPT_V(s, nir_opt_sink, nir_move_const_undef);
129
130 /* TODO we dont want to get shaders writing to depth for depth textures */
131 if (s->info.stage == MESA_SHADER_FRAGMENT) {
132 nir_foreach_shader_out_variable(var, s) {
133 if (var->data.location == FRAG_RESULT_DEPTH)
134 return -1;
135 }
136 }
137
138 return 0;
139 }
140
141 static struct ir2_src
load_const(struct ir2_context * ctx,float * value_f,unsigned ncomp)142 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
143 {
144 struct fd2_shader_stateobj *so = ctx->so;
145 unsigned imm_ncomp, swiz, idx, i, j;
146 uint32_t *value = (uint32_t*) value_f;
147
148 /* try to merge with existing immediate (TODO: try with neg) */
149 for (idx = 0; idx < so->num_immediates; idx++) {
150 swiz = 0;
151 imm_ncomp = so->immediates[idx].ncomp;
152 for (i = 0; i < ncomp; i++) {
153 for (j = 0; j < imm_ncomp; j++) {
154 if (value[i] == so->immediates[idx].val[j])
155 break;
156 }
157 if (j == imm_ncomp) {
158 if (j == 4)
159 break;
160 so->immediates[idx].val[imm_ncomp++] = value[i];
161 }
162 swiz |= swiz_set(j, i);
163 }
164 /* matched all components */
165 if (i == ncomp)
166 break;
167 }
168
169 /* need to allocate new immediate */
170 if (idx == so->num_immediates) {
171 swiz = 0;
172 imm_ncomp = 0;
173 for (i = 0; i < ncomp; i++) {
174 for (j = 0; j < imm_ncomp; j++) {
175 if (value[i] == ctx->so->immediates[idx].val[j])
176 break;
177 }
178 if (j == imm_ncomp) {
179 so->immediates[idx].val[imm_ncomp++] = value[i];
180 }
181 swiz |= swiz_set(j, i);
182 }
183 so->num_immediates++;
184 }
185 so->immediates[idx].ncomp = imm_ncomp;
186
187 if (ncomp == 1)
188 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
189
190 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
191 }
192
193 struct ir2_src
ir2_zero(struct ir2_context * ctx)194 ir2_zero(struct ir2_context *ctx)
195 {
196 return load_const(ctx, (float[]) {0.0f}, 1);
197 }
198
199 static void
update_range(struct ir2_context * ctx,struct ir2_reg * reg)200 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
201 {
202 if (!reg->initialized) {
203 reg->initialized = true;
204 reg->loop_depth = ctx->loop_depth;
205 }
206
207 if (ctx->loop_depth > reg->loop_depth) {
208 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
209 } else {
210 reg->loop_depth = ctx->loop_depth;
211 reg->block_idx_free = -1;
212 }
213
214 /* for regs we want to free at the end of the loop in any case
215 * XXX dont do this for ssa
216 */
217 if (reg->loop_depth)
218 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
219 }
220
221 static struct ir2_src
make_src(struct ir2_context * ctx,nir_src src)222 make_src(struct ir2_context *ctx, nir_src src)
223 {
224 struct ir2_src res = {};
225 struct ir2_reg *reg;
226
227 nir_const_value *const_value = nir_src_as_const_value(src);
228
229 if (const_value) {
230 assert(src.is_ssa);
231 float c[src.ssa->num_components];
232 nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
233 return load_const(ctx, c, src.ssa->num_components);
234 }
235
236 if (!src.is_ssa) {
237 res.num = src.reg.reg->index;
238 res.type = IR2_SRC_REG;
239 reg = &ctx->reg[res.num];
240 } else {
241 assert(ctx->ssa_map[src.ssa->index] >= 0);
242 res.num = ctx->ssa_map[src.ssa->index];
243 res.type = IR2_SRC_SSA;
244 reg = &ctx->instr[res.num].ssa;
245 }
246
247 update_range(ctx, reg);
248 return res;
249 }
250
251 static void
set_index(struct ir2_context * ctx,nir_dest * dst,struct ir2_instr * instr)252 set_index(struct ir2_context *ctx, nir_dest * dst,
253 struct ir2_instr *instr)
254 {
255 struct ir2_reg *reg = &instr->ssa;
256
257 if (dst->is_ssa) {
258 ctx->ssa_map[dst->ssa.index] = instr->idx;
259 } else {
260 assert(instr->is_ssa);
261 reg = &ctx->reg[dst->reg.reg->index];
262
263 instr->is_ssa = false;
264 instr->reg = reg;
265 }
266 update_range(ctx, reg);
267 }
268
269 static struct ir2_instr *
ir2_instr_create(struct ir2_context * ctx,int type)270 ir2_instr_create(struct ir2_context *ctx, int type)
271 {
272 struct ir2_instr *instr;
273
274 instr = &ctx->instr[ctx->instr_count++];
275 instr->idx = ctx->instr_count - 1;
276 instr->type = type;
277 instr->block_idx = ctx->block_idx;
278 instr->pred = ctx->pred;
279 instr->is_ssa = true;
280 return instr;
281 }
282
283 static struct ir2_instr *
instr_create_alu(struct ir2_context * ctx,nir_op opcode,unsigned ncomp)284 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
285 {
286 /* emit_alu will fixup instrs that don't map directly */
287 static const struct ir2_opc {
288 int8_t scalar, vector;
289 } nir_ir2_opc[nir_num_opcodes+1] = {
290 [0 ... nir_num_opcodes - 1] = {-1, -1},
291
292 [nir_op_mov] = {MAXs, MAXv},
293 [nir_op_fneg] = {MAXs, MAXv},
294 [nir_op_fabs] = {MAXs, MAXv},
295 [nir_op_fsat] = {MAXs, MAXv},
296 [nir_op_fsign] = {-1, CNDGTEv},
297 [nir_op_fadd] = {ADDs, ADDv},
298 [nir_op_fsub] = {ADDs, ADDv},
299 [nir_op_fmul] = {MULs, MULv},
300 [nir_op_ffma] = {-1, MULADDv},
301 [nir_op_fmax] = {MAXs, MAXv},
302 [nir_op_fmin] = {MINs, MINv},
303 [nir_op_ffloor] = {FLOORs, FLOORv},
304 [nir_op_ffract] = {FRACs, FRACv},
305 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
306 [nir_op_fdot2] = {-1, DOT2ADDv},
307 [nir_op_fdot3] = {-1, DOT3v},
308 [nir_op_fdot4] = {-1, DOT4v},
309 [nir_op_sge] = {-1, SETGTEv},
310 [nir_op_slt] = {-1, SETGTv},
311 [nir_op_sne] = {-1, SETNEv},
312 [nir_op_seq] = {-1, SETEv},
313 [nir_op_fcsel] = {-1, CNDEv},
314 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
315 [nir_op_frcp] = {RECIP_IEEE, -1},
316 [nir_op_flog2] = {LOG_IEEE, -1},
317 [nir_op_fexp2] = {EXP_IEEE, -1},
318 [nir_op_fsqrt] = {SQRT_IEEE, -1},
319 [nir_op_fcos] = {COS, -1},
320 [nir_op_fsin] = {SIN, -1},
321 /* no fsat, fneg, fabs since source mods deal with those */
322
323 /* so we can use this function with non-nir op */
324 #define ir2_op_cube nir_num_opcodes
325 [ir2_op_cube] = {-1, CUBEv},
326 };
327
328 struct ir2_opc op = nir_ir2_opc[opcode];
329 assert(op.vector >= 0 || op.scalar >= 0);
330
331 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
332 instr->alu.vector_opc = op.vector;
333 instr->alu.scalar_opc = op.scalar;
334 instr->alu.export = -1;
335 instr->alu.write_mask = (1 << ncomp) - 1;
336 instr->src_count = opcode == ir2_op_cube ? 2 :
337 nir_op_infos[opcode].num_inputs;
338 instr->ssa.ncomp = ncomp;
339 return instr;
340 }
341
342 static struct ir2_instr *
instr_create_alu_reg(struct ir2_context * ctx,nir_op opcode,uint8_t write_mask,struct ir2_instr * share_reg)343 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode,
344 uint8_t write_mask, struct ir2_instr *share_reg)
345 {
346 struct ir2_instr *instr;
347 struct ir2_reg *reg;
348
349 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
350 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
351
352 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
353 instr->alu.write_mask = write_mask;
354 instr->reg = reg;
355 instr->is_ssa = false;
356 return instr;
357 }
358
359
360 static struct ir2_instr *
instr_create_alu_dest(struct ir2_context * ctx,nir_op opcode,nir_dest * dst)361 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
362 {
363 struct ir2_instr *instr;
364 instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
365 set_index(ctx, dst, instr);
366 return instr;
367 }
368
369 static struct ir2_instr *
ir2_instr_create_fetch(struct ir2_context * ctx,nir_dest * dst,instr_fetch_opc_t opc)370 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
371 instr_fetch_opc_t opc)
372 {
373 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
374 instr->fetch.opc = opc;
375 instr->src_count = 1;
376 instr->ssa.ncomp = nir_dest_num_components(*dst);
377 set_index(ctx, dst, instr);
378 return instr;
379 }
380
381 static struct ir2_src
make_src_noconst(struct ir2_context * ctx,nir_src src)382 make_src_noconst(struct ir2_context *ctx, nir_src src)
383 {
384 struct ir2_instr *instr;
385
386 if (nir_src_as_const_value(src)) {
387 assert(src.is_ssa);
388 instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
389 instr->src[0] = make_src(ctx, src);
390 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
391 }
392
393 return make_src(ctx, src);
394 }
395
396 static void
emit_alu(struct ir2_context * ctx,nir_alu_instr * alu)397 emit_alu(struct ir2_context *ctx, nir_alu_instr * alu)
398 {
399 const nir_op_info *info = &nir_op_infos[alu->op];
400 nir_dest *dst = &alu->dest.dest;
401 struct ir2_instr *instr;
402 struct ir2_src tmp;
403 unsigned ncomp;
404
405 /* get the number of dst components */
406 if (dst->is_ssa) {
407 ncomp = dst->ssa.num_components;
408 } else {
409 ncomp = 0;
410 for (int i = 0; i < 4; i++)
411 ncomp += !!(alu->dest.write_mask & 1 << i);
412 }
413
414 instr = instr_create_alu(ctx, alu->op, ncomp);
415 set_index(ctx, dst, instr);
416 instr->alu.saturate = alu->dest.saturate;
417 instr->alu.write_mask = alu->dest.write_mask;
418
419 for (int i = 0; i < info->num_inputs; i++) {
420 nir_alu_src *src = &alu->src[i];
421
422 /* compress swizzle with writemask when applicable */
423 unsigned swiz = 0, j = 0;
424 for (int i = 0; i < 4; i++) {
425 if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
426 continue;
427 swiz |= swiz_set(src->swizzle[i], j++);
428 }
429
430 instr->src[i] = make_src(ctx, src->src);
431 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
432 instr->src[i].negate = src->negate;
433 instr->src[i].abs = src->abs;
434 }
435
436 /* workarounds for NIR ops that don't map directly to a2xx ops */
437 switch (alu->op) {
438 case nir_op_fneg:
439 instr->src[0].negate = 1;
440 break;
441 case nir_op_fabs:
442 instr->src[0].abs = 1;
443 break;
444 case nir_op_fsat:
445 instr->alu.saturate = 1;
446 break;
447 case nir_op_slt:
448 tmp = instr->src[0];
449 instr->src[0] = instr->src[1];
450 instr->src[1] = tmp;
451 break;
452 case nir_op_fcsel:
453 tmp = instr->src[1];
454 instr->src[1] = instr->src[2];
455 instr->src[2] = tmp;
456 break;
457 case nir_op_fsub:
458 instr->src[1].negate = !instr->src[1].negate;
459 break;
460 case nir_op_fdot2:
461 instr->src_count = 3;
462 instr->src[2] = ir2_zero(ctx);
463 break;
464 case nir_op_fsign: {
465 /* we need an extra instruction to deal with the zero case */
466 struct ir2_instr *tmp;
467
468 /* tmp = x == 0 ? 0 : 1 */
469 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
470 tmp->src[0] = instr->src[0];
471 tmp->src[1] = ir2_zero(ctx);
472 tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1);
473
474 /* result = x >= 0 ? tmp : -tmp */
475 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
476 instr->src[2] = instr->src[1];
477 instr->src[2].negate = true;
478 instr->src_count = 3;
479 } break;
480 default:
481 break;
482 }
483 }
484
485 static void
load_input(struct ir2_context * ctx,nir_dest * dst,unsigned idx)486 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
487 {
488 struct ir2_instr *instr;
489 int slot = -1;
490
491 if (ctx->so->type == MESA_SHADER_VERTEX) {
492 instr = ir2_instr_create_fetch(ctx, dst, 0);
493 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
494 instr->fetch.vtx.const_idx = 20 + (idx / 3);
495 instr->fetch.vtx.const_idx_sel = idx % 3;
496 return;
497 }
498
499 /* get slot from idx */
500 nir_foreach_shader_in_variable(var, ctx->nir) {
501 if (var->data.driver_location == idx) {
502 slot = var->data.location;
503 break;
504 }
505 }
506 assert(slot >= 0);
507
508 switch (slot) {
509 case VARYING_SLOT_POS:
510 /* need to extract xy with abs and add tile offset on a20x
511 * zw from fragcoord input (w inverted in fragment shader)
512 * TODO: only components that are required by fragment shader
513 */
514 instr = instr_create_alu_reg(ctx,
515 ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
516 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
517 instr->src[0].abs = true;
518 /* on a20x, C64 contains the tile offset */
519 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
520
521 instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
522 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
523
524 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
525 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
526
527 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
528 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
529 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
530 break;
531 default:
532 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
533 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
534 break;
535 }
536 }
537
538 static unsigned
output_slot(struct ir2_context * ctx,nir_intrinsic_instr * intr)539 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
540 {
541 int slot = -1;
542 unsigned idx = nir_intrinsic_base(intr);
543 nir_foreach_shader_out_variable(var, ctx->nir) {
544 if (var->data.driver_location == idx) {
545 slot = var->data.location;
546 break;
547 }
548 }
549 assert(slot != -1);
550 return slot;
551 }
552
553 static void
store_output(struct ir2_context * ctx,nir_src src,unsigned slot,unsigned ncomp)554 store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp)
555 {
556 struct ir2_instr *instr;
557 unsigned idx = 0;
558
559 if (ctx->so->type == MESA_SHADER_VERTEX) {
560 switch (slot) {
561 case VARYING_SLOT_POS:
562 ctx->position = make_src(ctx, src);
563 idx = 62;
564 break;
565 case VARYING_SLOT_PSIZ:
566 ctx->so->writes_psize = true;
567 idx = 63;
568 break;
569 default:
570 /* find matching slot from fragment shader input */
571 for (idx = 0; idx < ctx->f->inputs_count; idx++)
572 if (ctx->f->inputs[idx].slot == slot)
573 break;
574 if (idx == ctx->f->inputs_count)
575 return;
576 }
577 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
578 /* only color output is implemented */
579 return;
580 }
581
582 instr = instr_create_alu(ctx, nir_op_mov, ncomp);
583 instr->src[0] = make_src(ctx, src);
584 instr->alu.export = idx;
585 }
586
587 static void
emit_intrinsic(struct ir2_context * ctx,nir_intrinsic_instr * intr)588 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
589 {
590 struct ir2_instr *instr;
591 nir_const_value *const_offset;
592 unsigned idx;
593
594 switch (intr->intrinsic) {
595 case nir_intrinsic_load_input:
596 load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
597 break;
598 case nir_intrinsic_store_output:
599 store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components);
600 break;
601 case nir_intrinsic_load_uniform:
602 const_offset = nir_src_as_const_value(intr->src[0]);
603 assert(const_offset); /* TODO can be false in ES2? */
604 idx = nir_intrinsic_base(intr);
605 idx += (uint32_t) nir_src_as_const_value(intr->src[0])[0].f32;
606 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
607 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
608 break;
609 case nir_intrinsic_discard:
610 case nir_intrinsic_discard_if:
611 instr = ir2_instr_create(ctx, IR2_ALU);
612 instr->alu.vector_opc = VECTOR_NONE;
613 if (intr->intrinsic == nir_intrinsic_discard_if) {
614 instr->alu.scalar_opc = KILLNEs;
615 instr->src[0] = make_src(ctx, intr->src[0]);
616 } else {
617 instr->alu.scalar_opc = KILLEs;
618 instr->src[0] = ir2_zero(ctx);
619 }
620 instr->alu.export = -1;
621 instr->src_count = 1;
622 ctx->so->has_kill = true;
623 break;
624 case nir_intrinsic_load_front_face:
625 /* gl_FrontFacing is in the sign of param.x
626 * rcp required because otherwise we can't differentiate -0.0 and +0.0
627 */
628 ctx->so->need_param = true;
629
630 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
631 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
632
633 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
634 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
635 instr->src[1] = ir2_zero(ctx);
636 break;
637 case nir_intrinsic_load_point_coord:
638 /* param.zw (note: abs might be needed like fragcoord in param.xy?) */
639 ctx->so->need_param = true;
640
641 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
642 instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
643 break;
644 default:
645 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
646 break;
647 }
648 }
649
650 static void
emit_tex(struct ir2_context * ctx,nir_tex_instr * tex)651 emit_tex(struct ir2_context *ctx, nir_tex_instr * tex)
652 {
653 bool is_rect = false, is_cube = false;
654 struct ir2_instr *instr;
655 nir_src *coord, *lod_bias;
656
657 coord = lod_bias = NULL;
658
659 for (unsigned i = 0; i < tex->num_srcs; i++) {
660 switch (tex->src[i].src_type) {
661 case nir_tex_src_coord:
662 coord = &tex->src[i].src;
663 break;
664 case nir_tex_src_bias:
665 case nir_tex_src_lod:
666 assert(!lod_bias);
667 lod_bias = &tex->src[i].src;
668 break;
669 default:
670 compile_error(ctx, "Unhandled NIR tex src type: %d\n",
671 tex->src[i].src_type);
672 return;
673 }
674 }
675
676 switch (tex->op) {
677 case nir_texop_tex:
678 case nir_texop_txb:
679 case nir_texop_txl:
680 break;
681 default:
682 compile_error(ctx, "unimplemented texop %d\n", tex->op);
683 return;
684 }
685
686 switch (tex->sampler_dim) {
687 case GLSL_SAMPLER_DIM_2D:
688 break;
689 case GLSL_SAMPLER_DIM_RECT:
690 is_rect = true;
691 break;
692 case GLSL_SAMPLER_DIM_CUBE:
693 is_cube = true;
694 break;
695 default:
696 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
697 return;
698 }
699
700 struct ir2_src src_coord = make_src_noconst(ctx, *coord);
701
702 /* for cube maps
703 * tmp = cube(coord)
704 * tmp.xy = tmp.xy / |tmp.z| + 1.5
705 * coord = tmp.xyw
706 */
707 if (is_cube) {
708 struct ir2_instr *rcp, *coord_xy;
709 unsigned reg_idx;
710
711 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
712 instr->src[0] = src_coord;
713 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
714 instr->src[1] = src_coord;
715 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
716
717 reg_idx = instr->reg - ctx->reg; /* hacky */
718
719 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
720 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
721 rcp->src[0].abs = true;
722
723 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
724 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
725 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
726 coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1);
727
728 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
729 /* TODO: lod/bias transformed by src_coord.z ? */
730 }
731
732 instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
733 instr->src[0] = src_coord;
734 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;
735 instr->fetch.tex.is_cube = is_cube;
736 instr->fetch.tex.is_rect = is_rect;
737 instr->fetch.tex.samp_id = tex->sampler_index;
738
739 /* for lod/bias, we insert an extra src for the backend to deal with */
740 if (lod_bias) {
741 instr->src[1] = make_src_noconst(ctx, *lod_bias);
742 /* backend will use 2-3 components so apply swizzle */
743 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
744 instr->src_count = 2;
745 }
746 }
747
748 static void
setup_input(struct ir2_context * ctx,nir_variable * in)749 setup_input(struct ir2_context *ctx, nir_variable * in)
750 {
751 struct fd2_shader_stateobj *so = ctx->so;
752 unsigned array_len = MAX2(glsl_get_length(in->type), 1);
753 unsigned n = in->data.driver_location;
754 unsigned slot = in->data.location;
755
756 assert(array_len == 1);
757
758 /* handle later */
759 if (ctx->so->type == MESA_SHADER_VERTEX)
760 return;
761
762 if (ctx->so->type != MESA_SHADER_FRAGMENT)
763 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
764
765 n = ctx->f->inputs_count++;
766
767 /* half of fragcoord from param reg, half from a varying */
768 if (slot == VARYING_SLOT_POS) {
769 ctx->f->fragcoord = n;
770 so->need_param = true;
771 }
772
773 ctx->f->inputs[n].slot = slot;
774 ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
775
776 /* in->data.interpolation?
777 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
778 */
779 }
780
781 static void
emit_undef(struct ir2_context * ctx,nir_ssa_undef_instr * undef)782 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef)
783 {
784 /* TODO we don't want to emit anything for undefs */
785
786 struct ir2_instr *instr;
787
788 instr = instr_create_alu_dest(ctx, nir_op_mov,
789 &(nir_dest) {.ssa = undef->def,.is_ssa = true});
790 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
791 }
792
793 static void
emit_instr(struct ir2_context * ctx,nir_instr * instr)794 emit_instr(struct ir2_context *ctx, nir_instr * instr)
795 {
796 switch (instr->type) {
797 case nir_instr_type_alu:
798 emit_alu(ctx, nir_instr_as_alu(instr));
799 break;
800 case nir_instr_type_deref:
801 /* ignored, handled as part of the intrinsic they are src to */
802 break;
803 case nir_instr_type_intrinsic:
804 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
805 break;
806 case nir_instr_type_load_const:
807 /* dealt with when using nir_src */
808 break;
809 case nir_instr_type_tex:
810 emit_tex(ctx, nir_instr_as_tex(instr));
811 break;
812 case nir_instr_type_jump:
813 ctx->block_has_jump[ctx->block_idx] = true;
814 break;
815 case nir_instr_type_ssa_undef:
816 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
817 break;
818 default:
819 break;
820 }
821 }
822
823 /* fragcoord.zw and a20x hw binning outputs */
824 static void
extra_position_exports(struct ir2_context * ctx,bool binning)825 extra_position_exports(struct ir2_context *ctx, bool binning)
826 {
827 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
828
829 if (ctx->f->fragcoord < 0 && !binning)
830 return;
831
832 instr = instr_create_alu(ctx, nir_op_fmax, 1);
833 instr->src[0] = ctx->position;
834 instr->src[0].swizzle = IR2_SWIZZLE_W;
835 instr->src[1] = ir2_zero(ctx);
836
837 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
838 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
839
840 sc = instr_create_alu(ctx, nir_op_fmul, 4);
841 sc->src[0] = ctx->position;
842 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
843
844 wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
845 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
846 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
847 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
848
849 /* fragcoord z/w */
850 if (ctx->f->fragcoord >= 0 && !binning) {
851 instr = instr_create_alu(ctx, nir_op_mov, 1);
852 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
853 instr->alu.export = ctx->f->fragcoord;
854
855 instr = instr_create_alu(ctx, nir_op_mov, 1);
856 instr->src[0] = ctx->position;
857 instr->src[0].swizzle = IR2_SWIZZLE_W;
858 instr->alu.export = ctx->f->fragcoord;
859 instr->alu.write_mask = 2;
860 }
861
862 if (!binning)
863 return;
864
865 off = instr_create_alu(ctx, nir_op_fadd, 1);
866 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
867 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
868
869 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
870 for (int i = 0; i < 8; i++) {
871 instr = instr_create_alu(ctx, nir_op_ffma, 4);
872 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
873 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
874 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
875 instr->alu.export = 32;
876
877 instr = instr_create_alu(ctx, nir_op_ffma, 4);
878 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
879 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
880 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
881 instr->alu.export = 33;
882 }
883 }
884
885 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
886
887 static bool
emit_block(struct ir2_context * ctx,nir_block * block)888 emit_block(struct ir2_context *ctx, nir_block * block)
889 {
890 struct ir2_instr *instr;
891 nir_block *succs = block->successors[0];
892
893 ctx->block_idx = block->index;
894
895 nir_foreach_instr(instr, block)
896 emit_instr(ctx, instr);
897
898 if (!succs || !succs->index)
899 return false;
900
901 /* we want to be smart and always jump and have the backend cleanup
902 * but we are not, so there are two cases where jump is needed:
903 * loops (succs index lower)
904 * jumps (jump instruction seen in block)
905 */
906 if (succs->index > block->index && !ctx->block_has_jump[block->index])
907 return false;
908
909 assert(block->successors[1] == NULL);
910
911 instr = ir2_instr_create(ctx, IR2_CF);
912 instr->cf.block_idx = succs->index;
913 /* XXX can't jump to a block with different predicate */
914 return true;
915 }
916
917 static void
emit_if(struct ir2_context * ctx,nir_if * nif)918 emit_if(struct ir2_context *ctx, nir_if * nif)
919 {
920 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
921 struct ir2_instr *instr;
922
923 /* XXX: blob seems to always use same register for condition */
924
925 instr = ir2_instr_create(ctx, IR2_ALU);
926 instr->src[0] = make_src(ctx, nif->condition);
927 instr->src_count = 1;
928 instr->ssa.ncomp = 1;
929 instr->alu.vector_opc = VECTOR_NONE;
930 instr->alu.scalar_opc = SCALAR_NONE;
931 instr->alu.export = -1;
932 instr->alu.write_mask = 1;
933 instr->pred = 0;
934
935 /* if nested, use PRED_SETNE_PUSHv */
936 if (pred) {
937 instr->alu.vector_opc = PRED_SETNE_PUSHv;
938 instr->src[1] = instr->src[0];
939 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
940 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
941 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
942 instr->src_count = 2;
943 } else {
944 instr->alu.scalar_opc = PRED_SETNEs;
945 }
946
947 ctx->pred_idx = instr->idx;
948 ctx->pred = 3;
949
950 emit_cf_list(ctx, &nif->then_list);
951
952 /* TODO: if these is no else branch we don't need this
953 * and if the else branch is simple, can just flip ctx->pred instead
954 */
955 instr = ir2_instr_create(ctx, IR2_ALU);
956 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
957 instr->src_count = 1;
958 instr->ssa.ncomp = 1;
959 instr->alu.vector_opc = VECTOR_NONE;
960 instr->alu.scalar_opc = PRED_SET_INVs;
961 instr->alu.export = -1;
962 instr->alu.write_mask = 1;
963 instr->pred = 0;
964 ctx->pred_idx = instr->idx;
965
966 emit_cf_list(ctx, &nif->else_list);
967
968 /* restore predicate for nested predicates */
969 if (pred) {
970 instr = ir2_instr_create(ctx, IR2_ALU);
971 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
972 instr->src_count = 1;
973 instr->ssa.ncomp = 1;
974 instr->alu.vector_opc = VECTOR_NONE;
975 instr->alu.scalar_opc = PRED_SET_POPs;
976 instr->alu.export = -1;
977 instr->alu.write_mask = 1;
978 instr->pred = 0;
979 ctx->pred_idx = instr->idx;
980 }
981
982 /* restore ctx->pred */
983 ctx->pred = pred;
984 }
985
986 /* get the highest block idx in the loop, so we know when
987 * we can free registers that are allocated outside the loop
988 */
989 static unsigned
loop_last_block(struct exec_list * list)990 loop_last_block(struct exec_list *list)
991 {
992 nir_cf_node *node =
993 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
994 switch (node->type) {
995 case nir_cf_node_block:
996 return nir_cf_node_as_block(node)->index;
997 case nir_cf_node_if:
998 assert(0); /* XXX could this ever happen? */
999 return 0;
1000 case nir_cf_node_loop:
1001 return loop_last_block(&nir_cf_node_as_loop(node)->body);
1002 default:
1003 compile_error(ctx, "Not supported\n");
1004 return 0;
1005 }
1006 }
1007
1008 static void
emit_loop(struct ir2_context * ctx,nir_loop * nloop)1009 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1010 {
1011 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1012 emit_cf_list(ctx, &nloop->body);
1013 ctx->loop_depth--;
1014 }
1015
1016 static bool
emit_cf_list(struct ir2_context * ctx,struct exec_list * list)1017 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1018 {
1019 bool ret = false;
1020 foreach_list_typed(nir_cf_node, node, node, list) {
1021 ret = false;
1022 switch (node->type) {
1023 case nir_cf_node_block:
1024 ret = emit_block(ctx, nir_cf_node_as_block(node));
1025 break;
1026 case nir_cf_node_if:
1027 emit_if(ctx, nir_cf_node_as_if(node));
1028 break;
1029 case nir_cf_node_loop:
1030 emit_loop(ctx, nir_cf_node_as_loop(node));
1031 break;
1032 case nir_cf_node_function:
1033 compile_error(ctx, "Not supported\n");
1034 break;
1035 }
1036 }
1037 return ret;
1038 }
1039
cleanup_binning(struct ir2_context * ctx)1040 static void cleanup_binning(struct ir2_context *ctx)
1041 {
1042 assert(ctx->so->type == MESA_SHADER_VERTEX);
1043
1044 /* kill non-position outputs for binning variant */
1045 nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) {
1046 nir_foreach_instr_safe(instr, block) {
1047 if (instr->type != nir_instr_type_intrinsic)
1048 continue;
1049
1050 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1051 if (intr->intrinsic != nir_intrinsic_store_output)
1052 continue;
1053
1054 if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1055 nir_instr_remove(instr);
1056 }
1057 }
1058
1059 ir2_optimize_nir(ctx->nir, false);
1060 }
1061
1062 static bool
ir2_alu_to_scalar_filter_cb(const nir_instr * instr,const void * data)1063 ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
1064 {
1065 if (instr->type != nir_instr_type_alu)
1066 return false;
1067
1068 nir_alu_instr *alu = nir_instr_as_alu(instr);
1069 switch (alu->op) {
1070 case nir_op_frsq:
1071 case nir_op_frcp:
1072 case nir_op_flog2:
1073 case nir_op_fexp2:
1074 case nir_op_fsqrt:
1075 case nir_op_fcos:
1076 case nir_op_fsin:
1077 return true;
1078 default:
1079 break;
1080 }
1081
1082 return false;
1083 }
1084
1085 void
ir2_nir_compile(struct ir2_context * ctx,bool binning)1086 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1087 {
1088 struct fd2_shader_stateobj *so = ctx->so;
1089
1090 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1091
1092 ctx->nir = nir_shader_clone(NULL, so->nir);
1093
1094 if (binning)
1095 cleanup_binning(ctx);
1096
1097 OPT_V(ctx->nir, nir_copy_prop);
1098 OPT_V(ctx->nir, nir_opt_dce);
1099 OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
1100
1101 OPT_V(ctx->nir, nir_lower_int_to_float);
1102 OPT_V(ctx->nir, nir_lower_bool_to_float);
1103 while(OPT(ctx->nir, nir_opt_algebraic));
1104 OPT_V(ctx->nir, nir_opt_algebraic_late);
1105 OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1106
1107 OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL);
1108
1109 OPT_V(ctx->nir, nir_lower_locals_to_regs);
1110
1111 OPT_V(ctx->nir, nir_convert_from_ssa, true);
1112
1113 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1114 OPT_V(ctx->nir, nir_lower_vec_to_movs);
1115
1116 OPT_V(ctx->nir, nir_opt_dce);
1117
1118 nir_sweep(ctx->nir);
1119
1120 if (fd_mesa_debug & FD_DBG_DISASM) {
1121 debug_printf("----------------------\n");
1122 nir_print_shader(ctx->nir, stdout);
1123 debug_printf("----------------------\n");
1124 }
1125
1126 /* fd2_shader_stateobj init */
1127 if (so->type == MESA_SHADER_FRAGMENT) {
1128 ctx->f->fragcoord = -1;
1129 ctx->f->inputs_count = 0;
1130 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1131 }
1132
1133 /* Setup inputs: */
1134 nir_foreach_shader_in_variable(in, ctx->nir)
1135 setup_input(ctx, in);
1136
1137 if (so->type == MESA_SHADER_FRAGMENT) {
1138 unsigned idx;
1139 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1140 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1141 update_range(ctx, &ctx->input[idx]);
1142 }
1143 /* assume we have param input and kill it later if not */
1144 ctx->input[idx].ncomp = 4;
1145 update_range(ctx, &ctx->input[idx]);
1146 } else {
1147 ctx->input[0].ncomp = 1;
1148 ctx->input[2].ncomp = 1;
1149 update_range(ctx, &ctx->input[0]);
1150 update_range(ctx, &ctx->input[2]);
1151 }
1152
1153 /* And emit the body: */
1154 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1155
1156 nir_foreach_register(reg, &fxn->registers) {
1157 ctx->reg[reg->index].ncomp = reg->num_components;
1158 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1159 }
1160
1161 nir_metadata_require(fxn, nir_metadata_block_index);
1162 emit_cf_list(ctx, &fxn->body);
1163 /* TODO emit_block(ctx, fxn->end_block); */
1164
1165 if (so->type == MESA_SHADER_VERTEX)
1166 extra_position_exports(ctx, binning);
1167
1168 ralloc_free(ctx->nir);
1169
1170 /* kill unused param input */
1171 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1172 ctx->input[ctx->f->inputs_count].initialized = false;
1173 }
1174