1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28
29 #include "fd2_program.h"
30 #include "freedreno_util.h"
31
32 static const nir_shader_compiler_options options = {
33 .lower_fpow = true,
34 .lower_flrp32 = true,
35 .lower_fmod = true,
36 .lower_fdiv = true,
37 .lower_fceil = true,
38 .fuse_ffma16 = true,
39 .fuse_ffma32 = true,
40 .fuse_ffma64 = true,
41 /* .fdot_replicates = true, it is replicated, but it makes things worse */
42 .lower_all_io_to_temps = true,
43 .vertex_id_zero_based = true, /* its not implemented anyway */
44 .lower_bitops = true,
45 .lower_rotate = true,
46 .lower_vector_cmp = true,
47 .lower_fdph = true,
48 .has_fsub = true,
49 .has_isub = true,
50 .lower_insert_byte = true,
51 .lower_insert_word = true,
52 .force_indirect_unrolling = nir_var_all,
53 };
54
55 const nir_shader_compiler_options *
ir2_get_compiler_options(void)56 ir2_get_compiler_options(void)
57 {
58 return &options;
59 }
60
61 #define OPT(nir, pass, ...) \
62 ({ \
63 bool this_progress = false; \
64 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
65 this_progress; \
66 })
67 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
68
69 static void
ir2_optimize_loop(nir_shader * s)70 ir2_optimize_loop(nir_shader *s)
71 {
72 bool progress;
73 do {
74 progress = false;
75
76 OPT_V(s, nir_lower_vars_to_ssa);
77 progress |= OPT(s, nir_opt_copy_prop_vars);
78 progress |= OPT(s, nir_copy_prop);
79 progress |= OPT(s, nir_opt_dce);
80 progress |= OPT(s, nir_opt_cse);
81 /* progress |= OPT(s, nir_opt_gcm, true); */
82 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
83 progress |= OPT(s, nir_opt_intrinsics);
84 progress |= OPT(s, nir_opt_algebraic);
85 progress |= OPT(s, nir_opt_constant_folding);
86 progress |= OPT(s, nir_opt_dead_cf);
87 if (OPT(s, nir_opt_trivial_continues)) {
88 progress |= true;
89 /* If nir_opt_trivial_continues makes progress, then we need to clean
90 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
91 * to make progress.
92 */
93 OPT(s, nir_copy_prop);
94 OPT(s, nir_opt_dce);
95 }
96 progress |= OPT(s, nir_opt_loop_unroll);
97 progress |= OPT(s, nir_opt_if, false);
98 progress |= OPT(s, nir_opt_remove_phis);
99 progress |= OPT(s, nir_opt_undef);
100
101 } while (progress);
102 }
103
104 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
105 bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
106
107 int
ir2_optimize_nir(nir_shader * s,bool lower)108 ir2_optimize_nir(nir_shader *s, bool lower)
109 {
110 struct nir_lower_tex_options tex_options = {
111 .lower_txp = ~0u,
112 .lower_rect = 0,
113 };
114
115 if (FD_DBG(DISASM)) {
116 debug_printf("----------------------\n");
117 nir_print_shader(s, stdout);
118 debug_printf("----------------------\n");
119 }
120
121 OPT_V(s, nir_lower_regs_to_ssa);
122 OPT_V(s, nir_lower_vars_to_ssa);
123 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out,
124 UINT32_MAX);
125
126 if (lower) {
127 OPT_V(s, ir3_nir_apply_trig_workarounds);
128 OPT_V(s, nir_lower_tex, &tex_options);
129 }
130
131 ir2_optimize_loop(s);
132
133 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
134 OPT_V(s, nir_opt_sink, nir_move_const_undef);
135
136 /* TODO we dont want to get shaders writing to depth for depth textures */
137 if (s->info.stage == MESA_SHADER_FRAGMENT) {
138 nir_foreach_shader_out_variable (var, s) {
139 if (var->data.location == FRAG_RESULT_DEPTH)
140 return -1;
141 }
142 }
143
144 return 0;
145 }
146
147 static struct ir2_src
load_const(struct ir2_context * ctx,float * value_f,unsigned ncomp)148 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
149 {
150 struct fd2_shader_stateobj *so = ctx->so;
151 unsigned imm_ncomp, swiz, idx, i, j;
152 uint32_t *value = (uint32_t *)value_f;
153
154 /* try to merge with existing immediate (TODO: try with neg) */
155 for (idx = 0; idx < so->num_immediates; idx++) {
156 swiz = 0;
157 imm_ncomp = so->immediates[idx].ncomp;
158 for (i = 0; i < ncomp; i++) {
159 for (j = 0; j < imm_ncomp; j++) {
160 if (value[i] == so->immediates[idx].val[j])
161 break;
162 }
163 if (j == imm_ncomp) {
164 if (j == 4)
165 break;
166 so->immediates[idx].val[imm_ncomp++] = value[i];
167 }
168 swiz |= swiz_set(j, i);
169 }
170 /* matched all components */
171 if (i == ncomp)
172 break;
173 }
174
175 /* need to allocate new immediate */
176 if (idx == so->num_immediates) {
177 swiz = 0;
178 imm_ncomp = 0;
179 for (i = 0; i < ncomp; i++) {
180 for (j = 0; j < imm_ncomp; j++) {
181 if (value[i] == ctx->so->immediates[idx].val[j])
182 break;
183 }
184 if (j == imm_ncomp) {
185 so->immediates[idx].val[imm_ncomp++] = value[i];
186 }
187 swiz |= swiz_set(j, i);
188 }
189 so->num_immediates++;
190 }
191 so->immediates[idx].ncomp = imm_ncomp;
192
193 if (ncomp == 1)
194 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
195
196 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
197 }
198
199 struct ir2_src
ir2_zero(struct ir2_context * ctx)200 ir2_zero(struct ir2_context *ctx)
201 {
202 return load_const(ctx, (float[]){0.0f}, 1);
203 }
204
205 static void
update_range(struct ir2_context * ctx,struct ir2_reg * reg)206 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
207 {
208 if (!reg->initialized) {
209 reg->initialized = true;
210 reg->loop_depth = ctx->loop_depth;
211 }
212
213 if (ctx->loop_depth > reg->loop_depth) {
214 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
215 } else {
216 reg->loop_depth = ctx->loop_depth;
217 reg->block_idx_free = -1;
218 }
219
220 /* for regs we want to free at the end of the loop in any case
221 * XXX dont do this for ssa
222 */
223 if (reg->loop_depth)
224 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
225 }
226
227 static struct ir2_src
make_src(struct ir2_context * ctx,nir_src src)228 make_src(struct ir2_context *ctx, nir_src src)
229 {
230 struct ir2_src res = {};
231 struct ir2_reg *reg;
232
233 nir_const_value *const_value = nir_src_as_const_value(src);
234
235 if (const_value) {
236 assert(src.is_ssa);
237 float c[src.ssa->num_components];
238 nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
239 return load_const(ctx, c, src.ssa->num_components);
240 }
241
242 if (!src.is_ssa) {
243 res.num = src.reg.reg->index;
244 res.type = IR2_SRC_REG;
245 reg = &ctx->reg[res.num];
246 } else {
247 assert(ctx->ssa_map[src.ssa->index] >= 0);
248 res.num = ctx->ssa_map[src.ssa->index];
249 res.type = IR2_SRC_SSA;
250 reg = &ctx->instr[res.num].ssa;
251 }
252
253 update_range(ctx, reg);
254 return res;
255 }
256
257 static void
set_index(struct ir2_context * ctx,nir_dest * dst,struct ir2_instr * instr)258 set_index(struct ir2_context *ctx, nir_dest *dst, struct ir2_instr *instr)
259 {
260 struct ir2_reg *reg = &instr->ssa;
261
262 if (dst->is_ssa) {
263 ctx->ssa_map[dst->ssa.index] = instr->idx;
264 } else {
265 assert(instr->is_ssa);
266 reg = &ctx->reg[dst->reg.reg->index];
267
268 instr->is_ssa = false;
269 instr->reg = reg;
270 }
271 update_range(ctx, reg);
272 }
273
274 static struct ir2_instr *
ir2_instr_create(struct ir2_context * ctx,int type)275 ir2_instr_create(struct ir2_context *ctx, int type)
276 {
277 struct ir2_instr *instr;
278
279 instr = &ctx->instr[ctx->instr_count++];
280 instr->idx = ctx->instr_count - 1;
281 instr->type = type;
282 instr->block_idx = ctx->block_idx;
283 instr->pred = ctx->pred;
284 instr->is_ssa = true;
285 return instr;
286 }
287
288 static struct ir2_instr *
instr_create_alu(struct ir2_context * ctx,nir_op opcode,unsigned ncomp)289 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
290 {
291 /* emit_alu will fixup instrs that don't map directly */
292 static const struct ir2_opc {
293 int8_t scalar, vector;
294 } nir_ir2_opc[nir_num_opcodes + 1] = {
295 [0 ... nir_num_opcodes - 1] = {-1, -1},
296
297 [nir_op_mov] = {MAXs, MAXv},
298 [nir_op_fneg] = {MAXs, MAXv},
299 [nir_op_fabs] = {MAXs, MAXv},
300 [nir_op_fsat] = {MAXs, MAXv},
301 [nir_op_fsign] = {-1, CNDGTEv},
302 [nir_op_fadd] = {ADDs, ADDv},
303 [nir_op_fsub] = {ADDs, ADDv},
304 [nir_op_fmul] = {MULs, MULv},
305 [nir_op_ffma] = {-1, MULADDv},
306 [nir_op_fmax] = {MAXs, MAXv},
307 [nir_op_fmin] = {MINs, MINv},
308 [nir_op_ffloor] = {FLOORs, FLOORv},
309 [nir_op_ffract] = {FRACs, FRACv},
310 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
311 [nir_op_fdot2] = {-1, DOT2ADDv},
312 [nir_op_fdot3] = {-1, DOT3v},
313 [nir_op_fdot4] = {-1, DOT4v},
314 [nir_op_sge] = {-1, SETGTEv},
315 [nir_op_slt] = {-1, SETGTv},
316 [nir_op_sne] = {-1, SETNEv},
317 [nir_op_seq] = {-1, SETEv},
318 [nir_op_fcsel] = {-1, CNDEv},
319 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
320 [nir_op_frcp] = {RECIP_IEEE, -1},
321 [nir_op_flog2] = {LOG_IEEE, -1},
322 [nir_op_fexp2] = {EXP_IEEE, -1},
323 [nir_op_fsqrt] = {SQRT_IEEE, -1},
324 [nir_op_fcos] = {COS, -1},
325 [nir_op_fsin] = {SIN, -1},
326 /* no fsat, fneg, fabs since source mods deal with those */
327
328 /* so we can use this function with non-nir op */
329 #define ir2_op_cube nir_num_opcodes
330 [ir2_op_cube] = {-1, CUBEv},
331 };
332
333 struct ir2_opc op = nir_ir2_opc[opcode];
334 assert(op.vector >= 0 || op.scalar >= 0);
335
336 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
337 instr->alu.vector_opc = op.vector;
338 instr->alu.scalar_opc = op.scalar;
339 instr->alu.export = -1;
340 instr->alu.write_mask = (1 << ncomp) - 1;
341 instr->src_count =
342 opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs;
343 instr->ssa.ncomp = ncomp;
344 return instr;
345 }
346
347 static struct ir2_instr *
instr_create_alu_reg(struct ir2_context * ctx,nir_op opcode,uint8_t write_mask,struct ir2_instr * share_reg)348 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask,
349 struct ir2_instr *share_reg)
350 {
351 struct ir2_instr *instr;
352 struct ir2_reg *reg;
353
354 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
355 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
356
357 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
358 instr->alu.write_mask = write_mask;
359 instr->reg = reg;
360 instr->is_ssa = false;
361 return instr;
362 }
363
364 static struct ir2_instr *
instr_create_alu_dest(struct ir2_context * ctx,nir_op opcode,nir_dest * dst)365 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
366 {
367 struct ir2_instr *instr;
368 instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
369 set_index(ctx, dst, instr);
370 return instr;
371 }
372
373 static struct ir2_instr *
ir2_instr_create_fetch(struct ir2_context * ctx,nir_dest * dst,instr_fetch_opc_t opc)374 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
375 instr_fetch_opc_t opc)
376 {
377 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
378 instr->fetch.opc = opc;
379 instr->src_count = 1;
380 instr->ssa.ncomp = nir_dest_num_components(*dst);
381 set_index(ctx, dst, instr);
382 return instr;
383 }
384
385 static struct ir2_src
make_src_noconst(struct ir2_context * ctx,nir_src src)386 make_src_noconst(struct ir2_context *ctx, nir_src src)
387 {
388 struct ir2_instr *instr;
389
390 if (nir_src_as_const_value(src)) {
391 assert(src.is_ssa);
392 instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
393 instr->src[0] = make_src(ctx, src);
394 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
395 }
396
397 return make_src(ctx, src);
398 }
399
400 static void
emit_alu(struct ir2_context * ctx,nir_alu_instr * alu)401 emit_alu(struct ir2_context *ctx, nir_alu_instr *alu)
402 {
403 const nir_op_info *info = &nir_op_infos[alu->op];
404 nir_dest *dst = &alu->dest.dest;
405 struct ir2_instr *instr;
406 struct ir2_src tmp;
407 unsigned ncomp;
408
409 /* get the number of dst components */
410 if (dst->is_ssa) {
411 ncomp = dst->ssa.num_components;
412 } else {
413 ncomp = 0;
414 for (int i = 0; i < 4; i++)
415 ncomp += !!(alu->dest.write_mask & 1 << i);
416 }
417
418 instr = instr_create_alu(ctx, alu->op, ncomp);
419 set_index(ctx, dst, instr);
420 instr->alu.saturate = alu->dest.saturate;
421 instr->alu.write_mask = alu->dest.write_mask;
422
423 for (int i = 0; i < info->num_inputs; i++) {
424 nir_alu_src *src = &alu->src[i];
425
426 /* compress swizzle with writemask when applicable */
427 unsigned swiz = 0, j = 0;
428 for (int i = 0; i < 4; i++) {
429 if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
430 continue;
431 swiz |= swiz_set(src->swizzle[i], j++);
432 }
433
434 instr->src[i] = make_src(ctx, src->src);
435 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
436 instr->src[i].negate = src->negate;
437 instr->src[i].abs = src->abs;
438 }
439
440 /* workarounds for NIR ops that don't map directly to a2xx ops */
441 switch (alu->op) {
442 case nir_op_fneg:
443 instr->src[0].negate = 1;
444 break;
445 case nir_op_fabs:
446 instr->src[0].abs = 1;
447 break;
448 case nir_op_fsat:
449 instr->alu.saturate = 1;
450 break;
451 case nir_op_slt:
452 tmp = instr->src[0];
453 instr->src[0] = instr->src[1];
454 instr->src[1] = tmp;
455 break;
456 case nir_op_fcsel:
457 tmp = instr->src[1];
458 instr->src[1] = instr->src[2];
459 instr->src[2] = tmp;
460 break;
461 case nir_op_fsub:
462 instr->src[1].negate = !instr->src[1].negate;
463 break;
464 case nir_op_fdot2:
465 instr->src_count = 3;
466 instr->src[2] = ir2_zero(ctx);
467 break;
468 case nir_op_fsign: {
469 /* we need an extra instruction to deal with the zero case */
470 struct ir2_instr *tmp;
471
472 /* tmp = x == 0 ? 0 : 1 */
473 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
474 tmp->src[0] = instr->src[0];
475 tmp->src[1] = ir2_zero(ctx);
476 tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1);
477
478 /* result = x >= 0 ? tmp : -tmp */
479 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
480 instr->src[2] = instr->src[1];
481 instr->src[2].negate = true;
482 instr->src_count = 3;
483 } break;
484 default:
485 break;
486 }
487 }
488
489 static void
load_input(struct ir2_context * ctx,nir_dest * dst,unsigned idx)490 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
491 {
492 struct ir2_instr *instr;
493 int slot = -1;
494
495 if (ctx->so->type == MESA_SHADER_VERTEX) {
496 instr = ir2_instr_create_fetch(ctx, dst, 0);
497 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
498 instr->fetch.vtx.const_idx = 20 + (idx / 3);
499 instr->fetch.vtx.const_idx_sel = idx % 3;
500 return;
501 }
502
503 /* get slot from idx */
504 nir_foreach_shader_in_variable (var, ctx->nir) {
505 if (var->data.driver_location == idx) {
506 slot = var->data.location;
507 break;
508 }
509 }
510 assert(slot >= 0);
511
512 switch (slot) {
513 case VARYING_SLOT_POS:
514 /* need to extract xy with abs and add tile offset on a20x
515 * zw from fragcoord input (w inverted in fragment shader)
516 * TODO: only components that are required by fragment shader
517 */
518 instr = instr_create_alu_reg(
519 ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
520 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
521 instr->src[0].abs = true;
522 /* on a20x, C64 contains the tile offset */
523 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
524
525 instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
526 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
527
528 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
529 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
530
531 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
532 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
533 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
534 break;
535 default:
536 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
537 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
538 break;
539 }
540 }
541
542 static unsigned
output_slot(struct ir2_context * ctx,nir_intrinsic_instr * intr)543 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
544 {
545 int slot = -1;
546 unsigned idx = nir_intrinsic_base(intr);
547 nir_foreach_shader_out_variable (var, ctx->nir) {
548 if (var->data.driver_location == idx) {
549 slot = var->data.location;
550 break;
551 }
552 }
553 assert(slot != -1);
554 return slot;
555 }
556
557 static void
store_output(struct ir2_context * ctx,nir_src src,unsigned slot,unsigned ncomp)558 store_output(struct ir2_context *ctx, nir_src src, unsigned slot,
559 unsigned ncomp)
560 {
561 struct ir2_instr *instr;
562 unsigned idx = 0;
563
564 if (ctx->so->type == MESA_SHADER_VERTEX) {
565 switch (slot) {
566 case VARYING_SLOT_POS:
567 ctx->position = make_src(ctx, src);
568 idx = 62;
569 break;
570 case VARYING_SLOT_PSIZ:
571 ctx->so->writes_psize = true;
572 idx = 63;
573 break;
574 default:
575 /* find matching slot from fragment shader input */
576 for (idx = 0; idx < ctx->f->inputs_count; idx++)
577 if (ctx->f->inputs[idx].slot == slot)
578 break;
579 if (idx == ctx->f->inputs_count)
580 return;
581 }
582 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
583 /* only color output is implemented */
584 return;
585 }
586
587 instr = instr_create_alu(ctx, nir_op_mov, ncomp);
588 instr->src[0] = make_src(ctx, src);
589 instr->alu.export = idx;
590 }
591
592 static void
emit_intrinsic(struct ir2_context * ctx,nir_intrinsic_instr * intr)593 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
594 {
595 struct ir2_instr *instr;
596 ASSERTED nir_const_value *const_offset;
597 unsigned idx;
598
599 switch (intr->intrinsic) {
600 case nir_intrinsic_load_input:
601 load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
602 break;
603 case nir_intrinsic_store_output:
604 store_output(ctx, intr->src[0], output_slot(ctx, intr),
605 intr->num_components);
606 break;
607 case nir_intrinsic_load_uniform:
608 const_offset = nir_src_as_const_value(intr->src[0]);
609 assert(const_offset); /* TODO can be false in ES2? */
610 idx = nir_intrinsic_base(intr);
611 idx += (uint32_t)const_offset[0].f32;
612 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
613 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
614 break;
615 case nir_intrinsic_discard:
616 case nir_intrinsic_discard_if:
617 instr = ir2_instr_create(ctx, IR2_ALU);
618 instr->alu.vector_opc = VECTOR_NONE;
619 if (intr->intrinsic == nir_intrinsic_discard_if) {
620 instr->alu.scalar_opc = KILLNEs;
621 instr->src[0] = make_src(ctx, intr->src[0]);
622 } else {
623 instr->alu.scalar_opc = KILLEs;
624 instr->src[0] = ir2_zero(ctx);
625 }
626 instr->alu.export = -1;
627 instr->src_count = 1;
628 ctx->so->has_kill = true;
629 break;
630 case nir_intrinsic_load_front_face:
631 /* gl_FrontFacing is in the sign of param.x
632 * rcp required because otherwise we can't differentiate -0.0 and +0.0
633 */
634 ctx->so->need_param = true;
635
636 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
637 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
638
639 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
640 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
641 instr->src[1] = ir2_zero(ctx);
642 break;
643 case nir_intrinsic_load_point_coord:
644 /* param.zw (note: abs might be needed like fragcoord in param.xy?) */
645 ctx->so->need_param = true;
646
647 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
648 instr->src[0] =
649 ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
650 break;
651 default:
652 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
653 break;
654 }
655 }
656
657 static void
emit_tex(struct ir2_context * ctx,nir_tex_instr * tex)658 emit_tex(struct ir2_context *ctx, nir_tex_instr *tex)
659 {
660 bool is_rect = false, is_cube = false;
661 struct ir2_instr *instr;
662 nir_src *coord, *lod_bias;
663
664 coord = lod_bias = NULL;
665
666 for (unsigned i = 0; i < tex->num_srcs; i++) {
667 switch (tex->src[i].src_type) {
668 case nir_tex_src_coord:
669 coord = &tex->src[i].src;
670 break;
671 case nir_tex_src_bias:
672 case nir_tex_src_lod:
673 assert(!lod_bias);
674 lod_bias = &tex->src[i].src;
675 break;
676 default:
677 compile_error(ctx, "Unhandled NIR tex src type: %d\n",
678 tex->src[i].src_type);
679 return;
680 }
681 }
682
683 switch (tex->op) {
684 case nir_texop_tex:
685 case nir_texop_txb:
686 case nir_texop_txl:
687 break;
688 default:
689 compile_error(ctx, "unimplemented texop %d\n", tex->op);
690 return;
691 }
692
693 switch (tex->sampler_dim) {
694 case GLSL_SAMPLER_DIM_2D:
695 case GLSL_SAMPLER_DIM_EXTERNAL:
696 break;
697 case GLSL_SAMPLER_DIM_RECT:
698 is_rect = true;
699 break;
700 case GLSL_SAMPLER_DIM_CUBE:
701 is_cube = true;
702 break;
703 default:
704 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
705 return;
706 }
707
708 struct ir2_src src_coord = make_src_noconst(ctx, *coord);
709
710 /* for cube maps
711 * tmp = cube(coord)
712 * tmp.xy = tmp.xy / |tmp.z| + 1.5
713 * coord = tmp.xyw
714 */
715 if (is_cube) {
716 struct ir2_instr *rcp, *coord_xy;
717 unsigned reg_idx;
718
719 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
720 instr->src[0] = src_coord;
721 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
722 instr->src[1] = src_coord;
723 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
724
725 reg_idx = instr->reg - ctx->reg; /* hacky */
726
727 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
728 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
729 rcp->src[0].abs = true;
730
731 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
732 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
733 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
734 coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1);
735
736 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
737 /* TODO: lod/bias transformed by src_coord.z ? */
738 }
739
740 instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
741 instr->src[0] = src_coord;
742 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;
743 instr->fetch.tex.is_cube = is_cube;
744 instr->fetch.tex.is_rect = is_rect;
745 instr->fetch.tex.samp_id = tex->sampler_index;
746
747 /* for lod/bias, we insert an extra src for the backend to deal with */
748 if (lod_bias) {
749 instr->src[1] = make_src_noconst(ctx, *lod_bias);
750 /* backend will use 2-3 components so apply swizzle */
751 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
752 instr->src_count = 2;
753 }
754 }
755
756 static void
setup_input(struct ir2_context * ctx,nir_variable * in)757 setup_input(struct ir2_context *ctx, nir_variable *in)
758 {
759 struct fd2_shader_stateobj *so = ctx->so;
760 ASSERTED unsigned array_len = MAX2(glsl_get_length(in->type), 1);
761 unsigned n = in->data.driver_location;
762 unsigned slot = in->data.location;
763
764 assert(array_len == 1);
765
766 /* handle later */
767 if (ctx->so->type == MESA_SHADER_VERTEX)
768 return;
769
770 if (ctx->so->type != MESA_SHADER_FRAGMENT)
771 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
772
773 n = ctx->f->inputs_count++;
774
775 /* half of fragcoord from param reg, half from a varying */
776 if (slot == VARYING_SLOT_POS) {
777 ctx->f->fragcoord = n;
778 so->need_param = true;
779 }
780
781 ctx->f->inputs[n].slot = slot;
782 ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
783
784 /* in->data.interpolation?
785 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
786 */
787 }
788
789 static void
emit_undef(struct ir2_context * ctx,nir_ssa_undef_instr * undef)790 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr *undef)
791 {
792 /* TODO we don't want to emit anything for undefs */
793
794 struct ir2_instr *instr;
795
796 instr = instr_create_alu_dest(
797 ctx, nir_op_mov, &(nir_dest){.ssa = undef->def, .is_ssa = true});
798 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
799 }
800
801 static void
emit_instr(struct ir2_context * ctx,nir_instr * instr)802 emit_instr(struct ir2_context *ctx, nir_instr *instr)
803 {
804 switch (instr->type) {
805 case nir_instr_type_alu:
806 emit_alu(ctx, nir_instr_as_alu(instr));
807 break;
808 case nir_instr_type_deref:
809 /* ignored, handled as part of the intrinsic they are src to */
810 break;
811 case nir_instr_type_intrinsic:
812 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
813 break;
814 case nir_instr_type_load_const:
815 /* dealt with when using nir_src */
816 break;
817 case nir_instr_type_tex:
818 emit_tex(ctx, nir_instr_as_tex(instr));
819 break;
820 case nir_instr_type_jump:
821 ctx->block_has_jump[ctx->block_idx] = true;
822 break;
823 case nir_instr_type_ssa_undef:
824 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
825 break;
826 default:
827 break;
828 }
829 }
830
831 /* fragcoord.zw and a20x hw binning outputs */
832 static void
extra_position_exports(struct ir2_context * ctx,bool binning)833 extra_position_exports(struct ir2_context *ctx, bool binning)
834 {
835 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
836
837 if (ctx->f->fragcoord < 0 && !binning)
838 return;
839
840 instr = instr_create_alu(ctx, nir_op_fmax, 1);
841 instr->src[0] = ctx->position;
842 instr->src[0].swizzle = IR2_SWIZZLE_W;
843 instr->src[1] = ir2_zero(ctx);
844
845 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
846 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
847
848 sc = instr_create_alu(ctx, nir_op_fmul, 4);
849 sc->src[0] = ctx->position;
850 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
851
852 wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
853 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
854 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
855 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
856
857 /* fragcoord z/w */
858 if (ctx->f->fragcoord >= 0 && !binning) {
859 instr = instr_create_alu(ctx, nir_op_mov, 1);
860 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
861 instr->alu.export = ctx->f->fragcoord;
862
863 instr = instr_create_alu(ctx, nir_op_mov, 1);
864 instr->src[0] = ctx->position;
865 instr->src[0].swizzle = IR2_SWIZZLE_W;
866 instr->alu.export = ctx->f->fragcoord;
867 instr->alu.write_mask = 2;
868 }
869
870 if (!binning)
871 return;
872
873 off = instr_create_alu(ctx, nir_op_fadd, 1);
874 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
875 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
876
877 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
878 for (int i = 0; i < 8; i++) {
879 instr = instr_create_alu(ctx, nir_op_ffma, 4);
880 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
881 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
882 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
883 instr->alu.export = 32;
884
885 instr = instr_create_alu(ctx, nir_op_ffma, 4);
886 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
887 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
888 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
889 instr->alu.export = 33;
890 }
891 }
892
893 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
894
895 static bool
emit_block(struct ir2_context * ctx,nir_block * block)896 emit_block(struct ir2_context *ctx, nir_block *block)
897 {
898 struct ir2_instr *instr;
899 nir_block *succs = block->successors[0];
900
901 ctx->block_idx = block->index;
902
903 nir_foreach_instr (instr, block)
904 emit_instr(ctx, instr);
905
906 if (!succs || !succs->index)
907 return false;
908
909 /* we want to be smart and always jump and have the backend cleanup
910 * but we are not, so there are two cases where jump is needed:
911 * loops (succs index lower)
912 * jumps (jump instruction seen in block)
913 */
914 if (succs->index > block->index && !ctx->block_has_jump[block->index])
915 return false;
916
917 assert(block->successors[1] == NULL);
918
919 instr = ir2_instr_create(ctx, IR2_CF);
920 instr->cf.block_idx = succs->index;
921 /* XXX can't jump to a block with different predicate */
922 return true;
923 }
924
925 static void
emit_if(struct ir2_context * ctx,nir_if * nif)926 emit_if(struct ir2_context *ctx, nir_if *nif)
927 {
928 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
929 struct ir2_instr *instr;
930
931 /* XXX: blob seems to always use same register for condition */
932
933 instr = ir2_instr_create(ctx, IR2_ALU);
934 instr->src[0] = make_src(ctx, nif->condition);
935 instr->src_count = 1;
936 instr->ssa.ncomp = 1;
937 instr->alu.vector_opc = VECTOR_NONE;
938 instr->alu.scalar_opc = SCALAR_NONE;
939 instr->alu.export = -1;
940 instr->alu.write_mask = 1;
941 instr->pred = 0;
942
943 /* if nested, use PRED_SETNE_PUSHv */
944 if (pred) {
945 instr->alu.vector_opc = PRED_SETNE_PUSHv;
946 instr->src[1] = instr->src[0];
947 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
948 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
949 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
950 instr->src_count = 2;
951 } else {
952 instr->alu.scalar_opc = PRED_SETNEs;
953 }
954
955 ctx->pred_idx = instr->idx;
956 ctx->pred = 3;
957
958 emit_cf_list(ctx, &nif->then_list);
959
960 /* TODO: if these is no else branch we don't need this
961 * and if the else branch is simple, can just flip ctx->pred instead
962 */
963 instr = ir2_instr_create(ctx, IR2_ALU);
964 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
965 instr->src_count = 1;
966 instr->ssa.ncomp = 1;
967 instr->alu.vector_opc = VECTOR_NONE;
968 instr->alu.scalar_opc = PRED_SET_INVs;
969 instr->alu.export = -1;
970 instr->alu.write_mask = 1;
971 instr->pred = 0;
972 ctx->pred_idx = instr->idx;
973
974 emit_cf_list(ctx, &nif->else_list);
975
976 /* restore predicate for nested predicates */
977 if (pred) {
978 instr = ir2_instr_create(ctx, IR2_ALU);
979 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
980 instr->src_count = 1;
981 instr->ssa.ncomp = 1;
982 instr->alu.vector_opc = VECTOR_NONE;
983 instr->alu.scalar_opc = PRED_SET_POPs;
984 instr->alu.export = -1;
985 instr->alu.write_mask = 1;
986 instr->pred = 0;
987 ctx->pred_idx = instr->idx;
988 }
989
990 /* restore ctx->pred */
991 ctx->pred = pred;
992 }
993
994 /* get the highest block idx in the loop, so we know when
995 * we can free registers that are allocated outside the loop
996 */
997 static unsigned
loop_last_block(struct exec_list * list)998 loop_last_block(struct exec_list *list)
999 {
1000 nir_cf_node *node =
1001 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
1002 switch (node->type) {
1003 case nir_cf_node_block:
1004 return nir_cf_node_as_block(node)->index;
1005 case nir_cf_node_if:
1006 assert(0); /* XXX could this ever happen? */
1007 return 0;
1008 case nir_cf_node_loop:
1009 return loop_last_block(&nir_cf_node_as_loop(node)->body);
1010 default:
1011 compile_error(ctx, "Not supported\n");
1012 return 0;
1013 }
1014 }
1015
1016 static void
emit_loop(struct ir2_context * ctx,nir_loop * nloop)1017 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1018 {
1019 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1020 emit_cf_list(ctx, &nloop->body);
1021 ctx->loop_depth--;
1022 }
1023
1024 static bool
emit_cf_list(struct ir2_context * ctx,struct exec_list * list)1025 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1026 {
1027 bool ret = false;
1028 foreach_list_typed (nir_cf_node, node, node, list) {
1029 ret = false;
1030 switch (node->type) {
1031 case nir_cf_node_block:
1032 ret = emit_block(ctx, nir_cf_node_as_block(node));
1033 break;
1034 case nir_cf_node_if:
1035 emit_if(ctx, nir_cf_node_as_if(node));
1036 break;
1037 case nir_cf_node_loop:
1038 emit_loop(ctx, nir_cf_node_as_loop(node));
1039 break;
1040 case nir_cf_node_function:
1041 compile_error(ctx, "Not supported\n");
1042 break;
1043 }
1044 }
1045 return ret;
1046 }
1047
1048 static void
cleanup_binning(struct ir2_context * ctx)1049 cleanup_binning(struct ir2_context *ctx)
1050 {
1051 assert(ctx->so->type == MESA_SHADER_VERTEX);
1052
1053 /* kill non-position outputs for binning variant */
1054 nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) {
1055 nir_foreach_instr_safe (instr, block) {
1056 if (instr->type != nir_instr_type_intrinsic)
1057 continue;
1058
1059 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1060 if (intr->intrinsic != nir_intrinsic_store_output)
1061 continue;
1062
1063 if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1064 nir_instr_remove(instr);
1065 }
1066 }
1067
1068 ir2_optimize_nir(ctx->nir, false);
1069 }
1070
1071 static bool
ir2_alu_to_scalar_filter_cb(const nir_instr * instr,const void * data)1072 ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
1073 {
1074 if (instr->type != nir_instr_type_alu)
1075 return false;
1076
1077 nir_alu_instr *alu = nir_instr_as_alu(instr);
1078 switch (alu->op) {
1079 case nir_op_frsq:
1080 case nir_op_frcp:
1081 case nir_op_flog2:
1082 case nir_op_fexp2:
1083 case nir_op_fsqrt:
1084 case nir_op_fcos:
1085 case nir_op_fsin:
1086 return true;
1087 default:
1088 break;
1089 }
1090
1091 return false;
1092 }
1093
1094 void
ir2_nir_compile(struct ir2_context * ctx,bool binning)1095 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1096 {
1097 struct fd2_shader_stateobj *so = ctx->so;
1098
1099 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1100
1101 ctx->nir = nir_shader_clone(NULL, so->nir);
1102
1103 if (binning)
1104 cleanup_binning(ctx);
1105
1106 OPT_V(ctx->nir, nir_copy_prop);
1107 OPT_V(ctx->nir, nir_opt_dce);
1108 OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
1109
1110 OPT_V(ctx->nir, nir_lower_int_to_float);
1111 OPT_V(ctx->nir, nir_lower_bool_to_float);
1112 while (OPT(ctx->nir, nir_opt_algebraic))
1113 ;
1114 OPT_V(ctx->nir, nir_opt_algebraic_late);
1115 OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1116
1117 OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL);
1118
1119 OPT_V(ctx->nir, nir_lower_locals_to_regs);
1120
1121 OPT_V(ctx->nir, nir_convert_from_ssa, true);
1122
1123 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1124 OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL);
1125
1126 OPT_V(ctx->nir, nir_opt_dce);
1127
1128 nir_sweep(ctx->nir);
1129
1130 if (FD_DBG(DISASM)) {
1131 debug_printf("----------------------\n");
1132 nir_print_shader(ctx->nir, stdout);
1133 debug_printf("----------------------\n");
1134 }
1135
1136 /* fd2_shader_stateobj init */
1137 if (so->type == MESA_SHADER_FRAGMENT) {
1138 ctx->f->fragcoord = -1;
1139 ctx->f->inputs_count = 0;
1140 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1141 }
1142
1143 /* Setup inputs: */
1144 nir_foreach_shader_in_variable (in, ctx->nir)
1145 setup_input(ctx, in);
1146
1147 if (so->type == MESA_SHADER_FRAGMENT) {
1148 unsigned idx;
1149 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1150 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1151 update_range(ctx, &ctx->input[idx]);
1152 }
1153 /* assume we have param input and kill it later if not */
1154 ctx->input[idx].ncomp = 4;
1155 update_range(ctx, &ctx->input[idx]);
1156 } else {
1157 ctx->input[0].ncomp = 1;
1158 ctx->input[2].ncomp = 1;
1159 update_range(ctx, &ctx->input[0]);
1160 update_range(ctx, &ctx->input[2]);
1161 }
1162
1163 /* And emit the body: */
1164 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1165
1166 nir_foreach_register (reg, &fxn->registers) {
1167 ctx->reg[reg->index].ncomp = reg->num_components;
1168 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1169 }
1170
1171 nir_metadata_require(fxn, nir_metadata_block_index);
1172 emit_cf_list(ctx, &fxn->body);
1173 /* TODO emit_block(ctx, fxn->end_block); */
1174
1175 if (so->type == MESA_SHADER_VERTEX)
1176 extra_position_exports(ctx, binning);
1177
1178 ralloc_free(ctx->nir);
1179
1180 /* kill unused param input */
1181 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1182 ctx->input[ctx->f->inputs_count].initialized = false;
1183 }
1184