1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28
29 #include "fd2_program.h"
30 #include "freedreno_util.h"
31
32 static const nir_shader_compiler_options options = {
33 .lower_fpow = true,
34 .lower_flrp32 = true,
35 .lower_fmod = true,
36 .lower_fdiv = true,
37 .lower_fceil = true,
38 .fuse_ffma16 = true,
39 .fuse_ffma32 = true,
40 .fuse_ffma64 = true,
41 /* .fdot_replicates = true, it is replicated, but it makes things worse */
42 .lower_all_io_to_temps = true,
43 .vertex_id_zero_based = true, /* its not implemented anyway */
44 .lower_bitops = true,
45 .lower_rotate = true,
46 .lower_vector_cmp = true,
47 .lower_fdph = true,
48 .has_fsub = true,
49 .has_isub = true,
50 .lower_insert_byte = true,
51 .lower_insert_word = true,
52 .force_indirect_unrolling = nir_var_all,
53 .force_indirect_unrolling_sampler = true,
54 .max_unroll_iterations = 32,
55 };
56
57 const nir_shader_compiler_options *
ir2_get_compiler_options(void)58 ir2_get_compiler_options(void)
59 {
60 return &options;
61 }
62
63 #define OPT(nir, pass, ...) \
64 ({ \
65 bool this_progress = false; \
66 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
67 this_progress; \
68 })
69 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
70
71 static void
ir2_optimize_loop(nir_shader * s)72 ir2_optimize_loop(nir_shader *s)
73 {
74 bool progress;
75 do {
76 progress = false;
77
78 OPT_V(s, nir_lower_vars_to_ssa);
79 progress |= OPT(s, nir_opt_copy_prop_vars);
80 progress |= OPT(s, nir_copy_prop);
81 progress |= OPT(s, nir_opt_dce);
82 progress |= OPT(s, nir_opt_cse);
83 /* progress |= OPT(s, nir_opt_gcm, true); */
84 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
85 progress |= OPT(s, nir_opt_intrinsics);
86 progress |= OPT(s, nir_opt_algebraic);
87 progress |= OPT(s, nir_opt_constant_folding);
88 progress |= OPT(s, nir_opt_dead_cf);
89 if (OPT(s, nir_opt_trivial_continues)) {
90 progress |= true;
91 /* If nir_opt_trivial_continues makes progress, then we need to clean
92 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
93 * to make progress.
94 */
95 OPT(s, nir_copy_prop);
96 OPT(s, nir_opt_dce);
97 }
98 progress |= OPT(s, nir_opt_loop_unroll);
99 progress |= OPT(s, nir_opt_if, nir_opt_if_optimize_phi_true_false);
100 progress |= OPT(s, nir_opt_remove_phis);
101 progress |= OPT(s, nir_opt_undef);
102
103 } while (progress);
104 }
105
106 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
107 bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
108
109 int
ir2_optimize_nir(nir_shader * s,bool lower)110 ir2_optimize_nir(nir_shader *s, bool lower)
111 {
112 struct nir_lower_tex_options tex_options = {
113 .lower_txp = ~0u,
114 .lower_rect = 0,
115 .lower_invalid_implicit_lod = true,
116 };
117
118 if (FD_DBG(DISASM)) {
119 debug_printf("----------------------\n");
120 nir_print_shader(s, stdout);
121 debug_printf("----------------------\n");
122 }
123
124 OPT_V(s, nir_lower_regs_to_ssa);
125 OPT_V(s, nir_lower_vars_to_ssa);
126 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out,
127 UINT32_MAX);
128
129 if (lower) {
130 OPT_V(s, ir3_nir_apply_trig_workarounds);
131 OPT_V(s, nir_lower_tex, &tex_options);
132 }
133
134 ir2_optimize_loop(s);
135
136 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
137 OPT_V(s, nir_opt_sink, nir_move_const_undef);
138
139 /* TODO we dont want to get shaders writing to depth for depth textures */
140 if (s->info.stage == MESA_SHADER_FRAGMENT) {
141 nir_foreach_shader_out_variable (var, s) {
142 if (var->data.location == FRAG_RESULT_DEPTH)
143 return -1;
144 }
145 }
146
147 return 0;
148 }
149
150 static struct ir2_src
load_const(struct ir2_context * ctx,float * value_f,unsigned ncomp)151 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
152 {
153 struct fd2_shader_stateobj *so = ctx->so;
154 unsigned imm_ncomp, swiz, idx, i, j;
155 uint32_t *value = (uint32_t *)value_f;
156
157 /* try to merge with existing immediate (TODO: try with neg) */
158 for (idx = 0; idx < so->num_immediates; idx++) {
159 swiz = 0;
160 imm_ncomp = so->immediates[idx].ncomp;
161 for (i = 0; i < ncomp; i++) {
162 for (j = 0; j < imm_ncomp; j++) {
163 if (value[i] == so->immediates[idx].val[j])
164 break;
165 }
166 if (j == imm_ncomp) {
167 if (j == 4)
168 break;
169 so->immediates[idx].val[imm_ncomp++] = value[i];
170 }
171 swiz |= swiz_set(j, i);
172 }
173 /* matched all components */
174 if (i == ncomp)
175 break;
176 }
177
178 /* need to allocate new immediate */
179 if (idx == so->num_immediates) {
180 swiz = 0;
181 imm_ncomp = 0;
182 for (i = 0; i < ncomp; i++) {
183 for (j = 0; j < imm_ncomp; j++) {
184 if (value[i] == ctx->so->immediates[idx].val[j])
185 break;
186 }
187 if (j == imm_ncomp) {
188 so->immediates[idx].val[imm_ncomp++] = value[i];
189 }
190 swiz |= swiz_set(j, i);
191 }
192 so->num_immediates++;
193 }
194 so->immediates[idx].ncomp = imm_ncomp;
195
196 if (ncomp == 1)
197 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
198
199 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
200 }
201
202 struct ir2_src
ir2_zero(struct ir2_context * ctx)203 ir2_zero(struct ir2_context *ctx)
204 {
205 return load_const(ctx, (float[]){0.0f}, 1);
206 }
207
208 static void
update_range(struct ir2_context * ctx,struct ir2_reg * reg)209 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
210 {
211 if (!reg->initialized) {
212 reg->initialized = true;
213 reg->loop_depth = ctx->loop_depth;
214 }
215
216 if (ctx->loop_depth > reg->loop_depth) {
217 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
218 } else {
219 reg->loop_depth = ctx->loop_depth;
220 reg->block_idx_free = -1;
221 }
222
223 /* for regs we want to free at the end of the loop in any case
224 * XXX dont do this for ssa
225 */
226 if (reg->loop_depth)
227 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
228 }
229
230 static struct ir2_src
make_src(struct ir2_context * ctx,nir_src src)231 make_src(struct ir2_context *ctx, nir_src src)
232 {
233 struct ir2_src res = {};
234 struct ir2_reg *reg;
235
236 nir_const_value *const_value = nir_src_as_const_value(src);
237
238 if (const_value) {
239 assert(src.is_ssa);
240 float c[src.ssa->num_components];
241 nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
242 return load_const(ctx, c, src.ssa->num_components);
243 }
244
245 if (!src.is_ssa) {
246 res.num = src.reg.reg->index;
247 res.type = IR2_SRC_REG;
248 reg = &ctx->reg[res.num];
249 } else {
250 assert(ctx->ssa_map[src.ssa->index] >= 0);
251 res.num = ctx->ssa_map[src.ssa->index];
252 res.type = IR2_SRC_SSA;
253 reg = &ctx->instr[res.num].ssa;
254 }
255
256 update_range(ctx, reg);
257 return res;
258 }
259
260 static void
set_index(struct ir2_context * ctx,nir_dest * dst,struct ir2_instr * instr)261 set_index(struct ir2_context *ctx, nir_dest *dst, struct ir2_instr *instr)
262 {
263 struct ir2_reg *reg = &instr->ssa;
264
265 if (dst->is_ssa) {
266 ctx->ssa_map[dst->ssa.index] = instr->idx;
267 } else {
268 assert(instr->is_ssa);
269 reg = &ctx->reg[dst->reg.reg->index];
270
271 instr->is_ssa = false;
272 instr->reg = reg;
273 }
274 update_range(ctx, reg);
275 }
276
277 static struct ir2_instr *
ir2_instr_create(struct ir2_context * ctx,int type)278 ir2_instr_create(struct ir2_context *ctx, int type)
279 {
280 struct ir2_instr *instr;
281
282 instr = &ctx->instr[ctx->instr_count++];
283 instr->idx = ctx->instr_count - 1;
284 instr->type = type;
285 instr->block_idx = ctx->block_idx;
286 instr->pred = ctx->pred;
287 instr->is_ssa = true;
288 return instr;
289 }
290
291 static struct ir2_instr *
instr_create_alu(struct ir2_context * ctx,nir_op opcode,unsigned ncomp)292 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
293 {
294 /* emit_alu will fixup instrs that don't map directly */
295 static const struct ir2_opc {
296 int8_t scalar, vector;
297 } nir_ir2_opc[nir_num_opcodes + 1] = {
298 [0 ... nir_num_opcodes - 1] = {-1, -1},
299
300 [nir_op_mov] = {MAXs, MAXv},
301 [nir_op_fneg] = {MAXs, MAXv},
302 [nir_op_fabs] = {MAXs, MAXv},
303 [nir_op_fsat] = {MAXs, MAXv},
304 [nir_op_fsign] = {-1, CNDGTEv},
305 [nir_op_fadd] = {ADDs, ADDv},
306 [nir_op_fsub] = {ADDs, ADDv},
307 [nir_op_fmul] = {MULs, MULv},
308 [nir_op_ffma] = {-1, MULADDv},
309 [nir_op_fmax] = {MAXs, MAXv},
310 [nir_op_fmin] = {MINs, MINv},
311 [nir_op_ffloor] = {FLOORs, FLOORv},
312 [nir_op_ffract] = {FRACs, FRACv},
313 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
314 [nir_op_fdot2] = {-1, DOT2ADDv},
315 [nir_op_fdot3] = {-1, DOT3v},
316 [nir_op_fdot4] = {-1, DOT4v},
317 [nir_op_sge] = {-1, SETGTEv},
318 [nir_op_slt] = {-1, SETGTv},
319 [nir_op_sne] = {-1, SETNEv},
320 [nir_op_seq] = {-1, SETEv},
321 [nir_op_fcsel] = {-1, CNDEv},
322 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
323 [nir_op_frcp] = {RECIP_IEEE, -1},
324 [nir_op_flog2] = {LOG_IEEE, -1},
325 [nir_op_fexp2] = {EXP_IEEE, -1},
326 [nir_op_fsqrt] = {SQRT_IEEE, -1},
327 [nir_op_fcos] = {COS, -1},
328 [nir_op_fsin] = {SIN, -1},
329 /* no fsat, fneg, fabs since source mods deal with those */
330
331 /* so we can use this function with non-nir op */
332 #define ir2_op_cube nir_num_opcodes
333 [ir2_op_cube] = {-1, CUBEv},
334 };
335
336 struct ir2_opc op = nir_ir2_opc[opcode];
337 assert(op.vector >= 0 || op.scalar >= 0);
338
339 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
340 instr->alu.vector_opc = op.vector;
341 instr->alu.scalar_opc = op.scalar;
342 instr->alu.export = -1;
343 instr->alu.write_mask = (1 << ncomp) - 1;
344 instr->src_count =
345 opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs;
346 instr->ssa.ncomp = ncomp;
347 return instr;
348 }
349
350 static struct ir2_instr *
instr_create_alu_reg(struct ir2_context * ctx,nir_op opcode,uint8_t write_mask,struct ir2_instr * share_reg)351 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask,
352 struct ir2_instr *share_reg)
353 {
354 struct ir2_instr *instr;
355 struct ir2_reg *reg;
356
357 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
358 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
359
360 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
361 instr->alu.write_mask = write_mask;
362 instr->reg = reg;
363 instr->is_ssa = false;
364 return instr;
365 }
366
367 static struct ir2_instr *
instr_create_alu_dest(struct ir2_context * ctx,nir_op opcode,nir_dest * dst)368 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
369 {
370 struct ir2_instr *instr;
371 instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
372 set_index(ctx, dst, instr);
373 return instr;
374 }
375
376 static struct ir2_instr *
ir2_instr_create_fetch(struct ir2_context * ctx,nir_dest * dst,instr_fetch_opc_t opc)377 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
378 instr_fetch_opc_t opc)
379 {
380 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
381 instr->fetch.opc = opc;
382 instr->src_count = 1;
383 instr->ssa.ncomp = nir_dest_num_components(*dst);
384 set_index(ctx, dst, instr);
385 return instr;
386 }
387
388 static struct ir2_src
make_src_noconst(struct ir2_context * ctx,nir_src src)389 make_src_noconst(struct ir2_context *ctx, nir_src src)
390 {
391 struct ir2_instr *instr;
392
393 if (nir_src_as_const_value(src)) {
394 assert(src.is_ssa);
395 instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
396 instr->src[0] = make_src(ctx, src);
397 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
398 }
399
400 return make_src(ctx, src);
401 }
402
403 static void
emit_alu(struct ir2_context * ctx,nir_alu_instr * alu)404 emit_alu(struct ir2_context *ctx, nir_alu_instr *alu)
405 {
406 const nir_op_info *info = &nir_op_infos[alu->op];
407 nir_dest *dst = &alu->dest.dest;
408 struct ir2_instr *instr;
409 struct ir2_src tmp;
410 unsigned ncomp;
411
412 /* get the number of dst components */
413 if (dst->is_ssa) {
414 ncomp = dst->ssa.num_components;
415 } else {
416 ncomp = 0;
417 for (int i = 0; i < 4; i++)
418 ncomp += !!(alu->dest.write_mask & 1 << i);
419 }
420
421 instr = instr_create_alu(ctx, alu->op, ncomp);
422 set_index(ctx, dst, instr);
423 instr->alu.saturate = alu->dest.saturate;
424 instr->alu.write_mask = alu->dest.write_mask;
425
426 for (int i = 0; i < info->num_inputs; i++) {
427 nir_alu_src *src = &alu->src[i];
428
429 /* compress swizzle with writemask when applicable */
430 unsigned swiz = 0, j = 0;
431 for (int i = 0; i < 4; i++) {
432 if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
433 continue;
434 swiz |= swiz_set(src->swizzle[i], j++);
435 }
436
437 instr->src[i] = make_src(ctx, src->src);
438 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
439 instr->src[i].negate = src->negate;
440 instr->src[i].abs = src->abs;
441 }
442
443 /* workarounds for NIR ops that don't map directly to a2xx ops */
444 switch (alu->op) {
445 case nir_op_fneg:
446 instr->src[0].negate = 1;
447 break;
448 case nir_op_fabs:
449 instr->src[0].abs = 1;
450 break;
451 case nir_op_fsat:
452 instr->alu.saturate = 1;
453 break;
454 case nir_op_slt:
455 tmp = instr->src[0];
456 instr->src[0] = instr->src[1];
457 instr->src[1] = tmp;
458 break;
459 case nir_op_fcsel:
460 tmp = instr->src[1];
461 instr->src[1] = instr->src[2];
462 instr->src[2] = tmp;
463 break;
464 case nir_op_fsub:
465 instr->src[1].negate = !instr->src[1].negate;
466 break;
467 case nir_op_fdot2:
468 instr->src_count = 3;
469 instr->src[2] = ir2_zero(ctx);
470 break;
471 case nir_op_fsign: {
472 /* we need an extra instruction to deal with the zero case */
473 struct ir2_instr *tmp;
474
475 /* tmp = x == 0 ? 0 : 1 */
476 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
477 tmp->src[0] = instr->src[0];
478 tmp->src[1] = ir2_zero(ctx);
479 tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1);
480
481 /* result = x >= 0 ? tmp : -tmp */
482 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
483 instr->src[2] = instr->src[1];
484 instr->src[2].negate = true;
485 instr->src_count = 3;
486 } break;
487 default:
488 break;
489 }
490 }
491
492 static void
load_input(struct ir2_context * ctx,nir_dest * dst,unsigned idx)493 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
494 {
495 struct ir2_instr *instr;
496 int slot = -1;
497
498 if (ctx->so->type == MESA_SHADER_VERTEX) {
499 instr = ir2_instr_create_fetch(ctx, dst, 0);
500 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
501 instr->fetch.vtx.const_idx = 20 + (idx / 3);
502 instr->fetch.vtx.const_idx_sel = idx % 3;
503 return;
504 }
505
506 /* get slot from idx */
507 nir_foreach_shader_in_variable (var, ctx->nir) {
508 if (var->data.driver_location == idx) {
509 slot = var->data.location;
510 break;
511 }
512 }
513 assert(slot >= 0);
514
515 switch (slot) {
516 case VARYING_SLOT_POS:
517 /* need to extract xy with abs and add tile offset on a20x
518 * zw from fragcoord input (w inverted in fragment shader)
519 * TODO: only components that are required by fragment shader
520 */
521 instr = instr_create_alu_reg(
522 ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
523 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
524 instr->src[0].abs = true;
525 /* on a20x, C64 contains the tile offset */
526 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
527
528 instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
529 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
530
531 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
532 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
533
534 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
535 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
536 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
537 break;
538 default:
539 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
540 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
541 break;
542 }
543 }
544
545 static unsigned
output_slot(struct ir2_context * ctx,nir_intrinsic_instr * intr)546 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
547 {
548 int slot = -1;
549 unsigned idx = nir_intrinsic_base(intr);
550 nir_foreach_shader_out_variable (var, ctx->nir) {
551 if (var->data.driver_location == idx) {
552 slot = var->data.location;
553 break;
554 }
555 }
556 assert(slot != -1);
557 return slot;
558 }
559
560 static void
store_output(struct ir2_context * ctx,nir_src src,unsigned slot,unsigned ncomp)561 store_output(struct ir2_context *ctx, nir_src src, unsigned slot,
562 unsigned ncomp)
563 {
564 struct ir2_instr *instr;
565 unsigned idx = 0;
566
567 if (ctx->so->type == MESA_SHADER_VERTEX) {
568 switch (slot) {
569 case VARYING_SLOT_POS:
570 ctx->position = make_src(ctx, src);
571 idx = 62;
572 break;
573 case VARYING_SLOT_PSIZ:
574 ctx->so->writes_psize = true;
575 idx = 63;
576 break;
577 default:
578 /* find matching slot from fragment shader input */
579 for (idx = 0; idx < ctx->f->inputs_count; idx++)
580 if (ctx->f->inputs[idx].slot == slot)
581 break;
582 if (idx == ctx->f->inputs_count)
583 return;
584 }
585 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
586 /* only color output is implemented */
587 return;
588 }
589
590 instr = instr_create_alu(ctx, nir_op_mov, ncomp);
591 instr->src[0] = make_src(ctx, src);
592 instr->alu.export = idx;
593 }
594
595 static void
emit_intrinsic(struct ir2_context * ctx,nir_intrinsic_instr * intr)596 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
597 {
598 struct ir2_instr *instr;
599 ASSERTED nir_const_value *const_offset;
600 unsigned idx;
601
602 switch (intr->intrinsic) {
603 case nir_intrinsic_load_input:
604 load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
605 break;
606 case nir_intrinsic_store_output:
607 store_output(ctx, intr->src[0], output_slot(ctx, intr),
608 intr->num_components);
609 break;
610 case nir_intrinsic_load_uniform:
611 const_offset = nir_src_as_const_value(intr->src[0]);
612 assert(const_offset); /* TODO can be false in ES2? */
613 idx = nir_intrinsic_base(intr);
614 idx += (uint32_t)const_offset[0].f32;
615 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
616 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
617 break;
618 case nir_intrinsic_discard:
619 case nir_intrinsic_discard_if:
620 instr = ir2_instr_create(ctx, IR2_ALU);
621 instr->alu.vector_opc = VECTOR_NONE;
622 if (intr->intrinsic == nir_intrinsic_discard_if) {
623 instr->alu.scalar_opc = KILLNEs;
624 instr->src[0] = make_src(ctx, intr->src[0]);
625 } else {
626 instr->alu.scalar_opc = KILLEs;
627 instr->src[0] = ir2_zero(ctx);
628 }
629 instr->alu.export = -1;
630 instr->src_count = 1;
631 ctx->so->has_kill = true;
632 break;
633 case nir_intrinsic_load_front_face:
634 /* gl_FrontFacing is in the sign of param.x
635 * rcp required because otherwise we can't differentiate -0.0 and +0.0
636 */
637 ctx->so->need_param = true;
638
639 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
640 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
641
642 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
643 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
644 instr->src[1] = ir2_zero(ctx);
645 break;
646 case nir_intrinsic_load_point_coord:
647 /* param.zw (note: abs might be needed like fragcoord in param.xy?) */
648 ctx->so->need_param = true;
649
650 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
651 instr->src[0] =
652 ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
653 break;
654 default:
655 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
656 break;
657 }
658 }
659
660 static void
emit_tex(struct ir2_context * ctx,nir_tex_instr * tex)661 emit_tex(struct ir2_context *ctx, nir_tex_instr *tex)
662 {
663 bool is_rect = false, is_cube = false;
664 struct ir2_instr *instr;
665 nir_src *coord, *lod_bias;
666
667 coord = lod_bias = NULL;
668
669 for (unsigned i = 0; i < tex->num_srcs; i++) {
670 switch (tex->src[i].src_type) {
671 case nir_tex_src_coord:
672 coord = &tex->src[i].src;
673 break;
674 case nir_tex_src_bias:
675 case nir_tex_src_lod:
676 assert(!lod_bias);
677 lod_bias = &tex->src[i].src;
678 break;
679 default:
680 compile_error(ctx, "Unhandled NIR tex src type: %d\n",
681 tex->src[i].src_type);
682 return;
683 }
684 }
685
686 switch (tex->op) {
687 case nir_texop_tex:
688 case nir_texop_txb:
689 case nir_texop_txl:
690 break;
691 default:
692 compile_error(ctx, "unimplemented texop %d\n", tex->op);
693 return;
694 }
695
696 switch (tex->sampler_dim) {
697 case GLSL_SAMPLER_DIM_2D:
698 case GLSL_SAMPLER_DIM_EXTERNAL:
699 break;
700 case GLSL_SAMPLER_DIM_RECT:
701 is_rect = true;
702 break;
703 case GLSL_SAMPLER_DIM_CUBE:
704 is_cube = true;
705 break;
706 default:
707 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
708 return;
709 }
710
711 struct ir2_src src_coord = make_src_noconst(ctx, *coord);
712
713 /* for cube maps
714 * tmp = cube(coord)
715 * tmp.xy = tmp.xy / |tmp.z| + 1.5
716 * coord = tmp.xyw
717 */
718 if (is_cube) {
719 struct ir2_instr *rcp, *coord_xy;
720 unsigned reg_idx;
721
722 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
723 instr->src[0] = src_coord;
724 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
725 instr->src[1] = src_coord;
726 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
727
728 reg_idx = instr->reg - ctx->reg; /* hacky */
729
730 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
731 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
732 rcp->src[0].abs = true;
733
734 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
735 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
736 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
737 coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1);
738
739 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
740 /* TODO: lod/bias transformed by src_coord.z ? */
741 }
742
743 instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
744 instr->src[0] = src_coord;
745 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;
746 instr->fetch.tex.is_cube = is_cube;
747 instr->fetch.tex.is_rect = is_rect;
748 instr->fetch.tex.samp_id = tex->sampler_index;
749
750 /* for lod/bias, we insert an extra src for the backend to deal with */
751 if (lod_bias) {
752 instr->src[1] = make_src_noconst(ctx, *lod_bias);
753 /* backend will use 2-3 components so apply swizzle */
754 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
755 instr->src_count = 2;
756 }
757 }
758
759 static void
setup_input(struct ir2_context * ctx,nir_variable * in)760 setup_input(struct ir2_context *ctx, nir_variable *in)
761 {
762 struct fd2_shader_stateobj *so = ctx->so;
763 ASSERTED unsigned array_len = MAX2(glsl_get_length(in->type), 1);
764 unsigned n = in->data.driver_location;
765 unsigned slot = in->data.location;
766
767 assert(array_len == 1);
768
769 /* handle later */
770 if (ctx->so->type == MESA_SHADER_VERTEX)
771 return;
772
773 if (ctx->so->type != MESA_SHADER_FRAGMENT)
774 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
775
776 n = ctx->f->inputs_count++;
777
778 /* half of fragcoord from param reg, half from a varying */
779 if (slot == VARYING_SLOT_POS) {
780 ctx->f->fragcoord = n;
781 so->need_param = true;
782 }
783
784 ctx->f->inputs[n].slot = slot;
785 ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
786
787 /* in->data.interpolation?
788 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
789 */
790 }
791
792 static void
emit_undef(struct ir2_context * ctx,nir_ssa_undef_instr * undef)793 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr *undef)
794 {
795 /* TODO we don't want to emit anything for undefs */
796
797 struct ir2_instr *instr;
798
799 instr = instr_create_alu_dest(
800 ctx, nir_op_mov, &(nir_dest){.ssa = undef->def, .is_ssa = true});
801 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
802 }
803
804 static void
emit_instr(struct ir2_context * ctx,nir_instr * instr)805 emit_instr(struct ir2_context *ctx, nir_instr *instr)
806 {
807 switch (instr->type) {
808 case nir_instr_type_alu:
809 emit_alu(ctx, nir_instr_as_alu(instr));
810 break;
811 case nir_instr_type_deref:
812 /* ignored, handled as part of the intrinsic they are src to */
813 break;
814 case nir_instr_type_intrinsic:
815 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
816 break;
817 case nir_instr_type_load_const:
818 /* dealt with when using nir_src */
819 break;
820 case nir_instr_type_tex:
821 emit_tex(ctx, nir_instr_as_tex(instr));
822 break;
823 case nir_instr_type_jump:
824 ctx->block_has_jump[ctx->block_idx] = true;
825 break;
826 case nir_instr_type_ssa_undef:
827 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
828 break;
829 default:
830 break;
831 }
832 }
833
834 /* fragcoord.zw and a20x hw binning outputs */
835 static void
extra_position_exports(struct ir2_context * ctx,bool binning)836 extra_position_exports(struct ir2_context *ctx, bool binning)
837 {
838 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
839
840 if (ctx->f->fragcoord < 0 && !binning)
841 return;
842
843 instr = instr_create_alu(ctx, nir_op_fmax, 1);
844 instr->src[0] = ctx->position;
845 instr->src[0].swizzle = IR2_SWIZZLE_W;
846 instr->src[1] = ir2_zero(ctx);
847
848 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
849 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
850
851 sc = instr_create_alu(ctx, nir_op_fmul, 4);
852 sc->src[0] = ctx->position;
853 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
854
855 wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
856 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
857 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
858 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
859
860 /* fragcoord z/w */
861 if (ctx->f->fragcoord >= 0 && !binning) {
862 instr = instr_create_alu(ctx, nir_op_mov, 1);
863 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
864 instr->alu.export = ctx->f->fragcoord;
865
866 instr = instr_create_alu(ctx, nir_op_mov, 1);
867 instr->src[0] = ctx->position;
868 instr->src[0].swizzle = IR2_SWIZZLE_W;
869 instr->alu.export = ctx->f->fragcoord;
870 instr->alu.write_mask = 2;
871 }
872
873 if (!binning)
874 return;
875
876 off = instr_create_alu(ctx, nir_op_fadd, 1);
877 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
878 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
879
880 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
881 for (int i = 0; i < 8; i++) {
882 instr = instr_create_alu(ctx, nir_op_ffma, 4);
883 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
884 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
885 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
886 instr->alu.export = 32;
887
888 instr = instr_create_alu(ctx, nir_op_ffma, 4);
889 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
890 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
891 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
892 instr->alu.export = 33;
893 }
894 }
895
896 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
897
898 static bool
emit_block(struct ir2_context * ctx,nir_block * block)899 emit_block(struct ir2_context *ctx, nir_block *block)
900 {
901 struct ir2_instr *instr;
902 nir_block *succs = block->successors[0];
903
904 ctx->block_idx = block->index;
905
906 nir_foreach_instr (instr, block)
907 emit_instr(ctx, instr);
908
909 if (!succs || !succs->index)
910 return false;
911
912 /* we want to be smart and always jump and have the backend cleanup
913 * but we are not, so there are two cases where jump is needed:
914 * loops (succs index lower)
915 * jumps (jump instruction seen in block)
916 */
917 if (succs->index > block->index && !ctx->block_has_jump[block->index])
918 return false;
919
920 assert(block->successors[1] == NULL);
921
922 instr = ir2_instr_create(ctx, IR2_CF);
923 instr->cf.block_idx = succs->index;
924 /* XXX can't jump to a block with different predicate */
925 return true;
926 }
927
928 static void
emit_if(struct ir2_context * ctx,nir_if * nif)929 emit_if(struct ir2_context *ctx, nir_if *nif)
930 {
931 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
932 struct ir2_instr *instr;
933
934 /* XXX: blob seems to always use same register for condition */
935
936 instr = ir2_instr_create(ctx, IR2_ALU);
937 instr->src[0] = make_src(ctx, nif->condition);
938 instr->src_count = 1;
939 instr->ssa.ncomp = 1;
940 instr->alu.vector_opc = VECTOR_NONE;
941 instr->alu.scalar_opc = SCALAR_NONE;
942 instr->alu.export = -1;
943 instr->alu.write_mask = 1;
944 instr->pred = 0;
945
946 /* if nested, use PRED_SETNE_PUSHv */
947 if (pred) {
948 instr->alu.vector_opc = PRED_SETNE_PUSHv;
949 instr->src[1] = instr->src[0];
950 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
951 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
952 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
953 instr->src_count = 2;
954 } else {
955 instr->alu.scalar_opc = PRED_SETNEs;
956 }
957
958 ctx->pred_idx = instr->idx;
959 ctx->pred = 3;
960
961 emit_cf_list(ctx, &nif->then_list);
962
963 /* TODO: if these is no else branch we don't need this
964 * and if the else branch is simple, can just flip ctx->pred instead
965 */
966 instr = ir2_instr_create(ctx, IR2_ALU);
967 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
968 instr->src_count = 1;
969 instr->ssa.ncomp = 1;
970 instr->alu.vector_opc = VECTOR_NONE;
971 instr->alu.scalar_opc = PRED_SET_INVs;
972 instr->alu.export = -1;
973 instr->alu.write_mask = 1;
974 instr->pred = 0;
975 ctx->pred_idx = instr->idx;
976
977 emit_cf_list(ctx, &nif->else_list);
978
979 /* restore predicate for nested predicates */
980 if (pred) {
981 instr = ir2_instr_create(ctx, IR2_ALU);
982 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
983 instr->src_count = 1;
984 instr->ssa.ncomp = 1;
985 instr->alu.vector_opc = VECTOR_NONE;
986 instr->alu.scalar_opc = PRED_SET_POPs;
987 instr->alu.export = -1;
988 instr->alu.write_mask = 1;
989 instr->pred = 0;
990 ctx->pred_idx = instr->idx;
991 }
992
993 /* restore ctx->pred */
994 ctx->pred = pred;
995 }
996
997 /* get the highest block idx in the loop, so we know when
998 * we can free registers that are allocated outside the loop
999 */
1000 static unsigned
loop_last_block(struct exec_list * list)1001 loop_last_block(struct exec_list *list)
1002 {
1003 nir_cf_node *node =
1004 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
1005 switch (node->type) {
1006 case nir_cf_node_block:
1007 return nir_cf_node_as_block(node)->index;
1008 case nir_cf_node_if:
1009 assert(0); /* XXX could this ever happen? */
1010 return 0;
1011 case nir_cf_node_loop:
1012 return loop_last_block(&nir_cf_node_as_loop(node)->body);
1013 default:
1014 compile_error(ctx, "Not supported\n");
1015 return 0;
1016 }
1017 }
1018
1019 static void
emit_loop(struct ir2_context * ctx,nir_loop * nloop)1020 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1021 {
1022 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1023 emit_cf_list(ctx, &nloop->body);
1024 ctx->loop_depth--;
1025 }
1026
1027 static bool
emit_cf_list(struct ir2_context * ctx,struct exec_list * list)1028 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1029 {
1030 bool ret = false;
1031 foreach_list_typed (nir_cf_node, node, node, list) {
1032 ret = false;
1033 switch (node->type) {
1034 case nir_cf_node_block:
1035 ret = emit_block(ctx, nir_cf_node_as_block(node));
1036 break;
1037 case nir_cf_node_if:
1038 emit_if(ctx, nir_cf_node_as_if(node));
1039 break;
1040 case nir_cf_node_loop:
1041 emit_loop(ctx, nir_cf_node_as_loop(node));
1042 break;
1043 case nir_cf_node_function:
1044 compile_error(ctx, "Not supported\n");
1045 break;
1046 }
1047 }
1048 return ret;
1049 }
1050
1051 static void
cleanup_binning(struct ir2_context * ctx)1052 cleanup_binning(struct ir2_context *ctx)
1053 {
1054 assert(ctx->so->type == MESA_SHADER_VERTEX);
1055
1056 /* kill non-position outputs for binning variant */
1057 nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) {
1058 nir_foreach_instr_safe (instr, block) {
1059 if (instr->type != nir_instr_type_intrinsic)
1060 continue;
1061
1062 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1063 if (intr->intrinsic != nir_intrinsic_store_output)
1064 continue;
1065
1066 if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1067 nir_instr_remove(instr);
1068 }
1069 }
1070
1071 ir2_optimize_nir(ctx->nir, false);
1072 }
1073
1074 static bool
ir2_alu_to_scalar_filter_cb(const nir_instr * instr,const void * data)1075 ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
1076 {
1077 if (instr->type != nir_instr_type_alu)
1078 return false;
1079
1080 nir_alu_instr *alu = nir_instr_as_alu(instr);
1081 switch (alu->op) {
1082 case nir_op_frsq:
1083 case nir_op_frcp:
1084 case nir_op_flog2:
1085 case nir_op_fexp2:
1086 case nir_op_fsqrt:
1087 case nir_op_fcos:
1088 case nir_op_fsin:
1089 return true;
1090 default:
1091 break;
1092 }
1093
1094 return false;
1095 }
1096
1097 void
ir2_nir_compile(struct ir2_context * ctx,bool binning)1098 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1099 {
1100 struct fd2_shader_stateobj *so = ctx->so;
1101
1102 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1103
1104 ctx->nir = nir_shader_clone(NULL, so->nir);
1105
1106 if (binning)
1107 cleanup_binning(ctx);
1108
1109 OPT_V(ctx->nir, nir_copy_prop);
1110 OPT_V(ctx->nir, nir_opt_dce);
1111 OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
1112
1113 OPT_V(ctx->nir, nir_lower_int_to_float);
1114 OPT_V(ctx->nir, nir_lower_bool_to_float);
1115 while (OPT(ctx->nir, nir_opt_algebraic))
1116 ;
1117 OPT_V(ctx->nir, nir_opt_algebraic_late);
1118 OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1119
1120 OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL);
1121
1122 OPT_V(ctx->nir, nir_lower_locals_to_regs);
1123
1124 OPT_V(ctx->nir, nir_convert_from_ssa, true);
1125
1126 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1127 OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL);
1128
1129 OPT_V(ctx->nir, nir_opt_dce);
1130
1131 nir_sweep(ctx->nir);
1132
1133 if (FD_DBG(DISASM)) {
1134 debug_printf("----------------------\n");
1135 nir_print_shader(ctx->nir, stdout);
1136 debug_printf("----------------------\n");
1137 }
1138
1139 /* fd2_shader_stateobj init */
1140 if (so->type == MESA_SHADER_FRAGMENT) {
1141 ctx->f->fragcoord = -1;
1142 ctx->f->inputs_count = 0;
1143 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1144 }
1145
1146 /* Setup inputs: */
1147 nir_foreach_shader_in_variable (in, ctx->nir)
1148 setup_input(ctx, in);
1149
1150 if (so->type == MESA_SHADER_FRAGMENT) {
1151 unsigned idx;
1152 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1153 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1154 update_range(ctx, &ctx->input[idx]);
1155 }
1156 /* assume we have param input and kill it later if not */
1157 ctx->input[idx].ncomp = 4;
1158 update_range(ctx, &ctx->input[idx]);
1159 } else {
1160 ctx->input[0].ncomp = 1;
1161 ctx->input[2].ncomp = 1;
1162 update_range(ctx, &ctx->input[0]);
1163 update_range(ctx, &ctx->input[2]);
1164 }
1165
1166 /* And emit the body: */
1167 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1168
1169 nir_foreach_register (reg, &fxn->registers) {
1170 ctx->reg[reg->index].ncomp = reg->num_components;
1171 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1172 }
1173
1174 nir_metadata_require(fxn, nir_metadata_block_index);
1175 emit_cf_list(ctx, &fxn->body);
1176 /* TODO emit_block(ctx, fxn->end_block); */
1177
1178 if (so->type == MESA_SHADER_VERTEX)
1179 extra_position_exports(ctx, binning);
1180
1181 ralloc_free(ctx->nir);
1182
1183 /* kill unused param input */
1184 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1185 ctx->input[ctx->f->inputs_count].initialized = false;
1186 }
1187