1 /*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 * Copyright (C) 2019-2020 Collabora, Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include <sys/types.h>
26 #include <sys/stat.h>
27 #include <sys/mman.h>
28 #include <fcntl.h>
29 #include <stdint.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <err.h>
33
34 #include "main/mtypes.h"
35 #include "compiler/glsl/glsl_to_nir.h"
36 #include "compiler/nir_types.h"
37 #include "compiler/nir/nir_builder.h"
38 #include "util/half_float.h"
39 #include "util/u_math.h"
40 #include "util/u_debug.h"
41 #include "util/u_dynarray.h"
42 #include "util/list.h"
43 #include "main/mtypes.h"
44
45 #include "midgard.h"
46 #include "midgard_nir.h"
47 #include "midgard_compile.h"
48 #include "midgard_ops.h"
49 #include "helpers.h"
50 #include "compiler.h"
51 #include "midgard_quirks.h"
52 #include "panfrost-quirks.h"
53 #include "panfrost/util/pan_lower_framebuffer.h"
54
55 #include "disassemble.h"
56
57 static const struct debug_named_value midgard_debug_options[] = {
58 {"msgs", MIDGARD_DBG_MSGS, "Print debug messages"},
59 {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"},
60 {"shaderdb", MIDGARD_DBG_SHADERDB, "Prints shader-db statistics"},
61 {"inorder", MIDGARD_DBG_INORDER, "Disables out-of-order scheduling"},
62 {"verbose", MIDGARD_DBG_VERBOSE, "Dump shaders verbosely"},
63 {"internal", MIDGARD_DBG_INTERNAL, "Dump internal shaders"},
64 DEBUG_NAMED_VALUE_END
65 };
66
67 DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", midgard_debug_options, 0)
68
69 int midgard_debug = 0;
70
71 #define DBG(fmt, ...) \
72 do { if (midgard_debug & MIDGARD_DBG_MSGS) \
73 fprintf(stderr, "%s:%d: "fmt, \
74 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
75 static midgard_block *
create_empty_block(compiler_context * ctx)76 create_empty_block(compiler_context *ctx)
77 {
78 midgard_block *blk = rzalloc(ctx, midgard_block);
79
80 blk->base.predecessors = _mesa_set_create(blk,
81 _mesa_hash_pointer,
82 _mesa_key_pointer_equal);
83
84 blk->base.name = ctx->block_source_count++;
85
86 return blk;
87 }
88
89 static void
schedule_barrier(compiler_context * ctx)90 schedule_barrier(compiler_context *ctx)
91 {
92 midgard_block *temp = ctx->after_block;
93 ctx->after_block = create_empty_block(ctx);
94 ctx->block_count++;
95 list_addtail(&ctx->after_block->base.link, &ctx->blocks);
96 list_inithead(&ctx->after_block->base.instructions);
97 pan_block_add_successor(&ctx->current_block->base, &ctx->after_block->base);
98 ctx->current_block = ctx->after_block;
99 ctx->after_block = temp;
100 }
101
102 /* Helpers to generate midgard_instruction's using macro magic, since every
103 * driver seems to do it that way */
104
105 #define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
106
107 #define M_LOAD_STORE(name, store, T) \
108 static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
109 midgard_instruction i = { \
110 .type = TAG_LOAD_STORE_4, \
111 .mask = 0xF, \
112 .dest = ~0, \
113 .src = { ~0, ~0, ~0, ~0 }, \
114 .swizzle = SWIZZLE_IDENTITY_4, \
115 .op = midgard_op_##name, \
116 .load_store = { \
117 .signed_offset = address \
118 } \
119 }; \
120 \
121 if (store) { \
122 i.src[0] = ssa; \
123 i.src_types[0] = T; \
124 i.dest_type = T; \
125 } else { \
126 i.dest = ssa; \
127 i.dest_type = T; \
128 } \
129 return i; \
130 }
131
132 #define M_LOAD(name, T) M_LOAD_STORE(name, false, T)
133 #define M_STORE(name, T) M_LOAD_STORE(name, true, T)
134
135 M_LOAD(ld_attr_32, nir_type_uint32);
136 M_LOAD(ld_vary_32, nir_type_uint32);
137 M_LOAD(ld_ubo_32, nir_type_uint32);
138 M_LOAD(ld_ubo_64, nir_type_uint32);
139 M_LOAD(ld_ubo_128, nir_type_uint32);
140 M_LOAD(ld_32, nir_type_uint32);
141 M_LOAD(ld_64, nir_type_uint32);
142 M_LOAD(ld_128, nir_type_uint32);
143 M_STORE(st_32, nir_type_uint32);
144 M_STORE(st_64, nir_type_uint32);
145 M_STORE(st_128, nir_type_uint32);
146 M_LOAD(ld_tilebuffer_raw, nir_type_uint32);
147 M_LOAD(ld_tilebuffer_16f, nir_type_float16);
148 M_LOAD(ld_tilebuffer_32f, nir_type_float32);
149 M_STORE(st_vary_32, nir_type_uint32);
150 M_LOAD(ld_cubemap_coords, nir_type_uint32);
151 M_LOAD(ldst_mov, nir_type_uint32);
152 M_LOAD(ld_image_32f, nir_type_float32);
153 M_LOAD(ld_image_16f, nir_type_float16);
154 M_LOAD(ld_image_32u, nir_type_uint32);
155 M_LOAD(ld_image_32i, nir_type_int32);
156 M_STORE(st_image_32f, nir_type_float32);
157 M_STORE(st_image_16f, nir_type_float16);
158 M_STORE(st_image_32u, nir_type_uint32);
159 M_STORE(st_image_32i, nir_type_int32);
160 M_LOAD(lea_image, nir_type_uint64);
161
162 #define M_IMAGE(op) \
163 static midgard_instruction \
164 op ## _image(nir_alu_type type, unsigned val, unsigned address) \
165 { \
166 switch (type) { \
167 case nir_type_float32: \
168 return m_ ## op ## _image_32f(val, address); \
169 case nir_type_float16: \
170 return m_ ## op ## _image_16f(val, address); \
171 case nir_type_uint32: \
172 return m_ ## op ## _image_32u(val, address); \
173 case nir_type_int32: \
174 return m_ ## op ## _image_32i(val, address); \
175 default: \
176 unreachable("Invalid image type"); \
177 } \
178 }
179
180 M_IMAGE(ld);
181 M_IMAGE(st);
182
183 static midgard_instruction
v_branch(bool conditional,bool invert)184 v_branch(bool conditional, bool invert)
185 {
186 midgard_instruction ins = {
187 .type = TAG_ALU_4,
188 .unit = ALU_ENAB_BRANCH,
189 .compact_branch = true,
190 .branch = {
191 .conditional = conditional,
192 .invert_conditional = invert
193 },
194 .dest = ~0,
195 .src = { ~0, ~0, ~0, ~0 },
196 };
197
198 return ins;
199 }
200
201 static void
attach_constants(compiler_context * ctx,midgard_instruction * ins,void * constants,int name)202 attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)
203 {
204 ins->has_constants = true;
205 memcpy(&ins->constants, constants, 16);
206 }
207
208 static int
glsl_type_size(const struct glsl_type * type,bool bindless)209 glsl_type_size(const struct glsl_type *type, bool bindless)
210 {
211 return glsl_count_attribute_slots(type, false);
212 }
213
214 /* Lower fdot2 to a vector multiplication followed by channel addition */
215 static bool
midgard_nir_lower_fdot2_instr(nir_builder * b,nir_instr * instr,void * data)216 midgard_nir_lower_fdot2_instr(nir_builder *b, nir_instr *instr, void *data)
217 {
218 if (instr->type != nir_instr_type_alu)
219 return false;
220
221 nir_alu_instr *alu = nir_instr_as_alu(instr);
222 if (alu->op != nir_op_fdot2)
223 return false;
224
225 b->cursor = nir_before_instr(&alu->instr);
226
227 nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0);
228 nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1);
229
230 nir_ssa_def *product = nir_fmul(b, src0, src1);
231
232 nir_ssa_def *sum = nir_fadd(b,
233 nir_channel(b, product, 0),
234 nir_channel(b, product, 1));
235
236 /* Replace the fdot2 with this sum */
237 nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum);
238
239 return true;
240 }
241
242 static bool
midgard_nir_lower_fdot2(nir_shader * shader)243 midgard_nir_lower_fdot2(nir_shader *shader)
244 {
245 return nir_shader_instructions_pass(shader,
246 midgard_nir_lower_fdot2_instr,
247 nir_metadata_block_index | nir_metadata_dominance,
248 NULL);
249 }
250
251 static bool
mdg_is_64(const nir_instr * instr,const void * _unused)252 mdg_is_64(const nir_instr *instr, const void *_unused)
253 {
254 const nir_alu_instr *alu = nir_instr_as_alu(instr);
255
256 if (nir_dest_bit_size(alu->dest.dest) == 64)
257 return true;
258
259 switch (alu->op) {
260 case nir_op_umul_high:
261 case nir_op_imul_high:
262 return true;
263 default:
264 return false;
265 }
266 }
267
268 /* Only vectorize int64 up to vec2 */
269 static bool
midgard_vectorize_filter(const nir_instr * instr,void * data)270 midgard_vectorize_filter(const nir_instr *instr, void *data)
271 {
272 if (instr->type != nir_instr_type_alu)
273 return true;
274
275 const nir_alu_instr *alu = nir_instr_as_alu(instr);
276
277 unsigned num_components = alu->dest.dest.ssa.num_components;
278
279 int src_bit_size = nir_src_bit_size(alu->src[0].src);
280 int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
281
282 if (src_bit_size == 64 || dst_bit_size == 64) {
283 if (num_components > 1)
284 return false;
285 }
286
287 return true;
288 }
289
290
291 /* Flushes undefined values to zero */
292
293 static void
optimise_nir(nir_shader * nir,unsigned quirks,bool is_blend)294 optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend)
295 {
296 bool progress;
297 unsigned lower_flrp =
298 (nir->options->lower_flrp16 ? 16 : 0) |
299 (nir->options->lower_flrp32 ? 32 : 0) |
300 (nir->options->lower_flrp64 ? 64 : 0);
301
302 NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
303 nir_lower_idiv_options idiv_options = {
304 .imprecise_32bit_lowering = true,
305 .allow_fp16 = true,
306 };
307 NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
308
309 nir_lower_tex_options lower_tex_options = {
310 .lower_txs_lod = true,
311 .lower_txp = ~0,
312 .lower_tg4_broadcom_swizzle = true,
313 /* TODO: we have native gradient.. */
314 .lower_txd = true,
315 };
316
317 NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
318
319 /* Must lower fdot2 after tex is lowered */
320 NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
321
322 /* T720 is broken. */
323
324 if (quirks & MIDGARD_BROKEN_LOD)
325 NIR_PASS_V(nir, midgard_nir_lod_errata);
326
327 /* Midgard image ops coordinates are 16-bit instead of 32-bit */
328 NIR_PASS(progress, nir, midgard_nir_lower_image_bitsize);
329 NIR_PASS(progress, nir, midgard_nir_lower_helper_writes);
330 NIR_PASS(progress, nir, pan_lower_helper_invocation);
331 NIR_PASS(progress, nir, pan_lower_sample_pos);
332
333 NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early);
334
335 do {
336 progress = false;
337
338 NIR_PASS(progress, nir, nir_lower_var_copies);
339 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
340
341 NIR_PASS(progress, nir, nir_copy_prop);
342 NIR_PASS(progress, nir, nir_opt_remove_phis);
343 NIR_PASS(progress, nir, nir_opt_dce);
344 NIR_PASS(progress, nir, nir_opt_dead_cf);
345 NIR_PASS(progress, nir, nir_opt_cse);
346 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
347 NIR_PASS(progress, nir, nir_opt_algebraic);
348 NIR_PASS(progress, nir, nir_opt_constant_folding);
349
350 if (lower_flrp != 0) {
351 bool lower_flrp_progress = false;
352 NIR_PASS(lower_flrp_progress,
353 nir,
354 nir_lower_flrp,
355 lower_flrp,
356 false /* always_precise */);
357 if (lower_flrp_progress) {
358 NIR_PASS(progress, nir,
359 nir_opt_constant_folding);
360 progress = true;
361 }
362
363 /* Nothing should rematerialize any flrps, so we only
364 * need to do this lowering once.
365 */
366 lower_flrp = 0;
367 }
368
369 NIR_PASS(progress, nir, nir_opt_undef);
370 NIR_PASS(progress, nir, nir_lower_undef_to_zero);
371
372 NIR_PASS(progress, nir, nir_opt_loop_unroll);
373
374 NIR_PASS(progress, nir, nir_opt_vectorize,
375 midgard_vectorize_filter, NULL);
376 } while (progress);
377
378 NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_is_64, NULL);
379
380 /* Run after opts so it can hit more */
381 if (!is_blend)
382 NIR_PASS(progress, nir, nir_fuse_io_16);
383
384 /* Must be run at the end to prevent creation of fsin/fcos ops */
385 NIR_PASS(progress, nir, midgard_nir_scale_trig);
386
387 do {
388 progress = false;
389
390 NIR_PASS(progress, nir, nir_opt_dce);
391 NIR_PASS(progress, nir, nir_opt_algebraic);
392 NIR_PASS(progress, nir, nir_opt_constant_folding);
393 NIR_PASS(progress, nir, nir_copy_prop);
394 } while (progress);
395
396 NIR_PASS(progress, nir, nir_opt_algebraic_late);
397 NIR_PASS(progress, nir, nir_opt_algebraic_distribute_src_mods);
398
399 /* We implement booleans as 32-bit 0/~0 */
400 NIR_PASS(progress, nir, nir_lower_bool_to_int32);
401
402 /* Now that booleans are lowered, we can run out late opts */
403 NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);
404 NIR_PASS(progress, nir, midgard_nir_cancel_inot);
405
406 NIR_PASS(progress, nir, nir_copy_prop);
407 NIR_PASS(progress, nir, nir_opt_dce);
408
409 /* Backend scheduler is purely local, so do some global optimizations
410 * to reduce register pressure. */
411 nir_move_options move_all =
412 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
413 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
414
415 NIR_PASS_V(nir, nir_opt_sink, move_all);
416 NIR_PASS_V(nir, nir_opt_move, move_all);
417
418 /* Take us out of SSA */
419 NIR_PASS(progress, nir, nir_lower_locals_to_regs);
420 NIR_PASS(progress, nir, nir_convert_from_ssa, true);
421
422 /* We are a vector architecture; write combine where possible */
423 NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
424 NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL);
425
426 NIR_PASS(progress, nir, nir_opt_dce);
427 }
428
429 /* Do not actually emit a load; instead, cache the constant for inlining */
430
431 static void
emit_load_const(compiler_context * ctx,nir_load_const_instr * instr)432 emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
433 {
434 nir_ssa_def def = instr->def;
435
436 midgard_constants *consts = rzalloc(ctx, midgard_constants);
437
438 assert(instr->def.num_components * instr->def.bit_size <= sizeof(*consts) * 8);
439
440 #define RAW_CONST_COPY(bits) \
441 nir_const_value_to_array(consts->u##bits, instr->value, \
442 instr->def.num_components, u##bits)
443
444 switch (instr->def.bit_size) {
445 case 64:
446 RAW_CONST_COPY(64);
447 break;
448 case 32:
449 RAW_CONST_COPY(32);
450 break;
451 case 16:
452 RAW_CONST_COPY(16);
453 break;
454 case 8:
455 RAW_CONST_COPY(8);
456 break;
457 default:
458 unreachable("Invalid bit_size for load_const instruction\n");
459 }
460
461 /* Shifted for SSA, +1 for off-by-one */
462 _mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, consts);
463 }
464
465 /* Normally constants are embedded implicitly, but for I/O and such we have to
466 * explicitly emit a move with the constant source */
467
468 static void
emit_explicit_constant(compiler_context * ctx,unsigned node,unsigned to)469 emit_explicit_constant(compiler_context *ctx, unsigned node, unsigned to)
470 {
471 void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, node + 1);
472
473 if (constant_value) {
474 midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), to);
475 attach_constants(ctx, &ins, constant_value, node + 1);
476 emit_mir_instruction(ctx, ins);
477 }
478 }
479
480 static bool
nir_is_non_scalar_swizzle(nir_alu_src * src,unsigned nr_components)481 nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)
482 {
483 unsigned comp = src->swizzle[0];
484
485 for (unsigned c = 1; c < nr_components; ++c) {
486 if (src->swizzle[c] != comp)
487 return true;
488 }
489
490 return false;
491 }
492
493 #define ATOMIC_CASE_IMPL(ctx, instr, nir, op, is_shared) \
494 case nir_intrinsic_##nir: \
495 emit_atomic(ctx, instr, is_shared, midgard_op_##op, ~0); \
496 break;
497
498 #define ATOMIC_CASE(ctx, instr, nir, op) \
499 ATOMIC_CASE_IMPL(ctx, instr, shared_atomic_##nir, atomic_##op, true); \
500 ATOMIC_CASE_IMPL(ctx, instr, global_atomic_##nir, atomic_##op, false);
501
502 #define IMAGE_ATOMIC_CASE(ctx, instr, nir, op) \
503 case nir_intrinsic_image_atomic_##nir: { \
504 midgard_instruction ins = emit_image_op(ctx, instr, true); \
505 emit_atomic(ctx, instr, false, midgard_op_atomic_##op, ins.dest); \
506 break; \
507 }
508
509 #define ALU_CASE(nir, _op) \
510 case nir_op_##nir: \
511 op = midgard_alu_op_##_op; \
512 assert(src_bitsize == dst_bitsize); \
513 break;
514
515 #define ALU_CASE_RTZ(nir, _op) \
516 case nir_op_##nir: \
517 op = midgard_alu_op_##_op; \
518 roundmode = MIDGARD_RTZ; \
519 break;
520
521 #define ALU_CHECK_CMP() \
522 assert(src_bitsize == 16 || src_bitsize == 32 || src_bitsize == 64); \
523 assert(dst_bitsize == 16 || dst_bitsize == 32); \
524
525 #define ALU_CASE_BCAST(nir, _op, count) \
526 case nir_op_##nir: \
527 op = midgard_alu_op_##_op; \
528 broadcast_swizzle = count; \
529 ALU_CHECK_CMP(); \
530 break;
531
532 #define ALU_CASE_CMP(nir, _op) \
533 case nir_op_##nir: \
534 op = midgard_alu_op_##_op; \
535 ALU_CHECK_CMP(); \
536 break;
537
538 /* Compare mir_lower_invert */
539 static bool
nir_accepts_inot(nir_op op,unsigned src)540 nir_accepts_inot(nir_op op, unsigned src)
541 {
542 switch (op) {
543 case nir_op_ior:
544 case nir_op_iand: /* TODO: b2f16 */
545 case nir_op_ixor:
546 return true;
547 case nir_op_b32csel:
548 /* Only the condition */
549 return (src == 0);
550 default:
551 return false;
552 }
553 }
554
555 static bool
mir_accept_dest_mod(compiler_context * ctx,nir_dest ** dest,nir_op op)556 mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op)
557 {
558 if (pan_has_dest_mod(dest, op)) {
559 assert((*dest)->is_ssa);
560 BITSET_SET(ctx->already_emitted, (*dest)->ssa.index);
561 return true;
562 }
563
564 return false;
565 }
566
567 /* Look for floating point mods. We have the mods clamp_m1_1, clamp_0_1,
568 * and clamp_0_inf. We also have the relations (note 3 * 2 = 6 cases):
569 *
570 * clamp_0_1(clamp_0_inf(x)) = clamp_m1_1(x)
571 * clamp_0_1(clamp_m1_1(x)) = clamp_m1_1(x)
572 * clamp_0_inf(clamp_0_1(x)) = clamp_m1_1(x)
573 * clamp_0_inf(clamp_m1_1(x)) = clamp_m1_1(x)
574 * clamp_m1_1(clamp_0_1(x)) = clamp_m1_1(x)
575 * clamp_m1_1(clamp_0_inf(x)) = clamp_m1_1(x)
576 *
577 * So by cases any composition of output modifiers is equivalent to
578 * clamp_m1_1 alone.
579 */
580 static unsigned
mir_determine_float_outmod(compiler_context * ctx,nir_dest ** dest,unsigned prior_outmod)581 mir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, unsigned prior_outmod)
582 {
583 bool clamp_0_inf = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos_mali);
584 bool clamp_0_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat);
585 bool clamp_m1_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed_mali);
586 bool prior = (prior_outmod != midgard_outmod_none);
587 int count = (int) prior + (int) clamp_0_inf + (int) clamp_0_1 + (int) clamp_m1_1;
588
589 return ((count > 1) || clamp_0_1) ? midgard_outmod_clamp_0_1 :
590 clamp_0_inf ? midgard_outmod_clamp_0_inf :
591 clamp_m1_1 ? midgard_outmod_clamp_m1_1 :
592 prior_outmod;
593 }
594
595 static void
mir_copy_src(midgard_instruction * ins,nir_alu_instr * instr,unsigned i,unsigned to,bool * abs,bool * neg,bool * not,enum midgard_roundmode * roundmode,bool is_int,unsigned bcast_count)596 mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, enum midgard_roundmode *roundmode, bool is_int, unsigned bcast_count)
597 {
598 nir_alu_src src = instr->src[i];
599
600 if (!is_int) {
601 if (pan_has_source_mod(&src, nir_op_fneg))
602 *neg = !(*neg);
603
604 if (pan_has_source_mod(&src, nir_op_fabs))
605 *abs = true;
606 }
607
608 if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot))
609 *not = true;
610
611 if (roundmode) {
612 if (pan_has_source_mod(&src, nir_op_fround_even))
613 *roundmode = MIDGARD_RTE;
614
615 if (pan_has_source_mod(&src, nir_op_ftrunc))
616 *roundmode = MIDGARD_RTZ;
617
618 if (pan_has_source_mod(&src, nir_op_ffloor))
619 *roundmode = MIDGARD_RTN;
620
621 if (pan_has_source_mod(&src, nir_op_fceil))
622 *roundmode = MIDGARD_RTP;
623 }
624
625 unsigned bits = nir_src_bit_size(src.src);
626
627 ins->src[to] = nir_src_index(NULL, &src.src);
628 ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits;
629
630 for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) {
631 ins->swizzle[to][c] = src.swizzle[
632 (!bcast_count || c < bcast_count) ? c :
633 (bcast_count - 1)];
634 }
635 }
636
637 /* Midgard features both fcsel and icsel, depending on whether you want int or
638 * float modifiers. NIR's csel is typeless, so we want a heuristic to guess if
639 * we should emit an int or float csel depending on what modifiers could be
640 * placed. In the absense of modifiers, this is probably arbitrary. */
641
642 static bool
mir_is_bcsel_float(nir_alu_instr * instr)643 mir_is_bcsel_float(nir_alu_instr *instr)
644 {
645 nir_op intmods[] = {
646 nir_op_i2i8, nir_op_i2i16,
647 nir_op_i2i32, nir_op_i2i64
648 };
649
650 nir_op floatmods[] = {
651 nir_op_fabs, nir_op_fneg,
652 nir_op_f2f16, nir_op_f2f32,
653 nir_op_f2f64
654 };
655
656 nir_op floatdestmods[] = {
657 nir_op_fsat, nir_op_fsat_signed_mali, nir_op_fclamp_pos_mali,
658 nir_op_f2f16, nir_op_f2f32
659 };
660
661 signed score = 0;
662
663 for (unsigned i = 1; i < 3; ++i) {
664 nir_alu_src s = instr->src[i];
665 for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) {
666 if (pan_has_source_mod(&s, intmods[q]))
667 score--;
668 }
669 }
670
671 for (unsigned i = 1; i < 3; ++i) {
672 nir_alu_src s = instr->src[i];
673 for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) {
674 if (pan_has_source_mod(&s, floatmods[q]))
675 score++;
676 }
677 }
678
679 for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) {
680 nir_dest *dest = &instr->dest.dest;
681 if (pan_has_dest_mod(&dest, floatdestmods[q]))
682 score++;
683 }
684
685 return (score > 0);
686 }
687
688 static void
emit_alu(compiler_context * ctx,nir_alu_instr * instr)689 emit_alu(compiler_context *ctx, nir_alu_instr *instr)
690 {
691 nir_dest *dest = &instr->dest.dest;
692
693 if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index))
694 return;
695
696 /* Derivatives end up emitted on the texture pipe, not the ALUs. This
697 * is handled elsewhere */
698
699 if (instr->op == nir_op_fddx || instr->op == nir_op_fddy) {
700 midgard_emit_derivatives(ctx, instr);
701 return;
702 }
703
704 bool is_ssa = dest->is_ssa;
705
706 unsigned nr_components = nir_dest_num_components(*dest);
707 unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;
708 unsigned op = 0;
709
710 /* Number of components valid to check for the instruction (the rest
711 * will be forced to the last), or 0 to use as-is. Relevant as
712 * ball-type instructions have a channel count in NIR but are all vec4
713 * in Midgard */
714
715 unsigned broadcast_swizzle = 0;
716
717 /* Should we swap arguments? */
718 bool flip_src12 = false;
719
720 ASSERTED unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);
721 ASSERTED unsigned dst_bitsize = nir_dest_bit_size(*dest);
722
723 enum midgard_roundmode roundmode = MIDGARD_RTE;
724
725 switch (instr->op) {
726 ALU_CASE(fadd, fadd);
727 ALU_CASE(fmul, fmul);
728 ALU_CASE(fmin, fmin);
729 ALU_CASE(fmax, fmax);
730 ALU_CASE(imin, imin);
731 ALU_CASE(imax, imax);
732 ALU_CASE(umin, umin);
733 ALU_CASE(umax, umax);
734 ALU_CASE(ffloor, ffloor);
735 ALU_CASE(fround_even, froundeven);
736 ALU_CASE(ftrunc, ftrunc);
737 ALU_CASE(fceil, fceil);
738 ALU_CASE(fdot3, fdot3);
739 ALU_CASE(fdot4, fdot4);
740 ALU_CASE(iadd, iadd);
741 ALU_CASE(isub, isub);
742 ALU_CASE(iadd_sat, iaddsat);
743 ALU_CASE(isub_sat, isubsat);
744 ALU_CASE(uadd_sat, uaddsat);
745 ALU_CASE(usub_sat, usubsat);
746 ALU_CASE(imul, imul);
747 ALU_CASE(imul_high, imul);
748 ALU_CASE(umul_high, imul);
749 ALU_CASE(uclz, iclz);
750
751 /* Zero shoved as second-arg */
752 ALU_CASE(iabs, iabsdiff);
753
754 ALU_CASE(uabs_isub, iabsdiff);
755 ALU_CASE(uabs_usub, uabsdiff);
756
757 ALU_CASE(mov, imov);
758
759 ALU_CASE_CMP(feq32, feq);
760 ALU_CASE_CMP(fneu32, fne);
761 ALU_CASE_CMP(flt32, flt);
762 ALU_CASE_CMP(ieq32, ieq);
763 ALU_CASE_CMP(ine32, ine);
764 ALU_CASE_CMP(ilt32, ilt);
765 ALU_CASE_CMP(ult32, ult);
766
767 /* We don't have a native b2f32 instruction. Instead, like many
768 * GPUs, we exploit booleans as 0/~0 for false/true, and
769 * correspondingly AND
770 * by 1.0 to do the type conversion. For the moment, prime us
771 * to emit:
772 *
773 * iand [whatever], #0
774 *
775 * At the end of emit_alu (as MIR), we'll fix-up the constant
776 */
777
778 ALU_CASE_CMP(b2f32, iand);
779 ALU_CASE_CMP(b2f16, iand);
780 ALU_CASE_CMP(b2i32, iand);
781
782 /* Likewise, we don't have a dedicated f2b32 instruction, but
783 * we can do a "not equal to 0.0" test. */
784
785 ALU_CASE_CMP(f2b32, fne);
786 ALU_CASE_CMP(i2b32, ine);
787
788 ALU_CASE(frcp, frcp);
789 ALU_CASE(frsq, frsqrt);
790 ALU_CASE(fsqrt, fsqrt);
791 ALU_CASE(fexp2, fexp2);
792 ALU_CASE(flog2, flog2);
793
794 ALU_CASE_RTZ(f2i64, f2i_rte);
795 ALU_CASE_RTZ(f2u64, f2u_rte);
796 ALU_CASE_RTZ(i2f64, i2f_rte);
797 ALU_CASE_RTZ(u2f64, u2f_rte);
798
799 ALU_CASE_RTZ(f2i32, f2i_rte);
800 ALU_CASE_RTZ(f2u32, f2u_rte);
801 ALU_CASE_RTZ(i2f32, i2f_rte);
802 ALU_CASE_RTZ(u2f32, u2f_rte);
803
804 ALU_CASE_RTZ(f2i8, f2i_rte);
805 ALU_CASE_RTZ(f2u8, f2u_rte);
806
807 ALU_CASE_RTZ(f2i16, f2i_rte);
808 ALU_CASE_RTZ(f2u16, f2u_rte);
809 ALU_CASE_RTZ(i2f16, i2f_rte);
810 ALU_CASE_RTZ(u2f16, u2f_rte);
811
812 ALU_CASE(fsin, fsinpi);
813 ALU_CASE(fcos, fcospi);
814
815 /* We'll get 0 in the second arg, so:
816 * ~a = ~(a | 0) = nor(a, 0) */
817 ALU_CASE(inot, inor);
818 ALU_CASE(iand, iand);
819 ALU_CASE(ior, ior);
820 ALU_CASE(ixor, ixor);
821 ALU_CASE(ishl, ishl);
822 ALU_CASE(ishr, iasr);
823 ALU_CASE(ushr, ilsr);
824
825 ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2);
826 ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3);
827 ALU_CASE_CMP(b32all_fequal4, fball_eq);
828
829 ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2);
830 ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3);
831 ALU_CASE_CMP(b32any_fnequal4, fbany_neq);
832
833 ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2);
834 ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3);
835 ALU_CASE_CMP(b32all_iequal4, iball_eq);
836
837 ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2);
838 ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3);
839 ALU_CASE_CMP(b32any_inequal4, ibany_neq);
840
841 /* Source mods will be shoved in later */
842 ALU_CASE(fabs, fmov);
843 ALU_CASE(fneg, fmov);
844 ALU_CASE(fsat, fmov);
845 ALU_CASE(fsat_signed_mali, fmov);
846 ALU_CASE(fclamp_pos_mali, fmov);
847
848 /* For size conversion, we use a move. Ideally though we would squash
849 * these ops together; maybe that has to happen after in NIR as part of
850 * propagation...? An earlier algebraic pass ensured we step down by
851 * only / exactly one size. If stepping down, we use a dest override to
852 * reduce the size; if stepping up, we use a larger-sized move with a
853 * half source and a sign/zero-extension modifier */
854
855 case nir_op_i2i8:
856 case nir_op_i2i16:
857 case nir_op_i2i32:
858 case nir_op_i2i64:
859 case nir_op_u2u8:
860 case nir_op_u2u16:
861 case nir_op_u2u32:
862 case nir_op_u2u64:
863 case nir_op_f2f16:
864 case nir_op_f2f32:
865 case nir_op_f2f64: {
866 if (instr->op == nir_op_f2f16 || instr->op == nir_op_f2f32 ||
867 instr->op == nir_op_f2f64)
868 op = midgard_alu_op_fmov;
869 else
870 op = midgard_alu_op_imov;
871
872 break;
873 }
874
875 /* For greater-or-equal, we lower to less-or-equal and flip the
876 * arguments */
877
878 case nir_op_fge:
879 case nir_op_fge32:
880 case nir_op_ige32:
881 case nir_op_uge32: {
882 op =
883 instr->op == nir_op_fge ? midgard_alu_op_fle :
884 instr->op == nir_op_fge32 ? midgard_alu_op_fle :
885 instr->op == nir_op_ige32 ? midgard_alu_op_ile :
886 instr->op == nir_op_uge32 ? midgard_alu_op_ule :
887 0;
888
889 flip_src12 = true;
890 ALU_CHECK_CMP();
891 break;
892 }
893
894 case nir_op_b32csel: {
895 bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);
896 bool is_float = mir_is_bcsel_float(instr);
897 op = is_float ?
898 (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) :
899 (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel);
900
901 break;
902 }
903
904 case nir_op_unpack_32_2x16:
905 case nir_op_unpack_32_4x8:
906 case nir_op_pack_32_2x16:
907 case nir_op_pack_32_4x8: {
908 op = midgard_alu_op_imov;
909 break;
910 }
911
912 default:
913 DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
914 assert(0);
915 return;
916 }
917
918 /* Promote imov to fmov if it might help inline a constant */
919 if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src)
920 && nir_src_bit_size(instr->src[0].src) == 32
921 && nir_is_same_comp_swizzle(instr->src[0].swizzle,
922 nir_src_num_components(instr->src[0].src))) {
923 op = midgard_alu_op_fmov;
924 }
925
926 /* Midgard can perform certain modifiers on output of an ALU op */
927
928 unsigned outmod = 0;
929 bool is_int = midgard_is_integer_op(op);
930
931 if (instr->op == nir_op_umul_high || instr->op == nir_op_imul_high) {
932 outmod = midgard_outmod_keephi;
933 } else if (midgard_is_integer_out_op(op)) {
934 outmod = midgard_outmod_keeplo;
935 } else if (instr->op == nir_op_fsat) {
936 outmod = midgard_outmod_clamp_0_1;
937 } else if (instr->op == nir_op_fsat_signed_mali) {
938 outmod = midgard_outmod_clamp_m1_1;
939 } else if (instr->op == nir_op_fclamp_pos_mali) {
940 outmod = midgard_outmod_clamp_0_inf;
941 }
942
943 /* Fetch unit, quirks, etc information */
944 unsigned opcode_props = alu_opcode_props[op].props;
945 bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;
946
947 if (!midgard_is_integer_out_op(op)) {
948 outmod = mir_determine_float_outmod(ctx, &dest, outmod);
949 }
950
951 midgard_instruction ins = {
952 .type = TAG_ALU_4,
953 .dest = nir_dest_index(dest),
954 .dest_type = nir_op_infos[instr->op].output_type
955 | nir_dest_bit_size(*dest),
956 .roundmode = roundmode,
957 };
958
959 enum midgard_roundmode *roundptr = (opcode_props & MIDGARD_ROUNDS) ?
960 &ins.roundmode : NULL;
961
962 for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i)
963 ins.src[i] = ~0;
964
965 if (quirk_flipped_r24) {
966 ins.src[0] = ~0;
967 mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], roundptr, is_int, broadcast_swizzle);
968 } else {
969 for (unsigned i = 0; i < nr_inputs; ++i) {
970 unsigned to = i;
971
972 if (instr->op == nir_op_b32csel) {
973 /* The condition is the first argument; move
974 * the other arguments up one to be a binary
975 * instruction for Midgard with the condition
976 * last */
977
978 if (i == 0)
979 to = 2;
980 else if (flip_src12)
981 to = 2 - i;
982 else
983 to = i - 1;
984 } else if (flip_src12) {
985 to = 1 - to;
986 }
987
988 mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], roundptr, is_int, broadcast_swizzle);
989
990 /* (!c) ? a : b = c ? b : a */
991 if (instr->op == nir_op_b32csel && ins.src_invert[2]) {
992 ins.src_invert[2] = false;
993 flip_src12 ^= true;
994 }
995 }
996 }
997
998 if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) {
999 /* Lowered to move */
1000 if (instr->op == nir_op_fneg)
1001 ins.src_neg[1] ^= true;
1002
1003 if (instr->op == nir_op_fabs)
1004 ins.src_abs[1] = true;
1005 }
1006
1007 ins.mask = mask_of(nr_components);
1008
1009 /* Apply writemask if non-SSA, keeping in mind that we can't write to
1010 * components that don't exist. Note modifier => SSA => !reg => no
1011 * writemask, so we don't have to worry about writemasks here.*/
1012
1013 if (!is_ssa)
1014 ins.mask &= instr->dest.write_mask;
1015
1016 ins.op = op;
1017 ins.outmod = outmod;
1018
1019 /* Late fixup for emulated instructions */
1020
1021 if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) {
1022 /* Presently, our second argument is an inline #0 constant.
1023 * Switch over to an embedded 1.0 constant (that can't fit
1024 * inline, since we're 32-bit, not 16-bit like the inline
1025 * constants) */
1026
1027 ins.has_inline_constant = false;
1028 ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1029 ins.src_types[1] = nir_type_float32;
1030 ins.has_constants = true;
1031
1032 if (instr->op == nir_op_b2f32)
1033 ins.constants.f32[0] = 1.0f;
1034 else
1035 ins.constants.i32[0] = 1;
1036
1037 for (unsigned c = 0; c < 16; ++c)
1038 ins.swizzle[1][c] = 0;
1039 } else if (instr->op == nir_op_b2f16) {
1040 ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1041 ins.src_types[1] = nir_type_float16;
1042 ins.has_constants = true;
1043 ins.constants.i16[0] = _mesa_float_to_half(1.0);
1044
1045 for (unsigned c = 0; c < 16; ++c)
1046 ins.swizzle[1][c] = 0;
1047 } else if (nr_inputs == 1 && !quirk_flipped_r24) {
1048 /* Lots of instructions need a 0 plonked in */
1049 ins.has_inline_constant = false;
1050 ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1051 ins.src_types[1] = ins.src_types[0];
1052 ins.has_constants = true;
1053 ins.constants.u32[0] = 0;
1054
1055 for (unsigned c = 0; c < 16; ++c)
1056 ins.swizzle[1][c] = 0;
1057 } else if (instr->op == nir_op_pack_32_2x16) {
1058 ins.dest_type = nir_type_uint16;
1059 ins.mask = mask_of(nr_components * 2);
1060 ins.is_pack = true;
1061 } else if (instr->op == nir_op_pack_32_4x8) {
1062 ins.dest_type = nir_type_uint8;
1063 ins.mask = mask_of(nr_components * 4);
1064 ins.is_pack = true;
1065 } else if (instr->op == nir_op_unpack_32_2x16) {
1066 ins.dest_type = nir_type_uint32;
1067 ins.mask = mask_of(nr_components >> 1);
1068 ins.is_pack = true;
1069 } else if (instr->op == nir_op_unpack_32_4x8) {
1070 ins.dest_type = nir_type_uint32;
1071 ins.mask = mask_of(nr_components >> 2);
1072 ins.is_pack = true;
1073 }
1074
1075 if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {
1076 /* To avoid duplicating the lookup tables (probably), true LUT
1077 * instructions can only operate as if they were scalars. Lower
1078 * them here by changing the component. */
1079
1080 unsigned orig_mask = ins.mask;
1081
1082 unsigned swizzle_back[MIR_VEC_COMPONENTS];
1083 memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back));
1084
1085 midgard_instruction ins_split[MIR_VEC_COMPONENTS];
1086 unsigned ins_count = 0;
1087
1088 for (int i = 0; i < nr_components; ++i) {
1089 /* Mask the associated component, dropping the
1090 * instruction if needed */
1091
1092 ins.mask = 1 << i;
1093 ins.mask &= orig_mask;
1094
1095 for (unsigned j = 0; j < ins_count; ++j) {
1096 if (swizzle_back[i] == ins_split[j].swizzle[0][0]) {
1097 ins_split[j].mask |= ins.mask;
1098 ins.mask = 0;
1099 break;
1100 }
1101 }
1102
1103 if (!ins.mask)
1104 continue;
1105
1106 for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j)
1107 ins.swizzle[0][j] = swizzle_back[i]; /* Pull from the correct component */
1108
1109 ins_split[ins_count] = ins;
1110
1111 ++ins_count;
1112 }
1113
1114 for (unsigned i = 0; i < ins_count; ++i) {
1115 emit_mir_instruction(ctx, ins_split[i]);
1116 }
1117 } else {
1118 emit_mir_instruction(ctx, ins);
1119 }
1120 }
1121
1122 #undef ALU_CASE
1123
1124 static void
mir_set_intr_mask(nir_instr * instr,midgard_instruction * ins,bool is_read)1125 mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read)
1126 {
1127 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1128 unsigned nir_mask = 0;
1129 unsigned dsize = 0;
1130
1131 if (is_read) {
1132 nir_mask = mask_of(nir_intrinsic_dest_components(intr));
1133 dsize = nir_dest_bit_size(intr->dest);
1134 } else {
1135 nir_mask = nir_intrinsic_write_mask(intr);
1136 dsize = 32;
1137 }
1138
1139 /* Once we have the NIR mask, we need to normalize to work in 32-bit space */
1140 unsigned bytemask = pan_to_bytemask(dsize, nir_mask);
1141 ins->dest_type = nir_type_uint | dsize;
1142 mir_set_bytemask(ins, bytemask);
1143 }
1144
1145 /* Uniforms and UBOs use a shared code path, as uniforms are just (slightly
1146 * optimized) versions of UBO #0 */
1147
1148 static midgard_instruction *
emit_ubo_read(compiler_context * ctx,nir_instr * instr,unsigned dest,unsigned offset,nir_src * indirect_offset,unsigned indirect_shift,unsigned index,unsigned nr_comps)1149 emit_ubo_read(
1150 compiler_context *ctx,
1151 nir_instr *instr,
1152 unsigned dest,
1153 unsigned offset,
1154 nir_src *indirect_offset,
1155 unsigned indirect_shift,
1156 unsigned index,
1157 unsigned nr_comps)
1158 {
1159 midgard_instruction ins;
1160
1161 unsigned dest_size = (instr->type == nir_instr_type_intrinsic) ?
1162 nir_dest_bit_size(nir_instr_as_intrinsic(instr)->dest) : 32;
1163
1164 unsigned bitsize = dest_size * nr_comps;
1165
1166 /* Pick the smallest intrinsic to avoid out-of-bounds reads */
1167 if (bitsize <= 32)
1168 ins = m_ld_ubo_32(dest, 0);
1169 else if (bitsize <= 64)
1170 ins = m_ld_ubo_64(dest, 0);
1171 else if (bitsize <= 128)
1172 ins = m_ld_ubo_128(dest, 0);
1173 else
1174 unreachable("Invalid UBO read size");
1175
1176 ins.constants.u32[0] = offset;
1177
1178 if (instr->type == nir_instr_type_intrinsic)
1179 mir_set_intr_mask(instr, &ins, true);
1180
1181 if (indirect_offset) {
1182 ins.src[2] = nir_src_index(ctx, indirect_offset);
1183 ins.src_types[2] = nir_type_uint32;
1184 ins.load_store.index_shift = indirect_shift;
1185
1186 /* X component for the whole swizzle to prevent register
1187 * pressure from ballooning from the extra components */
1188 for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[2]); ++i)
1189 ins.swizzle[2][i] = 0;
1190 } else {
1191 ins.load_store.index_reg = REGISTER_LDST_ZERO;
1192 }
1193
1194 if (indirect_offset && indirect_offset->is_ssa && !indirect_shift)
1195 mir_set_ubo_offset(&ins, indirect_offset, offset);
1196
1197 midgard_pack_ubo_index_imm(&ins.load_store, index);
1198
1199 return emit_mir_instruction(ctx, ins);
1200 }
1201
1202 /* Globals are like UBOs if you squint. And shared memory is like globals if
1203 * you squint even harder */
1204
1205 static void
emit_global(compiler_context * ctx,nir_instr * instr,bool is_read,unsigned srcdest,nir_src * offset,unsigned seg)1206 emit_global(
1207 compiler_context *ctx,
1208 nir_instr *instr,
1209 bool is_read,
1210 unsigned srcdest,
1211 nir_src *offset,
1212 unsigned seg)
1213 {
1214 midgard_instruction ins;
1215
1216 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1217 if (is_read) {
1218 unsigned bitsize = nir_dest_bit_size(intr->dest) *
1219 nir_dest_num_components(intr->dest);
1220
1221 if (bitsize <= 32)
1222 ins = m_ld_32(srcdest, 0);
1223 else if (bitsize <= 64)
1224 ins = m_ld_64(srcdest, 0);
1225 else if (bitsize <= 128)
1226 ins = m_ld_128(srcdest, 0);
1227 else
1228 unreachable("Invalid global read size");
1229 } else {
1230 unsigned bitsize = nir_src_bit_size(intr->src[0]) *
1231 nir_src_num_components(intr->src[0]);
1232
1233 if (bitsize <= 32)
1234 ins = m_st_32(srcdest, 0);
1235 else if (bitsize <= 64)
1236 ins = m_st_64(srcdest, 0);
1237 else if (bitsize <= 128)
1238 ins = m_st_128(srcdest, 0);
1239 else
1240 unreachable("Invalid global store size");
1241 }
1242
1243 mir_set_offset(ctx, &ins, offset, seg);
1244 mir_set_intr_mask(instr, &ins, is_read);
1245
1246 /* Set a valid swizzle for masked out components */
1247 assert(ins.mask);
1248 unsigned first_component = __builtin_ffs(ins.mask) - 1;
1249
1250 for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) {
1251 if (!(ins.mask & (1 << i)))
1252 ins.swizzle[0][i] = first_component;
1253 }
1254
1255 emit_mir_instruction(ctx, ins);
1256 }
1257
1258 /* If is_shared is off, the only other possible value are globals, since
1259 * SSBO's are being lowered to globals through a NIR pass.
1260 * `image_direct_address` should be ~0 when instr is not an image_atomic
1261 * and the destination register of a lea_image op when it is an image_atomic. */
1262 static void
emit_atomic(compiler_context * ctx,nir_intrinsic_instr * instr,bool is_shared,midgard_load_store_op op,unsigned image_direct_address)1263 emit_atomic(
1264 compiler_context *ctx,
1265 nir_intrinsic_instr *instr,
1266 bool is_shared,
1267 midgard_load_store_op op,
1268 unsigned image_direct_address)
1269 {
1270 nir_alu_type type =
1271 (op == midgard_op_atomic_imin || op == midgard_op_atomic_imax) ?
1272 nir_type_int : nir_type_uint;
1273
1274 bool is_image = image_direct_address != ~0;
1275
1276 unsigned dest = nir_dest_index(&instr->dest);
1277 unsigned val_src = is_image ? 3 : 1;
1278 unsigned val = nir_src_index(ctx, &instr->src[val_src]);
1279 unsigned bitsize = nir_src_bit_size(instr->src[val_src]);
1280 emit_explicit_constant(ctx, val, val);
1281
1282 midgard_instruction ins = {
1283 .type = TAG_LOAD_STORE_4,
1284 .mask = 0xF,
1285 .dest = dest,
1286 .src = { ~0, ~0, ~0, val },
1287 .src_types = { 0, 0, 0, type | bitsize },
1288 .op = op
1289 };
1290
1291 nir_src *src_offset = nir_get_io_offset_src(instr);
1292
1293 if (op == midgard_op_atomic_cmpxchg) {
1294 unsigned xchg_val_src = is_image ? 4 : 2;
1295 unsigned xchg_val = nir_src_index(ctx, &instr->src[xchg_val_src]);
1296 emit_explicit_constant(ctx, xchg_val, xchg_val);
1297
1298 ins.src[2] = val;
1299 ins.src_types[2] = type | bitsize;
1300 ins.src[3] = xchg_val;
1301
1302 if (is_shared) {
1303 ins.load_store.arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR;
1304 ins.load_store.arg_comp = COMPONENT_Z;
1305 ins.load_store.bitsize_toggle = true;
1306 } else {
1307 for(unsigned i = 0; i < 2; ++i)
1308 ins.swizzle[1][i] = i;
1309
1310 ins.src[1] = is_image ? image_direct_address :
1311 nir_src_index(ctx, src_offset);
1312 ins.src_types[1] = nir_type_uint64;
1313 }
1314 } else if (is_image) {
1315 for(unsigned i = 0; i < 2; ++i)
1316 ins.swizzle[2][i] = i;
1317
1318 ins.src[2] = image_direct_address;
1319 ins.src_types[2] = nir_type_uint64;
1320
1321 ins.load_store.arg_reg = REGISTER_LDST_ZERO;
1322 ins.load_store.bitsize_toggle = true;
1323 ins.load_store.index_format = midgard_index_address_u64;
1324 } else
1325 mir_set_offset(ctx, &ins, src_offset, is_shared ? LDST_SHARED : LDST_GLOBAL);
1326
1327 mir_set_intr_mask(&instr->instr, &ins, true);
1328
1329 emit_mir_instruction(ctx, ins);
1330 }
1331
1332 static void
emit_varying_read(compiler_context * ctx,unsigned dest,unsigned offset,unsigned nr_comp,unsigned component,nir_src * indirect_offset,nir_alu_type type,bool flat)1333 emit_varying_read(
1334 compiler_context *ctx,
1335 unsigned dest, unsigned offset,
1336 unsigned nr_comp, unsigned component,
1337 nir_src *indirect_offset, nir_alu_type type, bool flat)
1338 {
1339 /* XXX: Half-floats? */
1340 /* TODO: swizzle, mask */
1341
1342 midgard_instruction ins = m_ld_vary_32(dest, PACK_LDST_ATTRIB_OFS(offset));
1343 ins.mask = mask_of(nr_comp);
1344 ins.dest_type = type;
1345
1346 if (type == nir_type_float16) {
1347 /* Ensure we are aligned so we can pack it later */
1348 ins.mask = mask_of(ALIGN_POT(nr_comp, 2));
1349 }
1350
1351 for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i)
1352 ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W);
1353
1354
1355 midgard_varying_params p = {
1356 .flat_shading = flat,
1357 .perspective_correction = 1,
1358 .interpolate_sample = true,
1359 };
1360 midgard_pack_varying_params(&ins.load_store, p);
1361
1362 if (indirect_offset) {
1363 ins.src[2] = nir_src_index(ctx, indirect_offset);
1364 ins.src_types[2] = nir_type_uint32;
1365 } else
1366 ins.load_store.index_reg = REGISTER_LDST_ZERO;
1367
1368 ins.load_store.arg_reg = REGISTER_LDST_ZERO;
1369 ins.load_store.index_format = midgard_index_address_u32;
1370
1371 /* Use the type appropriate load */
1372 switch (type) {
1373 case nir_type_uint32:
1374 case nir_type_bool32:
1375 ins.op = midgard_op_ld_vary_32u;
1376 break;
1377 case nir_type_int32:
1378 ins.op = midgard_op_ld_vary_32i;
1379 break;
1380 case nir_type_float32:
1381 ins.op = midgard_op_ld_vary_32;
1382 break;
1383 case nir_type_float16:
1384 ins.op = midgard_op_ld_vary_16;
1385 break;
1386 default:
1387 unreachable("Attempted to load unknown type");
1388 break;
1389 }
1390
1391 emit_mir_instruction(ctx, ins);
1392 }
1393
1394
1395 /* If `is_atomic` is true, we emit a `lea_image` since midgard doesn't not have special
1396 * image_atomic opcodes. The caller can then use that address to emit a normal atomic opcode. */
1397 static midgard_instruction
emit_image_op(compiler_context * ctx,nir_intrinsic_instr * instr,bool is_atomic)1398 emit_image_op(compiler_context *ctx, nir_intrinsic_instr *instr, bool is_atomic)
1399 {
1400 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1401 unsigned nr_attr = ctx->stage == MESA_SHADER_VERTEX ?
1402 util_bitcount64(ctx->nir->info.inputs_read) : 0;
1403 unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
1404 bool is_array = nir_intrinsic_image_array(instr);
1405 bool is_store = instr->intrinsic == nir_intrinsic_image_store;
1406
1407 /* TODO: MSAA */
1408 assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
1409
1410 unsigned coord_reg = nir_src_index(ctx, &instr->src[1]);
1411 emit_explicit_constant(ctx, coord_reg, coord_reg);
1412
1413 nir_src *index = &instr->src[0];
1414 bool is_direct = nir_src_is_const(*index);
1415
1416 /* For image opcodes, address is used as an index into the attribute descriptor */
1417 unsigned address = nr_attr;
1418 if (is_direct)
1419 address += nir_src_as_uint(*index);
1420
1421 midgard_instruction ins;
1422 if (is_store) { /* emit st_image_* */
1423 unsigned val = nir_src_index(ctx, &instr->src[3]);
1424 emit_explicit_constant(ctx, val, val);
1425
1426 nir_alu_type type = nir_intrinsic_src_type(instr);
1427 ins = st_image(type, val, PACK_LDST_ATTRIB_OFS(address));
1428 nir_alu_type base_type = nir_alu_type_get_base_type(type);
1429 ins.src_types[0] = base_type | nir_src_bit_size(instr->src[3]);
1430 } else if (is_atomic) { /* emit lea_image */
1431 unsigned dest = make_compiler_temp_reg(ctx);
1432 ins = m_lea_image(dest, PACK_LDST_ATTRIB_OFS(address));
1433 ins.mask = mask_of(2); /* 64-bit memory address */
1434 } else { /* emit ld_image_* */
1435 nir_alu_type type = nir_intrinsic_dest_type(instr);
1436 ins = ld_image(type, nir_dest_index(&instr->dest), PACK_LDST_ATTRIB_OFS(address));
1437 ins.mask = mask_of(nir_intrinsic_dest_components(instr));
1438 ins.dest_type = type;
1439 }
1440
1441 /* Coord reg */
1442 ins.src[1] = coord_reg;
1443 ins.src_types[1] = nir_type_uint16;
1444 if (nr_dim == 3 || is_array) {
1445 ins.load_store.bitsize_toggle = true;
1446 }
1447
1448 /* Image index reg */
1449 if (!is_direct) {
1450 ins.src[2] = nir_src_index(ctx, index);
1451 ins.src_types[2] = nir_type_uint32;
1452 } else
1453 ins.load_store.index_reg = REGISTER_LDST_ZERO;
1454
1455 emit_mir_instruction(ctx, ins);
1456
1457 return ins;
1458 }
1459
1460 static void
emit_attr_read(compiler_context * ctx,unsigned dest,unsigned offset,unsigned nr_comp,nir_alu_type t)1461 emit_attr_read(
1462 compiler_context *ctx,
1463 unsigned dest, unsigned offset,
1464 unsigned nr_comp, nir_alu_type t)
1465 {
1466 midgard_instruction ins = m_ld_attr_32(dest, PACK_LDST_ATTRIB_OFS(offset));
1467 ins.load_store.arg_reg = REGISTER_LDST_ZERO;
1468 ins.load_store.index_reg = REGISTER_LDST_ZERO;
1469 ins.mask = mask_of(nr_comp);
1470
1471 /* Use the type appropriate load */
1472 switch (t) {
1473 case nir_type_uint:
1474 case nir_type_bool:
1475 ins.op = midgard_op_ld_attr_32u;
1476 break;
1477 case nir_type_int:
1478 ins.op = midgard_op_ld_attr_32i;
1479 break;
1480 case nir_type_float:
1481 ins.op = midgard_op_ld_attr_32;
1482 break;
1483 default:
1484 unreachable("Attempted to load unknown type");
1485 break;
1486 }
1487
1488 emit_mir_instruction(ctx, ins);
1489 }
1490
1491 static void
emit_sysval_read(compiler_context * ctx,nir_instr * instr,unsigned nr_components,unsigned offset)1492 emit_sysval_read(compiler_context *ctx, nir_instr *instr,
1493 unsigned nr_components, unsigned offset)
1494 {
1495 nir_dest nir_dest;
1496
1497 /* Figure out which uniform this is */
1498 unsigned sysval_ubo =
1499 MAX2(ctx->inputs->sysval_ubo, ctx->nir->info.num_ubos);
1500 int sysval = panfrost_sysval_for_instr(instr, &nir_dest);
1501 unsigned dest = nir_dest_index(&nir_dest);
1502 unsigned uniform =
1503 pan_lookup_sysval(ctx->sysval_to_id, &ctx->info->sysvals, sysval);
1504
1505 /* Emit the read itself -- this is never indirect */
1506 midgard_instruction *ins =
1507 emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0,
1508 sysval_ubo, nr_components);
1509
1510 ins->mask = mask_of(nr_components);
1511 }
1512
1513 static unsigned
compute_builtin_arg(nir_intrinsic_op op)1514 compute_builtin_arg(nir_intrinsic_op op)
1515 {
1516 switch (op) {
1517 case nir_intrinsic_load_workgroup_id:
1518 return REGISTER_LDST_GROUP_ID;
1519 case nir_intrinsic_load_local_invocation_id:
1520 return REGISTER_LDST_LOCAL_THREAD_ID;
1521 case nir_intrinsic_load_global_invocation_id:
1522 case nir_intrinsic_load_global_invocation_id_zero_base:
1523 return REGISTER_LDST_GLOBAL_THREAD_ID;
1524 default:
1525 unreachable("Invalid compute paramater loaded");
1526 }
1527 }
1528
1529 static void
emit_fragment_store(compiler_context * ctx,unsigned src,unsigned src_z,unsigned src_s,enum midgard_rt_id rt,unsigned sample_iter)1530 emit_fragment_store(compiler_context *ctx, unsigned src, unsigned src_z, unsigned src_s,
1531 enum midgard_rt_id rt, unsigned sample_iter)
1532 {
1533 assert(rt < ARRAY_SIZE(ctx->writeout_branch));
1534 assert(sample_iter < ARRAY_SIZE(ctx->writeout_branch[0]));
1535
1536 midgard_instruction *br = ctx->writeout_branch[rt][sample_iter];
1537
1538 assert(!br);
1539
1540 emit_explicit_constant(ctx, src, src);
1541
1542 struct midgard_instruction ins =
1543 v_branch(false, false);
1544
1545 bool depth_only = (rt == MIDGARD_ZS_RT);
1546
1547 ins.writeout = depth_only ? 0 : PAN_WRITEOUT_C;
1548
1549 /* Add dependencies */
1550 ins.src[0] = src;
1551 ins.src_types[0] = nir_type_uint32;
1552
1553 if (depth_only)
1554 ins.constants.u32[0] = 0xFF;
1555 else
1556 ins.constants.u32[0] = ((rt - MIDGARD_COLOR_RT0) << 8) | sample_iter;
1557
1558 for (int i = 0; i < 4; ++i)
1559 ins.swizzle[0][i] = i;
1560
1561 if (~src_z) {
1562 emit_explicit_constant(ctx, src_z, src_z);
1563 ins.src[2] = src_z;
1564 ins.src_types[2] = nir_type_uint32;
1565 ins.writeout |= PAN_WRITEOUT_Z;
1566 }
1567 if (~src_s) {
1568 emit_explicit_constant(ctx, src_s, src_s);
1569 ins.src[3] = src_s;
1570 ins.src_types[3] = nir_type_uint32;
1571 ins.writeout |= PAN_WRITEOUT_S;
1572 }
1573
1574 /* Emit the branch */
1575 br = emit_mir_instruction(ctx, ins);
1576 schedule_barrier(ctx);
1577 ctx->writeout_branch[rt][sample_iter] = br;
1578
1579 /* Push our current location = current block count - 1 = where we'll
1580 * jump to. Maybe a bit too clever for my own good */
1581
1582 br->branch.target_block = ctx->block_count - 1;
1583 }
1584
1585 static void
emit_compute_builtin(compiler_context * ctx,nir_intrinsic_instr * instr)1586 emit_compute_builtin(compiler_context *ctx, nir_intrinsic_instr *instr)
1587 {
1588 unsigned reg = nir_dest_index(&instr->dest);
1589 midgard_instruction ins = m_ldst_mov(reg, 0);
1590 ins.mask = mask_of(3);
1591 ins.swizzle[0][3] = COMPONENT_X; /* xyzx */
1592 ins.load_store.arg_reg = compute_builtin_arg(instr->intrinsic);
1593 emit_mir_instruction(ctx, ins);
1594 }
1595
1596 static unsigned
vertex_builtin_arg(nir_intrinsic_op op)1597 vertex_builtin_arg(nir_intrinsic_op op)
1598 {
1599 switch (op) {
1600 case nir_intrinsic_load_vertex_id_zero_base:
1601 return PAN_VERTEX_ID;
1602 case nir_intrinsic_load_instance_id:
1603 return PAN_INSTANCE_ID;
1604 default:
1605 unreachable("Invalid vertex builtin");
1606 }
1607 }
1608
1609 static void
emit_vertex_builtin(compiler_context * ctx,nir_intrinsic_instr * instr)1610 emit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr)
1611 {
1612 unsigned reg = nir_dest_index(&instr->dest);
1613 emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int);
1614 }
1615
1616 static void
emit_special(compiler_context * ctx,nir_intrinsic_instr * instr,unsigned idx)1617 emit_special(compiler_context *ctx, nir_intrinsic_instr *instr, unsigned idx)
1618 {
1619 unsigned reg = nir_dest_index(&instr->dest);
1620
1621 midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0);
1622 ld.op = midgard_op_ld_special_32u;
1623 ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(idx);
1624 ld.load_store.index_reg = REGISTER_LDST_ZERO;
1625
1626 for (int i = 0; i < 4; ++i)
1627 ld.swizzle[0][i] = COMPONENT_X;
1628
1629 emit_mir_instruction(ctx, ld);
1630 }
1631
1632 static void
emit_control_barrier(compiler_context * ctx)1633 emit_control_barrier(compiler_context *ctx)
1634 {
1635 midgard_instruction ins = {
1636 .type = TAG_TEXTURE_4,
1637 .dest = ~0,
1638 .src = { ~0, ~0, ~0, ~0 },
1639 .op = midgard_tex_op_barrier,
1640 };
1641
1642 emit_mir_instruction(ctx, ins);
1643 }
1644
1645 static unsigned
mir_get_branch_cond(nir_src * src,bool * invert)1646 mir_get_branch_cond(nir_src *src, bool *invert)
1647 {
1648 /* Wrap it. No swizzle since it's a scalar */
1649
1650 nir_alu_src alu = {
1651 .src = *src
1652 };
1653
1654 *invert = pan_has_source_mod(&alu, nir_op_inot);
1655 return nir_src_index(NULL, &alu.src);
1656 }
1657
1658 static uint8_t
output_load_rt_addr(compiler_context * ctx,nir_intrinsic_instr * instr)1659 output_load_rt_addr(compiler_context *ctx, nir_intrinsic_instr *instr)
1660 {
1661 if (ctx->inputs->is_blend)
1662 return MIDGARD_COLOR_RT0 + ctx->inputs->blend.rt;
1663
1664 const nir_variable *var;
1665 var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr));
1666 assert(var);
1667
1668 unsigned loc = var->data.location;
1669
1670 if (loc >= FRAG_RESULT_DATA0)
1671 return loc - FRAG_RESULT_DATA0;
1672
1673 if (loc == FRAG_RESULT_DEPTH)
1674 return 0x1F;
1675 if (loc == FRAG_RESULT_STENCIL)
1676 return 0x1E;
1677
1678 unreachable("Invalid RT to load from");
1679 }
1680
1681 static void
emit_intrinsic(compiler_context * ctx,nir_intrinsic_instr * instr)1682 emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
1683 {
1684 unsigned offset = 0, reg;
1685
1686 switch (instr->intrinsic) {
1687 case nir_intrinsic_discard_if:
1688 case nir_intrinsic_discard: {
1689 bool conditional = instr->intrinsic == nir_intrinsic_discard_if;
1690 struct midgard_instruction discard = v_branch(conditional, false);
1691 discard.branch.target_type = TARGET_DISCARD;
1692
1693 if (conditional) {
1694 discard.src[0] = mir_get_branch_cond(&instr->src[0],
1695 &discard.branch.invert_conditional);
1696 discard.src_types[0] = nir_type_uint32;
1697 }
1698
1699 emit_mir_instruction(ctx, discard);
1700 schedule_barrier(ctx);
1701
1702 break;
1703 }
1704
1705 case nir_intrinsic_image_load:
1706 case nir_intrinsic_image_store:
1707 emit_image_op(ctx, instr, false);
1708 break;
1709
1710 case nir_intrinsic_image_size: {
1711 unsigned nr_comp = nir_intrinsic_dest_components(instr);
1712 emit_sysval_read(ctx, &instr->instr, nr_comp, 0);
1713 break;
1714 }
1715
1716 case nir_intrinsic_load_ubo:
1717 case nir_intrinsic_load_global:
1718 case nir_intrinsic_load_global_constant:
1719 case nir_intrinsic_load_shared:
1720 case nir_intrinsic_load_scratch:
1721 case nir_intrinsic_load_input:
1722 case nir_intrinsic_load_kernel_input:
1723 case nir_intrinsic_load_interpolated_input: {
1724 bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
1725 bool is_global = instr->intrinsic == nir_intrinsic_load_global ||
1726 instr->intrinsic == nir_intrinsic_load_global_constant;
1727 bool is_shared = instr->intrinsic == nir_intrinsic_load_shared;
1728 bool is_scratch = instr->intrinsic == nir_intrinsic_load_scratch;
1729 bool is_flat = instr->intrinsic == nir_intrinsic_load_input;
1730 bool is_kernel = instr->intrinsic == nir_intrinsic_load_kernel_input;
1731 bool is_interp = instr->intrinsic == nir_intrinsic_load_interpolated_input;
1732
1733 /* Get the base type of the intrinsic */
1734 /* TODO: Infer type? Does it matter? */
1735 nir_alu_type t =
1736 (is_interp) ? nir_type_float :
1737 (is_flat) ? nir_intrinsic_dest_type(instr) :
1738 nir_type_uint;
1739
1740 t = nir_alu_type_get_base_type(t);
1741
1742 if (!(is_ubo || is_global || is_scratch)) {
1743 offset = nir_intrinsic_base(instr);
1744 }
1745
1746 unsigned nr_comp = nir_intrinsic_dest_components(instr);
1747
1748 nir_src *src_offset = nir_get_io_offset_src(instr);
1749
1750 bool direct = nir_src_is_const(*src_offset);
1751 nir_src *indirect_offset = direct ? NULL : src_offset;
1752
1753 if (direct)
1754 offset += nir_src_as_uint(*src_offset);
1755
1756 /* We may need to apply a fractional offset */
1757 int component = (is_flat || is_interp) ?
1758 nir_intrinsic_component(instr) : 0;
1759 reg = nir_dest_index(&instr->dest);
1760
1761 if (is_kernel) {
1762 emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, 0, nr_comp);
1763 } else if (is_ubo) {
1764 nir_src index = instr->src[0];
1765
1766 /* TODO: Is indirect block number possible? */
1767 assert(nir_src_is_const(index));
1768
1769 uint32_t uindex = nir_src_as_uint(index);
1770 emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex, nr_comp);
1771 } else if (is_global || is_shared || is_scratch) {
1772 unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH);
1773 emit_global(ctx, &instr->instr, true, reg, src_offset, seg);
1774 } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) {
1775 emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t | nir_dest_bit_size(instr->dest), is_flat);
1776 } else if (ctx->inputs->is_blend) {
1777 /* ctx->blend_input will be precoloured to r0/r2, where
1778 * the input is preloaded */
1779
1780 unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input;
1781
1782 if (*input == ~0)
1783 *input = reg;
1784 else
1785 emit_mir_instruction(ctx, v_mov(*input, reg));
1786 } else if (ctx->stage == MESA_SHADER_VERTEX) {
1787 emit_attr_read(ctx, reg, offset, nr_comp, t);
1788 } else {
1789 DBG("Unknown load\n");
1790 assert(0);
1791 }
1792
1793 break;
1794 }
1795
1796 /* Handled together with load_interpolated_input */
1797 case nir_intrinsic_load_barycentric_pixel:
1798 case nir_intrinsic_load_barycentric_centroid:
1799 case nir_intrinsic_load_barycentric_sample:
1800 break;
1801
1802 /* Reads 128-bit value raw off the tilebuffer during blending, tasty */
1803
1804 case nir_intrinsic_load_raw_output_pan: {
1805 reg = nir_dest_index(&instr->dest);
1806
1807 /* T720 and below use different blend opcodes with slightly
1808 * different semantics than T760 and up */
1809
1810 midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0);
1811
1812 unsigned target = output_load_rt_addr(ctx, instr);
1813 ld.load_store.index_comp = target & 0x3;
1814 ld.load_store.index_reg = target >> 2;
1815
1816 if (nir_src_is_const(instr->src[0])) {
1817 unsigned sample = nir_src_as_uint(instr->src[0]);
1818 ld.load_store.arg_comp = sample & 0x3;
1819 ld.load_store.arg_reg = sample >> 2;
1820 } else {
1821 /* Enable sample index via register. */
1822 ld.load_store.signed_offset |= 1;
1823 ld.src[1] = nir_src_index(ctx, &instr->src[0]);
1824 ld.src_types[1] = nir_type_int32;
1825 }
1826
1827 if (ctx->quirks & MIDGARD_OLD_BLEND) {
1828 ld.op = midgard_op_ld_special_32u;
1829 ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(16);
1830 ld.load_store.index_reg = REGISTER_LDST_ZERO;
1831 }
1832
1833 emit_mir_instruction(ctx, ld);
1834 break;
1835 }
1836
1837 case nir_intrinsic_load_output: {
1838 reg = nir_dest_index(&instr->dest);
1839
1840 unsigned bits = nir_dest_bit_size(instr->dest);
1841
1842 midgard_instruction ld;
1843 if (bits == 16)
1844 ld = m_ld_tilebuffer_16f(reg, 0);
1845 else
1846 ld = m_ld_tilebuffer_32f(reg, 0);
1847
1848 unsigned index = output_load_rt_addr(ctx, instr);
1849 ld.load_store.index_comp = index & 0x3;
1850 ld.load_store.index_reg = index >> 2;
1851
1852 for (unsigned c = 4; c < 16; ++c)
1853 ld.swizzle[0][c] = 0;
1854
1855 if (ctx->quirks & MIDGARD_OLD_BLEND) {
1856 if (bits == 16)
1857 ld.op = midgard_op_ld_special_16f;
1858 else
1859 ld.op = midgard_op_ld_special_32f;
1860 ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(1);
1861 ld.load_store.index_reg = REGISTER_LDST_ZERO;
1862 }
1863
1864 emit_mir_instruction(ctx, ld);
1865 break;
1866 }
1867
1868 case nir_intrinsic_store_output:
1869 case nir_intrinsic_store_combined_output_pan:
1870 assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
1871
1872 offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]);
1873
1874 reg = nir_src_index(ctx, &instr->src[0]);
1875
1876 if (ctx->stage == MESA_SHADER_FRAGMENT) {
1877 bool combined = instr->intrinsic ==
1878 nir_intrinsic_store_combined_output_pan;
1879
1880 const nir_variable *var;
1881 var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out,
1882 nir_intrinsic_base(instr));
1883 assert(var);
1884
1885 /* Dual-source blend writeout is done by leaving the
1886 * value in r2 for the blend shader to use. */
1887 if (var->data.index) {
1888 if (instr->src[0].is_ssa) {
1889 emit_explicit_constant(ctx, reg, reg);
1890
1891 unsigned out = make_compiler_temp(ctx);
1892
1893 midgard_instruction ins = v_mov(reg, out);
1894 emit_mir_instruction(ctx, ins);
1895
1896 ctx->blend_src1 = out;
1897 } else {
1898 ctx->blend_src1 = reg;
1899 }
1900
1901 break;
1902 }
1903
1904 enum midgard_rt_id rt;
1905 if (var->data.location >= FRAG_RESULT_DATA0)
1906 rt = MIDGARD_COLOR_RT0 + var->data.location -
1907 FRAG_RESULT_DATA0;
1908 else if (combined)
1909 rt = MIDGARD_ZS_RT;
1910 else
1911 unreachable("bad rt");
1912
1913 unsigned reg_z = ~0, reg_s = ~0;
1914 if (combined) {
1915 unsigned writeout = nir_intrinsic_component(instr);
1916 if (writeout & PAN_WRITEOUT_Z)
1917 reg_z = nir_src_index(ctx, &instr->src[2]);
1918 if (writeout & PAN_WRITEOUT_S)
1919 reg_s = nir_src_index(ctx, &instr->src[3]);
1920 }
1921
1922 emit_fragment_store(ctx, reg, reg_z, reg_s, rt, 0);
1923 } else if (ctx->stage == MESA_SHADER_VERTEX) {
1924 assert(instr->intrinsic == nir_intrinsic_store_output);
1925
1926 /* We should have been vectorized, though we don't
1927 * currently check that st_vary is emitted only once
1928 * per slot (this is relevant, since there's not a mask
1929 * parameter available on the store [set to 0 by the
1930 * blob]). We do respect the component by adjusting the
1931 * swizzle. If this is a constant source, we'll need to
1932 * emit that explicitly. */
1933
1934 emit_explicit_constant(ctx, reg, reg);
1935
1936 unsigned dst_component = nir_intrinsic_component(instr);
1937 unsigned nr_comp = nir_src_num_components(instr->src[0]);
1938
1939 midgard_instruction st = m_st_vary_32(reg, PACK_LDST_ATTRIB_OFS(offset));
1940 st.load_store.arg_reg = REGISTER_LDST_ZERO;
1941 st.load_store.index_format = midgard_index_address_u32;
1942 st.load_store.index_reg = REGISTER_LDST_ZERO;
1943
1944 switch (nir_alu_type_get_base_type(nir_intrinsic_src_type(instr))) {
1945 case nir_type_uint:
1946 case nir_type_bool:
1947 st.op = midgard_op_st_vary_32u;
1948 break;
1949 case nir_type_int:
1950 st.op = midgard_op_st_vary_32i;
1951 break;
1952 case nir_type_float:
1953 st.op = midgard_op_st_vary_32;
1954 break;
1955 default:
1956 unreachable("Attempted to store unknown type");
1957 break;
1958 }
1959
1960 /* nir_intrinsic_component(store_intr) encodes the
1961 * destination component start. Source component offset
1962 * adjustment is taken care of in
1963 * install_registers_instr(), when offset_swizzle() is
1964 * called.
1965 */
1966 unsigned src_component = COMPONENT_X;
1967
1968 assert(nr_comp > 0);
1969 for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) {
1970 st.swizzle[0][i] = src_component;
1971 if (i >= dst_component && i < dst_component + nr_comp - 1)
1972 src_component++;
1973 }
1974
1975 emit_mir_instruction(ctx, st);
1976 } else {
1977 DBG("Unknown store\n");
1978 assert(0);
1979 }
1980
1981 break;
1982
1983 /* Special case of store_output for lowered blend shaders */
1984 case nir_intrinsic_store_raw_output_pan:
1985 assert (ctx->stage == MESA_SHADER_FRAGMENT);
1986 reg = nir_src_index(ctx, &instr->src[0]);
1987 for (unsigned s = 0; s < ctx->blend_sample_iterations; s++)
1988 emit_fragment_store(ctx, reg, ~0, ~0,
1989 ctx->inputs->blend.rt + MIDGARD_COLOR_RT0,
1990 s);
1991 break;
1992
1993 case nir_intrinsic_store_global:
1994 case nir_intrinsic_store_shared:
1995 case nir_intrinsic_store_scratch:
1996 reg = nir_src_index(ctx, &instr->src[0]);
1997 emit_explicit_constant(ctx, reg, reg);
1998
1999 unsigned seg;
2000 if (instr->intrinsic == nir_intrinsic_store_global)
2001 seg = LDST_GLOBAL;
2002 else if (instr->intrinsic == nir_intrinsic_store_shared)
2003 seg = LDST_SHARED;
2004 else
2005 seg = LDST_SCRATCH;
2006
2007 emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg);
2008 break;
2009
2010 case nir_intrinsic_load_first_vertex:
2011 case nir_intrinsic_load_ssbo_address:
2012 case nir_intrinsic_load_work_dim:
2013 emit_sysval_read(ctx, &instr->instr, 1, 0);
2014 break;
2015
2016 case nir_intrinsic_load_base_vertex:
2017 emit_sysval_read(ctx, &instr->instr, 1, 4);
2018 break;
2019
2020 case nir_intrinsic_load_base_instance:
2021 emit_sysval_read(ctx, &instr->instr, 1, 8);
2022 break;
2023
2024 case nir_intrinsic_load_sample_positions_pan:
2025 emit_sysval_read(ctx, &instr->instr, 2, 0);
2026 break;
2027
2028 case nir_intrinsic_get_ssbo_size:
2029 emit_sysval_read(ctx, &instr->instr, 1, 8);
2030 break;
2031
2032 case nir_intrinsic_load_viewport_scale:
2033 case nir_intrinsic_load_viewport_offset:
2034 case nir_intrinsic_load_num_workgroups:
2035 case nir_intrinsic_load_sampler_lod_parameters_pan:
2036 case nir_intrinsic_load_workgroup_size:
2037 emit_sysval_read(ctx, &instr->instr, 3, 0);
2038 break;
2039
2040 case nir_intrinsic_load_blend_const_color_rgba:
2041 emit_sysval_read(ctx, &instr->instr, 4, 0);
2042 break;
2043
2044 case nir_intrinsic_load_workgroup_id:
2045 case nir_intrinsic_load_local_invocation_id:
2046 case nir_intrinsic_load_global_invocation_id:
2047 case nir_intrinsic_load_global_invocation_id_zero_base:
2048 emit_compute_builtin(ctx, instr);
2049 break;
2050
2051 case nir_intrinsic_load_vertex_id_zero_base:
2052 case nir_intrinsic_load_instance_id:
2053 emit_vertex_builtin(ctx, instr);
2054 break;
2055
2056 case nir_intrinsic_load_sample_mask_in:
2057 emit_special(ctx, instr, 96);
2058 break;
2059
2060 case nir_intrinsic_load_sample_id:
2061 emit_special(ctx, instr, 97);
2062 break;
2063
2064 /* Midgard doesn't seem to want special handling */
2065 case nir_intrinsic_memory_barrier:
2066 case nir_intrinsic_memory_barrier_buffer:
2067 case nir_intrinsic_memory_barrier_image:
2068 case nir_intrinsic_memory_barrier_shared:
2069 case nir_intrinsic_group_memory_barrier:
2070 break;
2071
2072 case nir_intrinsic_control_barrier:
2073 schedule_barrier(ctx);
2074 emit_control_barrier(ctx);
2075 schedule_barrier(ctx);
2076 break;
2077
2078 ATOMIC_CASE(ctx, instr, add, add);
2079 ATOMIC_CASE(ctx, instr, and, and);
2080 ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg);
2081 ATOMIC_CASE(ctx, instr, exchange, xchg);
2082 ATOMIC_CASE(ctx, instr, imax, imax);
2083 ATOMIC_CASE(ctx, instr, imin, imin);
2084 ATOMIC_CASE(ctx, instr, or, or);
2085 ATOMIC_CASE(ctx, instr, umax, umax);
2086 ATOMIC_CASE(ctx, instr, umin, umin);
2087 ATOMIC_CASE(ctx, instr, xor, xor);
2088
2089 IMAGE_ATOMIC_CASE(ctx, instr, add, add);
2090 IMAGE_ATOMIC_CASE(ctx, instr, and, and);
2091 IMAGE_ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg);
2092 IMAGE_ATOMIC_CASE(ctx, instr, exchange, xchg);
2093 IMAGE_ATOMIC_CASE(ctx, instr, imax, imax);
2094 IMAGE_ATOMIC_CASE(ctx, instr, imin, imin);
2095 IMAGE_ATOMIC_CASE(ctx, instr, or, or);
2096 IMAGE_ATOMIC_CASE(ctx, instr, umax, umax);
2097 IMAGE_ATOMIC_CASE(ctx, instr, umin, umin);
2098 IMAGE_ATOMIC_CASE(ctx, instr, xor, xor);
2099
2100 default:
2101 fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
2102 assert(0);
2103 break;
2104 }
2105 }
2106
2107 /* Returns dimension with 0 special casing cubemaps */
2108 static unsigned
midgard_tex_format(enum glsl_sampler_dim dim)2109 midgard_tex_format(enum glsl_sampler_dim dim)
2110 {
2111 switch (dim) {
2112 case GLSL_SAMPLER_DIM_1D:
2113 case GLSL_SAMPLER_DIM_BUF:
2114 return 1;
2115
2116 case GLSL_SAMPLER_DIM_2D:
2117 case GLSL_SAMPLER_DIM_MS:
2118 case GLSL_SAMPLER_DIM_EXTERNAL:
2119 case GLSL_SAMPLER_DIM_RECT:
2120 return 2;
2121
2122 case GLSL_SAMPLER_DIM_3D:
2123 return 3;
2124
2125 case GLSL_SAMPLER_DIM_CUBE:
2126 return 0;
2127
2128 default:
2129 DBG("Unknown sampler dim type\n");
2130 assert(0);
2131 return 0;
2132 }
2133 }
2134
2135 /* Tries to attach an explicit LOD or bias as a constant. Returns whether this
2136 * was successful */
2137
2138 static bool
pan_attach_constant_bias(compiler_context * ctx,nir_src lod,midgard_texture_word * word)2139 pan_attach_constant_bias(
2140 compiler_context *ctx,
2141 nir_src lod,
2142 midgard_texture_word *word)
2143 {
2144 /* To attach as constant, it has to *be* constant */
2145
2146 if (!nir_src_is_const(lod))
2147 return false;
2148
2149 float f = nir_src_as_float(lod);
2150
2151 /* Break into fixed-point */
2152 signed lod_int = f;
2153 float lod_frac = f - lod_int;
2154
2155 /* Carry over negative fractions */
2156 if (lod_frac < 0.0) {
2157 lod_int--;
2158 lod_frac += 1.0;
2159 }
2160
2161 /* Encode */
2162 word->bias = float_to_ubyte(lod_frac);
2163 word->bias_int = lod_int;
2164
2165 return true;
2166 }
2167
2168 static enum mali_texture_mode
mdg_texture_mode(nir_tex_instr * instr)2169 mdg_texture_mode(nir_tex_instr *instr)
2170 {
2171 if (instr->op == nir_texop_tg4 && instr->is_shadow)
2172 return TEXTURE_GATHER_SHADOW;
2173 else if (instr->op == nir_texop_tg4)
2174 return TEXTURE_GATHER_X + instr->component;
2175 else if (instr->is_shadow)
2176 return TEXTURE_SHADOW;
2177 else
2178 return TEXTURE_NORMAL;
2179 }
2180
2181 static void
set_tex_coord(compiler_context * ctx,nir_tex_instr * instr,midgard_instruction * ins)2182 set_tex_coord(compiler_context *ctx, nir_tex_instr *instr,
2183 midgard_instruction *ins)
2184 {
2185 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
2186
2187 assert(coord_idx >= 0);
2188
2189 int comparator_idx = nir_tex_instr_src_index(instr, nir_tex_src_comparator);
2190 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
2191 assert(comparator_idx < 0 || ms_idx < 0);
2192 int ms_or_comparator_idx = ms_idx >= 0 ? ms_idx : comparator_idx;
2193
2194 unsigned coords = nir_src_index(ctx, &instr->src[coord_idx].src);
2195
2196 emit_explicit_constant(ctx, coords, coords);
2197
2198 ins->src_types[1] = nir_tex_instr_src_type(instr, coord_idx) |
2199 nir_src_bit_size(instr->src[coord_idx].src);
2200
2201 unsigned nr_comps = instr->coord_components;
2202 unsigned written_mask = 0, write_mask = 0;
2203
2204 /* Initialize all components to coord.x which is expected to always be
2205 * present. Swizzle is updated below based on the texture dimension
2206 * and extra attributes that are packed in the coordinate argument.
2207 */
2208 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++)
2209 ins->swizzle[1][c] = COMPONENT_X;
2210
2211 /* Shadow ref value is part of the coordinates if there's no comparator
2212 * source, in that case it's always placed in the last component.
2213 * Midgard wants the ref value in coord.z.
2214 */
2215 if (instr->is_shadow && comparator_idx < 0) {
2216 ins->swizzle[1][COMPONENT_Z] = --nr_comps;
2217 write_mask |= 1 << COMPONENT_Z;
2218 }
2219
2220 /* The array index is the last component if there's no shadow ref value
2221 * or second last if there's one. We already decremented the number of
2222 * components to account for the shadow ref value above.
2223 * Midgard wants the array index in coord.w.
2224 */
2225 if (instr->is_array) {
2226 ins->swizzle[1][COMPONENT_W] = --nr_comps;
2227 write_mask |= 1 << COMPONENT_W;
2228 }
2229
2230 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2231 /* texelFetch is undefined on samplerCube */
2232 assert(ins->op != midgard_tex_op_fetch);
2233
2234 ins->src[1] = make_compiler_temp_reg(ctx);
2235
2236 /* For cubemaps, we use a special ld/st op to select the face
2237 * and copy the xy into the texture register
2238 */
2239 midgard_instruction ld = m_ld_cubemap_coords(ins->src[1], 0);
2240 ld.src[1] = coords;
2241 ld.src_types[1] = ins->src_types[1];
2242 ld.mask = 0x3; /* xy */
2243 ld.load_store.bitsize_toggle = true;
2244 ld.swizzle[1][3] = COMPONENT_X;
2245 emit_mir_instruction(ctx, ld);
2246
2247 /* We packed cube coordiates (X,Y,Z) into (X,Y), update the
2248 * written mask accordingly and decrement the number of
2249 * components
2250 */
2251 nr_comps--;
2252 written_mask |= 3;
2253 }
2254
2255 /* Now flag tex coord components that have not been written yet */
2256 write_mask |= mask_of(nr_comps) & ~written_mask;
2257 for (unsigned c = 0; c < nr_comps; c++)
2258 ins->swizzle[1][c] = c;
2259
2260 /* Sample index and shadow ref are expected in coord.z */
2261 if (ms_or_comparator_idx >= 0) {
2262 assert(!((write_mask | written_mask) & (1 << COMPONENT_Z)));
2263
2264 unsigned sample_or_ref =
2265 nir_src_index(ctx, &instr->src[ms_or_comparator_idx].src);
2266
2267 emit_explicit_constant(ctx, sample_or_ref, sample_or_ref);
2268
2269 if (ins->src[1] == ~0)
2270 ins->src[1] = make_compiler_temp_reg(ctx);
2271
2272 midgard_instruction mov = v_mov(sample_or_ref, ins->src[1]);
2273
2274 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++)
2275 mov.swizzle[1][c] = COMPONENT_X;
2276
2277 mov.mask = 1 << COMPONENT_Z;
2278 written_mask |= 1 << COMPONENT_Z;
2279 ins->swizzle[1][COMPONENT_Z] = COMPONENT_Z;
2280 emit_mir_instruction(ctx, mov);
2281 }
2282
2283 /* Texelfetch coordinates uses all four elements (xyz/index) regardless
2284 * of texture dimensionality, which means it's necessary to zero the
2285 * unused components to keep everything happy.
2286 */
2287 if (ins->op == midgard_tex_op_fetch &&
2288 (written_mask | write_mask) != 0xF) {
2289 if (ins->src[1] == ~0)
2290 ins->src[1] = make_compiler_temp_reg(ctx);
2291
2292 /* mov index.zw, #0, or generalized */
2293 midgard_instruction mov =
2294 v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), ins->src[1]);
2295 mov.has_constants = true;
2296 mov.mask = (written_mask | write_mask) ^ 0xF;
2297 emit_mir_instruction(ctx, mov);
2298 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) {
2299 if (mov.mask & (1 << c))
2300 ins->swizzle[1][c] = c;
2301 }
2302 }
2303
2304 if (ins->src[1] == ~0) {
2305 /* No temporary reg created, use the src coords directly */
2306 ins->src[1] = coords;
2307 } else if (write_mask) {
2308 /* Move the remaining coordinates to the temporary reg */
2309 midgard_instruction mov = v_mov(coords, ins->src[1]);
2310
2311 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) {
2312 if ((1 << c) & write_mask) {
2313 mov.swizzle[1][c] = ins->swizzle[1][c];
2314 ins->swizzle[1][c] = c;
2315 } else {
2316 mov.swizzle[1][c] = COMPONENT_X;
2317 }
2318 }
2319
2320 mov.mask = write_mask;
2321 emit_mir_instruction(ctx, mov);
2322 }
2323 }
2324
2325 static void
emit_texop_native(compiler_context * ctx,nir_tex_instr * instr,unsigned midgard_texop)2326 emit_texop_native(compiler_context *ctx, nir_tex_instr *instr,
2327 unsigned midgard_texop)
2328 {
2329 /* TODO */
2330 //assert (!instr->sampler);
2331
2332 nir_dest *dest = &instr->dest;
2333
2334 int texture_index = instr->texture_index;
2335 int sampler_index = instr->sampler_index;
2336
2337 nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type);
2338
2339 /* texture instructions support float outmods */
2340 unsigned outmod = midgard_outmod_none;
2341 if (dest_base == nir_type_float) {
2342 outmod = mir_determine_float_outmod(ctx, &dest, 0);
2343 }
2344
2345 midgard_instruction ins = {
2346 .type = TAG_TEXTURE_4,
2347 .mask = 0xF,
2348 .dest = nir_dest_index(dest),
2349 .src = { ~0, ~0, ~0, ~0 },
2350 .dest_type = instr->dest_type,
2351 .swizzle = SWIZZLE_IDENTITY_4,
2352 .outmod = outmod,
2353 .op = midgard_texop,
2354 .texture = {
2355 .format = midgard_tex_format(instr->sampler_dim),
2356 .texture_handle = texture_index,
2357 .sampler_handle = sampler_index,
2358 .mode = mdg_texture_mode(instr)
2359 }
2360 };
2361
2362 if (instr->is_shadow && !instr->is_new_style_shadow && instr->op != nir_texop_tg4)
2363 for (int i = 0; i < 4; ++i)
2364 ins.swizzle[0][i] = COMPONENT_X;
2365
2366 for (unsigned i = 0; i < instr->num_srcs; ++i) {
2367 int index = nir_src_index(ctx, &instr->src[i].src);
2368 unsigned sz = nir_src_bit_size(instr->src[i].src);
2369 nir_alu_type T = nir_tex_instr_src_type(instr, i) | sz;
2370
2371 switch (instr->src[i].src_type) {
2372 case nir_tex_src_coord:
2373 set_tex_coord(ctx, instr, &ins);
2374 break;
2375
2376 case nir_tex_src_bias:
2377 case nir_tex_src_lod: {
2378 /* Try as a constant if we can */
2379
2380 bool is_txf = midgard_texop == midgard_tex_op_fetch;
2381 if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture))
2382 break;
2383
2384 ins.texture.lod_register = true;
2385 ins.src[2] = index;
2386 ins.src_types[2] = T;
2387
2388 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c)
2389 ins.swizzle[2][c] = COMPONENT_X;
2390
2391 emit_explicit_constant(ctx, index, index);
2392
2393 break;
2394 };
2395
2396 case nir_tex_src_offset: {
2397 ins.texture.offset_register = true;
2398 ins.src[3] = index;
2399 ins.src_types[3] = T;
2400
2401 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c)
2402 ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c;
2403
2404 emit_explicit_constant(ctx, index, index);
2405 break;
2406 };
2407
2408 case nir_tex_src_comparator:
2409 case nir_tex_src_ms_index:
2410 /* Nothing to do, handled in set_tex_coord() */
2411 break;
2412
2413 default: {
2414 fprintf(stderr, "Unknown texture source type: %d\n", instr->src[i].src_type);
2415 assert(0);
2416 }
2417 }
2418 }
2419
2420 emit_mir_instruction(ctx, ins);
2421 }
2422
2423 static void
emit_tex(compiler_context * ctx,nir_tex_instr * instr)2424 emit_tex(compiler_context *ctx, nir_tex_instr *instr)
2425 {
2426 switch (instr->op) {
2427 case nir_texop_tex:
2428 case nir_texop_txb:
2429 emit_texop_native(ctx, instr, midgard_tex_op_normal);
2430 break;
2431 case nir_texop_txl:
2432 case nir_texop_tg4:
2433 emit_texop_native(ctx, instr, midgard_tex_op_gradient);
2434 break;
2435 case nir_texop_txf:
2436 case nir_texop_txf_ms:
2437 emit_texop_native(ctx, instr, midgard_tex_op_fetch);
2438 break;
2439 case nir_texop_txs:
2440 emit_sysval_read(ctx, &instr->instr, 4, 0);
2441 break;
2442 default: {
2443 fprintf(stderr, "Unhandled texture op: %d\n", instr->op);
2444 assert(0);
2445 }
2446 }
2447 }
2448
2449 static void
emit_jump(compiler_context * ctx,nir_jump_instr * instr)2450 emit_jump(compiler_context *ctx, nir_jump_instr *instr)
2451 {
2452 switch (instr->type) {
2453 case nir_jump_break: {
2454 /* Emit a branch out of the loop */
2455 struct midgard_instruction br = v_branch(false, false);
2456 br.branch.target_type = TARGET_BREAK;
2457 br.branch.target_break = ctx->current_loop_depth;
2458 emit_mir_instruction(ctx, br);
2459 break;
2460 }
2461
2462 default:
2463 DBG("Unknown jump type %d\n", instr->type);
2464 break;
2465 }
2466 }
2467
2468 static void
emit_instr(compiler_context * ctx,struct nir_instr * instr)2469 emit_instr(compiler_context *ctx, struct nir_instr *instr)
2470 {
2471 switch (instr->type) {
2472 case nir_instr_type_load_const:
2473 emit_load_const(ctx, nir_instr_as_load_const(instr));
2474 break;
2475
2476 case nir_instr_type_intrinsic:
2477 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
2478 break;
2479
2480 case nir_instr_type_alu:
2481 emit_alu(ctx, nir_instr_as_alu(instr));
2482 break;
2483
2484 case nir_instr_type_tex:
2485 emit_tex(ctx, nir_instr_as_tex(instr));
2486 break;
2487
2488 case nir_instr_type_jump:
2489 emit_jump(ctx, nir_instr_as_jump(instr));
2490 break;
2491
2492 case nir_instr_type_ssa_undef:
2493 /* Spurious */
2494 break;
2495
2496 default:
2497 DBG("Unhandled instruction type\n");
2498 break;
2499 }
2500 }
2501
2502
2503 /* ALU instructions can inline or embed constants, which decreases register
2504 * pressure and saves space. */
2505
2506 #define CONDITIONAL_ATTACH(idx) { \
2507 void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[idx] + 1); \
2508 \
2509 if (entry) { \
2510 attach_constants(ctx, alu, entry, alu->src[idx] + 1); \
2511 alu->src[idx] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \
2512 } \
2513 }
2514
2515 static void
inline_alu_constants(compiler_context * ctx,midgard_block * block)2516 inline_alu_constants(compiler_context *ctx, midgard_block *block)
2517 {
2518 mir_foreach_instr_in_block(block, alu) {
2519 /* Other instructions cannot inline constants */
2520 if (alu->type != TAG_ALU_4) continue;
2521 if (alu->compact_branch) continue;
2522
2523 /* If there is already a constant here, we can do nothing */
2524 if (alu->has_constants) continue;
2525
2526 CONDITIONAL_ATTACH(0);
2527
2528 if (!alu->has_constants) {
2529 CONDITIONAL_ATTACH(1)
2530 } else if (!alu->inline_constant) {
2531 /* Corner case: _two_ vec4 constants, for instance with a
2532 * csel. For this case, we can only use a constant
2533 * register for one, we'll have to emit a move for the
2534 * other. */
2535
2536 void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1);
2537 unsigned scratch = make_compiler_temp(ctx);
2538
2539 if (entry) {
2540 midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch);
2541 attach_constants(ctx, &ins, entry, alu->src[1] + 1);
2542
2543 /* Set the source */
2544 alu->src[1] = scratch;
2545
2546 /* Inject us -before- the last instruction which set r31 */
2547 mir_insert_instruction_before(ctx, mir_prev_op(alu), ins);
2548 }
2549 }
2550 }
2551 }
2552
2553 unsigned
max_bitsize_for_alu(midgard_instruction * ins)2554 max_bitsize_for_alu(midgard_instruction *ins)
2555 {
2556 unsigned max_bitsize = 0;
2557 for (int i = 0; i < MIR_SRC_COUNT; i++) {
2558 if (ins->src[i] == ~0) continue;
2559 unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]);
2560 max_bitsize = MAX2(src_bitsize, max_bitsize);
2561 }
2562 unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type);
2563 max_bitsize = MAX2(dst_bitsize, max_bitsize);
2564
2565 /* We don't have fp16 LUTs, so we'll want to emit code like:
2566 *
2567 * vlut.fsinr hr0, hr0
2568 *
2569 * where both input and output are 16-bit but the operation is carried
2570 * out in 32-bit
2571 */
2572
2573 switch (ins->op) {
2574 case midgard_alu_op_fsqrt:
2575 case midgard_alu_op_frcp:
2576 case midgard_alu_op_frsqrt:
2577 case midgard_alu_op_fsinpi:
2578 case midgard_alu_op_fcospi:
2579 case midgard_alu_op_fexp2:
2580 case midgard_alu_op_flog2:
2581 max_bitsize = MAX2(max_bitsize, 32);
2582 break;
2583
2584 default:
2585 break;
2586 }
2587
2588 /* High implies computing at a higher bitsize, e.g umul_high of 32-bit
2589 * requires computing at 64-bit */
2590 if (midgard_is_integer_out_op(ins->op) && ins->outmod == midgard_outmod_keephi) {
2591 max_bitsize *= 2;
2592 assert(max_bitsize <= 64);
2593 }
2594
2595 return max_bitsize;
2596 }
2597
2598 midgard_reg_mode
reg_mode_for_bitsize(unsigned bitsize)2599 reg_mode_for_bitsize(unsigned bitsize)
2600 {
2601 switch (bitsize) {
2602 /* use 16 pipe for 8 since we don't support vec16 yet */
2603 case 8:
2604 case 16:
2605 return midgard_reg_mode_16;
2606 case 32:
2607 return midgard_reg_mode_32;
2608 case 64:
2609 return midgard_reg_mode_64;
2610 default:
2611 unreachable("invalid bit size");
2612 }
2613 }
2614
2615 /* Midgard supports two types of constants, embedded constants (128-bit) and
2616 * inline constants (16-bit). Sometimes, especially with scalar ops, embedded
2617 * constants can be demoted to inline constants, for space savings and
2618 * sometimes a performance boost */
2619
2620 static void
embedded_to_inline_constant(compiler_context * ctx,midgard_block * block)2621 embedded_to_inline_constant(compiler_context *ctx, midgard_block *block)
2622 {
2623 mir_foreach_instr_in_block(block, ins) {
2624 if (!ins->has_constants) continue;
2625 if (ins->has_inline_constant) continue;
2626
2627 unsigned max_bitsize = max_bitsize_for_alu(ins);
2628
2629 /* We can inline 32-bit (sometimes) or 16-bit (usually) */
2630 bool is_16 = max_bitsize == 16;
2631 bool is_32 = max_bitsize == 32;
2632
2633 if (!(is_16 || is_32))
2634 continue;
2635
2636 /* src1 cannot be an inline constant due to encoding
2637 * restrictions. So, if possible we try to flip the arguments
2638 * in that case */
2639
2640 int op = ins->op;
2641
2642 if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) &&
2643 alu_opcode_props[op].props & OP_COMMUTES) {
2644 mir_flip(ins);
2645 }
2646
2647 if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
2648 /* Component is from the swizzle. Take a nonzero component */
2649 assert(ins->mask);
2650 unsigned first_comp = ffs(ins->mask) - 1;
2651 unsigned component = ins->swizzle[1][first_comp];
2652
2653 /* Scale constant appropriately, if we can legally */
2654 int16_t scaled_constant = 0;
2655
2656 if (is_16) {
2657 scaled_constant = ins->constants.u16[component];
2658 } else if (midgard_is_integer_op(op)) {
2659 scaled_constant = ins->constants.u32[component];
2660
2661 /* Constant overflow after resize */
2662 if (scaled_constant != ins->constants.u32[component])
2663 continue;
2664 } else {
2665 float original = ins->constants.f32[component];
2666 scaled_constant = _mesa_float_to_half(original);
2667
2668 /* Check for loss of precision. If this is
2669 * mediump, we don't care, but for a highp
2670 * shader, we need to pay attention. NIR
2671 * doesn't yet tell us which mode we're in!
2672 * Practically this prevents most constants
2673 * from being inlined, sadly. */
2674
2675 float fp32 = _mesa_half_to_float(scaled_constant);
2676
2677 if (fp32 != original)
2678 continue;
2679 }
2680
2681 /* Should've been const folded */
2682 if (ins->src_abs[1] || ins->src_neg[1])
2683 continue;
2684
2685 /* Make sure that the constant is not itself a vector
2686 * by checking if all accessed values are the same. */
2687
2688 const midgard_constants *cons = &ins->constants;
2689 uint32_t value = is_16 ? cons->u16[component] : cons->u32[component];
2690
2691 bool is_vector = false;
2692 unsigned mask = effective_writemask(ins->op, ins->mask);
2693
2694 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) {
2695 /* We only care if this component is actually used */
2696 if (!(mask & (1 << c)))
2697 continue;
2698
2699 uint32_t test = is_16 ?
2700 cons->u16[ins->swizzle[1][c]] :
2701 cons->u32[ins->swizzle[1][c]];
2702
2703 if (test != value) {
2704 is_vector = true;
2705 break;
2706 }
2707 }
2708
2709 if (is_vector)
2710 continue;
2711
2712 /* Get rid of the embedded constant */
2713 ins->has_constants = false;
2714 ins->src[1] = ~0;
2715 ins->has_inline_constant = true;
2716 ins->inline_constant = scaled_constant;
2717 }
2718 }
2719 }
2720
2721 /* Dead code elimination for branches at the end of a block - only one branch
2722 * per block is legal semantically */
2723
2724 static void
midgard_cull_dead_branch(compiler_context * ctx,midgard_block * block)2725 midgard_cull_dead_branch(compiler_context *ctx, midgard_block *block)
2726 {
2727 bool branched = false;
2728
2729 mir_foreach_instr_in_block_safe(block, ins) {
2730 if (!midgard_is_branch_unit(ins->unit)) continue;
2731
2732 if (branched)
2733 mir_remove_instruction(ins);
2734
2735 branched = true;
2736 }
2737 }
2738
2739 /* We want to force the invert on AND/OR to the second slot to legalize into
2740 * iandnot/iornot. The relevant patterns are for AND (and OR respectively)
2741 *
2742 * ~a & #b = ~a & ~(#~b)
2743 * ~a & b = b & ~a
2744 */
2745
2746 static void
midgard_legalize_invert(compiler_context * ctx,midgard_block * block)2747 midgard_legalize_invert(compiler_context *ctx, midgard_block *block)
2748 {
2749 mir_foreach_instr_in_block(block, ins) {
2750 if (ins->type != TAG_ALU_4) continue;
2751
2752 if (ins->op != midgard_alu_op_iand &&
2753 ins->op != midgard_alu_op_ior) continue;
2754
2755 if (ins->src_invert[1] || !ins->src_invert[0]) continue;
2756
2757 if (ins->has_inline_constant) {
2758 /* ~(#~a) = ~(~#a) = a, so valid, and forces both
2759 * inverts on */
2760 ins->inline_constant = ~ins->inline_constant;
2761 ins->src_invert[1] = true;
2762 } else {
2763 /* Flip to the right invert order. Note
2764 * has_inline_constant false by assumption on the
2765 * branch, so flipping makes sense. */
2766 mir_flip(ins);
2767 }
2768 }
2769 }
2770
2771 static unsigned
emit_fragment_epilogue(compiler_context * ctx,unsigned rt,unsigned sample_iter)2772 emit_fragment_epilogue(compiler_context *ctx, unsigned rt, unsigned sample_iter)
2773 {
2774 /* Loop to ourselves */
2775 midgard_instruction *br = ctx->writeout_branch[rt][sample_iter];
2776 struct midgard_instruction ins = v_branch(false, false);
2777 ins.writeout = br->writeout;
2778 ins.branch.target_block = ctx->block_count - 1;
2779 ins.constants.u32[0] = br->constants.u32[0];
2780 memcpy(&ins.src_types, &br->src_types, sizeof(ins.src_types));
2781 emit_mir_instruction(ctx, ins);
2782
2783 ctx->current_block->epilogue = true;
2784 schedule_barrier(ctx);
2785 return ins.branch.target_block;
2786 }
2787
2788 static midgard_block *
emit_block_init(compiler_context * ctx)2789 emit_block_init(compiler_context *ctx)
2790 {
2791 midgard_block *this_block = ctx->after_block;
2792 ctx->after_block = NULL;
2793
2794 if (!this_block)
2795 this_block = create_empty_block(ctx);
2796
2797 list_addtail(&this_block->base.link, &ctx->blocks);
2798
2799 this_block->scheduled = false;
2800 ++ctx->block_count;
2801
2802 /* Set up current block */
2803 list_inithead(&this_block->base.instructions);
2804 ctx->current_block = this_block;
2805
2806 return this_block;
2807 }
2808
2809 static midgard_block *
emit_block(compiler_context * ctx,nir_block * block)2810 emit_block(compiler_context *ctx, nir_block *block)
2811 {
2812 midgard_block *this_block = emit_block_init(ctx);
2813
2814 nir_foreach_instr(instr, block) {
2815 emit_instr(ctx, instr);
2816 ++ctx->instruction_count;
2817 }
2818
2819 return this_block;
2820 }
2821
2822 static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);
2823
2824 static void
emit_if(struct compiler_context * ctx,nir_if * nif)2825 emit_if(struct compiler_context *ctx, nir_if *nif)
2826 {
2827 midgard_block *before_block = ctx->current_block;
2828
2829 /* Speculatively emit the branch, but we can't fill it in until later */
2830 bool inv = false;
2831 EMIT(branch, true, true);
2832 midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);
2833 then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv);
2834 then_branch->src_types[0] = nir_type_uint32;
2835 then_branch->branch.invert_conditional = !inv;
2836
2837 /* Emit the two subblocks. */
2838 midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);
2839 midgard_block *end_then_block = ctx->current_block;
2840
2841 /* Emit a jump from the end of the then block to the end of the else */
2842 EMIT(branch, false, false);
2843 midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);
2844
2845 /* Emit second block, and check if it's empty */
2846
2847 int else_idx = ctx->block_count;
2848 int count_in = ctx->instruction_count;
2849 midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);
2850 midgard_block *end_else_block = ctx->current_block;
2851 int after_else_idx = ctx->block_count;
2852
2853 /* Now that we have the subblocks emitted, fix up the branches */
2854
2855 assert(then_block);
2856 assert(else_block);
2857
2858 if (ctx->instruction_count == count_in) {
2859 /* The else block is empty, so don't emit an exit jump */
2860 mir_remove_instruction(then_exit);
2861 then_branch->branch.target_block = after_else_idx;
2862 } else {
2863 then_branch->branch.target_block = else_idx;
2864 then_exit->branch.target_block = after_else_idx;
2865 }
2866
2867 /* Wire up the successors */
2868
2869 ctx->after_block = create_empty_block(ctx);
2870
2871 pan_block_add_successor(&before_block->base, &then_block->base);
2872 pan_block_add_successor(&before_block->base, &else_block->base);
2873
2874 pan_block_add_successor(&end_then_block->base, &ctx->after_block->base);
2875 pan_block_add_successor(&end_else_block->base, &ctx->after_block->base);
2876 }
2877
2878 static void
emit_loop(struct compiler_context * ctx,nir_loop * nloop)2879 emit_loop(struct compiler_context *ctx, nir_loop *nloop)
2880 {
2881 /* Remember where we are */
2882 midgard_block *start_block = ctx->current_block;
2883
2884 /* Allocate a loop number, growing the current inner loop depth */
2885 int loop_idx = ++ctx->current_loop_depth;
2886
2887 /* Get index from before the body so we can loop back later */
2888 int start_idx = ctx->block_count;
2889
2890 /* Emit the body itself */
2891 midgard_block *loop_block = emit_cf_list(ctx, &nloop->body);
2892
2893 /* Branch back to loop back */
2894 struct midgard_instruction br_back = v_branch(false, false);
2895 br_back.branch.target_block = start_idx;
2896 emit_mir_instruction(ctx, br_back);
2897
2898 /* Mark down that branch in the graph. */
2899 pan_block_add_successor(&start_block->base, &loop_block->base);
2900 pan_block_add_successor(&ctx->current_block->base, &loop_block->base);
2901
2902 /* Find the index of the block about to follow us (note: we don't add
2903 * one; blocks are 0-indexed so we get a fencepost problem) */
2904 int break_block_idx = ctx->block_count;
2905
2906 /* Fix up the break statements we emitted to point to the right place,
2907 * now that we can allocate a block number for them */
2908 ctx->after_block = create_empty_block(ctx);
2909
2910 mir_foreach_block_from(ctx, start_block, _block) {
2911 mir_foreach_instr_in_block(((midgard_block *) _block), ins) {
2912 if (ins->type != TAG_ALU_4) continue;
2913 if (!ins->compact_branch) continue;
2914
2915 /* We found a branch -- check the type to see if we need to do anything */
2916 if (ins->branch.target_type != TARGET_BREAK) continue;
2917
2918 /* It's a break! Check if it's our break */
2919 if (ins->branch.target_break != loop_idx) continue;
2920
2921 /* Okay, cool, we're breaking out of this loop.
2922 * Rewrite from a break to a goto */
2923
2924 ins->branch.target_type = TARGET_GOTO;
2925 ins->branch.target_block = break_block_idx;
2926
2927 pan_block_add_successor(_block, &ctx->after_block->base);
2928 }
2929 }
2930
2931 /* Now that we've finished emitting the loop, free up the depth again
2932 * so we play nice with recursion amid nested loops */
2933 --ctx->current_loop_depth;
2934
2935 /* Dump loop stats */
2936 ++ctx->loop_count;
2937 }
2938
2939 static midgard_block *
emit_cf_list(struct compiler_context * ctx,struct exec_list * list)2940 emit_cf_list(struct compiler_context *ctx, struct exec_list *list)
2941 {
2942 midgard_block *start_block = NULL;
2943
2944 foreach_list_typed(nir_cf_node, node, node, list) {
2945 switch (node->type) {
2946 case nir_cf_node_block: {
2947 midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));
2948
2949 if (!start_block)
2950 start_block = block;
2951
2952 break;
2953 }
2954
2955 case nir_cf_node_if:
2956 emit_if(ctx, nir_cf_node_as_if(node));
2957 break;
2958
2959 case nir_cf_node_loop:
2960 emit_loop(ctx, nir_cf_node_as_loop(node));
2961 break;
2962
2963 case nir_cf_node_function:
2964 assert(0);
2965 break;
2966 }
2967 }
2968
2969 return start_block;
2970 }
2971
2972 /* Due to lookahead, we need to report the first tag executed in the command
2973 * stream and in branch targets. An initial block might be empty, so iterate
2974 * until we find one that 'works' */
2975
2976 unsigned
midgard_get_first_tag_from_block(compiler_context * ctx,unsigned block_idx)2977 midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)
2978 {
2979 midgard_block *initial_block = mir_get_block(ctx, block_idx);
2980
2981 mir_foreach_block_from(ctx, initial_block, _v) {
2982 midgard_block *v = (midgard_block *) _v;
2983 if (v->quadword_count) {
2984 midgard_bundle *initial_bundle =
2985 util_dynarray_element(&v->bundles, midgard_bundle, 0);
2986
2987 return initial_bundle->tag;
2988 }
2989 }
2990
2991 /* Default to a tag 1 which will break from the shader, in case we jump
2992 * to the exit block (i.e. `return` in a compute shader) */
2993
2994 return 1;
2995 }
2996
2997 /* For each fragment writeout instruction, generate a writeout loop to
2998 * associate with it */
2999
3000 static void
mir_add_writeout_loops(compiler_context * ctx)3001 mir_add_writeout_loops(compiler_context *ctx)
3002 {
3003 for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) {
3004 for (unsigned s = 0; s < MIDGARD_MAX_SAMPLE_ITER; ++s) {
3005 midgard_instruction *br = ctx->writeout_branch[rt][s];
3006 if (!br) continue;
3007
3008 unsigned popped = br->branch.target_block;
3009 pan_block_add_successor(&(mir_get_block(ctx, popped - 1)->base),
3010 &ctx->current_block->base);
3011 br->branch.target_block = emit_fragment_epilogue(ctx, rt, s);
3012 br->branch.target_type = TARGET_GOTO;
3013
3014 /* If we have more RTs, we'll need to restore back after our
3015 * loop terminates */
3016 midgard_instruction *next_br = NULL;
3017
3018 if ((s + 1) < MIDGARD_MAX_SAMPLE_ITER)
3019 next_br = ctx->writeout_branch[rt][s + 1];
3020
3021 if (!next_br && (rt + 1) < ARRAY_SIZE(ctx->writeout_branch))
3022 next_br = ctx->writeout_branch[rt + 1][0];
3023
3024 if (next_br) {
3025 midgard_instruction uncond = v_branch(false, false);
3026 uncond.branch.target_block = popped;
3027 uncond.branch.target_type = TARGET_GOTO;
3028 emit_mir_instruction(ctx, uncond);
3029 pan_block_add_successor(&ctx->current_block->base,
3030 &(mir_get_block(ctx, popped)->base));
3031 schedule_barrier(ctx);
3032 } else {
3033 /* We're last, so we can terminate here */
3034 br->last_writeout = true;
3035 }
3036 }
3037 }
3038 }
3039
3040 void
midgard_compile_shader_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info)3041 midgard_compile_shader_nir(nir_shader *nir,
3042 const struct panfrost_compile_inputs *inputs,
3043 struct util_dynarray *binary,
3044 struct pan_shader_info *info)
3045 {
3046 midgard_debug = debug_get_option_midgard_debug();
3047
3048 /* TODO: Bound against what? */
3049 compiler_context *ctx = rzalloc(NULL, compiler_context);
3050 ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx);
3051
3052 ctx->inputs = inputs;
3053 ctx->nir = nir;
3054 ctx->info = info;
3055 ctx->stage = nir->info.stage;
3056
3057 if (inputs->is_blend) {
3058 unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1);
3059 const struct util_format_description *desc =
3060 util_format_description(inputs->rt_formats[inputs->blend.rt]);
3061
3062 /* We have to split writeout in 128 bit chunks */
3063 ctx->blend_sample_iterations =
3064 DIV_ROUND_UP(desc->block.bits * nr_samples, 128);
3065 }
3066 ctx->blend_input = ~0;
3067 ctx->blend_src1 = ~0;
3068 ctx->quirks = midgard_get_quirks(inputs->gpu_id);
3069
3070 /* Initialize at a global (not block) level hash tables */
3071
3072 ctx->ssa_constants = _mesa_hash_table_u64_create(ctx);
3073
3074 /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
3075 * (so we don't accidentally duplicate the epilogue since mesa/st has
3076 * messed with our I/O quite a bit already) */
3077
3078 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3079
3080 if (ctx->stage == MESA_SHADER_VERTEX) {
3081 NIR_PASS_V(nir, nir_lower_viewport_transform);
3082 NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0);
3083 }
3084
3085 NIR_PASS_V(nir, nir_lower_var_copies);
3086 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3087 NIR_PASS_V(nir, nir_split_var_copies);
3088 NIR_PASS_V(nir, nir_lower_var_copies);
3089 NIR_PASS_V(nir, nir_lower_global_vars_to_local);
3090 NIR_PASS_V(nir, nir_lower_var_copies);
3091 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3092
3093 unsigned pan_quirks = panfrost_get_quirks(inputs->gpu_id, 0);
3094 NIR_PASS_V(nir, pan_lower_framebuffer,
3095 inputs->rt_formats, inputs->raw_fmt_mask,
3096 inputs->is_blend, pan_quirks);
3097
3098 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3099 glsl_type_size, 0);
3100 NIR_PASS_V(nir, nir_lower_ssbo);
3101 NIR_PASS_V(nir, pan_nir_lower_zs_store);
3102
3103 NIR_PASS_V(nir, pan_nir_lower_64bit_intrin);
3104
3105 /* Optimisation passes */
3106
3107 optimise_nir(nir, ctx->quirks, inputs->is_blend);
3108
3109 NIR_PASS_V(nir, pan_nir_reorder_writeout);
3110
3111 if ((midgard_debug & MIDGARD_DBG_SHADERS) &&
3112 ((midgard_debug & MIDGARD_DBG_INTERNAL) || !nir->info.internal)) {
3113 nir_print_shader(nir, stdout);
3114 }
3115
3116 info->tls_size = nir->scratch_size;
3117
3118 nir_foreach_function(func, nir) {
3119 if (!func->impl)
3120 continue;
3121
3122 list_inithead(&ctx->blocks);
3123 ctx->block_count = 0;
3124 ctx->func = func;
3125 ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD));
3126
3127 if (nir->info.outputs_read && !inputs->is_blend) {
3128 emit_block_init(ctx);
3129
3130 struct midgard_instruction wait = v_branch(false, false);
3131 wait.branch.target_type = TARGET_TILEBUF_WAIT;
3132
3133 emit_mir_instruction(ctx, wait);
3134
3135 ++ctx->instruction_count;
3136 }
3137
3138 emit_cf_list(ctx, &func->impl->body);
3139 free(ctx->already_emitted);
3140 break; /* TODO: Multi-function shaders */
3141 }
3142
3143 /* Per-block lowering before opts */
3144
3145 mir_foreach_block(ctx, _block) {
3146 midgard_block *block = (midgard_block *) _block;
3147 inline_alu_constants(ctx, block);
3148 embedded_to_inline_constant(ctx, block);
3149 }
3150 /* MIR-level optimizations */
3151
3152 bool progress = false;
3153
3154 do {
3155 progress = false;
3156 progress |= midgard_opt_dead_code_eliminate(ctx);
3157
3158 mir_foreach_block(ctx, _block) {
3159 midgard_block *block = (midgard_block *) _block;
3160 progress |= midgard_opt_copy_prop(ctx, block);
3161 progress |= midgard_opt_combine_projection(ctx, block);
3162 progress |= midgard_opt_varying_projection(ctx, block);
3163 }
3164 } while (progress);
3165
3166 mir_foreach_block(ctx, _block) {
3167 midgard_block *block = (midgard_block *) _block;
3168 midgard_lower_derivatives(ctx, block);
3169 midgard_legalize_invert(ctx, block);
3170 midgard_cull_dead_branch(ctx, block);
3171 }
3172
3173 if (ctx->stage == MESA_SHADER_FRAGMENT)
3174 mir_add_writeout_loops(ctx);
3175
3176 /* Analyze now that the code is known but before scheduling creates
3177 * pipeline registers which are harder to track */
3178 mir_analyze_helper_requirements(ctx);
3179
3180 /* Schedule! */
3181 midgard_schedule_program(ctx);
3182 mir_ra(ctx);
3183
3184 /* Analyze after scheduling since this is order-dependent */
3185 mir_analyze_helper_terminate(ctx);
3186
3187 /* Emit flat binary from the instruction arrays. Iterate each block in
3188 * sequence. Save instruction boundaries such that lookahead tags can
3189 * be assigned easily */
3190
3191 /* Cache _all_ bundles in source order for lookahead across failed branches */
3192
3193 int bundle_count = 0;
3194 mir_foreach_block(ctx, _block) {
3195 midgard_block *block = (midgard_block *) _block;
3196 bundle_count += block->bundles.size / sizeof(midgard_bundle);
3197 }
3198 midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);
3199 int bundle_idx = 0;
3200 mir_foreach_block(ctx, _block) {
3201 midgard_block *block = (midgard_block *) _block;
3202 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
3203 source_order_bundles[bundle_idx++] = bundle;
3204 }
3205 }
3206
3207 int current_bundle = 0;
3208
3209 /* Midgard prefetches instruction types, so during emission we
3210 * need to lookahead. Unless this is the last instruction, in
3211 * which we return 1. */
3212
3213 mir_foreach_block(ctx, _block) {
3214 midgard_block *block = (midgard_block *) _block;
3215 mir_foreach_bundle_in_block(block, bundle) {
3216 int lookahead = 1;
3217
3218 if (!bundle->last_writeout && (current_bundle + 1 < bundle_count))
3219 lookahead = source_order_bundles[current_bundle + 1]->tag;
3220
3221 emit_binary_bundle(ctx, block, bundle, binary, lookahead);
3222 ++current_bundle;
3223 }
3224
3225 /* TODO: Free deeper */
3226 //util_dynarray_fini(&block->instructions);
3227 }
3228
3229 free(source_order_bundles);
3230
3231 /* Report the very first tag executed */
3232 info->midgard.first_tag = midgard_get_first_tag_from_block(ctx, 0);
3233
3234 info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos);
3235
3236 if ((midgard_debug & MIDGARD_DBG_SHADERS) &&
3237 ((midgard_debug & MIDGARD_DBG_INTERNAL) || !nir->info.internal)) {
3238 disassemble_midgard(stdout, binary->data,
3239 binary->size, inputs->gpu_id,
3240 midgard_debug & MIDGARD_DBG_VERBOSE);
3241 fflush(stdout);
3242 }
3243
3244 /* A shader ending on a 16MB boundary causes INSTR_INVALID_PC faults,
3245 * workaround by adding some padding to the end of the shader. (The
3246 * kernel makes sure shader BOs can't cross 16MB boundaries.) */
3247 if (binary->size)
3248 memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16);
3249
3250 if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->shaderdb) &&
3251 !nir->info.internal) {
3252 unsigned nr_bundles = 0, nr_ins = 0;
3253
3254 /* Count instructions and bundles */
3255
3256 mir_foreach_block(ctx, _block) {
3257 midgard_block *block = (midgard_block *) _block;
3258 nr_bundles += util_dynarray_num_elements(
3259 &block->bundles, midgard_bundle);
3260
3261 mir_foreach_bundle_in_block(block, bun)
3262 nr_ins += bun->instruction_count;
3263 }
3264
3265 /* Calculate thread count. There are certain cutoffs by
3266 * register count for thread count */
3267
3268 unsigned nr_registers = info->work_reg_count;
3269
3270 unsigned nr_threads =
3271 (nr_registers <= 4) ? 4 :
3272 (nr_registers <= 8) ? 2 :
3273 1;
3274
3275 /* Dump stats */
3276
3277 fprintf(stderr, "%s - %s shader: "
3278 "%u inst, %u bundles, %u quadwords, "
3279 "%u registers, %u threads, %u loops, "
3280 "%u:%u spills:fills\n",
3281 ctx->nir->info.label ?: "",
3282 ctx->inputs->is_blend ? "PAN_SHADER_BLEND" :
3283 gl_shader_stage_name(ctx->stage),
3284 nr_ins, nr_bundles, ctx->quadword_count,
3285 nr_registers, nr_threads,
3286 ctx->loop_count,
3287 ctx->spills, ctx->fills);
3288 }
3289
3290 _mesa_hash_table_u64_destroy(ctx->ssa_constants);
3291 _mesa_hash_table_u64_destroy(ctx->sysval_to_id);
3292
3293 ralloc_free(ctx);
3294 }
3295