1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27 #include "main/mtypes.h"
28 #include "compiler/glsl/glsl_to_nir.h"
29 #include "compiler/nir_types.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "util/u_debug.h"
32
33 #include "disassemble.h"
34 #include "bifrost_compile.h"
35 #include "bifrost_nir.h"
36 #include "compiler.h"
37 #include "bi_quirks.h"
38 #include "bi_print.h"
39
40 static const struct debug_named_value debug_options[] = {
41 {"msgs", BIFROST_DBG_MSGS, "Print debug messages"},
42 {"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"},
43 DEBUG_NAMED_VALUE_END
44 };
45
46 DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", debug_options, 0)
47
48 int bifrost_debug = 0;
49
50 #define DBG(fmt, ...) \
51 do { if (bifrost_debug & BIFROST_DBG_MSGS) \
52 fprintf(stderr, "%s:%d: "fmt, \
53 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
54
55 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
56 static bi_instruction *bi_emit_branch(bi_context *ctx);
57
58 static void
emit_jump(bi_context * ctx,nir_jump_instr * instr)59 emit_jump(bi_context *ctx, nir_jump_instr *instr)
60 {
61 bi_instruction *branch = bi_emit_branch(ctx);
62
63 switch (instr->type) {
64 case nir_jump_break:
65 branch->branch_target = ctx->break_block;
66 break;
67 case nir_jump_continue:
68 branch->branch_target = ctx->continue_block;
69 break;
70 default:
71 unreachable("Unhandled jump type");
72 }
73
74 pan_block_add_successor(&ctx->current_block->base, &branch->branch_target->base);
75 ctx->current_block->base.unconditional_jumps = true;
76 }
77
78 static bi_instruction
bi_load(enum bi_class T,nir_intrinsic_instr * instr)79 bi_load(enum bi_class T, nir_intrinsic_instr *instr)
80 {
81 bi_instruction load = {
82 .type = T,
83 .vector_channels = instr->num_components,
84 .src = { BIR_INDEX_CONSTANT },
85 .src_types = { nir_type_uint32 },
86 .constant = { .u64 = nir_intrinsic_base(instr) },
87 };
88
89 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
90
91 if (info->has_dest)
92 load.dest = pan_dest_index(&instr->dest);
93
94 if (info->has_dest && nir_intrinsic_has_dest_type(instr))
95 load.dest_type = nir_intrinsic_dest_type(instr);
96
97 nir_src *offset = nir_get_io_offset_src(instr);
98
99 if (nir_src_is_const(*offset))
100 load.constant.u64 += nir_src_as_uint(*offset);
101 else
102 load.src[0] = pan_src_index(offset);
103
104 return load;
105 }
106
107 static void
bi_emit_ld_output(bi_context * ctx,nir_intrinsic_instr * instr)108 bi_emit_ld_output(bi_context *ctx, nir_intrinsic_instr *instr)
109 {
110 assert(ctx->is_blend);
111
112 bi_instruction ins = {
113 .type = BI_LOAD_TILE,
114 .vector_channels = instr->num_components,
115 .dest = pan_dest_index(&instr->dest),
116 .dest_type = nir_type_float16,
117 .src = {
118 /* PixelIndices */
119 BIR_INDEX_CONSTANT,
120 /* PixelCoverage: we simply pass r60 which contains the cumulative
121 * coverage bitmap
122 */
123 BIR_INDEX_REGISTER | 60,
124 /* InternalConversionDescriptor (see src/panfrost/lib/midgard.xml for more
125 * details)
126 */
127 BIR_INDEX_CONSTANT | 32
128 },
129 .src_types = { nir_type_uint32, nir_type_uint32, nir_type_uint32 },
130 };
131
132 /* We want to load the current pixel.
133 * FIXME: The sample to load is currently hardcoded to 0. This should
134 * be addressed for multi-sample FBs.
135 */
136 struct bifrost_pixel_indices pix = {
137 .y = BIFROST_CURRENT_PIXEL,
138 };
139 memcpy(&ins.constant.u64, &pix, sizeof(pix));
140
141 /* Only keep the conversion part of the blend descriptor. */
142 ins.constant.u64 |= ctx->blend_desc & 0xffffffff00000000ULL;
143
144 bi_emit(ctx, ins);
145 }
146
147 static enum bifrost_interp_mode
bi_interp_for_intrinsic(nir_intrinsic_op op)148 bi_interp_for_intrinsic(nir_intrinsic_op op)
149 {
150 switch (op) {
151 case nir_intrinsic_load_barycentric_centroid:
152 return BIFROST_INTERP_CENTROID;
153 case nir_intrinsic_load_barycentric_sample:
154 return BIFROST_INTERP_SAMPLE;
155 case nir_intrinsic_load_barycentric_pixel:
156 default:
157 return BIFROST_INTERP_CENTER;
158 }
159 }
160
161 static void
bi_emit_ld_vary(bi_context * ctx,nir_intrinsic_instr * instr)162 bi_emit_ld_vary(bi_context *ctx, nir_intrinsic_instr *instr)
163 {
164 bi_instruction ins = bi_load(BI_LOAD_VAR, instr);
165 ins.load_vary.interp_mode = BIFROST_INTERP_CENTER; /* TODO */
166 ins.load_vary.reuse = false; /* TODO */
167 ins.load_vary.flat = instr->intrinsic != nir_intrinsic_load_interpolated_input;
168 ins.dest_type = nir_type_float | nir_dest_bit_size(instr->dest);
169 ins.format = ins.dest_type;
170
171 if (instr->intrinsic == nir_intrinsic_load_interpolated_input) {
172 nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
173 if (parent) {
174 ins.load_vary.interp_mode =
175 bi_interp_for_intrinsic(parent->intrinsic);
176 }
177 }
178
179 if (nir_src_is_const(*nir_get_io_offset_src(instr))) {
180 /* Zero it out for direct */
181 ins.src[1] = BIR_INDEX_ZERO;
182 } else {
183 /* R61 contains sample mask stuff, TODO RA XXX */
184 ins.src[1] = BIR_INDEX_REGISTER | 61;
185 }
186
187 bi_emit(ctx, ins);
188 }
189
190 static void
bi_emit_ld_blend_input(bi_context * ctx,nir_intrinsic_instr * instr)191 bi_emit_ld_blend_input(bi_context *ctx, nir_intrinsic_instr *instr)
192 {
193 ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
194
195 /* We don't support dual-source blending yet. */
196 assert(sem.location == VARYING_SLOT_COL0);
197
198 bi_instruction ins = {
199 .type = BI_COMBINE,
200 .dest_type = nir_type_uint32,
201 .dest = pan_dest_index(&instr->dest),
202 .src_types = {
203 nir_type_uint32, nir_type_uint32,
204 nir_type_uint32, nir_type_uint32,
205 },
206
207 /* Source color is passed through r0-r3.
208 * TODO: We should probably find a way to avoid this
209 * combine/mov and use r0-r3 directly.
210 */
211 .src = {
212 BIR_INDEX_REGISTER | 0,
213 BIR_INDEX_REGISTER | 1,
214 BIR_INDEX_REGISTER | 2,
215 BIR_INDEX_REGISTER | 3,
216 },
217 };
218
219 bi_emit(ctx, ins);
220 }
221
222 static void
bi_emit_atest(bi_context * ctx,unsigned rgba,nir_alu_type T)223 bi_emit_atest(bi_context *ctx, unsigned rgba, nir_alu_type T)
224 {
225 bi_instruction ins = {
226 .type = BI_ATEST,
227 .src = {
228 BIR_INDEX_REGISTER | 60 /* TODO: RA */,
229 rgba,
230 },
231 .src_types = { nir_type_uint32, T },
232 .swizzle = {
233 { 0 },
234 { 3, 0 } /* swizzle out the alpha */
235 },
236 .dest = BIR_INDEX_REGISTER | 60 /* TODO: RA */,
237 .dest_type = nir_type_uint32,
238 };
239
240 bi_emit(ctx, ins);
241 }
242
243 static void
bi_emit_blend(bi_context * ctx,unsigned rgba,nir_alu_type T,unsigned rt)244 bi_emit_blend(bi_context *ctx, unsigned rgba, nir_alu_type T, unsigned rt)
245 {
246 bi_instruction blend = {
247 .type = BI_BLEND,
248 .blend_location = rt,
249 .src = {
250 rgba,
251 BIR_INDEX_REGISTER | 60 /* TODO: RA */
252 },
253 .src_types = {
254 T,
255 nir_type_uint32,
256 nir_type_uint32,
257 nir_type_uint32,
258 },
259 .swizzle = {
260 { 0, 1, 2, 3 },
261 { 0 }
262 },
263 .dest_type = nir_type_uint32,
264 .vector_channels = 4
265 };
266
267 if (ctx->is_blend) {
268 /* Blend descriptor comes from the compile inputs */
269 blend.src[2] = BIR_INDEX_CONSTANT | 0;
270 blend.src[3] = BIR_INDEX_CONSTANT | 32;
271 blend.constant.u64 = ctx->blend_desc;
272
273 /* Put the result in r0 */
274 blend.dest = BIR_INDEX_REGISTER | 0;
275 } else {
276 /* Blend descriptor comes from the FAU RAM */
277 blend.src[2] = BIR_INDEX_BLEND | BIFROST_SRC_FAU_LO;
278 blend.src[3] = BIR_INDEX_BLEND | BIFROST_SRC_FAU_HI;
279
280 /* By convention, the return address is stored in r48 and will
281 * be used by the blend shader to jump back to the fragment
282 * shader when it's done.
283 */
284 blend.dest = BIR_INDEX_REGISTER | 48;
285 }
286
287 assert(blend.blend_location < 8);
288 assert(ctx->blend_types);
289 assert(blend.src_types[0]);
290 ctx->blend_types[blend.blend_location] = blend.src_types[0];
291
292 bi_emit(ctx, blend);
293 }
294
295 static void
bi_emit_zs_emit(bi_context * ctx,unsigned z,unsigned stencil)296 bi_emit_zs_emit(bi_context *ctx, unsigned z, unsigned stencil)
297 {
298 bi_instruction ins = {
299 .type = BI_ZS_EMIT,
300 .src = {
301 z,
302 stencil,
303 BIR_INDEX_REGISTER | 60 /* TODO: RA */,
304 },
305 .src_types = {
306 nir_type_float32,
307 nir_type_uint8,
308 nir_type_uint32,
309 },
310 .swizzle = { { 0 }, { 0 }, { 0 } },
311 .dest = BIR_INDEX_REGISTER | 60 /* TODO: RA */,
312 .dest_type = nir_type_uint32,
313 };
314
315 bi_emit(ctx, ins);
316 }
317
318 static void
bi_emit_frag_out(bi_context * ctx,nir_intrinsic_instr * instr)319 bi_emit_frag_out(bi_context *ctx, nir_intrinsic_instr *instr)
320 {
321 bool combined = instr->intrinsic ==
322 nir_intrinsic_store_combined_output_pan;
323
324 unsigned writeout = combined ? nir_intrinsic_component(instr) :
325 PAN_WRITEOUT_C;
326
327 bool emit_blend = writeout & (PAN_WRITEOUT_C);
328 bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
329
330 const nir_variable *var =
331 nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out,
332 nir_intrinsic_base(instr));
333 assert(var);
334
335 if (!ctx->emitted_atest && !ctx->is_blend) {
336 bi_emit_atest(ctx,
337 pan_src_index(&instr->src[0]),
338 nir_intrinsic_src_type(instr));
339
340 ctx->emitted_atest = true;
341 }
342
343 if (emit_zs) {
344 unsigned z = writeout & PAN_WRITEOUT_Z ?
345 pan_src_index(&instr->src[2]) : 0;
346 unsigned s = writeout & PAN_WRITEOUT_S ?
347 pan_src_index(&instr->src[3]) : 0;
348
349 bi_emit_zs_emit(ctx, z, s);
350 }
351
352 if (emit_blend) {
353 unsigned loc = var->data.location;
354 assert(loc == FRAG_RESULT_COLOR || loc >= FRAG_RESULT_DATA0);
355
356 unsigned rt = loc == FRAG_RESULT_COLOR ? 0 :
357 (loc - FRAG_RESULT_DATA0);
358
359 bi_emit_blend(ctx,
360 pan_src_index(&instr->src[0]),
361 nir_intrinsic_src_type(instr),
362 rt);
363 }
364
365 if (ctx->is_blend) {
366 /* Jump back to the fragment shader, return address is stored
367 * in r48 (see above).
368 */
369 bi_instruction *ret = bi_emit_branch(ctx);
370 ret->src[2] = BIR_INDEX_REGISTER | 48;
371 }
372 }
373
374 static bi_instruction
bi_load_with_r61(enum bi_class T,nir_intrinsic_instr * instr)375 bi_load_with_r61(enum bi_class T, nir_intrinsic_instr *instr)
376 {
377 bi_instruction ld = bi_load(T, instr);
378 ld.src[1] = BIR_INDEX_REGISTER | 61; /* TODO: RA */
379 ld.src[2] = BIR_INDEX_REGISTER | 62;
380 ld.src_types[1] = nir_type_uint32;
381 ld.src_types[2] = nir_type_uint32;
382 ld.format = instr->intrinsic == nir_intrinsic_store_output ?
383 nir_intrinsic_src_type(instr) :
384 nir_intrinsic_dest_type(instr);
385 return ld;
386 }
387
388 static void
bi_emit_st_vary(bi_context * ctx,nir_intrinsic_instr * instr)389 bi_emit_st_vary(bi_context *ctx, nir_intrinsic_instr *instr)
390 {
391 bi_instruction address = bi_load_with_r61(BI_LOAD_VAR_ADDRESS, instr);
392 address.dest = bi_make_temp(ctx);
393 address.dest_type = nir_type_uint32;
394 address.vector_channels = 3;
395
396 unsigned nr = nir_intrinsic_src_components(instr, 0);
397 assert(nir_intrinsic_write_mask(instr) == ((1 << nr) - 1));
398
399 bi_instruction st = {
400 .type = BI_STORE_VAR,
401 .src = {
402 pan_src_index(&instr->src[0]),
403 address.dest, address.dest, address.dest,
404 },
405 .src_types = {
406 nir_type_uint32,
407 nir_type_uint32, nir_type_uint32, nir_type_uint32,
408 },
409 .swizzle = {
410 { 0 },
411 { 0 }, { 1 }, { 2}
412 },
413 .vector_channels = nr,
414 };
415
416 for (unsigned i = 0; i < nr; ++i)
417 st.swizzle[0][i] = i;
418
419 bi_emit(ctx, address);
420 bi_emit(ctx, st);
421 }
422
423 static void
bi_emit_ld_ubo(bi_context * ctx,nir_intrinsic_instr * instr)424 bi_emit_ld_ubo(bi_context *ctx, nir_intrinsic_instr *instr)
425 {
426 /* nir_lower_uniforms_to_ubo() should have been called, reserving
427 * UBO #0 for uniforms even if the shaders doesn't have uniforms.
428 */
429 assert(ctx->nir->info.first_ubo_is_default_ubo);
430
431 bool offset_is_const = nir_src_is_const(instr->src[1]);
432 unsigned dyn_offset = pan_src_index(&instr->src[1]);
433 uint32_t const_offset = 0;
434
435 if (nir_src_is_const(instr->src[1]))
436 const_offset = nir_src_as_uint(instr->src[1]);
437
438 if (nir_src_is_const(instr->src[0]) &&
439 nir_src_as_uint(instr->src[0]) == 0 &&
440 ctx->sysvals.sysval_count) {
441 if (offset_is_const) {
442 const_offset += 16 * ctx->sysvals.sysval_count;
443 } else {
444 bi_instruction add = {
445 .type = BI_IMATH,
446 .op.imath = BI_IMATH_ADD,
447 .dest = bi_make_temp(ctx),
448 .dest_type = nir_type_uint32,
449 .src = { dyn_offset, BIR_INDEX_CONSTANT | 0, BIR_INDEX_ZERO },
450 .src_types = { nir_type_uint32, nir_type_uint32, nir_type_uint32 },
451 .constant.u64 = 16 * ctx->sysvals.sysval_count,
452 };
453
454 bi_emit(ctx, add);
455 dyn_offset = add.dest;
456 }
457 }
458
459 bi_instruction ld = {
460 .type = BI_LOAD_UNIFORM,
461 .segment = BI_SEGMENT_UBO,
462 .vector_channels = instr->num_components,
463 .src_types = { nir_type_uint32, nir_type_uint32 },
464 .dest = pan_dest_index(&instr->dest),
465 .dest_type = nir_type_uint | nir_dest_bit_size(instr->dest),
466 };
467
468 if (offset_is_const) {
469 ld.src[0] = BIR_INDEX_CONSTANT | 0;
470 ld.constant.u64 |= const_offset;
471 } else {
472 ld.src[0] = dyn_offset;
473 }
474
475 if (nir_src_is_const(instr->src[0])) {
476 ld.src[1] = BIR_INDEX_CONSTANT | 32;
477 ld.constant.u64 |= nir_src_as_uint(instr->src[0]) << 32;
478 } else {
479 ld.src[1] = pan_src_index(&instr->src[0]);
480 }
481
482 bi_emit(ctx, ld);
483 }
484
485 static void
bi_emit_sysval(bi_context * ctx,nir_instr * instr,unsigned nr_components,unsigned offset)486 bi_emit_sysval(bi_context *ctx, nir_instr *instr,
487 unsigned nr_components, unsigned offset)
488 {
489 nir_dest nir_dest;
490
491 /* Figure out which uniform this is */
492 int sysval = panfrost_sysval_for_instr(instr, &nir_dest);
493 void *val = _mesa_hash_table_u64_search(ctx->sysvals.sysval_to_id, sysval);
494
495 /* Sysvals are prefix uniforms */
496 unsigned uniform = ((uintptr_t) val) - 1;
497
498 /* Emit the read itself -- this is never indirect */
499
500 bi_instruction load = {
501 .type = BI_LOAD_UNIFORM,
502 .segment = BI_SEGMENT_UBO,
503 .vector_channels = nr_components,
504 .src = { BIR_INDEX_CONSTANT, BIR_INDEX_ZERO },
505 .src_types = { nir_type_uint32, nir_type_uint32 },
506 .constant = { (uniform * 16) + offset },
507 .dest = pan_dest_index(&nir_dest),
508 .dest_type = nir_type_uint32, /* TODO */
509 };
510
511 bi_emit(ctx, load);
512 }
513
514 /* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5
515 * gl_FragCoord.z = ld_vary(fragz)
516 * gl_FragCoord.w = ld_vary(fragw)
517 */
518
519 static void
bi_emit_ld_frag_coord(bi_context * ctx,nir_intrinsic_instr * instr)520 bi_emit_ld_frag_coord(bi_context *ctx, nir_intrinsic_instr *instr)
521 {
522 /* Future proofing for mediump fragcoord at some point.. */
523 nir_alu_type T = nir_type_float32;
524
525 /* First, sketch a combine */
526 bi_instruction combine = {
527 .type = BI_COMBINE,
528 .dest_type = nir_type_uint32,
529 .dest = pan_dest_index(&instr->dest),
530 .src_types = { T, T, T, T },
531 };
532
533 /* Second, handle xy */
534 for (unsigned i = 0; i < 2; ++i) {
535 bi_instruction conv = {
536 .type = BI_CONVERT,
537 .dest_type = T,
538 .dest = bi_make_temp(ctx),
539 .src = {
540 /* TODO: RA XXX */
541 BIR_INDEX_REGISTER | 59
542 },
543 .src_types = { nir_type_uint16 },
544 .swizzle = { { i } }
545 };
546
547 bi_instruction add = {
548 .type = BI_ADD,
549 .dest_type = T,
550 .dest = bi_make_temp(ctx),
551 .src = { conv.dest, BIR_INDEX_CONSTANT },
552 .src_types = { T, T },
553 };
554
555 float half = 0.5;
556 memcpy(&add.constant.u32, &half, sizeof(float));
557
558 bi_emit(ctx, conv);
559 bi_emit(ctx, add);
560
561 combine.src[i] = add.dest;
562 }
563
564 /* Third, zw */
565 for (unsigned i = 0; i < 2; ++i) {
566 bi_instruction load = {
567 .type = BI_LOAD_VAR,
568 .load_vary = {
569 .interp_mode = BIFROST_INTERP_CENTER,
570 .reuse = false,
571 .flat = true
572 },
573 .vector_channels = 1,
574 .dest_type = nir_type_float32,
575 .format = nir_type_float32,
576 .dest = bi_make_temp(ctx),
577 .src = {
578 BIR_INDEX_CONSTANT,
579 BIR_INDEX_PASS | BIFROST_SRC_FAU_LO
580 },
581 .src_types = { nir_type_uint32, nir_type_uint32 },
582 .constant = {
583 .u32 = (i == 0) ? BIFROST_FRAGZ : BIFROST_FRAGW
584 }
585 };
586
587 bi_emit(ctx, load);
588
589 combine.src[i + 2] = load.dest;
590 }
591
592 /* Finally, emit the combine */
593 bi_emit(ctx, combine);
594 }
595
596 static void
bi_emit_discard(bi_context * ctx,nir_intrinsic_instr * instr)597 bi_emit_discard(bi_context *ctx, nir_intrinsic_instr *instr)
598 {
599 /* Goofy lowering */
600 bi_instruction discard = {
601 .type = BI_DISCARD,
602 .cond = BI_COND_EQ,
603 .src_types = { nir_type_uint32, nir_type_uint32 },
604 .src = { BIR_INDEX_ZERO, BIR_INDEX_ZERO },
605 };
606
607 bi_emit(ctx, discard);
608 }
609
610 static void
611 bi_fuse_cond(bi_instruction *csel, nir_alu_src cond,
612 unsigned *constants_left, unsigned *constant_shift,
613 unsigned comps, bool float_only);
614
615 static void
bi_emit_discard_if(bi_context * ctx,nir_intrinsic_instr * instr)616 bi_emit_discard_if(bi_context *ctx, nir_intrinsic_instr *instr)
617 {
618 nir_src cond = instr->src[0];
619 nir_alu_type T = nir_type_uint | nir_src_bit_size(cond);
620
621 bi_instruction discard = {
622 .type = BI_DISCARD,
623 .cond = BI_COND_NE,
624 .src_types = { T, T },
625 .src = {
626 pan_src_index(&cond),
627 BIR_INDEX_ZERO
628 },
629 };
630
631 /* Try to fuse in the condition */
632 unsigned constants_left = 1, constant_shift = 0;
633
634 /* Scalar so no swizzle */
635 nir_alu_src wrap = {
636 .src = instr->src[0]
637 };
638
639 /* May or may not succeed but we're optimistic */
640 bi_fuse_cond(&discard, wrap, &constants_left, &constant_shift, 1, true);
641
642 bi_emit(ctx, discard);
643 }
644
645 static void
bi_emit_blend_const(bi_context * ctx,nir_intrinsic_instr * instr)646 bi_emit_blend_const(bi_context *ctx, nir_intrinsic_instr *instr)
647 {
648 assert(ctx->is_blend);
649
650 unsigned comp;
651 switch (instr->intrinsic) {
652 case nir_intrinsic_load_blend_const_color_r_float: comp = 0; break;
653 case nir_intrinsic_load_blend_const_color_g_float: comp = 1; break;
654 case nir_intrinsic_load_blend_const_color_b_float: comp = 2; break;
655 case nir_intrinsic_load_blend_const_color_a_float: comp = 3; break;
656 default: unreachable("Invalid load blend constant intrinsic");
657 }
658
659 bi_instruction move = {
660 .type = BI_MOV,
661 .dest = pan_dest_index(&instr->dest),
662 .dest_type = nir_type_uint32,
663 .src = { BIR_INDEX_CONSTANT },
664 .src_types = { nir_type_uint32 },
665 };
666
667 memcpy(&move.constant.u32, &ctx->blend_constants[comp], sizeof(float));
668
669 bi_emit(ctx, move);
670 }
671
672 static void
bi_emit_sample_id(bi_context * ctx,nir_intrinsic_instr * instr)673 bi_emit_sample_id(bi_context *ctx, nir_intrinsic_instr *instr)
674 {
675 bi_instruction ins = {
676 .type = BI_BITWISE,
677 .op.bitwise = BI_BITWISE_AND,
678 .bitwise.rshift = true,
679 .dest = pan_dest_index(&instr->dest),
680 .dest_type = nir_type_uint32,
681 .src = {
682 /* r61[16:23] contains the sampleID */
683 BIR_INDEX_REGISTER | 61,
684 /* mask */
685 BIR_INDEX_CONSTANT | 0,
686 /* shift */
687 BIR_INDEX_CONSTANT | 32,
688 },
689 .src_types = {
690 nir_type_uint32,
691 nir_type_uint32,
692 nir_type_uint8,
693 },
694 .constant.u64 = 0xffull | (0x10ull << 32ull)
695 };
696
697 bi_emit(ctx, ins);
698 }
699
700 static void
bi_emit_front_face(bi_context * ctx,nir_intrinsic_instr * instr)701 bi_emit_front_face(bi_context *ctx, nir_intrinsic_instr *instr)
702 {
703 bi_instruction ins = {
704 .type = BI_CMP,
705 .cond = BI_COND_EQ,
706 .dest = pan_dest_index(&instr->dest),
707 .dest_type = nir_type_uint32,
708 .src = {
709 /* r58 == 0 means primitive is front facing */
710 BIR_INDEX_REGISTER | 58,
711 BIR_INDEX_ZERO,
712 },
713 .src_types = {
714 nir_type_uint32,
715 nir_type_uint32,
716 },
717 };
718
719 bi_emit(ctx, ins);
720 }
721
722 static void
bi_emit_point_coord(bi_context * ctx,nir_intrinsic_instr * instr)723 bi_emit_point_coord(bi_context *ctx, nir_intrinsic_instr *instr)
724 {
725 bi_instruction ins = {
726 .type = BI_LOAD_VAR,
727 .vector_channels = 2,
728 .dest = pan_dest_index(&instr->dest),
729 .dest_type = nir_type_float32,
730 .format = nir_type_float32,
731 .src = {
732 BIR_INDEX_CONSTANT,
733 BIR_INDEX_ZERO,
734 },
735 .src_types = {
736 nir_type_uint32,
737 },
738 .constant.u64 = 20,
739 };
740
741 bi_emit(ctx, ins);
742 }
743
744 static void
bi_emit_vertex_id(bi_context * ctx,nir_intrinsic_instr * instr)745 bi_emit_vertex_id(bi_context *ctx, nir_intrinsic_instr *instr)
746 {
747 bi_instruction mov = {
748 .type = BI_MOV,
749 .dest = pan_dest_index(&instr->dest),
750 .dest_type = nir_type_int32,
751 .src = { BIR_INDEX_REGISTER | 61 },
752 .src_types = { nir_type_int32 },
753 };
754
755 bi_emit(ctx, mov);
756 }
757
758 static void
bi_emit_instance_id(bi_context * ctx,nir_intrinsic_instr * instr)759 bi_emit_instance_id(bi_context *ctx, nir_intrinsic_instr *instr)
760 {
761 bi_instruction mov = {
762 .type = BI_MOV,
763 .dest = pan_dest_index(&instr->dest),
764 .dest_type = nir_type_int32,
765 .src = { BIR_INDEX_REGISTER | 62 },
766 .src_types = { nir_type_int32 },
767 };
768
769 bi_emit(ctx, mov);
770 }
771
772 static void
emit_intrinsic(bi_context * ctx,nir_intrinsic_instr * instr)773 emit_intrinsic(bi_context *ctx, nir_intrinsic_instr *instr)
774 {
775
776 switch (instr->intrinsic) {
777 case nir_intrinsic_load_barycentric_pixel:
778 case nir_intrinsic_load_barycentric_centroid:
779 case nir_intrinsic_load_barycentric_sample:
780 /* stub */
781 break;
782 case nir_intrinsic_load_interpolated_input:
783 case nir_intrinsic_load_input:
784 if (ctx->is_blend)
785 bi_emit_ld_blend_input(ctx, instr);
786 else if (ctx->stage == MESA_SHADER_FRAGMENT)
787 bi_emit_ld_vary(ctx, instr);
788 else if (ctx->stage == MESA_SHADER_VERTEX)
789 bi_emit(ctx, bi_load_with_r61(BI_LOAD_ATTR, instr));
790 else {
791 unreachable("Unsupported shader stage");
792 }
793 break;
794
795 case nir_intrinsic_store_output:
796 if (ctx->stage == MESA_SHADER_FRAGMENT)
797 bi_emit_frag_out(ctx, instr);
798 else if (ctx->stage == MESA_SHADER_VERTEX)
799 bi_emit_st_vary(ctx, instr);
800 else
801 unreachable("Unsupported shader stage");
802 break;
803
804 case nir_intrinsic_store_combined_output_pan:
805 assert(ctx->stage == MESA_SHADER_FRAGMENT);
806 bi_emit_frag_out(ctx, instr);
807 break;
808
809 case nir_intrinsic_load_ubo:
810 bi_emit_ld_ubo(ctx, instr);
811 break;
812
813 case nir_intrinsic_load_frag_coord:
814 bi_emit_ld_frag_coord(ctx, instr);
815 break;
816
817 case nir_intrinsic_discard:
818 bi_emit_discard(ctx, instr);
819 break;
820
821 case nir_intrinsic_discard_if:
822 bi_emit_discard_if(ctx, instr);
823 break;
824
825 case nir_intrinsic_load_ssbo_address:
826 bi_emit_sysval(ctx, &instr->instr, 1, 0);
827 break;
828
829 case nir_intrinsic_get_ssbo_size:
830 bi_emit_sysval(ctx, &instr->instr, 1, 8);
831 break;
832
833 case nir_intrinsic_load_output:
834 bi_emit_ld_output(ctx, instr);
835 break;
836
837 case nir_intrinsic_load_viewport_scale:
838 case nir_intrinsic_load_viewport_offset:
839 case nir_intrinsic_load_num_work_groups:
840 case nir_intrinsic_load_sampler_lod_parameters_pan:
841 bi_emit_sysval(ctx, &instr->instr, 3, 0);
842 break;
843
844 case nir_intrinsic_load_blend_const_color_r_float:
845 case nir_intrinsic_load_blend_const_color_g_float:
846 case nir_intrinsic_load_blend_const_color_b_float:
847 case nir_intrinsic_load_blend_const_color_a_float:
848 bi_emit_blend_const(ctx, instr);
849 break;
850
851 case nir_intrinsic_load_sample_id:
852 bi_emit_sample_id(ctx, instr);
853 break;
854
855 case nir_intrinsic_load_front_face:
856 bi_emit_front_face(ctx, instr);
857 break;
858
859 case nir_intrinsic_load_point_coord:
860 bi_emit_point_coord(ctx, instr);
861 break;
862
863 case nir_intrinsic_load_vertex_id:
864 bi_emit_vertex_id(ctx, instr);
865 break;
866
867 case nir_intrinsic_load_instance_id:
868 bi_emit_instance_id(ctx, instr);
869 break;
870
871 default:
872 unreachable("Unknown intrinsic");
873 break;
874 }
875 }
876
877 static void
emit_load_const(bi_context * ctx,nir_load_const_instr * instr)878 emit_load_const(bi_context *ctx, nir_load_const_instr *instr)
879 {
880 /* Make sure we've been lowered */
881 assert(instr->def.num_components <= (32 / instr->def.bit_size));
882
883 /* Accumulate all the channels of the constant, as if we did an
884 * implicit SEL over them */
885 uint32_t acc = 0;
886
887 for (unsigned i = 0; i < instr->def.num_components; ++i) {
888 unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
889 acc |= (v << (i * instr->def.bit_size));
890 }
891
892 bi_instruction move = {
893 .type = BI_MOV,
894 .dest = pan_ssa_index(&instr->def),
895 .dest_type = nir_type_uint32,
896 .src = {
897 BIR_INDEX_CONSTANT
898 },
899 .src_types = {
900 nir_type_uint32,
901 },
902 .constant = {
903 .u32 = acc
904 }
905 };
906
907 bi_emit(ctx, move);
908 }
909
910 #define BI_CASE_CMP(op) \
911 case op##8: \
912 case op##16: \
913 case op##32: \
914
915 static enum bi_class
bi_class_for_nir_alu(nir_op op)916 bi_class_for_nir_alu(nir_op op)
917 {
918 switch (op) {
919 case nir_op_fadd:
920 case nir_op_fsub:
921 return BI_ADD;
922
923 case nir_op_iadd:
924 case nir_op_isub:
925 return BI_IMATH;
926
927 case nir_op_imul:
928 return BI_IMUL;
929
930 case nir_op_iand:
931 case nir_op_ior:
932 case nir_op_ixor:
933 case nir_op_inot:
934 case nir_op_ishl:
935 case nir_op_ishr:
936 case nir_op_ushr:
937 return BI_BITWISE;
938
939 BI_CASE_CMP(nir_op_flt)
940 BI_CASE_CMP(nir_op_fge)
941 BI_CASE_CMP(nir_op_feq)
942 BI_CASE_CMP(nir_op_fneu)
943 BI_CASE_CMP(nir_op_ilt)
944 BI_CASE_CMP(nir_op_ige)
945 BI_CASE_CMP(nir_op_ieq)
946 BI_CASE_CMP(nir_op_ine)
947 BI_CASE_CMP(nir_op_uge)
948 BI_CASE_CMP(nir_op_ult)
949 return BI_CMP;
950
951 case nir_op_b8csel:
952 case nir_op_b16csel:
953 case nir_op_b32csel:
954 return BI_CSEL;
955
956 case nir_op_i2i8:
957 case nir_op_i2i16:
958 case nir_op_i2i32:
959 case nir_op_i2i64:
960 case nir_op_u2u8:
961 case nir_op_u2u16:
962 case nir_op_u2u32:
963 case nir_op_u2u64:
964 case nir_op_f2i16:
965 case nir_op_f2i32:
966 case nir_op_f2i64:
967 case nir_op_f2u16:
968 case nir_op_f2u32:
969 case nir_op_f2u64:
970 case nir_op_i2f16:
971 case nir_op_i2f32:
972 case nir_op_i2f64:
973 case nir_op_u2f16:
974 case nir_op_u2f32:
975 case nir_op_u2f64:
976 case nir_op_f2f16:
977 case nir_op_f2f32:
978 case nir_op_f2f64:
979 case nir_op_f2fmp:
980 return BI_CONVERT;
981
982 case nir_op_vec2:
983 case nir_op_vec3:
984 case nir_op_vec4:
985 return BI_COMBINE;
986
987 case nir_op_vec8:
988 case nir_op_vec16:
989 unreachable("should've been lowered");
990
991 case nir_op_ffma:
992 case nir_op_fmul:
993 return BI_FMA;
994
995 case nir_op_imin:
996 case nir_op_imax:
997 case nir_op_umin:
998 case nir_op_umax:
999 case nir_op_fmin:
1000 case nir_op_fmax:
1001 return BI_MINMAX;
1002
1003 case nir_op_fsat:
1004 case nir_op_fneg:
1005 case nir_op_fabs:
1006 return BI_FMOV;
1007 case nir_op_mov:
1008 return BI_MOV;
1009
1010 case nir_op_fround_even:
1011 case nir_op_fceil:
1012 case nir_op_ffloor:
1013 case nir_op_ftrunc:
1014 return BI_ROUND;
1015
1016 case nir_op_frcp:
1017 case nir_op_frsq:
1018 case nir_op_iabs:
1019 return BI_SPECIAL_ADD;
1020
1021 default:
1022 unreachable("Unknown ALU op");
1023 }
1024 }
1025
1026 /* Gets a bi_cond for a given NIR comparison opcode. In soft mode, it will
1027 * return BI_COND_ALWAYS as a sentinel if it fails to do so (when used for
1028 * optimizations). Otherwise it will bail (when used for primary code
1029 * generation). */
1030
1031 static enum bi_cond
bi_cond_for_nir(nir_op op,bool soft)1032 bi_cond_for_nir(nir_op op, bool soft)
1033 {
1034 switch (op) {
1035 BI_CASE_CMP(nir_op_flt)
1036 BI_CASE_CMP(nir_op_ilt)
1037 BI_CASE_CMP(nir_op_ult)
1038 return BI_COND_LT;
1039
1040 BI_CASE_CMP(nir_op_fge)
1041 BI_CASE_CMP(nir_op_ige)
1042 BI_CASE_CMP(nir_op_uge)
1043 return BI_COND_GE;
1044
1045 BI_CASE_CMP(nir_op_feq)
1046 BI_CASE_CMP(nir_op_ieq)
1047 return BI_COND_EQ;
1048
1049 BI_CASE_CMP(nir_op_fneu)
1050 BI_CASE_CMP(nir_op_ine)
1051 return BI_COND_NE;
1052 default:
1053 if (soft)
1054 return BI_COND_ALWAYS;
1055 else
1056 unreachable("Invalid compare");
1057 }
1058 }
1059
1060 static void
bi_copy_src(bi_instruction * alu,nir_alu_instr * instr,unsigned i,unsigned to,unsigned * constants_left,unsigned * constant_shift)1061 bi_copy_src(bi_instruction *alu, nir_alu_instr *instr, unsigned i, unsigned to,
1062 unsigned *constants_left, unsigned *constant_shift)
1063 {
1064 unsigned bits = nir_src_bit_size(instr->src[i].src);
1065 unsigned dest_bits = nir_dest_bit_size(instr->dest.dest);
1066
1067 alu->src_types[to] = nir_op_infos[instr->op].input_types[i]
1068 | bits;
1069
1070 /* Try to inline a constant */
1071 if (nir_src_is_const(instr->src[i].src) && *constants_left && (dest_bits == bits)) {
1072 uint64_t mask = (1ull << dest_bits) - 1;
1073 uint64_t cons = nir_src_as_uint(instr->src[i].src);
1074
1075 /* Try to reuse a constant */
1076 for (unsigned i = 0; i < (*constant_shift); i += dest_bits) {
1077 if (((alu->constant.u64 >> i) & mask) == cons) {
1078 alu->src[to] = BIR_INDEX_CONSTANT | i;
1079 return;
1080 }
1081 }
1082
1083 alu->constant.u64 |= cons << *constant_shift;
1084 alu->src[to] = BIR_INDEX_CONSTANT | (*constant_shift);
1085 --(*constants_left);
1086 (*constant_shift) += MAX2(dest_bits, 32); /* lo/hi */
1087 return;
1088 }
1089
1090 alu->src[to] = pan_src_index(&instr->src[i].src);
1091
1092 /* Copy swizzle for all vectored components, replicating last component
1093 * to fill undersized */
1094
1095 unsigned vec = alu->type == BI_COMBINE ? 1 :
1096 MAX2(1, 32 / bits);
1097
1098 unsigned comps = nir_ssa_alu_instr_src_components(instr, i);
1099 for (unsigned j = 0; j < vec; ++j)
1100 alu->swizzle[to][j] = instr->src[i].swizzle[MIN2(j, comps - 1)];
1101 }
1102
1103 static void
bi_fuse_cond(bi_instruction * csel,nir_alu_src cond,unsigned * constants_left,unsigned * constant_shift,unsigned comps,bool float_only)1104 bi_fuse_cond(bi_instruction *csel, nir_alu_src cond,
1105 unsigned *constants_left, unsigned *constant_shift,
1106 unsigned comps, bool float_only)
1107 {
1108 /* Bail for vector weirdness */
1109 if (cond.swizzle[0] != 0)
1110 return;
1111
1112 if (!cond.src.is_ssa)
1113 return;
1114
1115 nir_ssa_def *def = cond.src.ssa;
1116 nir_instr *parent = def->parent_instr;
1117
1118 if (parent->type != nir_instr_type_alu)
1119 return;
1120
1121 nir_alu_instr *alu = nir_instr_as_alu(parent);
1122
1123 /* Try to match a condition */
1124 enum bi_cond bcond = bi_cond_for_nir(alu->op, true);
1125
1126 if (bcond == BI_COND_ALWAYS)
1127 return;
1128
1129 /* Some instructions can't compare ints */
1130 if (float_only) {
1131 nir_alu_type T = nir_op_infos[alu->op].input_types[0];
1132 T = nir_alu_type_get_base_type(T);
1133
1134 if (T != nir_type_float)
1135 return;
1136 }
1137
1138 /* We found one, let's fuse it in */
1139 csel->cond = bcond;
1140 bi_copy_src(csel, alu, 0, 0, constants_left, constant_shift);
1141 bi_copy_src(csel, alu, 1, 1, constants_left, constant_shift);
1142 }
1143
1144 static void
emit_alu(bi_context * ctx,nir_alu_instr * instr)1145 emit_alu(bi_context *ctx, nir_alu_instr *instr)
1146 {
1147 /* Try some special functions */
1148 switch (instr->op) {
1149 case nir_op_fexp2:
1150 bi_emit_fexp2(ctx, instr);
1151 return;
1152 case nir_op_flog2:
1153 bi_emit_flog2(ctx, instr);
1154 return;
1155 default:
1156 break;
1157 }
1158
1159 /* Otherwise, assume it's something we can handle normally */
1160 bi_instruction alu = {
1161 .type = bi_class_for_nir_alu(instr->op),
1162 .dest = pan_dest_index(&instr->dest.dest),
1163 .dest_type = nir_op_infos[instr->op].output_type
1164 | nir_dest_bit_size(instr->dest.dest),
1165 };
1166
1167 /* TODO: Implement lowering of special functions for older Bifrost */
1168 assert(alu.type != BI_SPECIAL_ADD || !(ctx->quirks & BIFROST_NO_FAST_OP));
1169
1170 unsigned comps = nir_dest_num_components(instr->dest.dest);
1171 bool vector = comps > MAX2(1, 32 / nir_dest_bit_size(instr->dest.dest));
1172 assert(!vector || alu.type == BI_COMBINE || alu.type == BI_MOV);
1173
1174 if (!instr->dest.dest.is_ssa) {
1175 for (unsigned i = 0; i < comps; ++i)
1176 assert(instr->dest.write_mask);
1177 }
1178
1179 /* We inline constants as we go. This tracks how many constants have
1180 * been inlined, since we're limited to 64-bits of constants per
1181 * instruction */
1182
1183 unsigned dest_bits = nir_dest_bit_size(instr->dest.dest);
1184 unsigned constants_left = (64 / dest_bits);
1185 unsigned constant_shift = 0;
1186
1187 if (alu.type == BI_COMBINE)
1188 constants_left = 0;
1189
1190 /* Copy sources */
1191
1192 unsigned num_inputs = nir_op_infos[instr->op].num_inputs;
1193 assert(num_inputs <= ARRAY_SIZE(alu.src));
1194
1195 for (unsigned i = 0; i < num_inputs; ++i) {
1196 unsigned f = 0;
1197
1198 if (i && alu.type == BI_CSEL)
1199 f++;
1200
1201 bi_copy_src(&alu, instr, i, i + f, &constants_left, &constant_shift);
1202 }
1203
1204 /* Op-specific fixup */
1205 switch (instr->op) {
1206 case nir_op_fmul:
1207 alu.src[2] = BIR_INDEX_ZERO; /* FMA */
1208 alu.src_types[2] = alu.src_types[1];
1209 break;
1210 case nir_op_fsat:
1211 alu.outmod = BIFROST_SAT; /* FMOV */
1212 break;
1213 case nir_op_fneg:
1214 alu.src_neg[0] = true; /* FMOV */
1215 break;
1216 case nir_op_fabs:
1217 alu.src_abs[0] = true; /* FMOV */
1218 break;
1219 case nir_op_fsub:
1220 alu.src_neg[1] = true; /* FADD */
1221 break;
1222 case nir_op_iadd:
1223 alu.op.imath = BI_IMATH_ADD;
1224 /* Carry */
1225 alu.src[2] = BIR_INDEX_ZERO;
1226 break;
1227 case nir_op_isub:
1228 alu.op.imath = BI_IMATH_SUB;
1229 /* Borrow */
1230 alu.src[2] = BIR_INDEX_ZERO;
1231 break;
1232 case nir_op_iabs:
1233 alu.op.special = BI_SPECIAL_IABS;
1234 break;
1235 case nir_op_inot:
1236 /* no dedicated bitwise not, but we can invert sources. convert to ~(a | 0) */
1237 alu.op.bitwise = BI_BITWISE_OR;
1238 alu.bitwise.dest_invert = true;
1239 alu.src[1] = BIR_INDEX_ZERO;
1240 /* zero shift */
1241 alu.src[2] = BIR_INDEX_ZERO;
1242 alu.src_types[2] = nir_type_uint8;
1243 break;
1244 case nir_op_ushr:
1245 alu.bitwise.rshift = true;
1246 /* fallthrough */
1247 case nir_op_ishl:
1248 alu.op.bitwise = BI_BITWISE_OR;
1249 /* move src1 to src2 and replace with zero. underlying op is (src0 << src2) | src1 */
1250 alu.src[2] = alu.src[1];
1251 alu.src_types[2] = nir_type_uint8;
1252 alu.src[1] = BIR_INDEX_ZERO;
1253 break;
1254 case nir_op_ishr:
1255 alu.op.bitwise = BI_BITWISE_ARSHIFT;
1256 alu.bitwise.rshift = true;
1257 /* move src1 to src2 and replace with zero. underlying op is (src0 >> src2) */
1258 alu.src[2] = alu.src[1];
1259 alu.src_types[2] = nir_type_uint8;
1260 alu.src[1] = BIR_INDEX_ZERO;
1261 break;
1262 case nir_op_imul:
1263 alu.op.imul = BI_IMUL_IMUL;
1264 break;
1265 case nir_op_fmax:
1266 case nir_op_imax:
1267 case nir_op_umax:
1268 alu.op.minmax = BI_MINMAX_MAX; /* MINMAX */
1269 break;
1270 case nir_op_frcp:
1271 alu.op.special = BI_SPECIAL_FRCP;
1272 break;
1273 case nir_op_frsq:
1274 alu.op.special = BI_SPECIAL_FRSQ;
1275 break;
1276 BI_CASE_CMP(nir_op_flt)
1277 BI_CASE_CMP(nir_op_ilt)
1278 BI_CASE_CMP(nir_op_fge)
1279 BI_CASE_CMP(nir_op_ige)
1280 BI_CASE_CMP(nir_op_feq)
1281 BI_CASE_CMP(nir_op_ieq)
1282 BI_CASE_CMP(nir_op_fneu)
1283 BI_CASE_CMP(nir_op_ine)
1284 BI_CASE_CMP(nir_op_uge)
1285 BI_CASE_CMP(nir_op_ult)
1286 alu.cond = bi_cond_for_nir(instr->op, false);
1287 break;
1288 case nir_op_fround_even:
1289 alu.roundmode = BIFROST_RTE;
1290 break;
1291 case nir_op_fceil:
1292 alu.roundmode = BIFROST_RTP;
1293 break;
1294 case nir_op_ffloor:
1295 alu.roundmode = BIFROST_RTN;
1296 break;
1297 case nir_op_ftrunc:
1298 alu.roundmode = BIFROST_RTZ;
1299 break;
1300 case nir_op_iand:
1301 alu.op.bitwise = BI_BITWISE_AND;
1302 /* zero shift */
1303 alu.src[2] = BIR_INDEX_ZERO;
1304 alu.src_types[2] = nir_type_uint8;
1305 break;
1306 case nir_op_ior:
1307 alu.op.bitwise = BI_BITWISE_OR;
1308 /* zero shift */
1309 alu.src[2] = BIR_INDEX_ZERO;
1310 alu.src_types[2] = nir_type_uint8;
1311 break;
1312 case nir_op_ixor:
1313 alu.op.bitwise = BI_BITWISE_XOR;
1314 /* zero shift */
1315 alu.src[2] = BIR_INDEX_ZERO;
1316 alu.src_types[2] = nir_type_uint8;
1317 break;
1318 case nir_op_f2i32:
1319 alu.roundmode = BIFROST_RTZ;
1320 break;
1321
1322 case nir_op_f2f16:
1323 case nir_op_i2i16:
1324 case nir_op_u2u16: {
1325 if (nir_src_bit_size(instr->src[0].src) != 32)
1326 break;
1327
1328 /* Should have been const folded */
1329 assert(!nir_src_is_const(instr->src[0].src));
1330
1331 alu.src_types[1] = alu.src_types[0];
1332 alu.src[1] = alu.src[0];
1333
1334 unsigned last = nir_dest_num_components(instr->dest.dest) - 1;
1335 assert(last <= 1);
1336
1337 alu.swizzle[1][0] = instr->src[0].swizzle[last];
1338 break;
1339 }
1340
1341 default:
1342 break;
1343 }
1344
1345 if (alu.type == BI_MOV && vector) {
1346 alu.type = BI_COMBINE;
1347
1348 for (unsigned i = 0; i < comps; ++i) {
1349 alu.src[i] = alu.src[0];
1350 alu.swizzle[i][0] = instr->src[0].swizzle[i];
1351 }
1352 }
1353
1354 if (alu.type == BI_CSEL) {
1355 /* Default to csel3 */
1356 alu.cond = BI_COND_NE;
1357 alu.src[1] = BIR_INDEX_ZERO;
1358 alu.src_types[1] = alu.src_types[0];
1359
1360 /* TODO: Reenable cond fusing when we can split up registers
1361 * when scheduling */
1362 #if 0
1363 bi_fuse_cond(&alu, instr->src[0],
1364 &constants_left, &constant_shift, comps, false);
1365 #endif
1366 }
1367
1368 bi_emit(ctx, alu);
1369 }
1370
1371 /* TEXS instructions assume normal 2D f32 operation but are more
1372 * space-efficient and with simpler RA/scheduling requirements*/
1373
1374 static void
emit_texs(bi_context * ctx,nir_tex_instr * instr)1375 emit_texs(bi_context *ctx, nir_tex_instr *instr)
1376 {
1377 bi_instruction tex = {
1378 .type = BI_TEXS,
1379 .texture = {
1380 .texture_index = instr->texture_index,
1381 .sampler_index = instr->sampler_index,
1382 .compute_lod = instr->op == nir_texop_tex,
1383 },
1384 .dest = pan_dest_index(&instr->dest),
1385 .dest_type = instr->dest_type,
1386 .src_types = { nir_type_float32, nir_type_float32 },
1387 .vector_channels = 4
1388 };
1389
1390 for (unsigned i = 0; i < instr->num_srcs; ++i) {
1391 int index = pan_src_index(&instr->src[i].src);
1392
1393 /* We were checked ahead-of-time */
1394 if (instr->src[i].src_type == nir_tex_src_lod)
1395 continue;
1396
1397 assert (instr->src[i].src_type == nir_tex_src_coord);
1398
1399 tex.src[0] = index;
1400 tex.src[1] = index;
1401 tex.swizzle[0][0] = 0;
1402 tex.swizzle[1][0] = 1;
1403 }
1404
1405 bi_emit(ctx, tex);
1406 }
1407
1408 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */
1409 static unsigned
bifrost_tex_format(enum glsl_sampler_dim dim)1410 bifrost_tex_format(enum glsl_sampler_dim dim)
1411 {
1412 switch (dim) {
1413 case GLSL_SAMPLER_DIM_1D:
1414 case GLSL_SAMPLER_DIM_BUF:
1415 return 1;
1416
1417 case GLSL_SAMPLER_DIM_2D:
1418 case GLSL_SAMPLER_DIM_MS:
1419 case GLSL_SAMPLER_DIM_EXTERNAL:
1420 case GLSL_SAMPLER_DIM_RECT:
1421 return 2;
1422
1423 case GLSL_SAMPLER_DIM_3D:
1424 return 3;
1425
1426 case GLSL_SAMPLER_DIM_CUBE:
1427 return 0;
1428
1429 default:
1430 DBG("Unknown sampler dim type\n");
1431 assert(0);
1432 return 0;
1433 }
1434 }
1435
1436 static enum bifrost_texture_format_full
bi_texture_format(nir_alu_type T,enum bifrost_outmod outmod)1437 bi_texture_format(nir_alu_type T, enum bifrost_outmod outmod)
1438 {
1439 switch (T) {
1440 case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + outmod;
1441 case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + outmod;
1442 case nir_type_uint16: return BIFROST_TEXTURE_FORMAT_U16;
1443 case nir_type_int16: return BIFROST_TEXTURE_FORMAT_S16;
1444 case nir_type_uint32: return BIFROST_TEXTURE_FORMAT_U32;
1445 case nir_type_int32: return BIFROST_TEXTURE_FORMAT_S32;
1446 default: unreachable("Invalid type for texturing");
1447 }
1448 }
1449
1450 /* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */
1451 static unsigned
bi_emit_array_index(bi_context * ctx,unsigned idx,nir_alu_type T,unsigned * c)1452 bi_emit_array_index(bi_context *ctx, unsigned idx, nir_alu_type T, unsigned *c)
1453 {
1454 /* For (u)int we can just passthrough */
1455 nir_alu_type base = nir_alu_type_get_base_type(T);
1456 if (base == nir_type_int || base == nir_type_uint) {
1457 *c = 2;
1458 return idx;
1459 }
1460
1461 /* Otherwise we convert */
1462 assert(T == nir_type_float16 || T == nir_type_float32);
1463
1464 /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
1465 * Texel Selection") defines the layer to be taken from clamp(RNE(r),
1466 * 0, dt - 1). So we use roundmode RTE, clamping is handled at the data
1467 * structure level */
1468 bi_instruction f2i = {
1469 .type = BI_CONVERT,
1470 .dest = bi_make_temp(ctx),
1471 .dest_type = nir_type_uint32,
1472 .src = { idx },
1473 .src_types = { T },
1474 .swizzle = { { 2 } },
1475 .roundmode = BIFROST_RTE
1476 };
1477
1478 *c = 0;
1479 bi_emit(ctx, f2i);
1480 return f2i.dest;
1481 }
1482
1483 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
1484 * 16-bit 8:8 fixed-point format. We lower as:
1485 *
1486 * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
1487 * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
1488 */
1489
1490 static unsigned
bi_emit_lod_88(bi_context * ctx,unsigned lod,bool fp16)1491 bi_emit_lod_88(bi_context *ctx, unsigned lod, bool fp16)
1492 {
1493 nir_alu_type T = fp16 ? nir_type_float16 : nir_type_float32;
1494
1495 /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
1496 * the max LOD (16 since we cap at 2^16 texture dimensions), and
1497 * preferably small to minimize precision loss */
1498 const float max_lod = 16.0;
1499
1500 /* FMA.f16/f32.sat_signed, saturated, lod, #1.0/max_lod, #0 */
1501 bi_instruction fsat = {
1502 .type = BI_FMA,
1503 .dest = bi_make_temp(ctx),
1504 .dest_type = nir_type_float32,
1505 .src = { lod, BIR_INDEX_CONSTANT, BIR_INDEX_ZERO },
1506 .src_types = { T, nir_type_float32, nir_type_float32 },
1507 .outmod = BIFROST_SAT_SIGNED,
1508 .roundmode = BIFROST_RTE,
1509 .constant = {
1510 .u64 = fui(1.0 / max_lod)
1511 },
1512 };
1513
1514 /* FMA.f32 scaled, saturated, lod, #(max_lod * 256.0), #0 */
1515 bi_instruction fmul = {
1516 .type = BI_FMA,
1517 .dest = bi_make_temp(ctx),
1518 .dest_type = T,
1519 .src = { fsat.dest, BIR_INDEX_CONSTANT, BIR_INDEX_ZERO },
1520 .src_types = { nir_type_float32, nir_type_float32, nir_type_float32 },
1521 .roundmode = BIFROST_RTE,
1522 .constant = {
1523 .u64 = fui(max_lod * 256.0)
1524 },
1525 };
1526
1527 /* F32_TO_S32 s32, scaled */
1528 bi_instruction f2i = {
1529 .type = BI_CONVERT,
1530 .dest = bi_make_temp(ctx),
1531 .dest_type = nir_type_int32,
1532 .src = { fmul.dest },
1533 .src_types = { T },
1534 .roundmode = BIFROST_RTZ
1535 };
1536
1537 /* MKVEC.v2i16 s32.h0, #0 */
1538 bi_instruction mkvec = {
1539 .type = BI_SELECT,
1540 .dest = bi_make_temp(ctx),
1541 .dest_type = nir_type_int16,
1542 .src = { f2i.dest, BIR_INDEX_ZERO },
1543 .src_types = { nir_type_int16, nir_type_int16 },
1544 };
1545
1546 bi_emit(ctx, fsat);
1547 bi_emit(ctx, fmul);
1548 bi_emit(ctx, f2i);
1549 bi_emit(ctx, mkvec);
1550
1551 return mkvec.dest;
1552 }
1553
1554 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
1555 * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
1556 * TODO: Cube face.
1557 */
1558
1559 static unsigned
bi_emit_lod_cube(bi_context * ctx,unsigned lod)1560 bi_emit_lod_cube(bi_context *ctx, unsigned lod)
1561 {
1562 /* MKVEC.v2i16 out, lod.h0, #0 */
1563 bi_instruction mkvec = {
1564 .type = BI_SELECT,
1565 .dest = bi_make_temp(ctx),
1566 .dest_type = nir_type_int16,
1567 .src = { lod, BIR_INDEX_ZERO },
1568 .src_types = { nir_type_int16, nir_type_int16 },
1569 };
1570
1571 bi_emit(ctx, mkvec);
1572
1573 return mkvec.dest;
1574 }
1575
1576 /* The hardware specifies texel offsets and multisample indices together as a
1577 * u8vec4 <offset, ms index>. By default all are zero, so if have either a
1578 * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
1579 * the bits we need and return that to be passed as a staging register. Else we
1580 * return 0 to avoid allocating a data register when everything is zero. */
1581
1582 static unsigned
bi_emit_tex_offset_ms_index(bi_context * ctx,nir_tex_instr * instr)1583 bi_emit_tex_offset_ms_index(bi_context *ctx, nir_tex_instr *instr)
1584 {
1585 unsigned dest = 0;
1586
1587 /* TODO: offsets */
1588 assert(nir_tex_instr_src_index(instr, nir_tex_src_offset) < 0);
1589
1590 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
1591 if (ms_idx >= 0 &&
1592 (!nir_src_is_const(instr->src[ms_idx].src) ||
1593 nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
1594 bi_instruction shl = {
1595 .type = BI_BITWISE,
1596 .op.bitwise = BI_BITWISE_OR,
1597 .dest = bi_make_temp(ctx),
1598 .dest_type = nir_type_uint32,
1599 .src = {
1600 pan_src_index(&instr->src[ms_idx].src),
1601 BIR_INDEX_ZERO,
1602 BIR_INDEX_CONSTANT | 0,
1603 },
1604 .src_types = {
1605 nir_type_uint32,
1606 nir_type_uint32,
1607 nir_type_uint8,
1608 },
1609 .constant.u8[0] = 24,
1610 };
1611
1612 bi_emit(ctx, shl);
1613 dest = shl.dest;
1614 }
1615
1616 return dest;
1617 }
1618
1619 static void
bi_lower_cube_coord(bi_context * ctx,unsigned coord,unsigned * face,unsigned * s,unsigned * t)1620 bi_lower_cube_coord(bi_context *ctx, unsigned coord,
1621 unsigned *face, unsigned *s, unsigned *t)
1622 {
1623 /* Compute max { |x|, |y|, |z| } */
1624 bi_instruction cubeface1 = {
1625 .type = BI_SPECIAL_FMA,
1626 .op.special = BI_SPECIAL_CUBEFACE1,
1627 .dest = bi_make_temp(ctx),
1628 .dest_type = nir_type_float32,
1629 .src = { coord, coord, coord },
1630 .src_types = { nir_type_float32, nir_type_float32, nir_type_float32 },
1631 .swizzle = { {0}, {1}, {2} }
1632 };
1633
1634 /* Calculate packed exponent / face / infinity. In reality this reads
1635 * the destination from cubeface1 but that's handled by lowering */
1636 bi_instruction cubeface2 = {
1637 .type = BI_SPECIAL_ADD,
1638 .op.special = BI_SPECIAL_CUBEFACE2,
1639 .dest = bi_make_temp(ctx),
1640 .dest_type = nir_type_uint32,
1641 .src = { coord, coord, coord },
1642 .src_types = { nir_type_float32, nir_type_float32, nir_type_float32 },
1643 .swizzle = { {0}, {1}, {2} }
1644 };
1645
1646 /* Select S coordinate */
1647 bi_instruction cube_ssel = {
1648 .type = BI_SPECIAL_ADD,
1649 .op.special = BI_SPECIAL_CUBE_SSEL,
1650 .dest = bi_make_temp(ctx),
1651 .dest_type = nir_type_float32,
1652 .src = { coord, coord, cubeface2.dest },
1653 .src_types = { nir_type_float32, nir_type_float32, nir_type_uint32 },
1654 .swizzle = { {2}, {0} }
1655 };
1656
1657 /* Select T coordinate */
1658 bi_instruction cube_tsel = {
1659 .type = BI_SPECIAL_ADD,
1660 .op.special = BI_SPECIAL_CUBE_TSEL,
1661 .dest = bi_make_temp(ctx),
1662 .dest_type = nir_type_float32,
1663 .src = { coord, coord, cubeface2.dest },
1664 .src_types = { nir_type_float32, nir_type_float32, nir_type_uint32 },
1665 .swizzle = { {1}, {2} }
1666 };
1667
1668 /* The OpenGL ES specification requires us to transform an input vector
1669 * (x, y, z) to the coordinate, given the selected S/T:
1670 *
1671 * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
1672 *
1673 * We implement (s shown, t similar) in a form friendlier to FMA
1674 * instructions, and clamp coordinates at the end for correct
1675 * NaN/infinity handling:
1676 *
1677 * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
1678 *
1679 * Take the reciprocal of max{x, y, z}
1680 */
1681
1682 bi_instruction frcp = {
1683 .type = BI_SPECIAL_ADD,
1684 .op.special = BI_SPECIAL_FRCP,
1685 .dest = bi_make_temp(ctx),
1686 .dest_type = nir_type_float32,
1687 .src = { cubeface1.dest },
1688 .src_types = { nir_type_float32 },
1689 };
1690
1691 /* Calculate 0.5 * (1.0 / max{x, y, z}) */
1692 bi_instruction fma1 = {
1693 .type = BI_FMA,
1694 .dest = bi_make_temp(ctx),
1695 .dest_type = nir_type_float32,
1696 .src = { frcp.dest, BIR_INDEX_CONSTANT | 0, BIR_INDEX_ZERO },
1697 .src_types = { nir_type_float32, nir_type_float32, nir_type_float32 },
1698 .constant.u64 = 0x3f000000, /* 0.5f */
1699 };
1700
1701 /* Transform the s coordinate */
1702 bi_instruction fma2 = {
1703 .type = BI_FMA,
1704 .outmod = BIFROST_SAT,
1705 .dest = bi_make_temp(ctx),
1706 .dest_type = nir_type_float32,
1707 .src = { fma1.dest, cube_ssel.dest, BIR_INDEX_CONSTANT | 0 },
1708 .src_types = { nir_type_float32, nir_type_float32, nir_type_float32 },
1709 .constant.u64 = 0x3f000000, /* 0.5f */
1710 };
1711
1712 /* Transform the t coordinate */
1713 bi_instruction fma3 = {
1714 .type = BI_FMA,
1715 .outmod = BIFROST_SAT,
1716 .dest = bi_make_temp(ctx),
1717 .dest_type = nir_type_float32,
1718 .src = { fma1.dest, cube_tsel.dest, BIR_INDEX_CONSTANT | 0 },
1719 .src_types = { nir_type_float32, nir_type_float32, nir_type_float32 },
1720 .constant.u64 = 0x3f000000, /* 0.5f */
1721 };
1722
1723 bi_emit(ctx, cubeface1);
1724 bi_emit(ctx, cubeface2);
1725 bi_emit(ctx, cube_ssel);
1726 bi_emit(ctx, cube_tsel);
1727 bi_emit(ctx, frcp);
1728 bi_emit(ctx, fma1);
1729 bi_emit(ctx, fma2);
1730 bi_emit(ctx, fma3);
1731
1732 /* Cube face is stored in bit[29:31], we don't apply the shift here
1733 * because the TEXS_CUBE and TEXC instructions expect the face index to
1734 * be at this position.
1735 */
1736 *face = cubeface2.dest;
1737 *s = fma2.dest;
1738 *t = fma3.dest;
1739 }
1740
1741 static void
texc_pack_cube_coord(bi_context * ctx,unsigned coord,unsigned * face_s,unsigned * t)1742 texc_pack_cube_coord(bi_context *ctx, unsigned coord,
1743 unsigned *face_s, unsigned *t)
1744 {
1745 unsigned face, s;
1746
1747 bi_lower_cube_coord(ctx, coord, &face, &s, t);
1748
1749 bi_instruction and1 = {
1750 .type = BI_BITWISE,
1751 .op.bitwise = BI_BITWISE_AND,
1752 .dest = bi_make_temp(ctx),
1753 .dest_type = nir_type_uint32,
1754 .src = { face, BIR_INDEX_CONSTANT | 0, BIR_INDEX_ZERO },
1755 .src_types = { nir_type_uint32, nir_type_uint32, nir_type_uint8 },
1756 .constant.u64 = 0xe0000000,
1757 };
1758
1759 bi_instruction and2 = {
1760 .type = BI_BITWISE,
1761 .op.bitwise = BI_BITWISE_AND,
1762 .dest = bi_make_temp(ctx),
1763 .dest_type = nir_type_uint32,
1764 .src = { s, BIR_INDEX_CONSTANT | 0, BIR_INDEX_ZERO },
1765 .src_types = { nir_type_uint32, nir_type_uint32, nir_type_uint8 },
1766 .constant.u64 = 0x1fffffff,
1767 };
1768
1769 bi_instruction or = {
1770 .type = BI_BITWISE,
1771 .op.bitwise = BI_BITWISE_OR,
1772 .dest = bi_make_temp(ctx),
1773 .dest_type = nir_type_uint32,
1774 .src = { and1.dest, and2.dest, BIR_INDEX_ZERO },
1775 .src_types = { nir_type_uint32, nir_type_uint32, nir_type_uint8 },
1776 };
1777
1778 bi_emit(ctx, and1);
1779 bi_emit(ctx, and2);
1780 bi_emit(ctx, or);
1781
1782 /* packed cube-face + s */
1783 *face_s = or.dest;
1784 }
1785
1786 /* Map to the main texture op used. Some of these (txd in particular) will
1787 * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
1788 * sequence). We assume that lowering is handled elsewhere.
1789 */
1790
1791 static enum bifrost_tex_op
bi_tex_op(nir_texop op)1792 bi_tex_op(nir_texop op)
1793 {
1794 switch (op) {
1795 case nir_texop_tex:
1796 case nir_texop_txb:
1797 case nir_texop_txl:
1798 case nir_texop_txd:
1799 case nir_texop_tex_prefetch:
1800 return BIFROST_TEX_OP_TEX;
1801 case nir_texop_txf:
1802 case nir_texop_txf_ms:
1803 case nir_texop_txf_ms_fb:
1804 case nir_texop_txf_ms_mcs:
1805 case nir_texop_tg4:
1806 return BIFROST_TEX_OP_FETCH;
1807 case nir_texop_txs:
1808 case nir_texop_lod:
1809 case nir_texop_query_levels:
1810 case nir_texop_texture_samples:
1811 case nir_texop_samples_identical:
1812 unreachable("should've been lowered");
1813 default:
1814 unreachable("unsupported tex op");
1815 }
1816 }
1817
1818 /* Data registers required by texturing in the order they appear. All are
1819 * optional, the texture operation descriptor determines which are present.
1820 * Note since 3D arrays are not permitted at an API level, Z_COORD and
1821 * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
1822
1823 enum bifrost_tex_dreg {
1824 BIFROST_TEX_DREG_Z_COORD = 0,
1825 BIFROST_TEX_DREG_Y_DELTAS = 1,
1826 BIFROST_TEX_DREG_LOD = 2,
1827 BIFROST_TEX_DREG_GRDESC_HI = 3,
1828 BIFROST_TEX_DREG_SHADOW = 4,
1829 BIFROST_TEX_DREG_ARRAY = 5,
1830 BIFROST_TEX_DREG_OFFSETMS = 6,
1831 BIFROST_TEX_DREG_SAMPLER = 7,
1832 BIFROST_TEX_DREG_TEXTURE = 8,
1833 BIFROST_TEX_DREG_COUNT,
1834 };
1835
1836 static void
emit_texc(bi_context * ctx,nir_tex_instr * instr)1837 emit_texc(bi_context *ctx, nir_tex_instr *instr)
1838 {
1839 /* TODO: support more with other encodings */
1840 assert(instr->sampler_index < 16);
1841
1842 /* TODO: support more ops */
1843 switch (instr->op) {
1844 case nir_texop_tex:
1845 case nir_texop_txl:
1846 case nir_texop_txb:
1847 case nir_texop_txf:
1848 case nir_texop_txf_ms:
1849 break;
1850 default:
1851 unreachable("Unsupported texture op");
1852 }
1853
1854 bi_instruction tex = {
1855 .type = BI_TEXC,
1856 .dest = pan_dest_index(&instr->dest),
1857 .dest_type = instr->dest_type,
1858 .src_types = {
1859 /* Staging registers */
1860 nir_type_uint32,
1861 nir_type_float32, nir_type_float32,
1862 nir_type_uint32
1863 },
1864 .vector_channels = 4
1865 };
1866
1867 struct bifrost_texture_operation desc = {
1868 .sampler_index_or_mode = instr->sampler_index,
1869 .index = instr->texture_index,
1870 .immediate_indices = 1, /* TODO */
1871 .op = bi_tex_op(instr->op),
1872 .offset_or_bias_disable = false, /* TODO */
1873 .shadow_or_clamp_disable = instr->is_shadow,
1874 .array = instr->is_array,
1875 .dimension = bifrost_tex_format(instr->sampler_dim),
1876 .format = bi_texture_format(instr->dest_type, BIFROST_NONE), /* TODO */
1877 .mask = (1 << tex.vector_channels) - 1
1878 };
1879
1880 switch (desc.op) {
1881 case BIFROST_TEX_OP_TEX:
1882 desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
1883 break;
1884 case BIFROST_TEX_OP_FETCH:
1885 /* TODO: gathers */
1886 desc.lod_or_fetch = BIFROST_TEXTURE_FETCH_TEXEL;
1887 break;
1888 default:
1889 unreachable("texture op unsupported");
1890 }
1891
1892 /* 32-bit indices to be allocated as consecutive data registers. */
1893 unsigned dregs[BIFROST_TEX_DREG_COUNT] = { 0 };
1894 unsigned dregs_swiz[BIFROST_TEX_DREG_COUNT] = { 0 };
1895
1896 for (unsigned i = 0; i < instr->num_srcs; ++i) {
1897 unsigned index = pan_src_index(&instr->src[i].src);
1898 unsigned sz = nir_src_bit_size(instr->src[i].src);
1899 ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
1900 nir_alu_type T = base | sz;
1901
1902 switch (instr->src[i].src_type) {
1903 case nir_tex_src_coord:
1904 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
1905 texc_pack_cube_coord(ctx, index,
1906 &tex.src[1], &tex.src[2]);
1907 } else {
1908 tex.src[1] = index;
1909 tex.src[2] = index;
1910 tex.swizzle[1][0] = 0;
1911 tex.swizzle[2][0] = 1;
1912
1913 unsigned components = nir_src_num_components(instr->src[i].src);
1914 assert(components == 2 || components == 3);
1915
1916 if (components == 2) {
1917 /* nothing to do */
1918 } else if (desc.array) {
1919 /* 2D array */
1920 dregs[BIFROST_TEX_DREG_ARRAY] =
1921 bi_emit_array_index(ctx, index, T,
1922 &dregs_swiz[BIFROST_TEX_DREG_ARRAY]);
1923 } else {
1924 /* 3D */
1925 dregs[BIFROST_TEX_DREG_Z_COORD] = index;
1926 dregs_swiz[BIFROST_TEX_DREG_Z_COORD] = 2;
1927 }
1928 }
1929 break;
1930
1931 case nir_tex_src_lod:
1932 if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
1933 desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
1934 } else if (desc.op == BIFROST_TEX_OP_TEX) {
1935 assert(base == nir_type_float);
1936
1937 assert(sz == 16 || sz == 32);
1938 dregs[BIFROST_TEX_DREG_LOD] =
1939 bi_emit_lod_88(ctx, index, sz == 16);
1940 desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
1941 } else {
1942 assert(desc.op == BIFROST_TEX_OP_FETCH);
1943 assert(base == nir_type_uint || base == nir_type_int);
1944 assert(sz == 16 || sz == 32);
1945
1946 dregs[BIFROST_TEX_DREG_LOD] =
1947 bi_emit_lod_cube(ctx, index);
1948 }
1949
1950 break;
1951
1952 case nir_tex_src_bias:
1953 /* Upper 16-bits interpreted as a clamp, leave zero */
1954 assert(desc.op == BIFROST_TEX_OP_TEX);
1955 assert(base == nir_type_float);
1956 assert(sz == 16 || sz == 32);
1957 dregs[BIFROST_TEX_DREG_LOD] =
1958 bi_emit_lod_88(ctx, index, sz == 16);
1959 desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
1960 break;
1961
1962 case nir_tex_src_ms_index:
1963 case nir_tex_src_offset:
1964 if (desc.offset_or_bias_disable)
1965 break;
1966
1967 dregs[BIFROST_TEX_DREG_OFFSETMS] =
1968 bi_emit_tex_offset_ms_index(ctx, instr);
1969 if (dregs[BIFROST_TEX_DREG_OFFSETMS])
1970 desc.offset_or_bias_disable = true;
1971 break;
1972
1973 default:
1974 unreachable("Unhandled src type in texc emit");
1975 }
1976 }
1977
1978 /* Allocate data registers contiguously */
1979 bi_instruction combine = {
1980 .type = BI_COMBINE,
1981 .dest_type = nir_type_uint32,
1982 .dest = bi_make_temp(ctx),
1983 .src_types = {
1984 nir_type_uint32, nir_type_uint32,
1985 nir_type_uint32, nir_type_uint32,
1986 },
1987 };
1988
1989 unsigned dreg_index = 0;
1990
1991 for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
1992 assert(dreg_index < 4);
1993
1994 if (dregs[i]) {
1995 combine.swizzle[dreg_index][0] = dregs_swiz[i];
1996 combine.src[dreg_index++] = dregs[i];
1997 }
1998 }
1999
2000 if (dreg_index > 1) {
2001 /* Pass combined data registers together */
2002 tex.src[0] = combine.dest;
2003 bi_emit(ctx, combine);
2004
2005 for (unsigned i = 0; i < dreg_index; ++i)
2006 tex.swizzle[0][i] = i;
2007 } else if (dreg_index == 1) {
2008 tex.src[0] = combine.src[0];
2009 tex.swizzle[0][0] = combine.swizzle[0][0];
2010 } else {
2011 tex.src[0] = tex.dest;
2012 }
2013
2014 /* Pass the texture operation descriptor in src2 */
2015 tex.src[3] = BIR_INDEX_CONSTANT;
2016 memcpy(&tex.constant.u64, &desc, sizeof(desc));
2017
2018 bi_emit(ctx, tex);
2019 }
2020
2021 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D (or cube
2022 * map, TODO) textures. Anything else needs a complete texture op. */
2023
2024 static bool
bi_is_normal_tex(gl_shader_stage stage,nir_tex_instr * instr)2025 bi_is_normal_tex(gl_shader_stage stage, nir_tex_instr *instr)
2026 {
2027 if (instr->op == nir_texop_tex)
2028 return true;
2029
2030 if (instr->op != nir_texop_txl)
2031 return false;
2032
2033 int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2034 if (lod_idx < 0)
2035 return true;
2036
2037 nir_src lod = instr->src[lod_idx].src;
2038 return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
2039 }
2040
2041 static void
emit_tex(bi_context * ctx,nir_tex_instr * instr)2042 emit_tex(bi_context *ctx, nir_tex_instr *instr)
2043 {
2044 nir_alu_type base = nir_alu_type_get_base_type(instr->dest_type);
2045 unsigned sz = nir_dest_bit_size(instr->dest);
2046 instr->dest_type = base | sz;
2047
2048 bool is_normal = bi_is_normal_tex(ctx->stage, instr);
2049 bool is_2d = instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
2050 instr->sampler_dim == GLSL_SAMPLER_DIM_EXTERNAL;
2051 bool is_f = base == nir_type_float && (sz == 16 || sz == 32);
2052
2053 if (is_normal && is_2d && is_f && !instr->is_shadow && !instr->is_array)
2054 emit_texs(ctx, instr);
2055 else
2056 emit_texc(ctx, instr);
2057 }
2058
2059 static void
emit_instr(bi_context * ctx,struct nir_instr * instr)2060 emit_instr(bi_context *ctx, struct nir_instr *instr)
2061 {
2062 switch (instr->type) {
2063 case nir_instr_type_load_const:
2064 emit_load_const(ctx, nir_instr_as_load_const(instr));
2065 break;
2066
2067 case nir_instr_type_intrinsic:
2068 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
2069 break;
2070
2071 case nir_instr_type_alu:
2072 emit_alu(ctx, nir_instr_as_alu(instr));
2073 break;
2074
2075 case nir_instr_type_tex:
2076 emit_tex(ctx, nir_instr_as_tex(instr));
2077 break;
2078
2079 case nir_instr_type_jump:
2080 emit_jump(ctx, nir_instr_as_jump(instr));
2081 break;
2082
2083 case nir_instr_type_ssa_undef:
2084 unreachable("should've been lowered");
2085
2086 default:
2087 unreachable("Unhandled instruction type");
2088 break;
2089 }
2090 }
2091
2092
2093
2094 static bi_block *
create_empty_block(bi_context * ctx)2095 create_empty_block(bi_context *ctx)
2096 {
2097 bi_block *blk = rzalloc(ctx, bi_block);
2098
2099 blk->base.predecessors = _mesa_set_create(blk,
2100 _mesa_hash_pointer,
2101 _mesa_key_pointer_equal);
2102
2103 return blk;
2104 }
2105
2106 static bi_block *
emit_block(bi_context * ctx,nir_block * block)2107 emit_block(bi_context *ctx, nir_block *block)
2108 {
2109 if (ctx->after_block) {
2110 ctx->current_block = ctx->after_block;
2111 ctx->after_block = NULL;
2112 } else {
2113 ctx->current_block = create_empty_block(ctx);
2114 }
2115
2116 list_addtail(&ctx->current_block->base.link, &ctx->blocks);
2117 list_inithead(&ctx->current_block->base.instructions);
2118
2119 nir_foreach_instr(instr, block) {
2120 emit_instr(ctx, instr);
2121 ++ctx->instruction_count;
2122 }
2123
2124 return ctx->current_block;
2125 }
2126
2127 /* Emits an unconditional branch to the end of the current block, returning a
2128 * pointer so the user can fill in details */
2129
2130 static bi_instruction *
bi_emit_branch(bi_context * ctx)2131 bi_emit_branch(bi_context *ctx)
2132 {
2133 bi_instruction branch = {
2134 .type = BI_BRANCH,
2135 .cond = BI_COND_ALWAYS
2136 };
2137
2138 return bi_emit(ctx, branch);
2139 }
2140
2141 /* Sets a condition for a branch by examing the NIR condition. If we're
2142 * familiar with the condition, we unwrap it to fold it into the branch
2143 * instruction. Otherwise, we consume the condition directly. We
2144 * generally use 1-bit booleans which allows us to use small types for
2145 * the conditions.
2146 */
2147
2148 static void
bi_set_branch_cond(bi_instruction * branch,nir_src * cond,bool invert)2149 bi_set_branch_cond(bi_instruction *branch, nir_src *cond, bool invert)
2150 {
2151 /* TODO: Try to unwrap instead of always bailing */
2152 branch->src[0] = pan_src_index(cond);
2153 branch->src[1] = BIR_INDEX_ZERO;
2154 branch->src_types[0] = branch->src_types[1] = nir_type_uint |
2155 nir_src_bit_size(*cond);
2156 branch->cond = invert ? BI_COND_EQ : BI_COND_NE;
2157 }
2158
2159 static void
emit_if(bi_context * ctx,nir_if * nif)2160 emit_if(bi_context *ctx, nir_if *nif)
2161 {
2162 bi_block *before_block = ctx->current_block;
2163
2164 /* Speculatively emit the branch, but we can't fill it in until later */
2165 bi_instruction *then_branch = bi_emit_branch(ctx);
2166 bi_set_branch_cond(then_branch, &nif->condition, true);
2167
2168 /* Emit the two subblocks. */
2169 bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
2170 bi_block *end_then_block = ctx->current_block;
2171
2172 /* Emit a jump from the end of the then block to the end of the else */
2173 bi_instruction *then_exit = bi_emit_branch(ctx);
2174
2175 /* Emit second block, and check if it's empty */
2176
2177 int count_in = ctx->instruction_count;
2178 bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
2179 bi_block *end_else_block = ctx->current_block;
2180 ctx->after_block = create_empty_block(ctx);
2181
2182 /* Now that we have the subblocks emitted, fix up the branches */
2183
2184 assert(then_block);
2185 assert(else_block);
2186
2187 if (ctx->instruction_count == count_in) {
2188 /* The else block is empty, so don't emit an exit jump */
2189 bi_remove_instruction(then_exit);
2190 then_branch->branch_target = ctx->after_block;
2191 pan_block_add_successor(&end_then_block->base, &ctx->after_block->base); /* fallthrough */
2192 } else {
2193 then_branch->branch_target = else_block;
2194 then_exit->branch_target = ctx->after_block;
2195 pan_block_add_successor(&end_then_block->base, &then_exit->branch_target->base);
2196 pan_block_add_successor(&end_else_block->base, &ctx->after_block->base); /* fallthrough */
2197 }
2198
2199 pan_block_add_successor(&before_block->base, &then_branch->branch_target->base); /* then_branch */
2200 pan_block_add_successor(&before_block->base, &then_block->base); /* fallthrough */
2201 }
2202
2203 static void
emit_loop(bi_context * ctx,nir_loop * nloop)2204 emit_loop(bi_context *ctx, nir_loop *nloop)
2205 {
2206 /* Remember where we are */
2207 bi_block *start_block = ctx->current_block;
2208
2209 bi_block *saved_break = ctx->break_block;
2210 bi_block *saved_continue = ctx->continue_block;
2211
2212 ctx->continue_block = create_empty_block(ctx);
2213 ctx->break_block = create_empty_block(ctx);
2214 ctx->after_block = ctx->continue_block;
2215
2216 /* Emit the body itself */
2217 emit_cf_list(ctx, &nloop->body);
2218
2219 /* Branch back to loop back */
2220 bi_instruction *br_back = bi_emit_branch(ctx);
2221 br_back->branch_target = ctx->continue_block;
2222 pan_block_add_successor(&start_block->base, &ctx->continue_block->base);
2223 pan_block_add_successor(&ctx->current_block->base, &ctx->continue_block->base);
2224
2225 ctx->after_block = ctx->break_block;
2226
2227 /* Pop off */
2228 ctx->break_block = saved_break;
2229 ctx->continue_block = saved_continue;
2230 ++ctx->loop_count;
2231 }
2232
2233 static bi_block *
emit_cf_list(bi_context * ctx,struct exec_list * list)2234 emit_cf_list(bi_context *ctx, struct exec_list *list)
2235 {
2236 bi_block *start_block = NULL;
2237
2238 foreach_list_typed(nir_cf_node, node, node, list) {
2239 switch (node->type) {
2240 case nir_cf_node_block: {
2241 bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
2242
2243 if (!start_block)
2244 start_block = block;
2245
2246 break;
2247 }
2248
2249 case nir_cf_node_if:
2250 emit_if(ctx, nir_cf_node_as_if(node));
2251 break;
2252
2253 case nir_cf_node_loop:
2254 emit_loop(ctx, nir_cf_node_as_loop(node));
2255 break;
2256
2257 default:
2258 unreachable("Unknown control flow");
2259 }
2260 }
2261
2262 return start_block;
2263 }
2264
2265 static int
glsl_type_size(const struct glsl_type * type,bool bindless)2266 glsl_type_size(const struct glsl_type *type, bool bindless)
2267 {
2268 return glsl_count_attribute_slots(type, false);
2269 }
2270
2271 static void
bi_optimize_nir(nir_shader * nir)2272 bi_optimize_nir(nir_shader *nir)
2273 {
2274 bool progress;
2275 unsigned lower_flrp = 16 | 32 | 64;
2276
2277 NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
2278 NIR_PASS(progress, nir, nir_lower_idiv, nir_lower_idiv_fast);
2279
2280 nir_lower_tex_options lower_tex_options = {
2281 .lower_txs_lod = true,
2282 .lower_txp = ~0,
2283 .lower_tex_without_implicit_lod = true,
2284 .lower_txd = true,
2285 };
2286
2287 NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
2288 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
2289 NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
2290
2291 do {
2292 progress = false;
2293
2294 NIR_PASS(progress, nir, nir_lower_var_copies);
2295 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
2296
2297 NIR_PASS(progress, nir, nir_copy_prop);
2298 NIR_PASS(progress, nir, nir_opt_remove_phis);
2299 NIR_PASS(progress, nir, nir_opt_dce);
2300 NIR_PASS(progress, nir, nir_opt_dead_cf);
2301 NIR_PASS(progress, nir, nir_opt_cse);
2302 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
2303 NIR_PASS(progress, nir, nir_opt_algebraic);
2304 NIR_PASS(progress, nir, nir_opt_constant_folding);
2305
2306 if (lower_flrp != 0) {
2307 bool lower_flrp_progress = false;
2308 NIR_PASS(lower_flrp_progress,
2309 nir,
2310 nir_lower_flrp,
2311 lower_flrp,
2312 false /* always_precise */);
2313 if (lower_flrp_progress) {
2314 NIR_PASS(progress, nir,
2315 nir_opt_constant_folding);
2316 progress = true;
2317 }
2318
2319 /* Nothing should rematerialize any flrps, so we only
2320 * need to do this lowering once.
2321 */
2322 lower_flrp = 0;
2323 }
2324
2325 NIR_PASS(progress, nir, nir_opt_undef);
2326 NIR_PASS(progress, nir, nir_undef_to_zero);
2327
2328 NIR_PASS(progress, nir, nir_opt_loop_unroll,
2329 nir_var_shader_in |
2330 nir_var_shader_out |
2331 nir_var_function_temp);
2332 } while (progress);
2333
2334 NIR_PASS(progress, nir, nir_opt_algebraic_late);
2335 NIR_PASS(progress, nir, nir_lower_bool_to_int32);
2336 NIR_PASS(progress, nir, bifrost_nir_lower_algebraic_late);
2337 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
2338 NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
2339
2340 /* Take us out of SSA */
2341 NIR_PASS(progress, nir, nir_lower_locals_to_regs);
2342 NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
2343 NIR_PASS(progress, nir, nir_convert_from_ssa, true);
2344 }
2345
2346 panfrost_program *
bifrost_compile_shader_nir(void * mem_ctx,nir_shader * nir,const struct panfrost_compile_inputs * inputs)2347 bifrost_compile_shader_nir(void *mem_ctx, nir_shader *nir,
2348 const struct panfrost_compile_inputs *inputs)
2349 {
2350 panfrost_program *program = rzalloc(mem_ctx, panfrost_program);
2351
2352 bifrost_debug = debug_get_option_bifrost_debug();
2353
2354 bi_context *ctx = rzalloc(NULL, bi_context);
2355 ctx->nir = nir;
2356 ctx->stage = nir->info.stage;
2357 ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
2358 ctx->is_blend = inputs->is_blend;
2359 ctx->blend_desc = inputs->blend.bifrost_blend_desc;
2360 memcpy(ctx->blend_constants, inputs->blend.constants, sizeof(ctx->blend_constants));
2361 list_inithead(&ctx->blocks);
2362
2363 /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
2364 * (so we don't accidentally duplicate the epilogue since mesa/st has
2365 * messed with our I/O quite a bit already) */
2366
2367 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
2368
2369 if (ctx->stage == MESA_SHADER_VERTEX) {
2370 NIR_PASS_V(nir, nir_lower_viewport_transform);
2371 NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0);
2372 }
2373
2374 NIR_PASS_V(nir, nir_split_var_copies);
2375 NIR_PASS_V(nir, nir_lower_global_vars_to_local);
2376 NIR_PASS_V(nir, nir_lower_var_copies);
2377 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
2378 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
2379 glsl_type_size, 0);
2380 NIR_PASS_V(nir, nir_lower_ssbo);
2381 NIR_PASS_V(nir, pan_nir_lower_zs_store);
2382 // TODO: re-enable when fp16 is flipped on
2383 // NIR_PASS_V(nir, nir_lower_mediump_outputs);
2384
2385 bi_optimize_nir(nir);
2386
2387 NIR_PASS_V(nir, pan_nir_reorder_writeout);
2388
2389 if (bifrost_debug & BIFROST_DBG_SHADERS && !nir->info.internal) {
2390 nir_print_shader(nir, stdout);
2391 }
2392
2393 panfrost_nir_assign_sysvals(&ctx->sysvals, ctx, nir);
2394 program->sysval_count = ctx->sysvals.sysval_count;
2395 memcpy(program->sysvals, ctx->sysvals.sysvals, sizeof(ctx->sysvals.sysvals[0]) * ctx->sysvals.sysval_count);
2396 ctx->blend_types = program->blend_types;
2397
2398 nir_foreach_function(func, nir) {
2399 if (!func->impl)
2400 continue;
2401
2402 ctx->impl = func->impl;
2403 emit_cf_list(ctx, &func->impl->body);
2404 break; /* TODO: Multi-function shaders */
2405 }
2406
2407 unsigned block_source_count = 0;
2408
2409 bi_foreach_block(ctx, _block) {
2410 bi_block *block = (bi_block *) _block;
2411
2412 /* Name blocks now that we're done emitting so the order is
2413 * consistent */
2414 block->base.name = block_source_count++;
2415
2416 bi_lower_combine(ctx, block);
2417 }
2418
2419 bool progress = false;
2420
2421 do {
2422 progress = false;
2423
2424 bi_foreach_block(ctx, _block) {
2425 bi_block *block = (bi_block *) _block;
2426 progress |= bi_opt_dead_code_eliminate(ctx, block);
2427 }
2428 } while(progress);
2429
2430 if (bifrost_debug & BIFROST_DBG_SHADERS && !nir->info.internal)
2431 bi_print_shader(ctx, stdout);
2432 bi_schedule(ctx);
2433 bi_register_allocate(ctx);
2434 if (bifrost_debug & BIFROST_DBG_SHADERS && !nir->info.internal)
2435 bi_print_shader(ctx, stdout);
2436
2437 util_dynarray_init(&program->compiled, NULL);
2438 bi_pack(ctx, &program->compiled);
2439
2440 memcpy(program->blend_ret_offsets, ctx->blend_ret_offsets, sizeof(program->blend_ret_offsets));
2441
2442 if (bifrost_debug & BIFROST_DBG_SHADERS && !nir->info.internal)
2443 disassemble_bifrost(stdout, program->compiled.data, program->compiled.size, true);
2444
2445 program->tls_size = ctx->tls_size;
2446
2447 ralloc_free(ctx);
2448
2449 return program;
2450 }
2451