1 /*
2 * Copyright © 2015 Rob Clark <robclark@freedesktop.org>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <robclark@freedesktop.org>
7 */
8
9 #include <stdarg.h>
10
11 #include "util/u_math.h"
12 #include "util/u_memory.h"
13 #include "util/u_string.h"
14
15 #include "ir3_compiler.h"
16 #include "ir3_image.h"
17 #include "ir3_nir.h"
18 #include "ir3_shader.h"
19
20 #include "instr-a3xx.h"
21 #include "ir3.h"
22 #include "ir3_context.h"
23
24 static struct ir3_instruction_rpt
rpt_instr(struct ir3_instruction * instr,unsigned nrpt)25 rpt_instr(struct ir3_instruction *instr, unsigned nrpt)
26 {
27 struct ir3_instruction_rpt dst = {{0}};
28
29 for (unsigned i = 0; i < nrpt; ++i)
30 dst.rpts[i] = instr;
31
32 return dst;
33 }
34
35 static void
cp_instrs(struct ir3_instruction * dst[],struct ir3_instruction * instrs[],unsigned n)36 cp_instrs(struct ir3_instruction *dst[], struct ir3_instruction *instrs[],
37 unsigned n)
38 {
39 for (unsigned i = 0; i < n; ++i)
40 dst[i] = instrs[i];
41 }
42
43 static struct ir3_instruction_rpt
create_immed_rpt(struct ir3_builder * build,unsigned nrpt,unsigned val)44 create_immed_rpt(struct ir3_builder *build, unsigned nrpt, unsigned val)
45 {
46 return rpt_instr(create_immed(build, val), nrpt);
47 }
48
49 static struct ir3_instruction_rpt
create_immed_shared_rpt(struct ir3_builder * build,unsigned nrpt,uint32_t val,bool shared)50 create_immed_shared_rpt(struct ir3_builder *build, unsigned nrpt, uint32_t val,
51 bool shared)
52 {
53 return rpt_instr(create_immed_shared(build, val, shared), nrpt);
54 }
55
56 static struct ir3_instruction_rpt
create_immed_typed_rpt(struct ir3_builder * build,unsigned nrpt,unsigned val,type_t type)57 create_immed_typed_rpt(struct ir3_builder *build, unsigned nrpt, unsigned val,
58 type_t type)
59 {
60 return rpt_instr(create_immed_typed(build, val, type), nrpt);
61 }
62
63 static inline struct ir3_instruction_rpt
create_immed_typed_shared_rpt(struct ir3_builder * build,unsigned nrpt,uint32_t val,type_t type,bool shared)64 create_immed_typed_shared_rpt(struct ir3_builder *build, unsigned nrpt,
65 uint32_t val, type_t type, bool shared)
66 {
67 return rpt_instr(create_immed_typed_shared(build, val, type, shared), nrpt);
68 }
69
70 static void
set_instr_flags(struct ir3_instruction * instrs[],unsigned n,ir3_instruction_flags flags)71 set_instr_flags(struct ir3_instruction *instrs[], unsigned n,
72 ir3_instruction_flags flags)
73 {
74 for (unsigned i = 0; i < n; ++i)
75 instrs[i]->flags |= flags;
76 }
77
78 static void
set_cat1_round(struct ir3_instruction * instrs[],unsigned n,round_t round)79 set_cat1_round(struct ir3_instruction *instrs[], unsigned n, round_t round)
80 {
81 for (unsigned i = 0; i < n; ++i)
82 instrs[i]->cat1.round = round;
83 }
84
85 static void
set_cat2_condition(struct ir3_instruction * instrs[],unsigned n,unsigned condition)86 set_cat2_condition(struct ir3_instruction *instrs[], unsigned n,
87 unsigned condition)
88 {
89 for (unsigned i = 0; i < n; ++i)
90 instrs[i]->cat2.condition = condition;
91 }
92
93 static void
set_dst_flags(struct ir3_instruction * instrs[],unsigned n,ir3_register_flags flags)94 set_dst_flags(struct ir3_instruction *instrs[], unsigned n,
95 ir3_register_flags flags)
96 {
97 for (unsigned i = 0; i < n; ++i)
98 instrs[i]->dsts[0]->flags |= flags;
99 }
100
101 void
ir3_handle_nonuniform(struct ir3_instruction * instr,nir_intrinsic_instr * intrin)102 ir3_handle_nonuniform(struct ir3_instruction *instr,
103 nir_intrinsic_instr *intrin)
104 {
105 if (nir_intrinsic_has_access(intrin) &&
106 (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {
107 instr->flags |= IR3_INSTR_NONUNIF;
108 }
109 }
110
111 void
ir3_handle_bindless_cat6(struct ir3_instruction * instr,nir_src rsrc)112 ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
113 {
114 nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
115 if (!intrin)
116 return;
117
118 instr->flags |= IR3_INSTR_B;
119 instr->cat6.base = nir_intrinsic_desc_set(intrin);
120 }
121
122 static struct ir3_instruction *
create_input(struct ir3_context * ctx,unsigned compmask)123 create_input(struct ir3_context *ctx, unsigned compmask)
124 {
125 struct ir3_instruction *in;
126
127 in = ir3_instr_create_at(ir3_before_terminator(ctx->in_block),
128 OPC_META_INPUT, 1, 0);
129 in->input.sysval = ~0;
130 __ssa_dst(in)->wrmask = compmask;
131
132 array_insert(ctx->ir, ctx->ir->inputs, in);
133
134 return in;
135 }
136
137 static struct ir3_instruction_rpt
create_frag_input(struct ir3_context * ctx,struct ir3_instruction * coord,unsigned n,unsigned ncomp)138 create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,
139 unsigned n, unsigned ncomp)
140 {
141 struct ir3_builder *build = &ctx->build;
142 struct ir3_instruction_rpt instr;
143 /* packed inloc is fixed up later: */
144 struct ir3_instruction_rpt inloc;
145
146 for (unsigned i = 0; i < ncomp; i++)
147 inloc.rpts[i] = create_immed(build, n + i);
148
149 if (coord) {
150 instr =
151 ir3_BARY_F_rpt(build, ncomp, inloc, 0, rpt_instr(coord, ncomp), 0);
152 } else if (ctx->compiler->flat_bypass) {
153 if (ctx->compiler->gen >= 6) {
154 instr = ir3_FLAT_B_rpt(build, ncomp, inloc, 0, inloc, 0);
155 } else {
156 for (unsigned i = 0; i < ncomp; i++) {
157 instr.rpts[i] =
158 ir3_LDLV(build, inloc.rpts[i], 0, create_immed(build, 1), 0);
159 instr.rpts[i]->cat6.type = TYPE_U32;
160 instr.rpts[i]->cat6.iim_val = 1;
161 }
162 }
163 } else {
164 instr = ir3_BARY_F_rpt(build, ncomp, inloc, 0,
165 rpt_instr(ctx->ij[IJ_PERSP_PIXEL], ncomp), 0);
166
167 for (unsigned i = 0; i < ncomp; i++)
168 instr.rpts[i]->srcs[1]->wrmask = 0x3;
169 }
170
171 return instr;
172 }
173
174 static struct ir3_instruction *
create_driver_param(struct ir3_context * ctx,uint32_t dp)175 create_driver_param(struct ir3_context *ctx, uint32_t dp)
176 {
177 /* first four vec4 sysval's reserved for UBOs: */
178 /* NOTE: dp is in scalar, but there can be >4 dp components: */
179 unsigned r = ir3_const_reg(ir3_const_state(ctx->so),
180 IR3_CONST_ALLOC_DRIVER_PARAMS, dp);
181 return create_uniform(&ctx->build, r);
182 }
183
184 static struct ir3_instruction *
create_driver_param_indirect(struct ir3_context * ctx,uint32_t dp,struct ir3_instruction * address)185 create_driver_param_indirect(struct ir3_context *ctx, uint32_t dp,
186 struct ir3_instruction *address)
187 {
188 /* first four vec4 sysval's reserved for UBOs: */
189 /* NOTE: dp is in scalar, but there can be >4 dp components: */
190 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
191 unsigned n =
192 const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4;
193 return create_uniform_indirect(&ctx->build, n * 4 + dp, TYPE_U32, address);
194 }
195
196 /*
197 * Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or
198 * 32-bit registers. We use NIR's 1-bit integers to represent bools, and
199 * trust that we will only see and/or/xor on those 1-bit values, so we can
200 * safely store NIR i1s in a 32-bit reg while always containing either a 1 or
201 * 0.
202 */
203
204 /*
205 * alu/sfu instructions:
206 */
207
208 static struct ir3_instruction_rpt
create_cov(struct ir3_context * ctx,unsigned nrpt,struct ir3_instruction_rpt src,unsigned src_bitsize,nir_op op)209 create_cov(struct ir3_context *ctx, unsigned nrpt,
210 struct ir3_instruction_rpt src, unsigned src_bitsize, nir_op op)
211 {
212 type_t src_type, dst_type;
213
214 switch (op) {
215 case nir_op_f2f32:
216 case nir_op_f2f16_rtne:
217 case nir_op_f2f16_rtz:
218 case nir_op_f2f16:
219 case nir_op_f2i32:
220 case nir_op_f2i16:
221 case nir_op_f2i8:
222 case nir_op_f2u32:
223 case nir_op_f2u16:
224 case nir_op_f2u8:
225 switch (src_bitsize) {
226 case 32:
227 src_type = TYPE_F32;
228 break;
229 case 16:
230 src_type = TYPE_F16;
231 break;
232 default:
233 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
234 }
235 break;
236
237 case nir_op_i2f32:
238 case nir_op_i2f16:
239 case nir_op_i2i32:
240 case nir_op_i2i16:
241 case nir_op_i2i8:
242 switch (src_bitsize) {
243 case 32:
244 src_type = TYPE_S32;
245 break;
246 case 16:
247 src_type = TYPE_S16;
248 break;
249 case 8:
250 src_type = TYPE_U8;
251 break;
252 default:
253 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
254 }
255 break;
256
257 case nir_op_u2f32:
258 case nir_op_u2f16:
259 case nir_op_u2u32:
260 case nir_op_u2u16:
261 case nir_op_u2u8:
262 switch (src_bitsize) {
263 case 32:
264 src_type = TYPE_U32;
265 break;
266 case 16:
267 src_type = TYPE_U16;
268 break;
269 case 8:
270 src_type = TYPE_U8;
271 break;
272 default:
273 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
274 }
275 break;
276
277 case nir_op_b2f16:
278 case nir_op_b2f32:
279 case nir_op_b2i8:
280 case nir_op_b2i16:
281 case nir_op_b2i32:
282 src_type = ctx->compiler->bool_type;
283 break;
284
285 default:
286 ir3_context_error(ctx, "invalid conversion op: %u", op);
287 }
288
289 switch (op) {
290 case nir_op_f2f32:
291 case nir_op_i2f32:
292 case nir_op_u2f32:
293 case nir_op_b2f32:
294 dst_type = TYPE_F32;
295 break;
296
297 case nir_op_f2f16_rtne:
298 case nir_op_f2f16_rtz:
299 case nir_op_f2f16:
300 case nir_op_i2f16:
301 case nir_op_u2f16:
302 case nir_op_b2f16:
303 dst_type = TYPE_F16;
304 break;
305
306 case nir_op_f2i32:
307 case nir_op_i2i32:
308 case nir_op_b2i32:
309 dst_type = TYPE_S32;
310 break;
311
312 case nir_op_f2i16:
313 case nir_op_i2i16:
314 case nir_op_b2i16:
315 dst_type = TYPE_S16;
316 break;
317
318 case nir_op_f2i8:
319 case nir_op_i2i8:
320 case nir_op_b2i8:
321 dst_type = TYPE_U8;
322 break;
323
324 case nir_op_f2u32:
325 case nir_op_u2u32:
326 dst_type = TYPE_U32;
327 break;
328
329 case nir_op_f2u16:
330 case nir_op_u2u16:
331 dst_type = TYPE_U16;
332 break;
333
334 case nir_op_f2u8:
335 case nir_op_u2u8:
336 dst_type = TYPE_U8;
337 break;
338
339 default:
340 ir3_context_error(ctx, "invalid conversion op: %u", op);
341 }
342
343 if (src_type == dst_type)
344 return src;
345
346 /* Zero-extension of 8-bit values doesn't work with `cov`, so simple masking
347 * is used to achieve the result.
348 */
349 if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_U32) {
350 struct ir3_instruction_rpt mask =
351 create_immed_typed_rpt(&ctx->build, nrpt, 0xff, TYPE_U8);
352 struct ir3_instruction_rpt cov =
353 ir3_AND_B_rpt(&ctx->build, nrpt, src, 0, mask, 0);
354 set_dst_flags(cov.rpts, nrpt, type_flags(dst_type));
355 return cov;
356 }
357
358 /* Conversion of 8-bit values into floating-point values doesn't work with
359 * a simple `cov`, instead the 8-bit values first have to be converted into
360 * corresponding 16-bit values and converted from there.
361 */
362 if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_F32) {
363 assert(op == nir_op_u2f16 || op == nir_op_i2f16 ||
364 op == nir_op_u2f32 || op == nir_op_i2f32);
365
366 struct ir3_instruction_rpt cov;
367 if (op == nir_op_u2f16 || op == nir_op_u2f32) {
368 struct ir3_instruction_rpt mask =
369 create_immed_typed_rpt(&ctx->build, nrpt, 0xff, TYPE_U8);
370 cov = ir3_AND_B_rpt(&ctx->build, nrpt, src, 0, mask, 0);
371 set_dst_flags(cov.rpts, nrpt, IR3_REG_HALF);
372 cov = ir3_COV_rpt(&ctx->build, nrpt, cov, TYPE_U16, dst_type);
373 } else {
374 cov = ir3_COV_rpt(&ctx->build, nrpt, src, TYPE_U8, TYPE_S16);
375 cov = ir3_COV_rpt(&ctx->build, nrpt, cov, TYPE_S16, dst_type);
376 }
377 return cov;
378 }
379
380 /* Conversion of floating-point values to 8-bit values also doesn't work
381 * through a single `cov`, instead the conversion has to go through the
382 * corresponding 16-bit type that's then truncated.
383 */
384 if (full_type(src_type) == TYPE_F32 && dst_type == TYPE_U8) {
385 assert(op == nir_op_f2u8 || op == nir_op_f2i8);
386
387 type_t intermediate_type = op == nir_op_f2u8 ? TYPE_U16 : TYPE_S16;
388 struct ir3_instruction_rpt cov =
389 ir3_COV_rpt(&ctx->build, nrpt, src, src_type, intermediate_type);
390 cov = ir3_COV_rpt(&ctx->build, nrpt, cov, intermediate_type, TYPE_U8);
391 return cov;
392 }
393
394 struct ir3_instruction_rpt cov =
395 ir3_COV_rpt(&ctx->build, nrpt, src, src_type, dst_type);
396
397 if (op == nir_op_f2f16_rtne) {
398 set_cat1_round(cov.rpts, nrpt, ROUND_EVEN);
399 } else if (op == nir_op_f2f16_rtz) {
400 set_cat1_round(cov.rpts, nrpt, ROUND_ZERO);
401 } else if (dst_type == TYPE_F16 || dst_type == TYPE_F32) {
402 unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
403 nir_alu_type type =
404 dst_type == TYPE_F16 ? nir_type_float16 : nir_type_float32;
405 nir_rounding_mode rounding_mode =
406 nir_get_rounding_mode_from_float_controls(execution_mode, type);
407 if (rounding_mode == nir_rounding_mode_rtne)
408 set_cat1_round(cov.rpts, nrpt, ROUND_EVEN);
409 else if (rounding_mode == nir_rounding_mode_rtz)
410 set_cat1_round(cov.rpts, nrpt, ROUND_ZERO);
411 }
412
413 return cov;
414 }
415
416 /* For shift instructions NIR always has shift amount as 32 bit integer */
417 static struct ir3_instruction_rpt
resize_shift_amount(struct ir3_context * ctx,unsigned nrpt,struct ir3_instruction_rpt src,unsigned bs)418 resize_shift_amount(struct ir3_context *ctx, unsigned nrpt,
419 struct ir3_instruction_rpt src, unsigned bs)
420 {
421 if (bs == 16)
422 return ir3_COV_rpt(&ctx->build, nrpt, src, TYPE_U32, TYPE_U16);
423 else if (bs == 8)
424 return ir3_COV_rpt(&ctx->build, nrpt, src, TYPE_U32, TYPE_U8);
425 else
426 return src;
427 }
428
429 static void
emit_alu_dot_4x8_as_dp4acc(struct ir3_context * ctx,nir_alu_instr * alu,struct ir3_instruction ** dst,struct ir3_instruction ** src)430 emit_alu_dot_4x8_as_dp4acc(struct ir3_context *ctx, nir_alu_instr *alu,
431 struct ir3_instruction **dst,
432 struct ir3_instruction **src)
433 {
434 if (ctx->compiler->has_compliant_dp4acc) {
435 dst[0] = ir3_DP4ACC(&ctx->build, src[0], 0, src[1], 0, src[2], 0);
436
437 /* This is actually the LHS signedness attribute.
438 * IR3_SRC_UNSIGNED ~ unsigned LHS (i.e. OpUDot and OpUDotAccSat).
439 */
440 if (alu->op == nir_op_udot_4x8_uadd ||
441 alu->op == nir_op_udot_4x8_uadd_sat) {
442 dst[0]->cat3.signedness = IR3_SRC_UNSIGNED;
443 } else {
444 dst[0]->cat3.signedness = IR3_SRC_MIXED;
445 }
446
447 /* This is actually the RHS signedness attribute.
448 * IR3_SRC_PACKED_HIGH ~ signed RHS (i.e. OpSDot and OpSDotAccSat).
449 */
450 if (alu->op == nir_op_sdot_4x8_iadd ||
451 alu->op == nir_op_sdot_4x8_iadd_sat) {
452 dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH;
453 } else {
454 dst[0]->cat3.packed = IR3_SRC_PACKED_LOW;
455 }
456
457 if (alu->op == nir_op_udot_4x8_uadd_sat ||
458 alu->op == nir_op_sdot_4x8_iadd_sat ||
459 alu->op == nir_op_sudot_4x8_iadd_sat) {
460 dst[0]->flags |= IR3_INSTR_SAT;
461 }
462 return;
463 }
464
465 struct ir3_instruction *accumulator = NULL;
466 if (alu->op == nir_op_udot_4x8_uadd_sat) {
467 accumulator = create_immed(&ctx->build, 0);
468 } else {
469 accumulator = src[2];
470 }
471
472 dst[0] = ir3_DP4ACC(&ctx->build, src[0], 0, src[1], 0, accumulator, 0);
473
474 if (alu->op == nir_op_udot_4x8_uadd ||
475 alu->op == nir_op_udot_4x8_uadd_sat) {
476 dst[0]->cat3.signedness = IR3_SRC_UNSIGNED;
477 } else {
478 dst[0]->cat3.signedness = IR3_SRC_MIXED;
479 }
480
481 /* For some reason (sat) doesn't work in unsigned case so
482 * we have to emulate it.
483 */
484 if (alu->op == nir_op_udot_4x8_uadd_sat) {
485 dst[0] = ir3_ADD_U(&ctx->build, dst[0], 0, src[2], 0);
486 dst[0]->flags |= IR3_INSTR_SAT;
487 } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
488 dst[0]->flags |= IR3_INSTR_SAT;
489 }
490 }
491
492 static void
emit_alu_dot_4x8_as_dp2acc(struct ir3_context * ctx,nir_alu_instr * alu,struct ir3_instruction ** dst,struct ir3_instruction ** src)493 emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu,
494 struct ir3_instruction **dst,
495 struct ir3_instruction **src)
496 {
497 int signedness;
498 if (alu->op == nir_op_udot_4x8_uadd ||
499 alu->op == nir_op_udot_4x8_uadd_sat) {
500 signedness = IR3_SRC_UNSIGNED;
501 } else {
502 signedness = IR3_SRC_MIXED;
503 }
504
505 struct ir3_instruction *accumulator = NULL;
506 if (alu->op == nir_op_udot_4x8_uadd_sat ||
507 alu->op == nir_op_sudot_4x8_iadd_sat) {
508 accumulator = create_immed(&ctx->build, 0);
509 } else {
510 accumulator = src[2];
511 }
512
513 dst[0] = ir3_DP2ACC(&ctx->build, src[0], 0, src[1], 0, accumulator, 0);
514 dst[0]->cat3.packed = IR3_SRC_PACKED_LOW;
515 dst[0]->cat3.signedness = signedness;
516
517 dst[0] = ir3_DP2ACC(&ctx->build, src[0], 0, src[1], 0, dst[0], 0);
518 dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH;
519 dst[0]->cat3.signedness = signedness;
520
521 if (alu->op == nir_op_udot_4x8_uadd_sat) {
522 dst[0] = ir3_ADD_U(&ctx->build, dst[0], 0, src[2], 0);
523 dst[0]->flags |= IR3_INSTR_SAT;
524 } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
525 dst[0] = ir3_ADD_S(&ctx->build, dst[0], 0, src[2], 0);
526 dst[0]->flags |= IR3_INSTR_SAT;
527 }
528 }
529
530 static bool
all_sat_compatible(struct ir3_instruction * instrs[],unsigned n)531 all_sat_compatible(struct ir3_instruction *instrs[], unsigned n)
532 {
533 for (unsigned i = 0; i < n; i++) {
534 if (!is_sat_compatible(instrs[i]->opc))
535 return false;
536 }
537
538 return true;
539 }
540
541 /* Is src the only use of its def, taking components into account. */
542 static bool
is_unique_use(nir_src * src)543 is_unique_use(nir_src *src)
544 {
545 nir_def *def = src->ssa;
546
547 if (list_is_singular(&def->uses))
548 return true;
549
550 nir_component_mask_t src_read_mask = nir_src_components_read(src);
551
552 nir_foreach_use (use, def) {
553 if (use == src)
554 continue;
555
556 if (nir_src_components_read(use) & src_read_mask)
557 return false;
558 }
559
560 return true;
561 }
562
563 static void
emit_alu(struct ir3_context * ctx,nir_alu_instr * alu)564 emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
565 {
566 const nir_op_info *info = &nir_op_infos[alu->op];
567 struct ir3_instruction_rpt dst, src[info->num_inputs];
568 unsigned bs[info->num_inputs]; /* bit size */
569 struct ir3_builder *b = &ctx->build;
570 unsigned dst_sz;
571 unsigned dst_bitsize = ir3_bitsize(ctx, alu->def.bit_size);
572 type_t dst_type = type_uint_size(dst_bitsize);
573
574 dst_sz = alu->def.num_components;
575 assert(dst_sz == 1 || ir3_supports_vectorized_nir_op(alu->op));
576
577 bool use_shared = !alu->def.divergent &&
578 ctx->compiler->has_scalar_alu &&
579 /* it probably isn't worth emulating these with scalar-only ops */
580 alu->op != nir_op_udot_4x8_uadd &&
581 alu->op != nir_op_udot_4x8_uadd_sat &&
582 alu->op != nir_op_sdot_4x8_iadd &&
583 alu->op != nir_op_sdot_4x8_iadd_sat &&
584 alu->op != nir_op_sudot_4x8_iadd &&
585 alu->op != nir_op_sudot_4x8_iadd_sat &&
586 /* not supported in HW, we have to fall back to normal registers */
587 alu->op != nir_op_ffma;
588
589 struct ir3_instruction **def = ir3_get_def(ctx, &alu->def, dst_sz);
590
591 /* Vectors are special in that they have non-scalarized writemasks,
592 * and just take the first swizzle channel for each argument in
593 * order into each writemask channel.
594 */
595 if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
596 (alu->op == nir_op_vec4) || (alu->op == nir_op_vec8) ||
597 (alu->op == nir_op_vec16)) {
598 for (int i = 0; i < info->num_inputs; i++) {
599 nir_alu_src *asrc = &alu->src[i];
600 struct ir3_instruction *src =
601 ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
602 compile_assert(ctx, src);
603 def[i] = ir3_MOV(b, src, dst_type);
604 }
605
606 ir3_instr_create_rpt(def, info->num_inputs);
607 ir3_put_def(ctx, &alu->def);
608 return;
609 }
610
611 assert(dst_sz <= ARRAY_SIZE(src[0].rpts));
612
613 for (int i = 0; i < info->num_inputs; i++) {
614 nir_alu_src *asrc = &alu->src[i];
615 struct ir3_instruction *const *input_src =
616 ir3_get_src_shared(ctx, &asrc->src, use_shared);
617 bs[i] = nir_src_bit_size(asrc->src);
618
619 for (unsigned rpt = 0; rpt < dst_sz; rpt++) {
620 src[i].rpts[rpt] = input_src[asrc->swizzle[rpt]];
621 compile_assert(ctx, src[i].rpts[rpt]);
622 }
623 }
624
625 switch (alu->op) {
626 case nir_op_mov:
627 dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type);
628 break;
629
630 case nir_op_f2f32:
631 case nir_op_f2f16_rtne:
632 case nir_op_f2f16_rtz:
633 case nir_op_f2f16:
634 case nir_op_f2i32:
635 case nir_op_f2i16:
636 case nir_op_f2i8:
637 case nir_op_f2u32:
638 case nir_op_f2u16:
639 case nir_op_f2u8:
640 case nir_op_i2f32:
641 case nir_op_i2f16:
642 case nir_op_i2i32:
643 case nir_op_i2i16:
644 case nir_op_i2i8:
645 case nir_op_u2f32:
646 case nir_op_u2f16:
647 case nir_op_u2u32:
648 case nir_op_u2u16:
649 case nir_op_u2u8:
650 case nir_op_b2f16:
651 case nir_op_b2f32:
652 case nir_op_b2i8:
653 case nir_op_b2i16:
654 case nir_op_b2i32:
655 dst = create_cov(ctx, dst_sz, src[0], bs[0], alu->op);
656 break;
657
658 case nir_op_fquantize2f16:
659 dst = create_cov(ctx, dst_sz,
660 create_cov(ctx, dst_sz, src[0], 32, nir_op_f2f16_rtne),
661 16, nir_op_f2f32);
662 break;
663
664 case nir_op_b2b1:
665 /* b2b1 will appear when translating from
666 *
667 * - nir_intrinsic_load_shared of a 32-bit 0/~0 value.
668 * - nir_intrinsic_load_constant of a 32-bit 0/~0 value
669 *
670 * A negate can turn those into a 1 or 0 for us.
671 */
672 dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
673 break;
674
675 case nir_op_b2b32:
676 /* b2b32 will appear when converting our 1-bit bools to a store_shared
677 * argument.
678 *
679 * A negate can turn those into a ~0 for us.
680 */
681 dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
682 break;
683
684 case nir_op_fneg:
685 dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FNEG);
686 break;
687 case nir_op_fabs:
688 dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FABS);
689 break;
690 case nir_op_fmax:
691 dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
692 break;
693 case nir_op_fmin:
694 dst = ir3_MIN_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
695 break;
696 case nir_op_fsat:
697 /* if there is just a single use of the src, and it supports
698 * (sat) bit, we can just fold the (sat) flag back to the
699 * src instruction and create a mov. This is easier for cp
700 * to eliminate.
701 */
702 if (all_sat_compatible(src[0].rpts, dst_sz) &&
703 is_unique_use(&alu->src[0].src)) {
704 set_instr_flags(src[0].rpts, dst_sz, IR3_INSTR_SAT);
705 dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type);
706 } else {
707 /* otherwise generate a max.f that saturates.. blob does
708 * similar (generating a cat2 mov using max.f)
709 */
710 dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[0], 0);
711 set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
712 }
713 break;
714 case nir_op_fmul:
715 dst = ir3_MUL_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
716 break;
717 case nir_op_fadd:
718 dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
719 break;
720 case nir_op_fsub:
721 dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], IR3_REG_FNEG);
722 break;
723 case nir_op_ffma:
724 dst = ir3_MAD_F32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
725 break;
726 case nir_op_flt:
727 dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
728 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
729 break;
730 case nir_op_fge:
731 dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
732 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
733 break;
734 case nir_op_feq:
735 dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
736 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ);
737 break;
738 case nir_op_fneu:
739 dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
740 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE);
741 break;
742 case nir_op_fceil:
743 dst = ir3_CEIL_F_rpt(b, dst_sz, src[0], 0);
744 break;
745 case nir_op_ffloor:
746 dst = ir3_FLOOR_F_rpt(b, dst_sz, src[0], 0);
747 break;
748 case nir_op_ftrunc:
749 dst = ir3_TRUNC_F_rpt(b, dst_sz, src[0], 0);
750 break;
751 case nir_op_fround_even:
752 dst = ir3_RNDNE_F_rpt(b, dst_sz, src[0], 0);
753 break;
754 case nir_op_fsign:
755 dst = ir3_SIGN_F_rpt(b, dst_sz, src[0], 0);
756 break;
757
758 case nir_op_fsin:
759 dst = ir3_SIN_rpt(b, dst_sz, src[0], 0);
760 break;
761 case nir_op_fcos:
762 dst = ir3_COS_rpt(b, dst_sz, src[0], 0);
763 break;
764 case nir_op_frsq:
765 dst = ir3_RSQ_rpt(b, dst_sz, src[0], 0);
766 break;
767 case nir_op_frcp:
768 assert(dst_sz == 1);
769 dst.rpts[0] = ir3_RCP(b, src[0].rpts[0], 0);
770 break;
771 case nir_op_flog2:
772 dst = ir3_LOG2_rpt(b, dst_sz, src[0], 0);
773 break;
774 case nir_op_fexp2:
775 dst = ir3_EXP2_rpt(b, dst_sz, src[0], 0);
776 break;
777 case nir_op_fsqrt:
778 dst = ir3_SQRT_rpt(b, dst_sz, src[0], 0);
779 break;
780
781 case nir_op_iabs:
782 dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SABS);
783 break;
784 case nir_op_iadd:
785 dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
786 break;
787 case nir_op_iadd3:
788 if (use_shared) {
789 /* sad doesn't support the scalar ALU so expand to two adds so that we
790 * don't unnecessarily fall back to non-earlypreamble.
791 */
792 struct ir3_instruction_rpt add01 =
793 ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
794
795 if (is_half(src[0].rpts[0])) {
796 set_dst_flags(add01.rpts, dst_sz, IR3_REG_HALF);
797 }
798
799 dst = ir3_ADD_U_rpt(b, dst_sz, add01, 0, src[2], 0);
800 } else {
801 if (is_half(src[0].rpts[0])) {
802 dst = ir3_SAD_S16_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
803 } else {
804 dst = ir3_SAD_S32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
805 }
806 }
807 break;
808 case nir_op_ihadd:
809 dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
810 set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI);
811 break;
812 case nir_op_uhadd:
813 dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
814 set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI);
815 break;
816 case nir_op_iand:
817 dst = ir3_AND_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
818 break;
819 case nir_op_imax:
820 dst = ir3_MAX_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
821 break;
822 case nir_op_umax:
823 dst = ir3_MAX_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
824 break;
825 case nir_op_imin:
826 dst = ir3_MIN_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
827 break;
828 case nir_op_umin:
829 dst = ir3_MIN_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
830 break;
831 case nir_op_umul_low:
832 dst = ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
833 break;
834 case nir_op_imadsh_mix16:
835 if (use_shared) {
836 struct ir3_instruction_rpt sixteen =
837 create_immed_shared_rpt(b, dst_sz, 16, true);
838 struct ir3_instruction_rpt src1 =
839 ir3_SHR_B_rpt(b, dst_sz, src[1], 0, sixteen, 0);
840 struct ir3_instruction_rpt mul =
841 ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src1, 0);
842 dst = ir3_ADD_U_rpt(b, dst_sz,
843 ir3_SHL_B_rpt(b, dst_sz, mul, 0, sixteen, 0), 0,
844 src[2], 0);
845 } else {
846 dst = ir3_MADSH_M16_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
847 }
848 break;
849 case nir_op_imad24_ir3:
850 if (use_shared) {
851 dst = ir3_ADD_U_rpt(b, dst_sz,
852 ir3_MUL_U24_rpt(b, dst_sz, src[0], 0, src[1], 0),
853 0, src[2], 0);
854 } else {
855 dst = ir3_MAD_S24_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
856 }
857 break;
858 case nir_op_imul:
859 compile_assert(ctx, alu->def.bit_size == 8 || alu->def.bit_size == 16);
860 dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0);
861 break;
862 case nir_op_imul24:
863 dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0);
864 break;
865 case nir_op_ineg:
866 dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
867 break;
868 case nir_op_inot:
869 if (bs[0] == 1) {
870 struct ir3_instruction_rpt one = create_immed_typed_shared_rpt(
871 b, dst_sz, 1, ctx->compiler->bool_type, use_shared);
872 dst = ir3_SUB_U_rpt(b, dst_sz, one, 0, src[0], 0);
873 } else {
874 dst = ir3_NOT_B_rpt(b, dst_sz, src[0], 0);
875 }
876 break;
877 case nir_op_ior:
878 dst = ir3_OR_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
879 break;
880 case nir_op_ishl:
881 dst = ir3_SHL_B_rpt(b, dst_sz, src[0], 0,
882 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
883 break;
884 case nir_op_ishr:
885 dst = ir3_ASHR_B_rpt(b, dst_sz, src[0], 0,
886 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
887 break;
888 case nir_op_isub:
889 dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
890 break;
891 case nir_op_ixor:
892 dst = ir3_XOR_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
893 break;
894 case nir_op_ushr:
895 dst = ir3_SHR_B_rpt(b, dst_sz, src[0], 0,
896 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
897 break;
898 case nir_op_shrm_ir3:
899 dst = ir3_SHRM_rpt(b, dst_sz,
900 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0,
901 src[0], 0, src[2], 0);
902 break;
903 case nir_op_shlm_ir3:
904 dst = ir3_SHLM_rpt(b, dst_sz,
905 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0,
906 src[0], 0, src[2], 0);
907 break;
908 case nir_op_shrg_ir3:
909 dst = ir3_SHRG_rpt(b, dst_sz,
910 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0,
911 src[0], 0, src[2], 0);
912 break;
913 case nir_op_shlg_ir3:
914 dst = ir3_SHLG_rpt(b, dst_sz,
915 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0,
916 src[0], 0, src[2], 0);
917 break;
918 case nir_op_andg_ir3:
919 dst = ir3_ANDG_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
920 break;
921 case nir_op_ilt:
922 dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
923 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
924 break;
925 case nir_op_ige:
926 dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
927 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
928 break;
929 case nir_op_ieq:
930 dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
931 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ);
932 break;
933 case nir_op_ine:
934 dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
935 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE);
936 break;
937 case nir_op_ult:
938 dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
939 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
940 break;
941 case nir_op_uge:
942 dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
943 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
944 break;
945
946 case nir_op_icsel_eqz:
947 case nir_op_bcsel: {
948 struct ir3_instruction_rpt conds;
949
950 compile_assert(ctx, bs[1] == bs[2]);
951
952 /* TODO: repeat the covs when possible. */
953 for (unsigned rpt = 0; rpt < dst_sz; ++rpt) {
954 struct ir3_instruction *cond =
955 ir3_get_cond_for_nonzero_compare(src[0].rpts[rpt]);
956
957 /* The condition's size has to match the other two arguments' size, so
958 * convert down if necessary.
959 *
960 * Single hashtable is fine, because the conversion will either be
961 * 16->32 or 32->16, but never both
962 */
963 if (is_half(src[1].rpts[rpt]) != is_half(cond)) {
964 struct hash_entry *prev_entry = _mesa_hash_table_search(
965 ctx->sel_cond_conversions, src[0].rpts[rpt]);
966 if (prev_entry) {
967 cond = prev_entry->data;
968 } else {
969 if (is_half(cond)) {
970 if (bs[0] == 8) {
971 /* Zero-extension of an 8-bit value has to be done through
972 * masking, as in create_cov.
973 */
974 struct ir3_instruction *mask =
975 create_immed_typed(b, 0xff, TYPE_U8);
976 cond = ir3_AND_B(b, cond, 0, mask, 0);
977 } else {
978 cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32);
979 }
980 } else {
981 cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
982 }
983 _mesa_hash_table_insert(ctx->sel_cond_conversions,
984 src[0].rpts[rpt], cond);
985 }
986 }
987 conds.rpts[rpt] = cond;
988 }
989
990 if (alu->op == nir_op_icsel_eqz) {
991 struct ir3_instruction_rpt tmp = src[1];
992 src[1] = src[2];
993 src[2] = tmp;
994 }
995
996 if (is_half(src[1].rpts[0]))
997 dst = ir3_SEL_B16_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
998 else
999 dst = ir3_SEL_B32_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
1000 break;
1001 }
1002
1003 case nir_op_bit_count: {
1004 if (ctx->compiler->gen < 5 ||
1005 (src[0].rpts[0]->dsts[0]->flags & IR3_REG_HALF)) {
1006 dst = ir3_CBITS_B_rpt(b, dst_sz, src[0], 0);
1007 break;
1008 }
1009
1010 // We need to do this 16b at a time on a5xx+a6xx. Once half-precision
1011 // support is in place, this should probably move to a NIR lowering pass:
1012 struct ir3_instruction_rpt hi, lo;
1013
1014 hi = ir3_COV_rpt(
1015 b, dst_sz,
1016 ir3_SHR_B_rpt(b, dst_sz, src[0], 0,
1017 create_immed_shared_rpt(b, dst_sz, 16, use_shared), 0),
1018 TYPE_U32, TYPE_U16);
1019 lo = ir3_COV_rpt(b, dst_sz, src[0], TYPE_U32, TYPE_U16);
1020
1021 hi = ir3_CBITS_B_rpt(b, dst_sz, hi, 0);
1022 lo = ir3_CBITS_B_rpt(b, dst_sz, lo, 0);
1023
1024 // TODO maybe the builders should default to making dst half-precision
1025 // if the src's were half precision, to make this less awkward.. otoh
1026 // we should probably just do this lowering in NIR.
1027 set_dst_flags(hi.rpts, dst_sz, IR3_REG_HALF);
1028 set_dst_flags(lo.rpts, dst_sz, IR3_REG_HALF);
1029
1030 dst = ir3_ADD_S_rpt(b, dst_sz, hi, 0, lo, 0);
1031 set_dst_flags(dst.rpts, dst_sz, IR3_REG_HALF);
1032 dst = ir3_COV_rpt(b, dst_sz, dst, TYPE_U16, TYPE_U32);
1033 break;
1034 }
1035 case nir_op_ifind_msb: {
1036 struct ir3_instruction_rpt cmp;
1037 dst = ir3_CLZ_S_rpt(b, dst_sz, src[0], 0);
1038 cmp =
1039 ir3_CMPS_S_rpt(b, dst_sz, dst, 0,
1040 create_immed_shared_rpt(b, dst_sz, 0, use_shared), 0);
1041 set_cat2_condition(cmp.rpts, dst_sz, IR3_COND_GE);
1042 dst = ir3_SEL_B32_rpt(
1043 b, dst_sz,
1044 ir3_SUB_U_rpt(b, dst_sz,
1045 create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0,
1046 dst, 0),
1047 0, cmp, 0, dst, 0);
1048 break;
1049 }
1050 case nir_op_ufind_msb:
1051 dst = ir3_CLZ_B_rpt(b, dst_sz, src[0], 0);
1052 dst = ir3_SEL_B32_rpt(
1053 b, dst_sz,
1054 ir3_SUB_U_rpt(b, dst_sz,
1055 create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0,
1056 dst, 0),
1057 0, src[0], 0, dst, 0);
1058 break;
1059 case nir_op_find_lsb:
1060 dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0);
1061 dst = ir3_CLZ_B_rpt(b, dst_sz, dst, 0);
1062 break;
1063 case nir_op_bitfield_reverse:
1064 dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0);
1065 break;
1066
1067 case nir_op_uadd_sat:
1068 dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
1069 set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1070 break;
1071 case nir_op_iadd_sat:
1072 dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
1073 set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1074 break;
1075 case nir_op_usub_sat:
1076 dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
1077 set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1078 break;
1079 case nir_op_isub_sat:
1080 dst = ir3_SUB_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
1081 set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1082 break;
1083 case nir_op_pack_64_2x32_split: {
1084 struct ir3_instruction *r0 = ir3_MOV(b, src[0].rpts[0], TYPE_U32);
1085 struct ir3_instruction *r1 = ir3_MOV(b, src[1].rpts[0], TYPE_U32);
1086 dst.rpts[0] = r0;
1087 dst.rpts[1] = r1;
1088 dst_sz = 2;
1089 break;
1090 }
1091 case nir_op_unpack_64_2x32_split_x: {
1092 ir3_split_dest(b, &dst.rpts[0], src[0].rpts[0], 0, 1);
1093 break;
1094 }
1095 case nir_op_unpack_64_2x32_split_y: {
1096 ir3_split_dest(b, &dst.rpts[0], src[0].rpts[0], 1, 1);
1097 break;
1098 }
1099 case nir_op_udot_4x8_uadd:
1100 case nir_op_udot_4x8_uadd_sat:
1101 case nir_op_sdot_4x8_iadd:
1102 case nir_op_sdot_4x8_iadd_sat:
1103 case nir_op_sudot_4x8_iadd:
1104 case nir_op_sudot_4x8_iadd_sat: {
1105 assert(dst_sz == 1);
1106
1107 struct ir3_instruction *src_rpt0[] = {src[0].rpts[0], src[1].rpts[0],
1108 src[2].rpts[0]};
1109
1110 if (ctx->compiler->has_dp4acc) {
1111 emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst.rpts, src_rpt0);
1112 } else if (ctx->compiler->has_dp2acc) {
1113 emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst.rpts, src_rpt0);
1114 } else {
1115 ir3_context_error(ctx, "ALU op should have been lowered: %s\n",
1116 nir_op_infos[alu->op].name);
1117 }
1118
1119 break;
1120 }
1121
1122 default:
1123 ir3_context_error(ctx, "Unhandled ALU op: %s\n",
1124 nir_op_infos[alu->op].name);
1125 break;
1126 }
1127
1128 if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
1129 assert(alu->def.bit_size == 1 || alu->op == nir_op_b2b32);
1130 } else {
1131 /* 1-bit values stored in 32-bit registers are only valid for certain
1132 * ALU ops.
1133 */
1134 switch (alu->op) {
1135 case nir_op_mov:
1136 case nir_op_iand:
1137 case nir_op_ior:
1138 case nir_op_ixor:
1139 case nir_op_inot:
1140 case nir_op_bcsel:
1141 case nir_op_andg_ir3:
1142 break;
1143 default:
1144 compile_assert(ctx, alu->def.bit_size != 1);
1145 }
1146 }
1147
1148 cp_instrs(def, dst.rpts, dst_sz);
1149 ir3_put_def(ctx, &alu->def);
1150 }
1151
1152 static void
emit_intrinsic_load_ubo_ldc(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1153 emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1154 struct ir3_instruction **dst)
1155 {
1156 struct ir3_builder *b = &ctx->build;
1157
1158 /* This is only generated for us by nir_lower_ubo_vec4, which leaves base =
1159 * 0.
1160 */
1161 assert(nir_intrinsic_base(intr) == 0);
1162
1163 unsigned ncomp = intr->num_components;
1164 struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
1165 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
1166 struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
1167 ldc->dsts[0]->wrmask = MASK(ncomp);
1168 ldc->cat6.iim_val = ncomp;
1169 ldc->cat6.d = nir_intrinsic_component(intr);
1170 ldc->cat6.type = utype_def(&intr->def);
1171
1172 ir3_handle_bindless_cat6(ldc, intr->src[0]);
1173 if (ldc->flags & IR3_INSTR_B)
1174 ctx->so->bindless_ubo = true;
1175 ir3_handle_nonuniform(ldc, intr);
1176
1177 if (!intr->def.divergent &&
1178 ctx->compiler->has_scalar_alu) {
1179 ldc->dsts[0]->flags |= IR3_REG_SHARED;
1180 ldc->flags |= IR3_INSTR_U;
1181 }
1182
1183 ir3_split_dest(b, dst, ldc, 0, ncomp);
1184 }
1185
1186 static void
emit_intrinsic_copy_ubo_to_uniform(struct ir3_context * ctx,nir_intrinsic_instr * intr)1187 emit_intrinsic_copy_ubo_to_uniform(struct ir3_context *ctx,
1188 nir_intrinsic_instr *intr)
1189 {
1190 struct ir3_builder *b = &ctx->build;
1191
1192 unsigned base = nir_intrinsic_base(intr);
1193 unsigned size = nir_intrinsic_range(intr);
1194
1195 struct ir3_instruction *addr1 = ir3_get_addr1(ctx, base);
1196
1197 struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
1198 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
1199 struct ir3_instruction *ldc = ir3_LDC_K(b, idx, 0, offset, 0);
1200 ldc->cat6.iim_val = size;
1201 ldc->barrier_class = ldc->barrier_conflict = IR3_BARRIER_CONST_W;
1202
1203 ir3_handle_bindless_cat6(ldc, intr->src[0]);
1204 if (ldc->flags & IR3_INSTR_B)
1205 ctx->so->bindless_ubo = true;
1206
1207 ir3_instr_set_address(ldc, addr1);
1208
1209 /* The assembler isn't aware of what value a1.x has, so make sure that
1210 * constlen includes the ldc.k here.
1211 */
1212 ctx->so->constlen =
1213 MAX2(ctx->so->constlen, DIV_ROUND_UP(base + size * 4, 4));
1214
1215 array_insert(ctx->block, ctx->block->keeps, ldc);
1216 }
1217
1218 static void
emit_intrinsic_copy_global_to_uniform(struct ir3_context * ctx,nir_intrinsic_instr * intr)1219 emit_intrinsic_copy_global_to_uniform(struct ir3_context *ctx,
1220 nir_intrinsic_instr *intr)
1221 {
1222 struct ir3_builder *b = &ctx->build;
1223
1224 unsigned size = nir_intrinsic_range(intr);
1225 unsigned dst = nir_intrinsic_range_base(intr);
1226 unsigned addr_offset = nir_intrinsic_base(intr);
1227 unsigned dst_lo = dst & 0xff;
1228 unsigned dst_hi = dst >> 8;
1229
1230 struct ir3_instruction *a1 = NULL;
1231 if (dst_hi)
1232 a1 = ir3_get_addr1(ctx, dst_hi << 8);
1233
1234 struct ir3_instruction *addr_lo = ir3_get_src(ctx, &intr->src[0])[0];
1235 struct ir3_instruction *addr_hi = ir3_get_src(ctx, &intr->src[0])[1];
1236 struct ir3_instruction *addr = ir3_collect(b, addr_lo, addr_hi);
1237 struct ir3_instruction *ldg = ir3_LDG_K(b, create_immed(b, dst_lo), 0, addr, 0,
1238 create_immed(b, addr_offset), 0,
1239 create_immed(b, size), 0);
1240 ldg->barrier_class = ldg->barrier_conflict = IR3_BARRIER_CONST_W;
1241 ldg->cat6.type = TYPE_U32;
1242
1243 if (a1) {
1244 ir3_instr_set_address(ldg, a1);
1245 ldg->flags |= IR3_INSTR_A1EN;
1246 }
1247
1248 /* The assembler isn't aware of what value a1.x has, so make sure that
1249 * constlen includes the ldg.k here.
1250 */
1251 ctx->so->constlen =
1252 MAX2(ctx->so->constlen, DIV_ROUND_UP(dst + size * 4, 4));
1253
1254 array_insert(ctx->block, ctx->block->keeps, ldg);
1255 }
1256
1257
1258 /* handles direct/indirect UBO reads: */
1259 static void
emit_intrinsic_load_ubo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1260 emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1261 struct ir3_instruction **dst)
1262 {
1263 struct ir3_builder *b = &ctx->build;
1264 struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
1265 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1266 unsigned ubo = ir3_const_reg(const_state, IR3_CONST_ALLOC_UBO_PTRS, 0);
1267 const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
1268
1269 int off = 0;
1270
1271 /* First src is ubo index, which could either be an immed or not: */
1272 src0 = ir3_get_src(ctx, &intr->src[0])[0];
1273 if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
1274 base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));
1275 base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);
1276 } else {
1277 base_lo = create_uniform_indirect(b, ubo, TYPE_U32,
1278 ir3_get_addr0(ctx, src0, ptrsz));
1279 base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32,
1280 ir3_get_addr0(ctx, src0, ptrsz));
1281
1282 /* NOTE: since relative addressing is used, make sure constlen is
1283 * at least big enough to cover all the UBO addresses, since the
1284 * assembler won't know what the max address reg is.
1285 */
1286 ctx->so->constlen = MAX2(
1287 ctx->so->constlen,
1288 const_state->allocs.consts[IR3_CONST_ALLOC_UBO_PTRS].offset_vec4 +
1289 (ctx->s->info.num_ubos * ptrsz));
1290 }
1291
1292 /* note: on 32bit gpu's base_hi is ignored and DCE'd */
1293 addr = base_lo;
1294
1295 if (nir_src_is_const(intr->src[1])) {
1296 off += nir_src_as_uint(intr->src[1]);
1297 } else {
1298 /* For load_ubo_indirect, second src is indirect offset: */
1299 src1 = ir3_get_src(ctx, &intr->src[1])[0];
1300
1301 /* and add offset to addr: */
1302 addr = ir3_ADD_S(b, addr, 0, src1, 0);
1303 }
1304
1305 /* if offset is to large to encode in the ldg, split it out: */
1306 if ((off + (intr->num_components * 4)) > 1024) {
1307 /* split out the minimal amount to improve the odds that
1308 * cp can fit the immediate in the add.s instruction:
1309 */
1310 unsigned off2 = off + (intr->num_components * 4) - 1024;
1311 addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
1312 off -= off2;
1313 }
1314
1315 if (ptrsz == 2) {
1316 struct ir3_instruction *carry;
1317
1318 /* handle 32b rollover, ie:
1319 * if (addr < base_lo)
1320 * base_hi++
1321 */
1322 carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
1323 carry->cat2.condition = IR3_COND_LT;
1324 base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
1325
1326 addr = ir3_collect(b, addr, base_hi);
1327 }
1328
1329 for (int i = 0; i < intr->num_components; i++) {
1330 struct ir3_instruction *load =
1331 ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0,
1332 create_immed(b, 1), 0); /* num components */
1333 load->cat6.type = TYPE_U32;
1334 dst[i] = load;
1335 }
1336 }
1337
1338 /* Load a kernel param: src[] = { address }. */
1339 static void
emit_intrinsic_load_kernel_input(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1340 emit_intrinsic_load_kernel_input(struct ir3_context *ctx,
1341 nir_intrinsic_instr *intr,
1342 struct ir3_instruction **dst)
1343 {
1344 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1345 struct ir3_builder *b = &ctx->build;
1346 unsigned offset = nir_intrinsic_base(intr);
1347 unsigned p = ir3_const_reg(const_state, IR3_CONST_ALLOC_KERNEL_PARAMS, 0);
1348
1349 struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0];
1350
1351 if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
1352 offset += src0->srcs[0]->iim_val;
1353
1354 /* kernel param position is in bytes, but constant space is 32b registers: */
1355 compile_assert(ctx, !(offset & 0x3));
1356
1357 dst[0] = create_uniform(b, p + (offset / 4));
1358 } else {
1359 /* kernel param position is in bytes, but constant space is 32b registers: */
1360 compile_assert(ctx, !(offset & 0x3));
1361
1362 /* TODO we should probably be lowering this in nir, and also handling
1363 * non-32b inputs.. Also we probably don't want to be using
1364 * SP_MODE_CONTROL.CONSTANT_DEMOTION_ENABLE for KERNEL shaders..
1365 */
1366 src0 = ir3_SHR_B(b, src0, 0, create_immed(b, 2), 0);
1367
1368 dst[0] = create_uniform_indirect(b, offset / 4, TYPE_U32,
1369 ir3_get_addr0(ctx, src0, 1));
1370 }
1371 }
1372
1373 /* src[] = { block_index } */
1374 static void
emit_intrinsic_ssbo_size(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1375 emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1376 struct ir3_instruction **dst)
1377 {
1378 struct ir3_builder *b = &ctx->build;
1379 struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
1380 struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
1381 resinfo->cat6.iim_val = 1;
1382 resinfo->cat6.d = ctx->compiler->gen >= 6 ? 1 : 2;
1383 resinfo->cat6.type = TYPE_U32;
1384 resinfo->cat6.typed = false;
1385 /* resinfo has no writemask and always writes out 3 components */
1386 resinfo->dsts[0]->wrmask = MASK(3);
1387 ir3_handle_bindless_cat6(resinfo, intr->src[0]);
1388 ir3_handle_nonuniform(resinfo, intr);
1389
1390 if (ctx->compiler->gen >= 6) {
1391 ir3_split_dest(b, dst, resinfo, 0, 1);
1392 } else {
1393 /* On a5xx, resinfo returns the low 16 bits of ssbo size in .x and the high 16 bits in .y */
1394 struct ir3_instruction *resinfo_dst[2];
1395 ir3_split_dest(b, resinfo_dst, resinfo, 0, 2);
1396 *dst = ir3_ADD_U(b, ir3_SHL_B(b, resinfo_dst[1], 0, create_immed(b, 16), 0), 0, resinfo_dst[0], 0);
1397 }
1398 }
1399
1400 /* src[] = { offset }. const_index[] = { base } */
1401 static void
emit_intrinsic_load_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1402 emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1403 struct ir3_instruction **dst)
1404 {
1405 struct ir3_builder *b = &ctx->build;
1406 struct ir3_instruction *ldl, *offset;
1407 unsigned base;
1408
1409 offset = ir3_get_src(ctx, &intr->src[0])[0];
1410 base = nir_intrinsic_base(intr);
1411
1412 ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0,
1413 create_immed(b, intr->num_components), 0);
1414
1415 ldl->cat6.type = utype_def(&intr->def);
1416 ldl->dsts[0]->wrmask = MASK(intr->num_components);
1417
1418 ldl->barrier_class = IR3_BARRIER_SHARED_R;
1419 ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
1420
1421 ir3_split_dest(b, dst, ldl, 0, intr->num_components);
1422 }
1423
1424 /* src[] = { value, offset }. const_index[] = { base, write_mask } */
1425 static void
emit_intrinsic_store_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)1426 emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1427 {
1428 struct ir3_builder *b = &ctx->build;
1429 struct ir3_instruction *stl, *offset;
1430 struct ir3_instruction *const *value;
1431 unsigned base, wrmask, ncomp;
1432
1433 value = ir3_get_src(ctx, &intr->src[0]);
1434 offset = ir3_get_src(ctx, &intr->src[1])[0];
1435
1436 base = nir_intrinsic_base(intr);
1437 wrmask = nir_intrinsic_write_mask(intr);
1438 ncomp = ffs(~wrmask) - 1;
1439
1440 assert(wrmask == BITFIELD_MASK(intr->num_components));
1441
1442 stl = ir3_STL(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1443 create_immed(b, ncomp), 0);
1444 stl->cat6.dst_offset = base;
1445 stl->cat6.type = utype_src(intr->src[0]);
1446 stl->barrier_class = IR3_BARRIER_SHARED_W;
1447 stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1448
1449 array_insert(ctx->block, ctx->block->keeps, stl);
1450 }
1451
1452 /* src[] = { offset }. const_index[] = { base } */
1453 static void
emit_intrinsic_load_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1454 emit_intrinsic_load_shared_ir3(struct ir3_context *ctx,
1455 nir_intrinsic_instr *intr,
1456 struct ir3_instruction **dst)
1457 {
1458 struct ir3_builder *b = &ctx->build;
1459 struct ir3_instruction *load, *offset;
1460 unsigned base;
1461
1462 offset = ir3_get_src(ctx, &intr->src[0])[0];
1463 base = nir_intrinsic_base(intr);
1464
1465 load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0,
1466 create_immed(b, intr->num_components), 0);
1467
1468 /* for a650, use LDL for tess ctrl inputs: */
1469 if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)
1470 load->opc = OPC_LDL;
1471
1472 load->cat6.type = utype_def(&intr->def);
1473 load->dsts[0]->wrmask = MASK(intr->num_components);
1474
1475 load->barrier_class = IR3_BARRIER_SHARED_R;
1476 load->barrier_conflict = IR3_BARRIER_SHARED_W;
1477
1478 ir3_split_dest(b, dst, load, 0, intr->num_components);
1479 }
1480
1481 /* src[] = { value, offset }. const_index[] = { base } */
1482 static void
emit_intrinsic_store_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr)1483 emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,
1484 nir_intrinsic_instr *intr)
1485 {
1486 struct ir3_builder *b = &ctx->build;
1487 struct ir3_instruction *store, *offset;
1488 struct ir3_instruction *const *value;
1489
1490 value = ir3_get_src(ctx, &intr->src[0]);
1491 offset = ir3_get_src(ctx, &intr->src[1])[0];
1492
1493 store = ir3_STLW(b, offset, 0,
1494 ir3_create_collect(b, value, intr->num_components), 0,
1495 create_immed(b, intr->num_components), 0);
1496
1497 /* for a650, use STL for vertex outputs used by tess ctrl shader: */
1498 if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&
1499 ctx->compiler->tess_use_shared)
1500 store->opc = OPC_STL;
1501
1502 store->cat6.dst_offset = nir_intrinsic_base(intr);
1503 store->cat6.type = utype_src(intr->src[0]);
1504 store->barrier_class = IR3_BARRIER_SHARED_W;
1505 store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1506
1507 array_insert(ctx->block, ctx->block->keeps, store);
1508 }
1509
1510 /*
1511 * CS shared variable atomic intrinsics
1512 *
1513 * All of the shared variable atomic memory operations read a value from
1514 * memory, compute a new value using one of the operations below, write the
1515 * new value to memory, and return the original value read.
1516 *
1517 * All operations take 2 sources except CompSwap that takes 3. These
1518 * sources represent:
1519 *
1520 * 0: The offset into the shared variable storage region that the atomic
1521 * operation will operate on.
1522 * 1: The data parameter to the atomic function (i.e. the value to add
1523 * in, etc).
1524 * 2: For CompSwap only: the second data parameter.
1525 */
1526 static struct ir3_instruction *
emit_intrinsic_atomic_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)1527 emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1528 {
1529 struct ir3_builder *b = &ctx->build;
1530 struct ir3_instruction *atomic, *src0, *src1;
1531 type_t type = TYPE_U32;
1532
1533 src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */
1534 src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */
1535
1536 switch (nir_intrinsic_atomic_op(intr)) {
1537 case nir_atomic_op_iadd:
1538 atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
1539 break;
1540 case nir_atomic_op_imin:
1541 atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1542 type = TYPE_S32;
1543 break;
1544 case nir_atomic_op_umin:
1545 atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1546 break;
1547 case nir_atomic_op_imax:
1548 atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1549 type = TYPE_S32;
1550 break;
1551 case nir_atomic_op_umax:
1552 atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1553 break;
1554 case nir_atomic_op_iand:
1555 atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
1556 break;
1557 case nir_atomic_op_ior:
1558 atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
1559 break;
1560 case nir_atomic_op_ixor:
1561 atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
1562 break;
1563 case nir_atomic_op_xchg:
1564 atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
1565 break;
1566 case nir_atomic_op_cmpxchg:
1567 /* for cmpxchg, src1 is [ui]vec2(data, compare): */
1568 src1 = ir3_collect(b, ir3_get_src(ctx, &intr->src[2])[0], src1);
1569 atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
1570 break;
1571 default:
1572 unreachable("boo");
1573 }
1574
1575 atomic->cat6.iim_val = 1;
1576 atomic->cat6.d = 1;
1577 atomic->cat6.type = type;
1578 atomic->barrier_class = IR3_BARRIER_SHARED_W;
1579 atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1580
1581 /* even if nothing consume the result, we can't DCE the instruction: */
1582 array_insert(ctx->block, ctx->block->keeps, atomic);
1583
1584 return atomic;
1585 }
1586
1587 static void
stp_ldp_offset(struct ir3_context * ctx,nir_src * src,struct ir3_instruction ** offset,int32_t * base)1588 stp_ldp_offset(struct ir3_context *ctx, nir_src *src,
1589 struct ir3_instruction **offset, int32_t *base)
1590 {
1591 struct ir3_builder *b = &ctx->build;
1592
1593 if (nir_src_is_const(*src)) {
1594 unsigned src_offset = nir_src_as_uint(*src);
1595 /* The base offset field is only 13 bits, and it's signed. Try to make the
1596 * offset constant whenever the original offsets are similar, to avoid
1597 * creating too many constants in the final shader.
1598 */
1599 *base = ((int32_t) src_offset << (32 - 13)) >> (32 - 13);
1600 uint32_t offset_val = src_offset - *base;
1601 *offset = create_immed(b, offset_val);
1602 } else {
1603 /* TODO: match on nir_iadd with a constant that fits */
1604 *base = 0;
1605 *offset = ir3_get_src(ctx, src)[0];
1606 }
1607 }
1608
1609 /* src[] = { offset }. */
1610 static void
emit_intrinsic_load_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1611 emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1612 struct ir3_instruction **dst)
1613 {
1614 struct ir3_builder *b = &ctx->build;
1615 struct ir3_instruction *ldp, *offset;
1616 int32_t base;
1617
1618 stp_ldp_offset(ctx, &intr->src[0], &offset, &base);
1619
1620 ldp = ir3_LDP(b, offset, 0, create_immed(b, base), 0,
1621 create_immed(b, intr->num_components), 0);
1622
1623 ldp->cat6.type = utype_def(&intr->def);
1624 ldp->dsts[0]->wrmask = MASK(intr->num_components);
1625
1626 ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
1627 ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
1628
1629 ir3_split_dest(b, dst, ldp, 0, intr->num_components);
1630 }
1631
1632 /* src[] = { value, offset }. const_index[] = { write_mask } */
1633 static void
emit_intrinsic_store_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr)1634 emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1635 {
1636 struct ir3_builder *b = &ctx->build;
1637 struct ir3_instruction *stp, *offset;
1638 struct ir3_instruction *const *value;
1639 unsigned wrmask, ncomp;
1640 int32_t base;
1641
1642 value = ir3_get_src(ctx, &intr->src[0]);
1643
1644 stp_ldp_offset(ctx, &intr->src[1], &offset, &base);
1645
1646 wrmask = nir_intrinsic_write_mask(intr);
1647 ncomp = ffs(~wrmask) - 1;
1648
1649 assert(wrmask == BITFIELD_MASK(intr->num_components));
1650
1651 stp = ir3_STP(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1652 create_immed(b, ncomp), 0);
1653 stp->cat6.dst_offset = base;
1654 stp->cat6.type = utype_src(intr->src[0]);
1655 stp->barrier_class = IR3_BARRIER_PRIVATE_W;
1656 stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
1657
1658 array_insert(ctx->block, ctx->block->keeps, stp);
1659 }
1660
1661 struct tex_src_info {
1662 /* For prefetch */
1663 unsigned tex_base, samp_base, tex_idx, samp_idx;
1664 /* For normal tex instructions */
1665 unsigned base, a1_val, flags;
1666 struct ir3_instruction *samp_tex;
1667 };
1668
1669 /* TODO handle actual indirect/dynamic case.. which is going to be weird
1670 * to handle with the image_mapping table..
1671 */
1672 static struct tex_src_info
get_image_ssbo_samp_tex_src(struct ir3_context * ctx,nir_src * src,bool image)1673 get_image_ssbo_samp_tex_src(struct ir3_context *ctx, nir_src *src, bool image)
1674 {
1675 struct ir3_builder *b = &ctx->build;
1676 struct tex_src_info info = {0};
1677 nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(*src);
1678
1679 if (bindless_tex) {
1680 /* Bindless case */
1681 ctx->so->bindless_tex = true;
1682 info.flags |= IR3_INSTR_B;
1683
1684 /* Gather information required to determine which encoding to
1685 * choose as well as for prefetch.
1686 */
1687 info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1688 bool tex_const = nir_src_is_const(bindless_tex->src[0]);
1689 if (tex_const)
1690 info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1691 info.samp_idx = 0;
1692
1693 /* Choose encoding. */
1694 if (tex_const && info.tex_idx < 256) {
1695 if (info.tex_idx < 16) {
1696 /* Everything fits within the instruction */
1697 info.base = info.tex_base;
1698 } else {
1699 info.base = info.tex_base;
1700 if (ctx->compiler->gen <= 6) {
1701 info.a1_val = info.tex_idx << 3;
1702 } else {
1703 info.a1_val = info.samp_idx << 3;
1704 }
1705 info.flags |= IR3_INSTR_A1EN;
1706 }
1707 info.samp_tex = NULL;
1708 } else {
1709 info.flags |= IR3_INSTR_S2EN;
1710 info.base = info.tex_base;
1711
1712 /* Note: the indirect source is now a vec2 instead of hvec2 */
1713 struct ir3_instruction *texture, *sampler;
1714
1715 texture = ir3_get_src(ctx, src)[0];
1716 sampler = create_immed(b, 0);
1717 info.samp_tex = ir3_collect(b, texture, sampler);
1718 }
1719 } else {
1720 info.flags |= IR3_INSTR_S2EN;
1721 unsigned slot = nir_src_as_uint(*src);
1722 unsigned tex_idx = image ?
1723 ir3_image_to_tex(&ctx->so->image_mapping, slot) :
1724 ir3_ssbo_to_tex(&ctx->so->image_mapping, slot);
1725 struct ir3_instruction *texture, *sampler;
1726
1727 ctx->so->num_samp = MAX2(ctx->so->num_samp, tex_idx + 1);
1728
1729 texture = create_immed_typed(b, tex_idx, TYPE_U16);
1730 sampler = create_immed_typed(b, tex_idx, TYPE_U16);
1731
1732 info.samp_tex = ir3_collect(b, texture, sampler);
1733 }
1734
1735 return info;
1736 }
1737
1738 static struct ir3_instruction *
emit_sam(struct ir3_context * ctx,opc_t opc,struct tex_src_info info,type_t type,unsigned wrmask,struct ir3_instruction * src0,struct ir3_instruction * src1)1739 emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,
1740 type_t type, unsigned wrmask, struct ir3_instruction *src0,
1741 struct ir3_instruction *src1)
1742 {
1743 struct ir3_instruction *sam, *addr;
1744 if (info.flags & IR3_INSTR_A1EN) {
1745 addr = ir3_get_addr1(ctx, info.a1_val);
1746 }
1747 sam = ir3_SAM(&ctx->build, opc, type, wrmask, info.flags, info.samp_tex,
1748 src0, src1);
1749 if (info.flags & IR3_INSTR_A1EN) {
1750 ir3_instr_set_address(sam, addr);
1751 }
1752 if (info.flags & IR3_INSTR_B) {
1753 sam->cat5.tex_base = info.base;
1754 sam->cat5.samp = info.samp_idx;
1755 sam->cat5.tex = info.tex_idx;
1756 }
1757 return sam;
1758 }
1759
1760 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
1761 static void
emit_intrinsic_load_image(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1762 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1763 struct ir3_instruction **dst)
1764 {
1765 /* If the image can be written, must use LDIB to retrieve data, rather than
1766 * through ISAM (which uses the texture cache and won't get previous writes).
1767 */
1768 if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER)) {
1769 ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1770 return;
1771 }
1772
1773 /* The sparse set of texture descriptors for non-coherent load_images means we can't do indirection, so
1774 * fall back to coherent load.
1775 */
1776 if (ctx->compiler->gen >= 5 &&
1777 !ir3_bindless_resource(intr->src[0]) &&
1778 !nir_src_is_const(intr->src[0])) {
1779 ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1780 return;
1781 }
1782
1783 struct ir3_builder *b = &ctx->build;
1784 struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], true);
1785 struct ir3_instruction *sam;
1786 struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]);
1787 struct ir3_instruction *coords[4];
1788 unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1789 type_t type = ir3_get_type_for_image_intrinsic(intr);
1790
1791 info.flags |= flags;
1792
1793 /* hw doesn't do 1d, so we treat it as 2d with height of 1, and patch up the
1794 * y coord. Note that the array index must come after the fake y coord.
1795 */
1796 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
1797 if (dim == GLSL_SAMPLER_DIM_1D || dim == GLSL_SAMPLER_DIM_BUF) {
1798 coords[0] = src0[0];
1799 coords[1] = create_immed(b, 0);
1800 for (unsigned i = 1; i < ncoords; i++)
1801 coords[i + 1] = src0[i];
1802 ncoords++;
1803 } else {
1804 for (unsigned i = 0; i < ncoords; i++)
1805 coords[i] = src0[i];
1806 }
1807
1808 sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
1809 ir3_create_collect(b, coords, ncoords), NULL);
1810
1811 ir3_handle_nonuniform(sam, intr);
1812
1813 sam->barrier_class = IR3_BARRIER_IMAGE_R;
1814 sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
1815
1816 ir3_split_dest(b, dst, sam, 0, 4);
1817 }
1818
1819 /* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */
1820 void
emit_intrinsic_image_size_tex(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1821 emit_intrinsic_image_size_tex(struct ir3_context *ctx,
1822 nir_intrinsic_instr *intr,
1823 struct ir3_instruction **dst)
1824 {
1825 struct ir3_builder *b = &ctx->build;
1826 struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], true);
1827 struct ir3_instruction *sam, *lod;
1828 unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1829 type_t dst_type = intr->def.bit_size == 16 ? TYPE_U16 : TYPE_U32;
1830
1831 info.flags |= flags;
1832 assert(nir_src_as_uint(intr->src[1]) == 0);
1833 lod = create_immed(b, 0);
1834 sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
1835
1836 /* Array size actually ends up in .w rather than .z. This doesn't
1837 * matter for miplevel 0, but for higher mips the value in z is
1838 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1839 * returned, which means that we have to add 1 to it for arrays for
1840 * a3xx.
1841 *
1842 * Note use a temporary dst and then copy, since the size of the dst
1843 * array that is passed in is based on nir's understanding of the
1844 * result size, not the hardware's
1845 */
1846 struct ir3_instruction *tmp[4];
1847
1848 ir3_split_dest(b, tmp, sam, 0, 4);
1849
1850 for (unsigned i = 0; i < ncoords; i++)
1851 dst[i] = tmp[i];
1852
1853 if (flags & IR3_INSTR_A) {
1854 if (ctx->compiler->levels_add_one) {
1855 dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
1856 } else {
1857 dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32);
1858 }
1859 }
1860 }
1861
1862 static struct tex_src_info
get_bindless_samp_src(struct ir3_context * ctx,nir_src * tex,nir_src * samp)1863 get_bindless_samp_src(struct ir3_context *ctx, nir_src *tex,
1864 nir_src *samp)
1865 {
1866 struct ir3_builder *b = &ctx->build;
1867 struct tex_src_info info = {0};
1868
1869 info.flags |= IR3_INSTR_B;
1870
1871 /* Gather information required to determine which encoding to
1872 * choose as well as for prefetch.
1873 */
1874 nir_intrinsic_instr *bindless_tex = NULL;
1875 bool tex_const;
1876 if (tex) {
1877 ctx->so->bindless_tex = true;
1878 bindless_tex = ir3_bindless_resource(*tex);
1879 assert(bindless_tex);
1880 info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1881 tex_const = nir_src_is_const(bindless_tex->src[0]);
1882 if (tex_const)
1883 info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1884 } else {
1885 /* To simplify some of the logic below, assume the index is
1886 * constant 0 when it's not enabled.
1887 */
1888 tex_const = true;
1889 info.tex_idx = 0;
1890 }
1891 nir_intrinsic_instr *bindless_samp = NULL;
1892 bool samp_const;
1893 if (samp) {
1894 ctx->so->bindless_samp = true;
1895 bindless_samp = ir3_bindless_resource(*samp);
1896 assert(bindless_samp);
1897 info.samp_base = nir_intrinsic_desc_set(bindless_samp);
1898 samp_const = nir_src_is_const(bindless_samp->src[0]);
1899 if (samp_const)
1900 info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);
1901 } else {
1902 samp_const = true;
1903 info.samp_idx = 0;
1904 }
1905
1906 /* Choose encoding. */
1907 if (tex_const && samp_const && info.tex_idx < 256 &&
1908 info.samp_idx < 256) {
1909 if (info.tex_idx < 16 && info.samp_idx < 16 &&
1910 (!bindless_tex || !bindless_samp ||
1911 info.tex_base == info.samp_base)) {
1912 /* Everything fits within the instruction */
1913 info.base = info.tex_base;
1914 } else {
1915 info.base = info.tex_base;
1916 if (ctx->compiler->gen <= 6) {
1917 info.a1_val = info.tex_idx << 3 | info.samp_base;
1918 } else {
1919 info.a1_val = info.samp_idx << 3 | info.samp_base;
1920 }
1921
1922 info.flags |= IR3_INSTR_A1EN;
1923 }
1924 info.samp_tex = NULL;
1925 } else {
1926 info.flags |= IR3_INSTR_S2EN;
1927 /* In the indirect case, we only use a1.x to store the sampler
1928 * base if it differs from the texture base.
1929 */
1930 if (!bindless_tex || !bindless_samp ||
1931 info.tex_base == info.samp_base) {
1932 info.base = info.tex_base;
1933 } else {
1934 info.base = info.tex_base;
1935 info.a1_val = info.samp_base;
1936 info.flags |= IR3_INSTR_A1EN;
1937 }
1938
1939 /* Note: the indirect source is now a vec2 instead of hvec2
1940 */
1941 struct ir3_instruction *texture, *sampler;
1942
1943 if (bindless_tex) {
1944 texture = ir3_get_src(ctx, tex)[0];
1945 } else {
1946 texture = create_immed(b, 0);
1947 }
1948
1949 if (bindless_samp) {
1950 sampler = ir3_get_src(ctx, samp)[0];
1951 } else {
1952 sampler = create_immed(b, 0);
1953 }
1954 info.samp_tex = ir3_collect(b, texture, sampler);
1955 }
1956
1957 return info;
1958 }
1959
1960 /* src[] = { buffer_index, offset }. No const_index */
1961 static void
emit_intrinsic_load_ssbo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1962 emit_intrinsic_load_ssbo(struct ir3_context *ctx,
1963 nir_intrinsic_instr *intr,
1964 struct ir3_instruction **dst)
1965 {
1966 /* Note: we can only use isam for vectorized loads/stores if isam.v is
1967 * available.
1968 * Note: isam also can't handle 8-bit loads.
1969 */
1970 if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
1971 (intr->def.num_components > 1 && !ctx->compiler->has_isam_v) ||
1972 (ctx->compiler->options.storage_8bit && intr->def.bit_size == 8) ||
1973 !ctx->compiler->has_isam_ssbo) {
1974 ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
1975 return;
1976 }
1977
1978 struct ir3_builder *b = &ctx->build;
1979 nir_src *offset_src = &intr->src[2];
1980 struct ir3_instruction *coords = NULL;
1981 unsigned imm_offset = 0;
1982
1983 if (ctx->compiler->has_isam_v) {
1984 ir3_lower_imm_offset(ctx, intr, offset_src, 8, &coords, &imm_offset);
1985 } else {
1986 coords =
1987 ir3_collect(b, ir3_get_src(ctx, offset_src)[0], create_immed(b, 0));
1988 }
1989
1990 struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false);
1991
1992 unsigned num_components = intr->def.num_components;
1993 assert(num_components == 1 || ctx->compiler->has_isam_v);
1994
1995 struct ir3_instruction *sam =
1996 emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size),
1997 MASK(num_components), coords, create_immed(b, imm_offset));
1998
1999 if (ctx->compiler->has_isam_v) {
2000 sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D);
2001
2002 if (imm_offset) {
2003 sam->flags |= IR3_INSTR_IMM_OFFSET;
2004 }
2005 }
2006
2007 ir3_handle_nonuniform(sam, intr);
2008
2009 sam->barrier_class = IR3_BARRIER_BUFFER_R;
2010 sam->barrier_conflict = IR3_BARRIER_BUFFER_W;
2011
2012 ir3_split_dest(b, dst, sam, 0, num_components);
2013 }
2014
2015 static void
emit_control_barrier(struct ir3_context * ctx)2016 emit_control_barrier(struct ir3_context *ctx)
2017 {
2018 /* Hull shaders dispatch 32 wide so an entire patch will always
2019 * fit in a single warp and execute in lock-step. Consequently,
2020 * we don't need to do anything for TCS barriers. Emitting
2021 * barrier instruction will deadlock.
2022 */
2023 if (ctx->so->type == MESA_SHADER_TESS_CTRL)
2024 return;
2025
2026 struct ir3_builder *b = &ctx->build;
2027 struct ir3_instruction *barrier = ir3_BAR(b);
2028 barrier->cat7.g = true;
2029 if (ctx->compiler->gen < 6)
2030 barrier->cat7.l = true;
2031 barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
2032 barrier->barrier_class = IR3_BARRIER_EVERYTHING;
2033 array_insert(ctx->block, ctx->block->keeps, barrier);
2034
2035 ctx->so->has_barrier = true;
2036 }
2037
2038 static void
emit_intrinsic_barrier(struct ir3_context * ctx,nir_intrinsic_instr * intr)2039 emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2040 {
2041 struct ir3_builder *b = &ctx->build;
2042 struct ir3_instruction *barrier;
2043
2044 /* TODO: find out why there is a major difference of .l usage
2045 * between a5xx and a6xx,
2046 */
2047
2048 mesa_scope exec_scope = nir_intrinsic_execution_scope(intr);
2049 mesa_scope mem_scope = nir_intrinsic_memory_scope(intr);
2050 nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
2051 /* loads/stores are always cache-coherent so we can filter out
2052 * available/visible.
2053 */
2054 nir_memory_semantics semantics =
2055 nir_intrinsic_memory_semantics(intr) & (NIR_MEMORY_ACQUIRE |
2056 NIR_MEMORY_RELEASE);
2057
2058 if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
2059 /* Remove mode corresponding to TCS patch barriers because hull shaders
2060 * dispatch 32 wide so an entire patch will always fit in a single warp
2061 * and execute in lock-step.
2062 *
2063 * TODO: memory barrier also tells us not to reorder stores, this
2064 * information is lost here (backend doesn't reorder stores so we
2065 * are safe for now).
2066 */
2067 modes &= ~nir_var_shader_out;
2068 }
2069
2070 assert(!(modes & nir_var_shader_out));
2071
2072 if ((modes & (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global |
2073 nir_var_image)) && semantics) {
2074 barrier = ir3_FENCE(b);
2075 barrier->cat7.r = true;
2076 barrier->cat7.w = true;
2077
2078 if (modes & (nir_var_mem_ssbo | nir_var_image | nir_var_mem_global)) {
2079 barrier->cat7.g = true;
2080 }
2081
2082 if (ctx->compiler->gen >= 6) {
2083 if (modes & (nir_var_mem_ssbo | nir_var_image)) {
2084 barrier->cat7.l = true;
2085 }
2086 } else {
2087 if (modes & (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_image)) {
2088 barrier->cat7.l = true;
2089 }
2090 }
2091
2092 barrier->barrier_class = 0;
2093 barrier->barrier_conflict = 0;
2094
2095 if (modes & nir_var_mem_shared) {
2096 barrier->barrier_class |= IR3_BARRIER_SHARED_W;
2097 barrier->barrier_conflict |=
2098 IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
2099 }
2100
2101 if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
2102 barrier->barrier_class |= IR3_BARRIER_BUFFER_W;
2103 barrier->barrier_conflict |=
2104 IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
2105 }
2106
2107 if (modes & nir_var_image) {
2108 barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
2109 barrier->barrier_conflict |=
2110 IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
2111 }
2112
2113 /* make sure barrier doesn't get DCE'd */
2114 array_insert(ctx->block, ctx->block->keeps, barrier);
2115
2116 if (ctx->compiler->gen >= 7 && mem_scope > SCOPE_WORKGROUP &&
2117 modes & (nir_var_mem_ssbo | nir_var_image) &&
2118 semantics & NIR_MEMORY_ACQUIRE) {
2119 /* "r + l" is not enough to synchronize reads with writes from other
2120 * workgroups, we can disable them since they are useless here.
2121 */
2122 barrier->cat7.r = false;
2123 barrier->cat7.l = false;
2124
2125 struct ir3_instruction *ccinv = ir3_CCINV(b);
2126 /* A7XX TODO: ccinv should just stick to the barrier,
2127 * the barrier class/conflict introduces unnecessary waits.
2128 */
2129 ccinv->barrier_class = barrier->barrier_class;
2130 ccinv->barrier_conflict = barrier->barrier_conflict;
2131 array_insert(ctx->block, ctx->block->keeps, ccinv);
2132 }
2133 }
2134
2135 if (exec_scope >= SCOPE_WORKGROUP) {
2136 emit_control_barrier(ctx);
2137 }
2138 }
2139
2140 static void
add_sysval_input_compmask(struct ir3_context * ctx,gl_system_value slot,unsigned compmask,struct ir3_instruction * instr)2141 add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot,
2142 unsigned compmask, struct ir3_instruction *instr)
2143 {
2144 struct ir3_shader_variant *so = ctx->so;
2145 unsigned n = so->inputs_count++;
2146
2147 assert(instr->opc == OPC_META_INPUT);
2148 instr->input.inidx = n;
2149 instr->input.sysval = slot;
2150
2151 so->inputs[n].sysval = true;
2152 so->inputs[n].slot = slot;
2153 so->inputs[n].compmask = compmask;
2154 so->total_in++;
2155
2156 so->sysval_in += util_last_bit(compmask);
2157 }
2158
2159 static struct ir3_instruction *
create_sysval_input(struct ir3_context * ctx,gl_system_value slot,unsigned compmask)2160 create_sysval_input(struct ir3_context *ctx, gl_system_value slot,
2161 unsigned compmask)
2162 {
2163 assert(compmask);
2164 struct ir3_instruction *sysval = create_input(ctx, compmask);
2165 add_sysval_input_compmask(ctx, slot, compmask, sysval);
2166 return sysval;
2167 }
2168
2169 static struct ir3_instruction *
get_barycentric(struct ir3_context * ctx,enum ir3_bary bary)2170 get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)
2171 {
2172 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_PIXEL ==
2173 SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
2174 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_SAMPLE ==
2175 SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
2176 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTROID ==
2177 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
2178 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTER_RHW ==
2179 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW);
2180 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_PIXEL ==
2181 SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
2182 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_CENTROID ==
2183 SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
2184 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_SAMPLE ==
2185 SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
2186
2187 if (!ctx->ij[bary]) {
2188 struct ir3_instruction *xy[2];
2189 struct ir3_instruction *ij;
2190 struct ir3_builder build =
2191 ir3_builder_at(ir3_before_terminator(ctx->in_block));
2192
2193 ij = create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL +
2194 bary, 0x3);
2195 ir3_split_dest(&build, xy, ij, 0, 2);
2196
2197 ctx->ij[bary] = ir3_create_collect(&build, xy, 2);
2198 }
2199
2200 return ctx->ij[bary];
2201 }
2202
2203 /* TODO: make this a common NIR helper?
2204 * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so
2205 * it can't be extended to work with this
2206 */
2207 static gl_system_value
nir_intrinsic_barycentric_sysval(nir_intrinsic_instr * intr)2208 nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)
2209 {
2210 enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);
2211 gl_system_value sysval;
2212
2213 switch (intr->intrinsic) {
2214 case nir_intrinsic_load_barycentric_pixel:
2215 if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
2216 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
2217 else
2218 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2219 break;
2220 case nir_intrinsic_load_barycentric_centroid:
2221 if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
2222 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;
2223 else
2224 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;
2225 break;
2226 case nir_intrinsic_load_barycentric_sample:
2227 if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
2228 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;
2229 else
2230 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;
2231 break;
2232 default:
2233 unreachable("invalid barycentric intrinsic");
2234 }
2235
2236 return sysval;
2237 }
2238
2239 static void
emit_intrinsic_barycentric(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)2240 emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr,
2241 struct ir3_instruction **dst)
2242 {
2243 gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);
2244
2245 if (!ctx->so->key.msaa && ctx->compiler->gen < 6) {
2246 switch (sysval) {
2247 case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
2248 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2249 break;
2250 case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
2251 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2252 break;
2253 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
2254 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
2255 break;
2256 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
2257 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
2258 break;
2259 default:
2260 break;
2261 }
2262 }
2263
2264 enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2265
2266 struct ir3_instruction *ij = get_barycentric(ctx, bary);
2267 ir3_split_dest(&ctx->build, dst, ij, 0, 2);
2268 }
2269
2270 static struct ir3_instruction *
get_frag_coord(struct ir3_context * ctx,nir_intrinsic_instr * intr)2271 get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2272 {
2273 if (!ctx->frag_coord) {
2274 struct ir3_block *block = ir3_after_preamble(ctx->ir);
2275 struct ir3_builder b = ir3_builder_at(ir3_before_terminator(block));
2276 struct ir3_instruction_rpt xyzw;
2277 struct ir3_instruction *hw_frag_coord;
2278
2279 hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);
2280 ir3_split_dest(&b, xyzw.rpts, hw_frag_coord, 0, 4);
2281
2282 /* for frag_coord.xy, we get unsigned values.. we need
2283 * to subtract (integer) 8 and divide by 16 (right-
2284 * shift by 4) then convert to float:
2285 *
2286 * sub.s tmp, src, 8
2287 * shr.b tmp, tmp, 4
2288 * mov.u32f32 dst, tmp
2289 *
2290 */
2291 struct ir3_instruction_rpt xy =
2292 ir3_COV_rpt(&b, 2, xyzw, TYPE_U32, TYPE_F32);
2293 xy = ir3_MUL_F_rpt(&b, 2, xy, 0, create_immed_rpt(&b, 2, fui(1.0 / 16.0)),
2294 0);
2295 cp_instrs(xyzw.rpts, xy.rpts, 2);
2296 ctx->frag_coord = ir3_create_collect(&b, xyzw.rpts, 4);
2297 }
2298
2299 ctx->so->fragcoord_compmask |= nir_def_components_read(&intr->def);
2300
2301 return ctx->frag_coord;
2302 }
2303
2304 /* This is a bit of a hack until ir3_context is converted to store SSA values
2305 * as ir3_register's instead of ir3_instruction's. Pick out a given destination
2306 * of an instruction with multiple destinations using a mov that will get folded
2307 * away by ir3_cp.
2308 */
2309 static struct ir3_instruction *
create_multidst_mov(struct ir3_builder * build,struct ir3_register * dst)2310 create_multidst_mov(struct ir3_builder *build, struct ir3_register *dst)
2311 {
2312 struct ir3_instruction *mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2313 unsigned dst_flags = dst->flags & IR3_REG_HALF;
2314 unsigned src_flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
2315
2316 __ssa_dst(mov)->flags |= dst_flags;
2317 struct ir3_register *src =
2318 ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | src_flags);
2319 src->wrmask = dst->wrmask;
2320 src->def = dst;
2321 assert(!(dst->flags & IR3_REG_RELATIV));
2322 mov->cat1.src_type = mov->cat1.dst_type =
2323 (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
2324 return mov;
2325 }
2326
2327 static reduce_op_t
get_reduce_op(nir_op opc)2328 get_reduce_op(nir_op opc)
2329 {
2330 switch (opc) {
2331 case nir_op_iadd: return REDUCE_OP_ADD_U;
2332 case nir_op_fadd: return REDUCE_OP_ADD_F;
2333 case nir_op_imul: return REDUCE_OP_MUL_U;
2334 case nir_op_fmul: return REDUCE_OP_MUL_F;
2335 case nir_op_umin: return REDUCE_OP_MIN_U;
2336 case nir_op_imin: return REDUCE_OP_MIN_S;
2337 case nir_op_fmin: return REDUCE_OP_MIN_F;
2338 case nir_op_umax: return REDUCE_OP_MAX_U;
2339 case nir_op_imax: return REDUCE_OP_MAX_S;
2340 case nir_op_fmax: return REDUCE_OP_MAX_F;
2341 case nir_op_iand: return REDUCE_OP_AND_B;
2342 case nir_op_ior: return REDUCE_OP_OR_B;
2343 case nir_op_ixor: return REDUCE_OP_XOR_B;
2344 default:
2345 unreachable("unknown NIR reduce op");
2346 }
2347 }
2348
2349 static uint32_t
get_reduce_identity(nir_op opc,unsigned size)2350 get_reduce_identity(nir_op opc, unsigned size)
2351 {
2352 switch (opc) {
2353 case nir_op_iadd:
2354 return 0;
2355 case nir_op_fadd:
2356 return size == 32 ? fui(0.0f) : _mesa_float_to_half(0.0f);
2357 case nir_op_imul:
2358 return 1;
2359 case nir_op_fmul:
2360 return size == 32 ? fui(1.0f) : _mesa_float_to_half(1.0f);
2361 case nir_op_umax:
2362 return 0;
2363 case nir_op_imax:
2364 return size == 32 ? INT32_MIN : (uint32_t)INT16_MIN;
2365 case nir_op_fmax:
2366 return size == 32 ? fui(-INFINITY) : _mesa_float_to_half(-INFINITY);
2367 case nir_op_umin:
2368 return size == 32 ? UINT32_MAX : UINT16_MAX;
2369 case nir_op_imin:
2370 return size == 32 ? INT32_MAX : (uint32_t)INT16_MAX;
2371 case nir_op_fmin:
2372 return size == 32 ? fui(INFINITY) : _mesa_float_to_half(INFINITY);
2373 case nir_op_iand:
2374 return size == 32 ? ~0 : (size == 16 ? (uint32_t)(uint16_t)~0 : 1);
2375 case nir_op_ior:
2376 return 0;
2377 case nir_op_ixor:
2378 return 0;
2379 default:
2380 unreachable("unknown NIR reduce op");
2381 }
2382 }
2383
2384 static struct ir3_instruction *
emit_intrinsic_reduce(struct ir3_context * ctx,nir_intrinsic_instr * intr)2385 emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2386 {
2387 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2388 nir_op nir_reduce_op = (nir_op) nir_intrinsic_reduction_op(intr);
2389 reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
2390 unsigned dst_size = intr->def.bit_size;
2391 unsigned flags = (ir3_bitsize(ctx, dst_size) == 16) ? IR3_REG_HALF : 0;
2392
2393 /* Note: the shared reg is initialized to the identity, so we need it to
2394 * always be 32-bit even when the source isn't because half shared regs are
2395 * not supported.
2396 */
2397 struct ir3_instruction *identity = create_immed_shared(
2398 &ctx->build, get_reduce_identity(nir_reduce_op, dst_size), true);
2399
2400 /* OPC_SCAN_MACRO has the following destinations:
2401 * - Exclusive scan result (interferes with source)
2402 * - Inclusive scan result
2403 * - Shared reg reduction result, must be initialized to the identity
2404 *
2405 * The loop computes all three results at the same time, we just have to
2406 * choose which destination to return.
2407 */
2408 struct ir3_instruction *scan =
2409 ir3_build_instr(&ctx->build, OPC_SCAN_MACRO, 3, 2);
2410 scan->cat1.reduce_op = reduce_op;
2411
2412 struct ir3_register *exclusive = __ssa_dst(scan);
2413 exclusive->flags |= flags | IR3_REG_EARLY_CLOBBER;
2414 struct ir3_register *inclusive = __ssa_dst(scan);
2415 inclusive->flags |= flags;
2416 struct ir3_register *reduce = __ssa_dst(scan);
2417 reduce->flags |= IR3_REG_SHARED;
2418
2419 /* The 32-bit multiply macro reads its sources after writing a partial result
2420 * to the destination, therefore inclusive also interferes with the source.
2421 */
2422 if (reduce_op == REDUCE_OP_MUL_U && dst_size == 32)
2423 inclusive->flags |= IR3_REG_EARLY_CLOBBER;
2424
2425 /* Normal source */
2426 __ssa_src(scan, src, 0);
2427
2428 /* shared reg tied source */
2429 struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
2430 ir3_reg_tie(reduce, reduce_init);
2431
2432 struct ir3_register *dst;
2433 switch (intr->intrinsic) {
2434 case nir_intrinsic_reduce: dst = reduce; break;
2435 case nir_intrinsic_inclusive_scan: dst = inclusive; break;
2436 case nir_intrinsic_exclusive_scan: dst = exclusive; break;
2437 default:
2438 unreachable("unknown reduce intrinsic");
2439 }
2440
2441 return create_multidst_mov(&ctx->build, dst);
2442 }
2443
2444 static struct ir3_instruction *
emit_intrinsic_reduce_clusters(struct ir3_context * ctx,nir_intrinsic_instr * intr)2445 emit_intrinsic_reduce_clusters(struct ir3_context *ctx,
2446 nir_intrinsic_instr *intr)
2447 {
2448 nir_op nir_reduce_op = (nir_op)nir_intrinsic_reduction_op(intr);
2449 reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
2450 unsigned dst_size = intr->def.bit_size;
2451
2452 bool need_exclusive =
2453 intr->intrinsic == nir_intrinsic_exclusive_scan_clusters_ir3;
2454 bool need_scratch = reduce_op == REDUCE_OP_MUL_U && dst_size == 32;
2455
2456 /* Note: the shared reg is initialized to the identity, so we need it to
2457 * always be 32-bit even when the source isn't because half shared regs are
2458 * not supported.
2459 */
2460 struct ir3_instruction *identity = create_immed_shared(
2461 &ctx->build, get_reduce_identity(nir_reduce_op, dst_size), true);
2462
2463 struct ir3_instruction *inclusive_src = ir3_get_src(ctx, &intr->src[0])[0];
2464
2465 struct ir3_instruction *exclusive_src = NULL;
2466 if (need_exclusive)
2467 exclusive_src = ir3_get_src(ctx, &intr->src[1])[0];
2468
2469 /* OPC_SCAN_CLUSTERS_MACRO has the following destinations:
2470 * - Shared reg reduction result, must be initialized to the identity
2471 * - Inclusive scan result
2472 * - (iff exclusive) Exclusive scan result. Conditionally added because
2473 * calculating the exclusive value is optional (i.e., not a side-effect of
2474 * calculating the inclusive value) and won't be DCE'd anymore at this
2475 * point.
2476 * - (iff 32b mul_u) Scratch register. We try to emit "op rx, ry, rx" for
2477 * most ops but this isn't possible for the 32b mul_u macro since its
2478 * destination is clobbered. So conditionally allocate an extra
2479 * register in that case.
2480 *
2481 * Note that the getlast loop this macro expands to iterates over all
2482 * clusters. However, for each iteration, not only the fibers in the current
2483 * cluster are active but all later ones as well. Since they still need their
2484 * sources when their cluster is handled, all destinations interfere with
2485 * the sources.
2486 */
2487 unsigned ndst = 2 + need_exclusive + need_scratch;
2488 unsigned nsrc = 2 + need_exclusive;
2489 struct ir3_instruction *scan =
2490 ir3_build_instr(&ctx->build, OPC_SCAN_CLUSTERS_MACRO, ndst, nsrc);
2491 scan->cat1.reduce_op = reduce_op;
2492
2493 unsigned dst_flags = IR3_REG_EARLY_CLOBBER;
2494 if (ir3_bitsize(ctx, dst_size) == 16)
2495 dst_flags |= IR3_REG_HALF;
2496
2497 struct ir3_register *reduce = __ssa_dst(scan);
2498 reduce->flags |= IR3_REG_SHARED;
2499 struct ir3_register *inclusive = __ssa_dst(scan);
2500 inclusive->flags |= dst_flags;
2501
2502 struct ir3_register *exclusive = NULL;
2503 if (need_exclusive) {
2504 exclusive = __ssa_dst(scan);
2505 exclusive->flags |= dst_flags;
2506 }
2507
2508 if (need_scratch) {
2509 struct ir3_register *scratch = __ssa_dst(scan);
2510 scratch->flags |= dst_flags;
2511 }
2512
2513 struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
2514 ir3_reg_tie(reduce, reduce_init);
2515
2516 __ssa_src(scan, inclusive_src, 0);
2517
2518 if (need_exclusive)
2519 __ssa_src(scan, exclusive_src, 0);
2520
2521 struct ir3_register *dst;
2522 switch (intr->intrinsic) {
2523 case nir_intrinsic_reduce_clusters_ir3:
2524 dst = reduce;
2525 break;
2526 case nir_intrinsic_inclusive_scan_clusters_ir3:
2527 dst = inclusive;
2528 break;
2529 case nir_intrinsic_exclusive_scan_clusters_ir3: {
2530 assert(exclusive != NULL);
2531 dst = exclusive;
2532 break;
2533 }
2534 default:
2535 unreachable("unknown reduce intrinsic");
2536 }
2537
2538 return create_multidst_mov(&ctx->build, dst);
2539 }
2540
2541 static struct ir3_instruction *
emit_intrinsic_brcst_active(struct ir3_context * ctx,nir_intrinsic_instr * intr)2542 emit_intrinsic_brcst_active(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2543 {
2544 struct ir3_instruction *default_src = ir3_get_src(ctx, &intr->src[0])[0];
2545 struct ir3_instruction *brcst_val = ir3_get_src(ctx, &intr->src[1])[0];
2546 return ir3_BRCST_ACTIVE(&ctx->build, nir_intrinsic_cluster_size(intr),
2547 brcst_val, default_src);
2548 }
2549
2550 static ir3_shfl_mode
shfl_mode(nir_intrinsic_instr * intr)2551 shfl_mode(nir_intrinsic_instr *intr)
2552 {
2553 switch (intr->intrinsic) {
2554 case nir_intrinsic_rotate:
2555 return SHFL_RDOWN;
2556 case nir_intrinsic_shuffle_up_uniform_ir3:
2557 return SHFL_RUP;
2558 case nir_intrinsic_shuffle_down_uniform_ir3:
2559 return SHFL_RDOWN;
2560 case nir_intrinsic_shuffle_xor_uniform_ir3:
2561 return SHFL_XOR;
2562 default:
2563 unreachable("unsupported shfl");
2564 }
2565 }
2566
2567 static struct ir3_instruction *
emit_shfl(struct ir3_context * ctx,nir_intrinsic_instr * intr)2568 emit_shfl(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2569 {
2570 assert(ctx->compiler->has_shfl);
2571
2572 struct ir3_instruction *val = ir3_get_src(ctx, &intr->src[0])[0];
2573 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0];
2574
2575 struct ir3_instruction *shfl = ir3_SHFL(&ctx->build, val, 0, idx, 0);
2576 shfl->cat6.shfl_mode = shfl_mode(intr);
2577 shfl->cat6.type = is_half(val) ? TYPE_U16 : TYPE_U32;
2578
2579 return shfl;
2580 }
2581
2582 static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
2583 static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
2584
2585 static void
emit_intrinsic(struct ir3_context * ctx,nir_intrinsic_instr * intr)2586 emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2587 {
2588 const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
2589 struct ir3_instruction **dst;
2590 struct ir3_instruction *const *src;
2591 struct ir3_builder *b = &ctx->build;
2592 unsigned dest_components = nir_intrinsic_dest_components(intr);
2593 int idx;
2594 bool create_rpt = false;
2595
2596 if (info->has_dest) {
2597 dst = ir3_get_def(ctx, &intr->def, dest_components);
2598 } else {
2599 dst = NULL;
2600 }
2601
2602 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
2603 const unsigned primitive_param =
2604 const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4 * 4;
2605 const unsigned primitive_map =
2606 const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4 * 4;
2607
2608 switch (intr->intrinsic) {
2609 case nir_intrinsic_decl_reg:
2610 /* There's logically nothing to do, but this has a destination in NIR so
2611 * plug in something... It will get DCE'd.
2612 */
2613 dst[0] = create_immed(b, 0);
2614 break;
2615
2616 case nir_intrinsic_load_reg:
2617 case nir_intrinsic_load_reg_indirect: {
2618 struct ir3_array *arr = ir3_get_array(ctx, intr->src[0].ssa);
2619 struct ir3_instruction *addr = NULL;
2620
2621 if (intr->intrinsic == nir_intrinsic_load_reg_indirect) {
2622 addr = ir3_get_addr0(ctx, ir3_get_src(ctx, &intr->src[1])[0],
2623 dest_components);
2624 }
2625
2626 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(intr->src[0].ssa);
2627 assert(dest_components == nir_intrinsic_num_components(decl));
2628
2629 for (unsigned i = 0; i < dest_components; i++) {
2630 unsigned n = nir_intrinsic_base(intr) * dest_components + i;
2631 compile_assert(ctx, n < arr->length);
2632 dst[i] = ir3_create_array_load(ctx, arr, n, addr);
2633 }
2634
2635 break;
2636 }
2637
2638 case nir_intrinsic_store_reg:
2639 case nir_intrinsic_store_reg_indirect: {
2640 struct ir3_array *arr = ir3_get_array(ctx, intr->src[1].ssa);
2641 unsigned num_components = nir_src_num_components(intr->src[0]);
2642 struct ir3_instruction *addr = NULL;
2643
2644 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(intr->src[1].ssa);
2645 assert(num_components == nir_intrinsic_num_components(decl));
2646
2647 struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[0]);
2648
2649 if (intr->intrinsic == nir_intrinsic_store_reg_indirect) {
2650 addr = ir3_get_addr0(ctx, ir3_get_src(ctx, &intr->src[2])[0],
2651 num_components);
2652 }
2653
2654 u_foreach_bit(i, nir_intrinsic_write_mask(intr)) {
2655 assert(i < num_components);
2656
2657 unsigned n = nir_intrinsic_base(intr) * num_components + i;
2658 compile_assert(ctx, n < arr->length);
2659 if (value[i])
2660 ir3_create_array_store(ctx, arr, n, value[i], addr);
2661 }
2662
2663 break;
2664 }
2665
2666 case nir_intrinsic_load_const_ir3:
2667 idx = nir_intrinsic_base(intr);
2668 if (nir_src_is_const(intr->src[0])) {
2669 idx += nir_src_as_uint(intr->src[0]);
2670 for (int i = 0; i < dest_components; i++) {
2671 dst[i] = create_uniform_typed(
2672 b, idx + i,
2673 intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32);
2674 }
2675 create_rpt = true;
2676 } else {
2677 src = ctx->compiler->has_scalar_alu ?
2678 ir3_get_src_maybe_shared(ctx, &intr->src[0]) :
2679 ir3_get_src(ctx, &intr->src[0]);
2680 for (int i = 0; i < dest_components; i++) {
2681 dst[i] = create_uniform_indirect(
2682 b, idx + i,
2683 intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32,
2684 ir3_get_addr0(ctx, src[0], 1));
2685 /* Since this may not be foldable into conversions into shared
2686 * registers, manually make it shared. Optimizations can undo this if
2687 * the user can't use shared regs.
2688 */
2689 if (ctx->compiler->has_scalar_alu && !intr->def.divergent)
2690 dst[i]->dsts[0]->flags |= IR3_REG_SHARED;
2691 }
2692
2693 ctx->has_relative_load_const_ir3 = true;
2694 }
2695 break;
2696
2697 case nir_intrinsic_load_vs_primitive_stride_ir3:
2698 dst[0] = create_uniform(b, primitive_param + 0);
2699 break;
2700 case nir_intrinsic_load_vs_vertex_stride_ir3:
2701 dst[0] = create_uniform(b, primitive_param + 1);
2702 break;
2703 case nir_intrinsic_load_hs_patch_stride_ir3:
2704 dst[0] = create_uniform(b, primitive_param + 2);
2705 break;
2706 case nir_intrinsic_load_patch_vertices_in:
2707 dst[0] = create_uniform(b, primitive_param + 3);
2708 break;
2709 case nir_intrinsic_load_tess_param_base_ir3:
2710 dst[0] = create_uniform(b, primitive_param + 4);
2711 dst[1] = create_uniform(b, primitive_param + 5);
2712 break;
2713 case nir_intrinsic_load_tess_factor_base_ir3:
2714 dst[0] = create_uniform(b, primitive_param + 6);
2715 dst[1] = create_uniform(b, primitive_param + 7);
2716 break;
2717
2718 case nir_intrinsic_load_primitive_location_ir3:
2719 idx = nir_intrinsic_driver_location(intr);
2720 dst[0] = create_uniform(b, primitive_map + idx);
2721 break;
2722
2723 case nir_intrinsic_load_gs_header_ir3:
2724 dst[0] = ctx->gs_header;
2725 break;
2726 case nir_intrinsic_load_tcs_header_ir3:
2727 dst[0] = ctx->tcs_header;
2728 break;
2729
2730 case nir_intrinsic_load_rel_patch_id_ir3:
2731 dst[0] = ctx->rel_patch_id;
2732 break;
2733
2734 case nir_intrinsic_load_primitive_id:
2735 if (!ctx->primitive_id) {
2736 ctx->primitive_id =
2737 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
2738 }
2739 dst[0] = ctx->primitive_id;
2740 break;
2741
2742 case nir_intrinsic_load_tess_coord_xy:
2743 if (!ctx->tess_coord) {
2744 ctx->tess_coord =
2745 create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);
2746 }
2747 ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);
2748 break;
2749
2750 case nir_intrinsic_store_global_ir3:
2751 ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);
2752 break;
2753 case nir_intrinsic_load_global_ir3:
2754 ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);
2755 break;
2756
2757 case nir_intrinsic_load_ubo:
2758 emit_intrinsic_load_ubo(ctx, intr, dst);
2759 break;
2760 case nir_intrinsic_load_ubo_vec4:
2761 emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
2762 break;
2763 case nir_intrinsic_copy_ubo_to_uniform_ir3:
2764 emit_intrinsic_copy_ubo_to_uniform(ctx, intr);
2765 break;
2766 case nir_intrinsic_copy_global_to_uniform_ir3:
2767 emit_intrinsic_copy_global_to_uniform(ctx, intr);
2768 break;
2769 case nir_intrinsic_load_frag_coord:
2770 case nir_intrinsic_load_frag_coord_unscaled_ir3:
2771 ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);
2772 break;
2773 case nir_intrinsic_load_sample_pos_from_id: {
2774 /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
2775 * but that doesn't seem necessary.
2776 */
2777 struct ir3_instruction *offset =
2778 ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
2779 offset->dsts[0]->wrmask = 0x3;
2780 offset->cat5.type = TYPE_F32;
2781
2782 ir3_split_dest(b, dst, offset, 0, 2);
2783
2784 break;
2785 }
2786 case nir_intrinsic_load_persp_center_rhw_ir3:
2787 if (!ctx->ij[IJ_PERSP_CENTER_RHW]) {
2788 ctx->ij[IJ_PERSP_CENTER_RHW] =
2789 create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW, 0x1);
2790 }
2791 dst[0] = ctx->ij[IJ_PERSP_CENTER_RHW];
2792 break;
2793 case nir_intrinsic_load_barycentric_centroid:
2794 case nir_intrinsic_load_barycentric_sample:
2795 case nir_intrinsic_load_barycentric_pixel:
2796 emit_intrinsic_barycentric(ctx, intr, dst);
2797 break;
2798 case nir_intrinsic_load_interpolated_input:
2799 case nir_intrinsic_load_input:
2800 setup_input(ctx, intr);
2801 break;
2802 case nir_intrinsic_load_kernel_input:
2803 emit_intrinsic_load_kernel_input(ctx, intr, dst);
2804 break;
2805 /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
2806 * pass and replaced by an ir3-specifc version that adds the
2807 * dword-offset in the last source.
2808 */
2809 case nir_intrinsic_load_ssbo_ir3:
2810 emit_intrinsic_load_ssbo(ctx, intr, dst);
2811 break;
2812 case nir_intrinsic_store_ssbo_ir3:
2813 ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
2814 break;
2815 case nir_intrinsic_get_ssbo_size:
2816 emit_intrinsic_ssbo_size(ctx, intr, dst);
2817 break;
2818 case nir_intrinsic_ssbo_atomic_ir3:
2819 case nir_intrinsic_ssbo_atomic_swap_ir3:
2820 dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
2821 break;
2822 case nir_intrinsic_load_shared:
2823 emit_intrinsic_load_shared(ctx, intr, dst);
2824 break;
2825 case nir_intrinsic_store_shared:
2826 emit_intrinsic_store_shared(ctx, intr);
2827 break;
2828 case nir_intrinsic_shared_atomic:
2829 case nir_intrinsic_shared_atomic_swap:
2830 dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
2831 break;
2832 case nir_intrinsic_load_scratch:
2833 emit_intrinsic_load_scratch(ctx, intr, dst);
2834 break;
2835 case nir_intrinsic_store_scratch:
2836 emit_intrinsic_store_scratch(ctx, intr);
2837 break;
2838 case nir_intrinsic_image_load:
2839 case nir_intrinsic_bindless_image_load:
2840 emit_intrinsic_load_image(ctx, intr, dst);
2841 break;
2842 case nir_intrinsic_image_store:
2843 case nir_intrinsic_bindless_image_store:
2844 ctx->funcs->emit_intrinsic_store_image(ctx, intr);
2845 break;
2846 case nir_intrinsic_image_size:
2847 case nir_intrinsic_bindless_image_size:
2848 ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);
2849 break;
2850 case nir_intrinsic_image_atomic:
2851 case nir_intrinsic_bindless_image_atomic:
2852 case nir_intrinsic_image_atomic_swap:
2853 case nir_intrinsic_bindless_image_atomic_swap:
2854 dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
2855 break;
2856 case nir_intrinsic_barrier:
2857 emit_intrinsic_barrier(ctx, intr);
2858 /* note that blk ptr no longer valid, make that obvious: */
2859 b = NULL;
2860 break;
2861 case nir_intrinsic_store_output:
2862 case nir_intrinsic_store_per_view_output:
2863 setup_output(ctx, intr);
2864 break;
2865 case nir_intrinsic_load_base_vertex:
2866 case nir_intrinsic_load_first_vertex:
2867 if (!ctx->basevertex) {
2868 ctx->basevertex = create_driver_param(ctx, IR3_DP_VS(vtxid_base));
2869 }
2870 dst[0] = ctx->basevertex;
2871 break;
2872 case nir_intrinsic_load_is_indexed_draw:
2873 if (!ctx->is_indexed_draw) {
2874 ctx->is_indexed_draw = create_driver_param(ctx, IR3_DP_VS(is_indexed_draw));
2875 }
2876 dst[0] = ctx->is_indexed_draw;
2877 break;
2878 case nir_intrinsic_load_draw_id:
2879 if (!ctx->draw_id) {
2880 ctx->draw_id = create_driver_param(ctx, IR3_DP_VS(draw_id));
2881 }
2882 dst[0] = ctx->draw_id;
2883 break;
2884 case nir_intrinsic_load_base_instance:
2885 if (!ctx->base_instance) {
2886 ctx->base_instance = create_driver_param(ctx, IR3_DP_VS(instid_base));
2887 }
2888 dst[0] = ctx->base_instance;
2889 break;
2890 case nir_intrinsic_load_view_index:
2891 if (!ctx->view_index) {
2892 ctx->view_index =
2893 create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);
2894 }
2895 dst[0] = ctx->view_index;
2896 break;
2897 case nir_intrinsic_load_vertex_id_zero_base:
2898 case nir_intrinsic_load_vertex_id:
2899 if (!ctx->vertex_id) {
2900 gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id)
2901 ? SYSTEM_VALUE_VERTEX_ID
2902 : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
2903 ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);
2904 }
2905 dst[0] = ctx->vertex_id;
2906 break;
2907 case nir_intrinsic_load_instance_id:
2908 if (!ctx->instance_id) {
2909 ctx->instance_id =
2910 create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);
2911 }
2912 dst[0] = ctx->instance_id;
2913 break;
2914 case nir_intrinsic_load_sample_id:
2915 case nir_intrinsic_load_sample_id_no_per_sample:
2916 if (!ctx->samp_id) {
2917 ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);
2918 ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;
2919 }
2920 dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
2921 break;
2922 case nir_intrinsic_load_sample_mask_in:
2923 if (!ctx->samp_mask_in) {
2924 ctx->so->reads_smask = true;
2925 ctx->samp_mask_in =
2926 create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
2927 }
2928 dst[0] = ctx->samp_mask_in;
2929 break;
2930 case nir_intrinsic_load_user_clip_plane:
2931 idx = nir_intrinsic_ucp_id(intr);
2932 for (int i = 0; i < dest_components; i++) {
2933 unsigned n = idx * 4 + i;
2934 dst[i] = create_driver_param(ctx, IR3_DP_VS(ucp[0].x) + n);
2935 }
2936 create_rpt = true;
2937 break;
2938 case nir_intrinsic_load_front_face:
2939 if (!ctx->frag_face) {
2940 ctx->so->frag_face = true;
2941 ctx->frag_face =
2942 create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);
2943 ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;
2944 }
2945 /* for fragface, we get -1 for back and 0 for front. However this is
2946 * the inverse of what nir expects (where ~0 is true).
2947 */
2948 dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0,
2949 create_immed_typed(b, 0, TYPE_U16), 0);
2950 dst[0]->cat2.condition = IR3_COND_EQ;
2951 break;
2952 case nir_intrinsic_load_local_invocation_id:
2953 if (!ctx->local_invocation_id) {
2954 ctx->local_invocation_id =
2955 create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);
2956 }
2957 ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);
2958 break;
2959 case nir_intrinsic_load_workgroup_id:
2960 if (ctx->compiler->has_shared_regfile) {
2961 if (!ctx->work_group_id) {
2962 ctx->work_group_id =
2963 create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);
2964 ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;
2965 }
2966 ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);
2967 } else {
2968 /* For a3xx/a4xx, this comes in via const injection by the hw */
2969 for (int i = 0; i < dest_components; i++) {
2970 dst[i] = create_driver_param(ctx, IR3_DP_CS(workgroup_id_x) + i);
2971 }
2972 }
2973 break;
2974 case nir_intrinsic_load_frag_shading_rate: {
2975 if (!ctx->frag_shading_rate) {
2976 ctx->so->reads_shading_rate = true;
2977 ctx->frag_shading_rate =
2978 create_sysval_input(ctx, SYSTEM_VALUE_FRAG_SHADING_RATE, 0x1);
2979 }
2980 dst[0] = ctx->frag_shading_rate;
2981 break;
2982 }
2983 case nir_intrinsic_load_base_workgroup_id:
2984 for (int i = 0; i < dest_components; i++) {
2985 dst[i] = create_driver_param(ctx, IR3_DP_CS(base_group_x) + i);
2986 }
2987 create_rpt = true;
2988 break;
2989 case nir_intrinsic_load_num_workgroups:
2990 for (int i = 0; i < dest_components; i++) {
2991 dst[i] = create_driver_param(ctx, IR3_DP_CS(num_work_groups_x) + i);
2992 }
2993 create_rpt = true;
2994 break;
2995 case nir_intrinsic_load_workgroup_size:
2996 for (int i = 0; i < dest_components; i++) {
2997 dst[i] = create_driver_param(ctx, IR3_DP_CS(local_group_size_x) + i);
2998 }
2999 create_rpt = true;
3000 break;
3001 case nir_intrinsic_load_subgroup_size: {
3002 assert(ctx->so->type == MESA_SHADER_COMPUTE ||
3003 ctx->so->type == MESA_SHADER_FRAGMENT);
3004 unsigned size = ctx->so->type == MESA_SHADER_COMPUTE ?
3005 IR3_DP_CS(subgroup_size) : IR3_DP_FS(subgroup_size);
3006 dst[0] = create_driver_param(ctx, size);
3007 break;
3008 }
3009 case nir_intrinsic_load_subgroup_id_shift_ir3:
3010 dst[0] = create_driver_param(ctx, IR3_DP_CS(subgroup_id_shift));
3011 break;
3012 case nir_intrinsic_load_work_dim:
3013 dst[0] = create_driver_param(ctx, IR3_DP_CS(work_dim));
3014 break;
3015 case nir_intrinsic_load_subgroup_invocation:
3016 assert(ctx->compiler->has_getfiberid);
3017 dst[0] = ir3_GETFIBERID(b);
3018 dst[0]->cat6.type = TYPE_U32;
3019 __ssa_dst(dst[0]);
3020 break;
3021 case nir_intrinsic_load_tess_level_outer_default:
3022 for (int i = 0; i < dest_components; i++) {
3023 dst[i] = create_driver_param(ctx, IR3_DP_TCS(default_outer_level_x) + i);
3024 }
3025 create_rpt = true;
3026 break;
3027 case nir_intrinsic_load_tess_level_inner_default:
3028 for (int i = 0; i < dest_components; i++) {
3029 dst[i] = create_driver_param(ctx, IR3_DP_TCS(default_inner_level_x) + i);
3030 }
3031 create_rpt = true;
3032 break;
3033 case nir_intrinsic_load_frag_invocation_count:
3034 dst[0] = create_driver_param(ctx, IR3_DP_FS(frag_invocation_count));
3035 break;
3036 case nir_intrinsic_load_frag_size_ir3:
3037 case nir_intrinsic_load_frag_offset_ir3: {
3038 unsigned param =
3039 intr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
3040 IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
3041 if (nir_src_is_const(intr->src[0])) {
3042 uint32_t view = nir_src_as_uint(intr->src[0]);
3043 for (int i = 0; i < dest_components; i++) {
3044 dst[i] = create_driver_param(ctx, param + 4 * view + i);
3045 }
3046 create_rpt = true;
3047 } else {
3048 struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0];
3049 for (int i = 0; i < dest_components; i++) {
3050 dst[i] = create_driver_param_indirect(ctx, param + i,
3051 ir3_get_addr0(ctx, view, 4));
3052 }
3053 ctx->so->constlen =
3054 MAX2(ctx->so->constlen,
3055 const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4 +
3056 param / 4 + nir_intrinsic_range(intr));
3057 }
3058 break;
3059 }
3060 case nir_intrinsic_demote:
3061 case nir_intrinsic_demote_if:
3062 case nir_intrinsic_terminate:
3063 case nir_intrinsic_terminate_if: {
3064 struct ir3_instruction *cond, *kill;
3065
3066 if (intr->intrinsic == nir_intrinsic_demote_if ||
3067 intr->intrinsic == nir_intrinsic_terminate_if) {
3068 /* conditional discard: */
3069 src = ir3_get_src(ctx, &intr->src[0]);
3070 cond = src[0];
3071 } else {
3072 /* unconditional discard: */
3073 cond = create_immed_typed(b, 1, ctx->compiler->bool_type);
3074 }
3075
3076 /* NOTE: only cmps.*.* can write p0.x: */
3077 struct ir3_instruction *zero =
3078 create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32);
3079 cond = ir3_CMPS_S(b, cond, 0, zero, 0);
3080 cond->cat2.condition = IR3_COND_NE;
3081
3082 /* condition always goes in predicate register: */
3083 cond->dsts[0]->flags |= IR3_REG_PREDICATE;
3084
3085 if (intr->intrinsic == nir_intrinsic_demote ||
3086 intr->intrinsic == nir_intrinsic_demote_if) {
3087 kill = ir3_DEMOTE(b, cond, 0);
3088 } else {
3089 kill = ir3_KILL(b, cond, 0);
3090 }
3091
3092 /* - Side-effects should not be moved on a different side of the kill
3093 * - Instructions that depend on active fibers should not be reordered
3094 */
3095 kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
3096 IR3_BARRIER_ACTIVE_FIBERS_W;
3097 kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
3098 IR3_BARRIER_ACTIVE_FIBERS_R;
3099 kill->srcs[0]->flags |= IR3_REG_PREDICATE;
3100
3101 array_insert(ctx->block, ctx->block->keeps, kill);
3102 ctx->so->has_kill = true;
3103
3104 break;
3105 }
3106
3107 case nir_intrinsic_vote_any:
3108 case nir_intrinsic_vote_all: {
3109 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3110 struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
3111 if (intr->intrinsic == nir_intrinsic_vote_any)
3112 dst[0] = ir3_ANY_MACRO(b, pred, 0);
3113 else
3114 dst[0] = ir3_ALL_MACRO(b, pred, 0);
3115 dst[0]->srcs[0]->flags |= IR3_REG_PREDICATE;
3116 break;
3117 }
3118 case nir_intrinsic_elect:
3119 dst[0] = ir3_ELECT_MACRO(b);
3120 dst[0]->flags |= IR3_INSTR_NEEDS_HELPERS;
3121 break;
3122 case nir_intrinsic_elect_any_ir3:
3123 dst[0] = ir3_ELECT_MACRO(b);
3124 break;
3125 case nir_intrinsic_preamble_start_ir3:
3126 dst[0] = ir3_SHPS_MACRO(b);
3127 break;
3128
3129 case nir_intrinsic_read_invocation_cond_ir3: {
3130 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3131 struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];
3132 dst[0] = ir3_READ_COND_MACRO(b, ir3_get_predicate(ctx, cond), 0, src, 0);
3133 dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
3134 dst[0]->srcs[0]->flags |= IR3_REG_PREDICATE;
3135 /* Work around a bug with half-register shared -> non-shared moves by
3136 * adding an extra mov here so that the original destination stays full.
3137 */
3138 if (src->dsts[0]->flags & IR3_REG_HALF) {
3139 dst[0] = ir3_MOV(b, dst[0], TYPE_U32);
3140 if (!ctx->compiler->has_scalar_alu)
3141 dst[0]->dsts[0]->flags &= ~IR3_REG_SHARED;
3142 }
3143 break;
3144 }
3145
3146 case nir_intrinsic_read_first_invocation: {
3147 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3148 dst[0] = ir3_READ_FIRST_MACRO(b, src, 0);
3149 dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
3150 /* See above. */
3151 if (src->dsts[0]->flags & IR3_REG_HALF) {
3152 dst[0] = ir3_MOV(b, dst[0], TYPE_U32);
3153 if (!ctx->compiler->has_scalar_alu)
3154 dst[0]->dsts[0]->flags &= ~IR3_REG_SHARED;
3155 }
3156 break;
3157 }
3158
3159 case nir_intrinsic_read_getlast_ir3: {
3160 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3161 dst[0] = ir3_READ_GETLAST_MACRO(b, src, 0);
3162 dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
3163 /* See above. */
3164 if (src->dsts[0]->flags & IR3_REG_HALF) {
3165 dst[0] = ir3_MOV(b, dst[0], TYPE_U32);
3166 if (!ctx->compiler->has_scalar_alu)
3167 dst[0]->dsts[0]->flags &= ~IR3_REG_SHARED;
3168 }
3169 break;
3170 }
3171
3172 case nir_intrinsic_ballot: {
3173 struct ir3_instruction *ballot;
3174 unsigned components = intr->def.num_components;
3175 if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
3176 /* ballot(true) is just MOVMSK */
3177 ballot = ir3_MOVMSK(b, components);
3178 } else {
3179 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3180 struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
3181 ballot = ir3_BALLOT_MACRO(b, pred, components);
3182 ballot->srcs[0]->flags |= IR3_REG_PREDICATE;
3183 }
3184
3185 ballot->barrier_class = IR3_BARRIER_ACTIVE_FIBERS_R;
3186 ballot->barrier_conflict = IR3_BARRIER_ACTIVE_FIBERS_W;
3187
3188 ir3_split_dest(b, dst, ballot, 0, components);
3189 break;
3190 }
3191
3192 case nir_intrinsic_quad_broadcast: {
3193 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3194 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0];
3195
3196 type_t dst_type = type_uint_size(intr->def.bit_size);
3197
3198 if (dst_type != TYPE_U32)
3199 idx = ir3_COV(b, idx, TYPE_U32, dst_type);
3200
3201 dst[0] = ir3_QUAD_SHUFFLE_BRCST(b, src, 0, idx, 0);
3202 dst[0]->cat5.type = dst_type;
3203 break;
3204 }
3205
3206 case nir_intrinsic_quad_swap_horizontal: {
3207 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3208 dst[0] = ir3_QUAD_SHUFFLE_HORIZ(b, src, 0);
3209 dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
3210 break;
3211 }
3212
3213 case nir_intrinsic_quad_swap_vertical: {
3214 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3215 dst[0] = ir3_QUAD_SHUFFLE_VERT(b, src, 0);
3216 dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
3217 break;
3218 }
3219
3220 case nir_intrinsic_quad_swap_diagonal: {
3221 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3222 dst[0] = ir3_QUAD_SHUFFLE_DIAG(b, src, 0);
3223 dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
3224 break;
3225 }
3226 case nir_intrinsic_ddx:
3227 case nir_intrinsic_ddx_coarse: {
3228 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3229 dst[0] = ir3_DSX(b, src, 0);
3230 dst[0]->cat5.type = TYPE_F32;
3231 break;
3232 }
3233 case nir_intrinsic_ddx_fine: {
3234 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3235 dst[0] = ir3_DSXPP_MACRO(b, src, 0);
3236 dst[0]->cat5.type = TYPE_F32;
3237 break;
3238 }
3239 case nir_intrinsic_ddy:
3240 case nir_intrinsic_ddy_coarse: {
3241 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3242 dst[0] = ir3_DSY(b, src, 0);
3243 dst[0]->cat5.type = TYPE_F32;
3244 break;
3245 }
3246 case nir_intrinsic_ddy_fine: {
3247 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3248 dst[0] = ir3_DSYPP_MACRO(b, src, 0);
3249 dst[0]->cat5.type = TYPE_F32;
3250 break;
3251 }
3252 case nir_intrinsic_load_shared_ir3:
3253 emit_intrinsic_load_shared_ir3(ctx, intr, dst);
3254 break;
3255 case nir_intrinsic_store_shared_ir3:
3256 emit_intrinsic_store_shared_ir3(ctx, intr);
3257 break;
3258 case nir_intrinsic_bindless_resource_ir3:
3259 dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
3260 break;
3261 case nir_intrinsic_global_atomic_ir3:
3262 case nir_intrinsic_global_atomic_swap_ir3: {
3263 dst[0] = ctx->funcs->emit_intrinsic_atomic_global(ctx, intr);
3264 break;
3265 }
3266
3267 case nir_intrinsic_reduce:
3268 case nir_intrinsic_inclusive_scan:
3269 case nir_intrinsic_exclusive_scan:
3270 dst[0] = emit_intrinsic_reduce(ctx, intr);
3271 break;
3272
3273 case nir_intrinsic_reduce_clusters_ir3:
3274 case nir_intrinsic_inclusive_scan_clusters_ir3:
3275 case nir_intrinsic_exclusive_scan_clusters_ir3:
3276 dst[0] = emit_intrinsic_reduce_clusters(ctx, intr);
3277 break;
3278
3279 case nir_intrinsic_brcst_active_ir3:
3280 dst[0] = emit_intrinsic_brcst_active(ctx, intr);
3281 break;
3282
3283 case nir_intrinsic_preamble_end_ir3: {
3284 struct ir3_instruction *instr = ir3_SHPE(b);
3285 instr->barrier_class = instr->barrier_conflict = IR3_BARRIER_CONST_W;
3286 array_insert(ctx->block, ctx->block->keeps, instr);
3287 break;
3288 }
3289 case nir_intrinsic_store_const_ir3: {
3290 unsigned components = nir_src_num_components(intr->src[0]);
3291 unsigned dst = nir_intrinsic_base(intr);
3292 unsigned dst_lo = dst & 0xff;
3293 unsigned dst_hi = dst >> 8;
3294
3295 struct ir3_instruction *src =
3296 ir3_create_collect(b, ir3_get_src_shared(ctx, &intr->src[0],
3297 ctx->compiler->has_scalar_alu),
3298 components);
3299 struct ir3_instruction *a1 = NULL;
3300 if (dst_hi) {
3301 /* Encode only the high part of the destination in a1.x to increase the
3302 * chance that we can reuse the a1.x value in subsequent stc
3303 * instructions.
3304 */
3305 a1 = ir3_get_addr1(ctx, dst_hi << 8);
3306 }
3307
3308 struct ir3_instruction *stc =
3309 ir3_STC(b, create_immed(b, dst_lo), 0, src, 0);
3310 stc->cat6.iim_val = components;
3311 stc->cat6.type = TYPE_U32;
3312 stc->barrier_conflict = IR3_BARRIER_CONST_W;
3313 if (a1) {
3314 ir3_instr_set_address(stc, a1);
3315 stc->flags |= IR3_INSTR_A1EN;
3316 }
3317 /* The assembler isn't aware of what value a1.x has, so make sure that
3318 * constlen includes the stc here.
3319 */
3320 ctx->so->constlen =
3321 MAX2(ctx->so->constlen, DIV_ROUND_UP(dst + components, 4));
3322 array_insert(ctx->block, ctx->block->keeps, stc);
3323 break;
3324 }
3325 case nir_intrinsic_copy_push_const_to_uniform_ir3: {
3326 struct ir3_instruction *load =
3327 ir3_build_instr(b, OPC_PUSH_CONSTS_LOAD_MACRO, 0, 0);
3328 array_insert(ctx->block, ctx->block->keeps, load);
3329
3330 load->push_consts.dst_base = nir_src_as_uint(intr->src[0]);
3331 load->push_consts.src_base = nir_intrinsic_base(intr);
3332 load->push_consts.src_size = nir_intrinsic_range(intr);
3333
3334 ctx->so->constlen =
3335 MAX2(ctx->so->constlen,
3336 DIV_ROUND_UP(
3337 load->push_consts.dst_base + load->push_consts.src_size, 4));
3338 break;
3339 }
3340 case nir_intrinsic_prefetch_sam_ir3: {
3341 struct tex_src_info info =
3342 get_bindless_samp_src(ctx, &intr->src[0], &intr->src[1]);
3343 struct ir3_instruction *sam =
3344 emit_sam(ctx, OPC_SAM, info, TYPE_F32, 0b1111, NULL, NULL);
3345
3346 sam->dsts_count = 0;
3347 array_insert(ctx->block, ctx->block->keeps, sam);
3348 break;
3349 }
3350 case nir_intrinsic_prefetch_tex_ir3: {
3351 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
3352 struct ir3_instruction *resinfo = ir3_RESINFO(b, idx, 0);
3353 resinfo->cat6.iim_val = 1;
3354 resinfo->cat6.d = 1;
3355 resinfo->cat6.type = TYPE_U32;
3356 resinfo->cat6.typed = false;
3357
3358 ir3_handle_bindless_cat6(resinfo, intr->src[0]);
3359 if (resinfo->flags & IR3_INSTR_B)
3360 ctx->so->bindless_tex = true;
3361
3362 resinfo->dsts_count = 0;
3363 array_insert(ctx->block, ctx->block->keeps, resinfo);
3364 break;
3365 }
3366 case nir_intrinsic_prefetch_ubo_ir3: {
3367 struct ir3_instruction *offset = create_immed(b, 0);
3368 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
3369 struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
3370 ldc->cat6.iim_val = 1;
3371 ldc->cat6.type = TYPE_U32;
3372
3373 ir3_handle_bindless_cat6(ldc, intr->src[0]);
3374 if (ldc->flags & IR3_INSTR_B)
3375 ctx->so->bindless_ubo = true;
3376
3377 ldc->dsts_count = 0;
3378 array_insert(ctx->block, ctx->block->keeps, ldc);
3379 break;
3380 }
3381 case nir_intrinsic_rotate:
3382 case nir_intrinsic_shuffle_up_uniform_ir3:
3383 case nir_intrinsic_shuffle_down_uniform_ir3:
3384 case nir_intrinsic_shuffle_xor_uniform_ir3:
3385 dst[0] = emit_shfl(ctx, intr);
3386 break;
3387 default:
3388 ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
3389 nir_intrinsic_infos[intr->intrinsic].name);
3390 break;
3391 }
3392
3393 if (info->has_dest) {
3394 if (create_rpt)
3395 ir3_instr_create_rpt(dst, dest_components);
3396 ir3_put_def(ctx, &intr->def);
3397 }
3398 }
3399
3400 static void
emit_load_const(struct ir3_context * ctx,nir_load_const_instr * instr)3401 emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
3402 {
3403 unsigned bit_size = ir3_bitsize(ctx, instr->def.bit_size);
3404 struct ir3_instruction **dst =
3405 ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components * ((bit_size == 64) ? 2 : 1));
3406
3407 if (bit_size <= 8) {
3408 for (int i = 0; i < instr->def.num_components; i++)
3409 dst[i] = create_immed_typed(&ctx->build, instr->value[i].u8, TYPE_U8);
3410 } else if (bit_size <= 16) {
3411 for (int i = 0; i < instr->def.num_components; i++)
3412 dst[i] =
3413 create_immed_typed(&ctx->build, instr->value[i].u16, TYPE_U16);
3414 } else if (bit_size <= 32) {
3415 for (int i = 0; i < instr->def.num_components; i++)
3416 dst[i] =
3417 create_immed_typed(&ctx->build, instr->value[i].u32, TYPE_U32);
3418 } else {
3419 assert(instr->def.num_components == 1);
3420 for (int i = 0; i < instr->def.num_components; i++) {
3421 dst[2 * i] = create_immed_typed(
3422 &ctx->build, (uint32_t)(instr->value[i].u64), TYPE_U32);
3423 dst[2 * i + 1] = create_immed_typed(
3424 &ctx->build, (uint32_t)(instr->value[i].u64 >> 32), TYPE_U32);
3425 }
3426 }
3427 }
3428
3429 static void
emit_undef(struct ir3_context * ctx,nir_undef_instr * undef)3430 emit_undef(struct ir3_context *ctx, nir_undef_instr *undef)
3431 {
3432 struct ir3_instruction **dst =
3433 ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);
3434 type_t type = utype_for_size(ir3_bitsize(ctx, undef->def.bit_size));
3435
3436 /* backend doesn't want undefined instructions, so just plug
3437 * in 0.0..
3438 */
3439 for (int i = 0; i < undef->def.num_components; i++)
3440 dst[i] = create_immed_typed(&ctx->build, fui(0.0), type);
3441 }
3442
3443 /*
3444 * texture fetch/sample instructions:
3445 */
3446
3447 static type_t
get_tex_dest_type(nir_tex_instr * tex)3448 get_tex_dest_type(nir_tex_instr *tex)
3449 {
3450 type_t type;
3451
3452 switch (tex->dest_type) {
3453 case nir_type_float32:
3454 return TYPE_F32;
3455 case nir_type_float16:
3456 return TYPE_F16;
3457 case nir_type_int32:
3458 return TYPE_S32;
3459 case nir_type_int16:
3460 return TYPE_S16;
3461 case nir_type_bool32:
3462 case nir_type_uint32:
3463 return TYPE_U32;
3464 case nir_type_bool16:
3465 case nir_type_uint16:
3466 return TYPE_U16;
3467 case nir_type_invalid:
3468 default:
3469 unreachable("bad dest_type");
3470 }
3471
3472 return type;
3473 }
3474
3475 static void
tex_info(nir_tex_instr * tex,unsigned * flagsp,unsigned * coordsp)3476 tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
3477 {
3478 unsigned coords =
3479 glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);
3480 unsigned flags = 0;
3481
3482 /* note: would use tex->coord_components.. except txs.. also,
3483 * since array index goes after shadow ref, we don't want to
3484 * count it:
3485 */
3486 if (coords == 3)
3487 flags |= IR3_INSTR_3D;
3488
3489 if (tex->is_shadow && tex->op != nir_texop_lod)
3490 flags |= IR3_INSTR_S;
3491
3492 if (tex->is_array && tex->op != nir_texop_lod)
3493 flags |= IR3_INSTR_A;
3494
3495 *flagsp = flags;
3496 *coordsp = coords;
3497 }
3498
3499 /* Gets the sampler/texture idx as a hvec2. Which could either be dynamic
3500 * or immediate (in which case it will get lowered later to a non .s2en
3501 * version of the tex instruction which encode tex/samp as immediates:
3502 */
3503 static struct tex_src_info
get_tex_samp_tex_src(struct ir3_context * ctx,nir_tex_instr * tex)3504 get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
3505 {
3506 struct ir3_builder *b = &ctx->build;
3507 struct tex_src_info info = {0};
3508 int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
3509 int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
3510 struct ir3_instruction *texture, *sampler;
3511
3512 if (texture_idx >= 0 || sampler_idx >= 0) {
3513 /* Bindless case */
3514 info = get_bindless_samp_src(ctx,
3515 texture_idx >= 0 ? &tex->src[texture_idx].src : NULL,
3516 sampler_idx >= 0 ? &tex->src[sampler_idx].src : NULL);
3517
3518 if (tex->texture_non_uniform || tex->sampler_non_uniform)
3519 info.flags |= IR3_INSTR_NONUNIF;
3520 } else {
3521 info.flags |= IR3_INSTR_S2EN;
3522 texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
3523 sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
3524 if (texture_idx >= 0) {
3525 texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
3526 texture = ir3_COV(b, texture, TYPE_U32, TYPE_U16);
3527 } else {
3528 /* TODO what to do for dynamic case? I guess we only need the
3529 * max index for astc srgb workaround so maybe not a problem
3530 * to worry about if we don't enable indirect samplers for
3531 * a4xx?
3532 */
3533 ctx->max_texture_index =
3534 MAX2(ctx->max_texture_index, tex->texture_index);
3535 texture = create_immed_typed(b, tex->texture_index, TYPE_U16);
3536 info.tex_idx = tex->texture_index;
3537 }
3538
3539 if (sampler_idx >= 0) {
3540 sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
3541 sampler = ir3_COV(b, sampler, TYPE_U32, TYPE_U16);
3542 } else {
3543 sampler = create_immed_typed(b, tex->sampler_index, TYPE_U16);
3544 info.samp_idx = tex->texture_index;
3545 }
3546
3547 info.samp_tex = ir3_collect(b, texture, sampler);
3548 }
3549
3550 return info;
3551 }
3552
3553 static void
emit_tex(struct ir3_context * ctx,nir_tex_instr * tex)3554 emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
3555 {
3556 struct ir3_builder *b = &ctx->build;
3557 struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
3558 struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
3559 struct ir3_instruction *lod, *compare, *proj, *sample_index;
3560 struct tex_src_info info = {0};
3561 bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
3562 unsigned i, coords, flags, ncomp;
3563 unsigned nsrc0 = 0, nsrc1 = 0;
3564 type_t type;
3565 opc_t opc = 0;
3566
3567 ncomp = tex->def.num_components;
3568
3569 coord = off = ddx = ddy = NULL;
3570 lod = proj = compare = sample_index = NULL;
3571
3572 dst = ir3_get_def(ctx, &tex->def, ncomp);
3573
3574 for (unsigned i = 0; i < tex->num_srcs; i++) {
3575 switch (tex->src[i].src_type) {
3576 case nir_tex_src_coord:
3577 coord = ir3_get_src(ctx, &tex->src[i].src);
3578 break;
3579 case nir_tex_src_bias:
3580 lod = ir3_get_src(ctx, &tex->src[i].src)[0];
3581 has_bias = true;
3582 break;
3583 case nir_tex_src_lod:
3584 lod = ir3_get_src(ctx, &tex->src[i].src)[0];
3585 has_lod = true;
3586 break;
3587 case nir_tex_src_comparator: /* shadow comparator */
3588 compare = ir3_get_src(ctx, &tex->src[i].src)[0];
3589 break;
3590 case nir_tex_src_projector:
3591 proj = ir3_get_src(ctx, &tex->src[i].src)[0];
3592 has_proj = true;
3593 break;
3594 case nir_tex_src_offset:
3595 off = ir3_get_src(ctx, &tex->src[i].src);
3596 has_off = true;
3597 break;
3598 case nir_tex_src_ddx:
3599 ddx = ir3_get_src(ctx, &tex->src[i].src);
3600 break;
3601 case nir_tex_src_ddy:
3602 ddy = ir3_get_src(ctx, &tex->src[i].src);
3603 break;
3604 case nir_tex_src_ms_index:
3605 sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];
3606 break;
3607 case nir_tex_src_texture_offset:
3608 case nir_tex_src_sampler_offset:
3609 case nir_tex_src_texture_handle:
3610 case nir_tex_src_sampler_handle:
3611 /* handled in get_tex_samp_src() */
3612 break;
3613 default:
3614 ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
3615 tex->src[i].src_type);
3616 return;
3617 }
3618 }
3619
3620 switch (tex->op) {
3621 case nir_texop_tex_prefetch:
3622 compile_assert(ctx, !has_bias);
3623 compile_assert(ctx, !has_lod);
3624 compile_assert(ctx, !compare);
3625 compile_assert(ctx, !has_proj);
3626 compile_assert(ctx, !has_off);
3627 compile_assert(ctx, !ddx);
3628 compile_assert(ctx, !ddy);
3629 compile_assert(ctx, !sample_index);
3630 compile_assert(
3631 ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);
3632 compile_assert(
3633 ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);
3634
3635 if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {
3636 opc = OPC_META_TEX_PREFETCH;
3637 ctx->so->num_sampler_prefetch++;
3638 break;
3639 }
3640 FALLTHROUGH;
3641 case nir_texop_tex:
3642 opc = has_lod ? OPC_SAML : OPC_SAM;
3643 break;
3644 case nir_texop_txb:
3645 opc = OPC_SAMB;
3646 break;
3647 case nir_texop_txl:
3648 opc = OPC_SAML;
3649 break;
3650 case nir_texop_txd:
3651 opc = OPC_SAMGQ;
3652 break;
3653 case nir_texop_txf:
3654 opc = OPC_ISAML;
3655 break;
3656 case nir_texop_lod:
3657 opc = OPC_GETLOD;
3658 break;
3659 case nir_texop_tg4:
3660 switch (tex->component) {
3661 case 0:
3662 opc = OPC_GATHER4R;
3663 break;
3664 case 1:
3665 opc = OPC_GATHER4G;
3666 break;
3667 case 2:
3668 opc = OPC_GATHER4B;
3669 break;
3670 case 3:
3671 opc = OPC_GATHER4A;
3672 break;
3673 }
3674 break;
3675 case nir_texop_txf_ms_fb:
3676 case nir_texop_txf_ms:
3677 opc = OPC_ISAMM;
3678 break;
3679 default:
3680 ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
3681 return;
3682 }
3683
3684 tex_info(tex, &flags, &coords);
3685
3686 /*
3687 * lay out the first argument in the proper order:
3688 * - actual coordinates first
3689 * - shadow reference
3690 * - array index
3691 * - projection w
3692 * - starting at offset 4, dpdx.xy, dpdy.xy
3693 *
3694 * bias/lod go into the second arg
3695 */
3696
3697 /* insert tex coords: */
3698 for (i = 0; i < coords; i++)
3699 src0[i] = coord[i];
3700
3701 nsrc0 = i;
3702
3703 type_t coord_pad_type = is_half(coord[0]) ? TYPE_U16 : TYPE_U32;
3704 /* scale up integer coords for TXF based on the LOD */
3705 if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
3706 assert(has_lod);
3707 for (i = 0; i < coords; i++)
3708 src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
3709 }
3710
3711 if (coords == 1) {
3712 /* hw doesn't do 1d, so we treat it as 2d with
3713 * height of 1, and patch up the y coord.
3714 */
3715 if (is_isam(opc)) {
3716 src0[nsrc0++] = create_immed_typed(b, 0, coord_pad_type);
3717 } else if (is_half(coord[0])) {
3718 src0[nsrc0++] = create_immed_typed(b, _mesa_float_to_half(0.5), coord_pad_type);
3719 } else {
3720 src0[nsrc0++] = create_immed_typed(b, fui(0.5), coord_pad_type);
3721 }
3722 }
3723
3724 if (tex->is_shadow && tex->op != nir_texop_lod)
3725 src0[nsrc0++] = compare;
3726
3727 if (tex->is_array && tex->op != nir_texop_lod)
3728 src0[nsrc0++] = coord[coords];
3729
3730 if (has_proj) {
3731 src0[nsrc0++] = proj;
3732 flags |= IR3_INSTR_P;
3733 }
3734
3735 /* pad to 4, then ddx/ddy: */
3736 if (tex->op == nir_texop_txd) {
3737 while (nsrc0 < 4)
3738 src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3739 for (i = 0; i < coords; i++)
3740 src0[nsrc0++] = ddx[i];
3741 if (coords < 2)
3742 src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3743 for (i = 0; i < coords; i++)
3744 src0[nsrc0++] = ddy[i];
3745 if (coords < 2)
3746 src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3747 }
3748
3749 /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
3750 * with scaled x coord according to requested sample:
3751 */
3752 if (opc == OPC_ISAMM) {
3753 if (ctx->compiler->txf_ms_with_isaml) {
3754 /* the samples are laid out in x dimension as
3755 * 0 1 2 3
3756 * x_ms = (x << ms) + sample_index;
3757 */
3758 struct ir3_instruction *ms;
3759 ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
3760
3761 src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
3762 src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
3763
3764 opc = OPC_ISAML;
3765 } else {
3766 src0[nsrc0++] = sample_index;
3767 }
3768 }
3769
3770 /*
3771 * second argument (if applicable):
3772 * - offsets
3773 * - lod
3774 * - bias
3775 */
3776 if (has_off | has_lod | has_bias) {
3777 if (has_off) {
3778 unsigned off_coords = coords;
3779 if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
3780 off_coords--;
3781 for (i = 0; i < off_coords; i++)
3782 src1[nsrc1++] = off[i];
3783 if (off_coords < 2)
3784 src1[nsrc1++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3785 flags |= IR3_INSTR_O;
3786 }
3787
3788 if (has_lod | has_bias)
3789 src1[nsrc1++] = lod;
3790 }
3791
3792 type = get_tex_dest_type(tex);
3793
3794 if (opc == OPC_GETLOD)
3795 type = TYPE_S32;
3796
3797 if (tex->op == nir_texop_txf_ms_fb) {
3798 compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
3799
3800 ctx->so->fb_read = true;
3801 if (ctx->compiler->options.bindless_fb_read_descriptor >= 0) {
3802 ctx->so->bindless_tex = true;
3803 info.flags = IR3_INSTR_B;
3804 info.base = ctx->compiler->options.bindless_fb_read_descriptor;
3805 struct ir3_instruction *texture, *sampler;
3806
3807 int base_index =
3808 nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
3809 nir_src tex_src = tex->src[base_index].src;
3810
3811 if (nir_src_is_const(tex_src)) {
3812 texture = create_immed_typed(b,
3813 nir_src_as_uint(tex_src) + ctx->compiler->options.bindless_fb_read_slot,
3814 TYPE_U32);
3815 } else {
3816 texture = create_immed_typed(
3817 b, ctx->compiler->options.bindless_fb_read_slot, TYPE_U32);
3818 struct ir3_instruction *base =
3819 ir3_get_src(ctx, &tex->src[base_index].src)[0];
3820 texture = ir3_ADD_U(b, texture, 0, base, 0);
3821 }
3822 sampler = create_immed_typed(b, 0, TYPE_U32);
3823 info.samp_tex = ir3_collect(b, texture, sampler);
3824 info.flags |= IR3_INSTR_S2EN;
3825 if (tex->texture_non_uniform) {
3826 info.flags |= IR3_INSTR_NONUNIF;
3827 }
3828 } else {
3829 /* Otherwise append a sampler to be patched into the texture
3830 * state:
3831 */
3832 info.samp_tex =
3833 ir3_collect(b, create_immed_typed(b, ctx->so->num_samp, TYPE_U16),
3834 create_immed_typed(b, ctx->so->num_samp, TYPE_U16));
3835 info.flags = IR3_INSTR_S2EN;
3836 }
3837
3838 ctx->so->num_samp++;
3839 } else {
3840 info = get_tex_samp_tex_src(ctx, tex);
3841 }
3842
3843 bool tg4_swizzle_fixup = false;
3844 if (tex->op == nir_texop_tg4 && ctx->compiler->gen == 4 &&
3845 ctx->sampler_swizzles[tex->texture_index] != 0x688 /* rgba */) {
3846 uint16_t swizzles = ctx->sampler_swizzles[tex->texture_index];
3847 uint16_t swizzle = (swizzles >> (tex->component * 3)) & 7;
3848 if (swizzle > 3) {
3849 /* this would mean that we can just return 0 / 1, no texturing
3850 * necessary
3851 */
3852 struct ir3_instruction *imm = create_immed(b,
3853 type_float(type) ? fui(swizzle - 4) : (swizzle - 4));
3854 for (int i = 0; i < 4; i++)
3855 dst[i] = imm;
3856 ir3_put_def(ctx, &tex->def);
3857 return;
3858 }
3859 opc = OPC_GATHER4R + swizzle;
3860 tg4_swizzle_fixup = true;
3861 }
3862
3863 struct ir3_instruction *col0 = ir3_create_collect(b, src0, nsrc0);
3864 struct ir3_instruction *col1 = ir3_create_collect(b, src1, nsrc1);
3865
3866 if (opc == OPC_META_TEX_PREFETCH) {
3867 int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
3868
3869 struct ir3_builder build =
3870 ir3_builder_at(ir3_before_terminator(ctx->in_block));
3871 sam = ir3_SAM(&build, opc, type, MASK(ncomp), 0, NULL,
3872 get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
3873 sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);
3874 /* make sure not to add irrelevant flags like S2EN */
3875 sam->flags = flags | (info.flags & IR3_INSTR_B);
3876 sam->prefetch.tex = info.tex_idx;
3877 sam->prefetch.samp = info.samp_idx;
3878 sam->prefetch.tex_base = info.tex_base;
3879 sam->prefetch.samp_base = info.samp_base;
3880 } else {
3881 info.flags |= flags;
3882 sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
3883 }
3884
3885 if (tg4_swizzle_fixup) {
3886 /* TODO: fix-up for ASTC when alpha is selected? */
3887 array_insert(ctx->ir, ctx->ir->tg4, sam);
3888
3889 ir3_split_dest(b, dst, sam, 0, 4);
3890
3891 uint8_t tex_bits = ctx->sampler_swizzles[tex->texture_index] >> 12;
3892 if (!type_float(type) && tex_bits != 3 /* 32bpp */ &&
3893 tex_bits != 0 /* key unset */) {
3894 uint8_t bits = 0;
3895 switch (tex_bits) {
3896 case 1: /* 8bpp */
3897 bits = 8;
3898 break;
3899 case 2: /* 16bpp */
3900 bits = 16;
3901 break;
3902 case 4: /* 10bpp or 2bpp for alpha */
3903 if (opc == OPC_GATHER4A)
3904 bits = 2;
3905 else
3906 bits = 10;
3907 break;
3908 default:
3909 assert(0);
3910 }
3911
3912 sam->cat5.type = TYPE_F32;
3913 for (int i = 0; i < 4; i++) {
3914 /* scale and offset the unorm data */
3915 dst[i] = ir3_MAD_F32(b, dst[i], 0, create_immed(b, fui((1 << bits) - 1)), 0, create_immed(b, fui(0.5f)), 0);
3916 /* convert the scaled value to integer */
3917 dst[i] = ir3_COV(b, dst[i], TYPE_F32, TYPE_U32);
3918 /* sign extend for signed values */
3919 if (type == TYPE_S32) {
3920 dst[i] = ir3_SHL_B(b, dst[i], 0, create_immed(b, 32 - bits), 0);
3921 dst[i] = ir3_ASHR_B(b, dst[i], 0, create_immed(b, 32 - bits), 0);
3922 }
3923 }
3924 }
3925 } else if ((ctx->astc_srgb & (1 << tex->texture_index)) &&
3926 tex->op != nir_texop_tg4 && /* leave out tg4, unless it's on alpha? */
3927 !nir_tex_instr_is_query(tex)) {
3928 assert(opc != OPC_META_TEX_PREFETCH);
3929
3930 /* only need first 3 components: */
3931 sam->dsts[0]->wrmask = 0x7;
3932 ir3_split_dest(b, dst, sam, 0, 3);
3933
3934 /* we need to sample the alpha separately with a non-SRGB
3935 * texture state:
3936 */
3937 sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex,
3938 col0, col1);
3939
3940 array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
3941
3942 /* fixup .w component: */
3943 ir3_split_dest(b, &dst[3], sam, 3, 1);
3944 } else {
3945 /* normal (non-workaround) case: */
3946 ir3_split_dest(b, dst, sam, 0, ncomp);
3947 }
3948
3949 /* GETLOD returns results in 4.8 fixed point */
3950 if (opc == OPC_GETLOD) {
3951 bool half = tex->def.bit_size == 16;
3952 struct ir3_instruction *factor =
3953 half ? create_immed_typed(b, _mesa_float_to_half(1.0 / 256), TYPE_F16)
3954 : create_immed(b, fui(1.0 / 256));
3955
3956 for (i = 0; i < 2; i++) {
3957 dst[i] = ir3_MUL_F(
3958 b, ir3_COV(b, dst[i], TYPE_S32, half ? TYPE_F16 : TYPE_F32), 0,
3959 factor, 0);
3960 }
3961 }
3962
3963 ir3_put_def(ctx, &tex->def);
3964 }
3965
3966 static void
emit_tex_info(struct ir3_context * ctx,nir_tex_instr * tex,unsigned idx)3967 emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx)
3968 {
3969 struct ir3_builder *b = &ctx->build;
3970 struct ir3_instruction **dst, *sam;
3971 type_t dst_type = get_tex_dest_type(tex);
3972 struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
3973
3974 dst = ir3_get_def(ctx, &tex->def, 1);
3975
3976 sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);
3977
3978 /* even though there is only one component, since it ends
3979 * up in .y/.z/.w rather than .x, we need a split_dest()
3980 */
3981 ir3_split_dest(b, dst, sam, idx, 1);
3982
3983 /* The # of levels comes from getinfo.z. We need to add 1 to it, since
3984 * the value in TEX_CONST_0 is zero-based.
3985 */
3986 if (ctx->compiler->levels_add_one)
3987 dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
3988
3989 ir3_put_def(ctx, &tex->def);
3990 }
3991
3992 static void
emit_tex_txs(struct ir3_context * ctx,nir_tex_instr * tex)3993 emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
3994 {
3995 struct ir3_builder *b = &ctx->build;
3996 struct ir3_instruction **dst, *sam;
3997 struct ir3_instruction *lod;
3998 unsigned flags, coords;
3999 type_t dst_type = get_tex_dest_type(tex);
4000 struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
4001
4002 tex_info(tex, &flags, &coords);
4003 info.flags |= flags;
4004
4005 /* Actually we want the number of dimensions, not coordinates. This
4006 * distinction only matters for cubes.
4007 */
4008 if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
4009 coords = 2;
4010
4011 dst = ir3_get_def(ctx, &tex->def, 4);
4012
4013 int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
4014 compile_assert(ctx, lod_idx >= 0);
4015
4016 lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];
4017
4018 if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
4019 sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
4020 } else {
4021 /*
4022 * The maximum value which OPC_GETSIZE could return for one dimension
4023 * is 0x007ff0, however sampler buffer could be much bigger.
4024 * Blob uses OPC_GETBUF for them.
4025 */
4026 sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);
4027 }
4028
4029 ir3_split_dest(b, dst, sam, 0, 4);
4030
4031 /* Array size actually ends up in .w rather than .z. This doesn't
4032 * matter for miplevel 0, but for higher mips the value in z is
4033 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
4034 * returned, which means that we have to add 1 to it for arrays.
4035 */
4036 if (tex->is_array) {
4037 if (ctx->compiler->levels_add_one) {
4038 dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
4039 } else {
4040 dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
4041 }
4042 }
4043
4044 ir3_put_def(ctx, &tex->def);
4045 }
4046
4047 /* phi instructions are left partially constructed. We don't resolve
4048 * their srcs until the end of the shader, since (eg. loops) one of
4049 * the phi's srcs might be defined after the phi due to back edges in
4050 * the CFG.
4051 */
4052 static void
emit_phi(struct ir3_context * ctx,nir_phi_instr * nphi)4053 emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)
4054 {
4055 struct ir3_instruction *phi, **dst;
4056
4057 unsigned num_components = nphi->def.num_components;
4058 dst = ir3_get_def(ctx, &nphi->def, num_components);
4059
4060 if (exec_list_is_singular(&nphi->srcs)) {
4061 nir_phi_src *src = list_entry(exec_list_get_head(&nphi->srcs),
4062 nir_phi_src, node);
4063 if (nphi->def.divergent == src->src.ssa->divergent) {
4064 struct ir3_instruction *const *srcs =
4065 ir3_get_src_maybe_shared(ctx, &src->src);
4066 memcpy(dst, srcs, num_components * sizeof(struct ir3_instruction *));
4067 ir3_put_def(ctx, &nphi->def);
4068 return;
4069 }
4070 }
4071
4072 for (unsigned i = 0; i < num_components; i++) {
4073 phi = ir3_build_instr(&ctx->build, OPC_META_PHI, 1,
4074 exec_list_length(&nphi->srcs));
4075 __ssa_dst(phi);
4076 phi->phi.nphi = nphi;
4077 phi->phi.comp = i;
4078
4079 if (ctx->compiler->has_scalar_alu && !nphi->def.divergent)
4080 phi->dsts[0]->flags |= IR3_REG_SHARED;
4081
4082 dst[i] = phi;
4083 }
4084
4085 ir3_put_def(ctx, &nphi->def);
4086 }
4087
4088 static struct ir3_block *get_block(struct ir3_context *ctx,
4089 const nir_block *nblock);
4090
4091 static struct ir3_instruction *
read_phi_src(struct ir3_context * ctx,struct ir3_block * blk,struct ir3_instruction * phi,nir_phi_instr * nphi)4092 read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,
4093 struct ir3_instruction *phi, nir_phi_instr *nphi)
4094 {
4095 if (!blk->nblock) {
4096 struct ir3_builder build = ir3_builder_at(ir3_before_terminator(blk));
4097 struct ir3_instruction *continue_phi =
4098 ir3_build_instr(&build, OPC_META_PHI, 1, blk->predecessors_count);
4099 __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;
4100
4101 for (unsigned i = 0; i < blk->predecessors_count; i++) {
4102 struct ir3_instruction *src =
4103 read_phi_src(ctx, blk->predecessors[i], phi, nphi);
4104 if (src)
4105 __ssa_src(continue_phi, src, 0);
4106 else
4107 ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);
4108 }
4109
4110 return continue_phi;
4111 }
4112
4113 nir_foreach_phi_src (nsrc, nphi) {
4114 if (blk->nblock == nsrc->pred) {
4115 if (nsrc->src.ssa->parent_instr->type == nir_instr_type_undef) {
4116 /* Create an ir3 undef */
4117 return NULL;
4118 } else {
4119 /* We need to insert the move at the end of the block */
4120 struct ir3_block *old_block = ctx->block;
4121 ir3_context_set_block(ctx, blk);
4122 struct ir3_instruction *src = ir3_get_src_shared(
4123 ctx, &nsrc->src,
4124 phi->dsts[0]->flags & IR3_REG_SHARED)[phi->phi.comp];
4125 ir3_context_set_block(ctx, old_block);
4126 return src;
4127 }
4128 }
4129 }
4130
4131 unreachable("couldn't find phi node ir3 block");
4132 return NULL;
4133 }
4134
4135 static void
resolve_phis(struct ir3_context * ctx,struct ir3_block * block)4136 resolve_phis(struct ir3_context *ctx, struct ir3_block *block)
4137 {
4138 foreach_instr (phi, &block->instr_list) {
4139 if (phi->opc != OPC_META_PHI)
4140 break;
4141
4142 nir_phi_instr *nphi = phi->phi.nphi;
4143
4144 if (!nphi) /* skip continue phis created above */
4145 continue;
4146
4147 for (unsigned i = 0; i < block->predecessors_count; i++) {
4148 struct ir3_block *pred = block->predecessors[i];
4149 struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);
4150 if (src) {
4151 __ssa_src(phi, src, 0);
4152 } else {
4153 /* Create an ir3 undef */
4154 ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);
4155 }
4156 }
4157 }
4158 }
4159
4160 static void
emit_jump(struct ir3_context * ctx,nir_jump_instr * jump)4161 emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
4162 {
4163 switch (jump->type) {
4164 case nir_jump_break:
4165 case nir_jump_continue:
4166 case nir_jump_return:
4167 /* I *think* we can simply just ignore this, and use the
4168 * successor block link to figure out where we need to
4169 * jump to for break/continue
4170 */
4171 break;
4172 default:
4173 ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
4174 break;
4175 }
4176 }
4177
4178 static void
emit_instr(struct ir3_context * ctx,nir_instr * instr)4179 emit_instr(struct ir3_context *ctx, nir_instr *instr)
4180 {
4181 switch (instr->type) {
4182 case nir_instr_type_alu:
4183 emit_alu(ctx, nir_instr_as_alu(instr));
4184 break;
4185 case nir_instr_type_deref:
4186 /* ignored, handled as part of the intrinsic they are src to */
4187 break;
4188 case nir_instr_type_intrinsic:
4189 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
4190 break;
4191 case nir_instr_type_load_const:
4192 emit_load_const(ctx, nir_instr_as_load_const(instr));
4193 break;
4194 case nir_instr_type_undef:
4195 emit_undef(ctx, nir_instr_as_undef(instr));
4196 break;
4197 case nir_instr_type_tex: {
4198 nir_tex_instr *tex = nir_instr_as_tex(instr);
4199 /* couple tex instructions get special-cased:
4200 */
4201 switch (tex->op) {
4202 case nir_texop_txs:
4203 emit_tex_txs(ctx, tex);
4204 break;
4205 case nir_texop_query_levels:
4206 emit_tex_info(ctx, tex, 2);
4207 break;
4208 case nir_texop_texture_samples:
4209 emit_tex_info(ctx, tex, 3);
4210 break;
4211 default:
4212 emit_tex(ctx, tex);
4213 break;
4214 }
4215 break;
4216 }
4217 case nir_instr_type_jump:
4218 emit_jump(ctx, nir_instr_as_jump(instr));
4219 break;
4220 case nir_instr_type_phi:
4221 emit_phi(ctx, nir_instr_as_phi(instr));
4222 break;
4223 case nir_instr_type_call:
4224 case nir_instr_type_parallel_copy:
4225 case nir_instr_type_debug_info:
4226 ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n",
4227 instr->type);
4228 break;
4229 }
4230 }
4231
4232 static struct ir3_block *
get_block(struct ir3_context * ctx,const nir_block * nblock)4233 get_block(struct ir3_context *ctx, const nir_block *nblock)
4234 {
4235 struct ir3_block *block;
4236 struct hash_entry *hentry;
4237
4238 hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
4239 if (hentry)
4240 return hentry->data;
4241
4242 block = ir3_block_create(ctx->ir);
4243 block->nblock = nblock;
4244 _mesa_hash_table_insert(ctx->block_ht, nblock, block);
4245
4246 return block;
4247 }
4248
4249 static struct ir3_block *
get_block_or_continue(struct ir3_context * ctx,const nir_block * nblock)4250 get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock)
4251 {
4252 struct hash_entry *hentry;
4253
4254 hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);
4255 if (hentry)
4256 return hentry->data;
4257
4258 return get_block(ctx, nblock);
4259 }
4260
4261 static struct ir3_block *
create_continue_block(struct ir3_context * ctx,const nir_block * nblock)4262 create_continue_block(struct ir3_context *ctx, const nir_block *nblock)
4263 {
4264 struct ir3_block *block = ir3_block_create(ctx->ir);
4265 block->nblock = NULL;
4266 _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);
4267 return block;
4268 }
4269
4270 static void
emit_block(struct ir3_context * ctx,nir_block * nblock)4271 emit_block(struct ir3_context *ctx, nir_block *nblock)
4272 {
4273 ir3_context_set_block(ctx, get_block(ctx, nblock));
4274
4275 list_addtail(&ctx->block->node, &ctx->ir->block_list);
4276
4277 ctx->block->loop_depth = ctx->loop_depth;
4278
4279 /* re-emit addr register in each block if needed: */
4280 for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
4281 _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
4282 ctx->addr0_ht[i] = NULL;
4283 }
4284
4285 _mesa_hash_table_u64_destroy(ctx->addr1_ht);
4286 ctx->addr1_ht = NULL;
4287
4288 nir_foreach_instr (instr, nblock) {
4289 ctx->cur_instr = instr;
4290 emit_instr(ctx, instr);
4291 ctx->cur_instr = NULL;
4292 if (ctx->error)
4293 return;
4294 }
4295
4296 for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {
4297 if (nblock->successors[i]) {
4298 ctx->block->successors[i] =
4299 get_block_or_continue(ctx, nblock->successors[i]);
4300 }
4301 }
4302
4303 /* Emit unconditional branch if we only have one successor. Conditional
4304 * branches are emitted in emit_if.
4305 */
4306 if (ctx->block->successors[0] && !ctx->block->successors[1]) {
4307 if (!ir3_block_get_terminator(ctx->block))
4308 ir3_JUMP(&ctx->build);
4309 }
4310
4311 _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
4312 }
4313
4314 static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
4315
4316 /* Get the ir3 branch condition for a given nir source. This will strip any inot
4317 * instructions and set *inv when the condition should be inverted. This
4318 * inversion can be directly folded into branches (in the inv1/inv2 fields)
4319 * instead of adding an explicit not.b/sub.u instruction.
4320 */
4321 static struct ir3_instruction *
get_branch_condition(struct ir3_context * ctx,nir_src * src,unsigned comp,bool * inv)4322 get_branch_condition(struct ir3_context *ctx, nir_src *src, unsigned comp,
4323 bool *inv)
4324 {
4325 struct ir3_instruction *condition = ir3_get_src(ctx, src)[comp];
4326
4327 if (src->ssa->parent_instr->type == nir_instr_type_alu) {
4328 nir_alu_instr *nir_cond = nir_instr_as_alu(src->ssa->parent_instr);
4329
4330 if (nir_cond->op == nir_op_inot) {
4331 struct ir3_instruction *inv_cond = get_branch_condition(
4332 ctx, &nir_cond->src[0].src, nir_cond->src[0].swizzle[comp], inv);
4333 *inv = !*inv;
4334 return inv_cond;
4335 }
4336 }
4337
4338 *inv = false;
4339 return ir3_get_predicate(ctx, condition);
4340 }
4341
4342 /* Try to fold br (and/or cond1, cond2) into braa/brao cond1, cond2.
4343 */
4344 static struct ir3_instruction *
fold_conditional_branch(struct ir3_context * ctx,struct nir_src * nir_cond)4345 fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
4346 {
4347 if (!ctx->compiler->has_branch_and_or)
4348 return NULL;
4349
4350 if (nir_cond->ssa->parent_instr->type != nir_instr_type_alu)
4351 return NULL;
4352
4353 nir_alu_instr *alu_cond = nir_instr_as_alu(nir_cond->ssa->parent_instr);
4354
4355 if ((alu_cond->op != nir_op_iand) && (alu_cond->op != nir_op_ior))
4356 return NULL;
4357
4358 /* If the result of the and/or is also used for something else than an if
4359 * condition, the and/or cannot be removed. In that case, we will end-up with
4360 * extra predicate conversions for the conditions without actually removing
4361 * any instructions, resulting in an increase of instructions. Let's not fold
4362 * the conditions in the branch in that case.
4363 */
4364 if (!nir_def_only_used_by_if(&alu_cond->def))
4365 return NULL;
4366
4367 bool inv1, inv2;
4368 struct ir3_instruction *cond1 = get_branch_condition(
4369 ctx, &alu_cond->src[0].src, alu_cond->src[0].swizzle[0], &inv1);
4370 struct ir3_instruction *cond2 = get_branch_condition(
4371 ctx, &alu_cond->src[1].src, alu_cond->src[1].swizzle[0], &inv2);
4372
4373 struct ir3_instruction *branch;
4374 if (alu_cond->op == nir_op_iand) {
4375 branch = ir3_BRAA(&ctx->build, cond1, IR3_REG_PREDICATE, cond2,
4376 IR3_REG_PREDICATE);
4377 } else {
4378 branch = ir3_BRAO(&ctx->build, cond1, IR3_REG_PREDICATE, cond2,
4379 IR3_REG_PREDICATE);
4380 }
4381
4382 branch->cat0.inv1 = inv1;
4383 branch->cat0.inv2 = inv2;
4384 return branch;
4385 }
4386
4387 static bool
instr_can_be_predicated(nir_instr * instr)4388 instr_can_be_predicated(nir_instr *instr)
4389 {
4390 /* Anything that doesn't expand to control-flow can be predicated. */
4391 switch (instr->type) {
4392 case nir_instr_type_alu:
4393 case nir_instr_type_deref:
4394 case nir_instr_type_tex:
4395 case nir_instr_type_load_const:
4396 case nir_instr_type_undef:
4397 case nir_instr_type_phi:
4398 case nir_instr_type_parallel_copy:
4399 return true;
4400 case nir_instr_type_call:
4401 case nir_instr_type_jump:
4402 case nir_instr_type_debug_info:
4403 return false;
4404 case nir_instr_type_intrinsic: {
4405 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4406 switch (intrin->intrinsic) {
4407 case nir_intrinsic_reduce:
4408 case nir_intrinsic_inclusive_scan:
4409 case nir_intrinsic_exclusive_scan:
4410 case nir_intrinsic_reduce_clusters_ir3:
4411 case nir_intrinsic_inclusive_scan_clusters_ir3:
4412 case nir_intrinsic_exclusive_scan_clusters_ir3:
4413 case nir_intrinsic_brcst_active_ir3:
4414 case nir_intrinsic_ballot:
4415 case nir_intrinsic_elect:
4416 case nir_intrinsic_elect_any_ir3:
4417 case nir_intrinsic_read_invocation_cond_ir3:
4418 case nir_intrinsic_demote:
4419 case nir_intrinsic_demote_if:
4420 case nir_intrinsic_terminate:
4421 case nir_intrinsic_terminate_if:
4422 return false;
4423 default:
4424 return true;
4425 }
4426 }
4427 }
4428
4429 unreachable("Checked all cases");
4430 }
4431
4432 static bool
nif_can_be_predicated(nir_if * nif)4433 nif_can_be_predicated(nir_if *nif)
4434 {
4435 /* For non-divergent branches, predication is more expensive than a branch
4436 * because the latter can potentially skip all instructions.
4437 */
4438 if (!nir_src_is_divergent(&nif->condition))
4439 return false;
4440
4441 /* Although it could potentially be possible to allow a limited form of
4442 * nested predication (e.g., by resetting the predication mask after a nested
4443 * branch), let's avoid this for now and only use predication for leaf
4444 * branches. That is, for ifs that contain exactly one block in both branches
4445 * (note that they always contain at least one block).
4446 */
4447 if (!exec_list_is_singular(&nif->then_list) ||
4448 !exec_list_is_singular(&nif->else_list)) {
4449 return false;
4450 }
4451
4452 nir_foreach_instr (instr, nir_if_first_then_block(nif)) {
4453 if (!instr_can_be_predicated(instr))
4454 return false;
4455 }
4456
4457 nir_foreach_instr (instr, nir_if_first_else_block(nif)) {
4458 if (!instr_can_be_predicated(instr))
4459 return false;
4460 }
4461
4462 return true;
4463 }
4464
4465 /* A typical if-else block like this:
4466 * if (cond) {
4467 * tblock;
4468 * } else {
4469 * fblock;
4470 * }
4471 * Will be emitted as:
4472 * |-- i --|
4473 * | ... |
4474 * | predt |
4475 * |-------|
4476 * succ0 / \ succ1
4477 * |-- i+1 --| |-- i+2 --|
4478 * | tblock | | fblock |
4479 * | predf | | jump |
4480 * |---------| |---------|
4481 * succ0 \ / succ0
4482 * |-- j --|
4483 * | ... |
4484 * |-------|
4485 * Where the numbers at the top of blocks are their indices. That is, the true
4486 * block and false block are laid-out contiguously after the current block. This
4487 * layout is verified during legalization in prede_sched which also inserts the
4488 * final prede instruction. Note that we don't insert prede right away to allow
4489 * opt_jump to optimize the jump in the false block.
4490 */
4491 static struct ir3_instruction *
emit_predicated_branch(struct ir3_context * ctx,nir_if * nif)4492 emit_predicated_branch(struct ir3_context *ctx, nir_if *nif)
4493 {
4494 if (!ctx->compiler->has_predication)
4495 return NULL;
4496 if (!nif_can_be_predicated(nif))
4497 return NULL;
4498
4499 struct ir3_block *then_block = get_block(ctx, nir_if_first_then_block(nif));
4500 struct ir3_block *else_block = get_block(ctx, nir_if_first_else_block(nif));
4501 assert(list_is_empty(&then_block->instr_list) &&
4502 list_is_empty(&else_block->instr_list));
4503
4504 bool inv;
4505 struct ir3_instruction *condition =
4506 get_branch_condition(ctx, &nif->condition, 0, &inv);
4507 struct ir3_builder then_build = ir3_builder_at(ir3_after_block(then_block));
4508 struct ir3_instruction *pred, *pred_inv;
4509
4510 if (!inv) {
4511 pred = ir3_PREDT(&ctx->build, condition, IR3_REG_PREDICATE);
4512 pred_inv = ir3_PREDF(&then_build, condition, IR3_REG_PREDICATE);
4513 } else {
4514 pred = ir3_PREDF(&ctx->build, condition, IR3_REG_PREDICATE);
4515 pred_inv = ir3_PREDT(&then_build, condition, IR3_REG_PREDICATE);
4516 }
4517
4518 pred->srcs[0]->num = REG_P0_X;
4519 pred_inv->srcs[0]->num = REG_P0_X;
4520 return pred;
4521 }
4522
4523 static struct ir3_instruction *
emit_conditional_branch(struct ir3_context * ctx,nir_if * nif)4524 emit_conditional_branch(struct ir3_context *ctx, nir_if *nif)
4525 {
4526 nir_src *nir_cond = &nif->condition;
4527 struct ir3_instruction *folded = fold_conditional_branch(ctx, nir_cond);
4528 if (folded)
4529 return folded;
4530
4531 struct ir3_instruction *predicated = emit_predicated_branch(ctx, nif);
4532 if (predicated)
4533 return predicated;
4534
4535 bool inv1;
4536 struct ir3_instruction *cond1 =
4537 get_branch_condition(ctx, nir_cond, 0, &inv1);
4538 struct ir3_instruction *branch =
4539 ir3_BR(&ctx->build, cond1, IR3_REG_PREDICATE);
4540 branch->cat0.inv1 = inv1;
4541 return branch;
4542 }
4543
4544 static void
emit_if(struct ir3_context * ctx,nir_if * nif)4545 emit_if(struct ir3_context *ctx, nir_if *nif)
4546 {
4547 struct ir3_instruction *condition = ir3_get_src_maybe_shared(ctx, &nif->condition)[0];
4548
4549 if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
4550 struct ir3_instruction *pred = ssa(condition->srcs[0]);
4551 ir3_BANY(&ctx->build, pred, IR3_REG_PREDICATE);
4552 } else if (condition->opc == OPC_ALL_MACRO &&
4553 condition->block == ctx->block) {
4554 struct ir3_instruction *pred = ssa(condition->srcs[0]);
4555 ir3_BALL(&ctx->build, pred, IR3_REG_PREDICATE);
4556 } else if (condition->opc == OPC_ELECT_MACRO &&
4557 condition->block == ctx->block) {
4558 struct ir3_instruction *branch = ir3_GETONE(&ctx->build);
4559 branch->flags |= condition->flags & IR3_INSTR_NEEDS_HELPERS;
4560 } else if (condition->opc == OPC_SHPS_MACRO &&
4561 condition->block == ctx->block) {
4562 /* TODO: technically this only works if the block is the only user of the
4563 * shps, but we only use it in very constrained scenarios so this should
4564 * be ok.
4565 */
4566 ir3_SHPS(&ctx->build);
4567 } else {
4568 emit_conditional_branch(ctx, nif);
4569 }
4570
4571 ctx->block->divergent_condition = nir_src_is_divergent(&nif->condition);
4572
4573 emit_cf_list(ctx, &nif->then_list);
4574 emit_cf_list(ctx, &nif->else_list);
4575 }
4576
4577 static bool
has_nontrivial_continue(nir_loop * nloop)4578 has_nontrivial_continue(nir_loop *nloop)
4579 {
4580 struct nir_block *nstart = nir_loop_first_block(nloop);
4581
4582 /* There's always one incoming edge from outside the loop, and if there
4583 * is more than one backedge from inside the loop (so more than 2 total
4584 * edges) then one must be a nontrivial continue.
4585 */
4586 if (nstart->predecessors->entries > 2)
4587 return true;
4588
4589 /* Check whether the one backedge is a nontrivial continue. This can happen
4590 * if the loop ends with a break.
4591 */
4592 set_foreach (nstart->predecessors, entry) {
4593 nir_block *pred = (nir_block*)entry->key;
4594 if (pred == nir_loop_last_block(nloop) ||
4595 pred == nir_cf_node_as_block(nir_cf_node_prev(&nloop->cf_node)))
4596 continue;
4597 return true;
4598 }
4599
4600 return false;
4601 }
4602
4603 static void
emit_loop(struct ir3_context * ctx,nir_loop * nloop)4604 emit_loop(struct ir3_context *ctx, nir_loop *nloop)
4605 {
4606 assert(!nir_loop_has_continue_construct(nloop));
4607 ctx->loop_depth++;
4608
4609 struct nir_block *nstart = nir_loop_first_block(nloop);
4610 struct ir3_block *continue_blk = NULL;
4611
4612 /* If the loop has a continue statement that isn't at the end, then we need to
4613 * create a continue block in order to let control flow reconverge before
4614 * entering the next iteration of the loop.
4615 */
4616 if (has_nontrivial_continue(nloop)) {
4617 continue_blk = create_continue_block(ctx, nstart);
4618 }
4619
4620 emit_cf_list(ctx, &nloop->body);
4621
4622 if (continue_blk) {
4623 struct ir3_block *start = get_block(ctx, nstart);
4624 struct ir3_builder build = ir3_builder_at(ir3_after_block(continue_blk));
4625 ir3_JUMP(&build);
4626 continue_blk->successors[0] = start;
4627 continue_blk->loop_depth = ctx->loop_depth;
4628 list_addtail(&continue_blk->node, &ctx->ir->block_list);
4629 }
4630
4631 ctx->so->loops++;
4632 ctx->loop_depth--;
4633 }
4634
4635 static void
emit_cf_list(struct ir3_context * ctx,struct exec_list * list)4636 emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
4637 {
4638 foreach_list_typed (nir_cf_node, node, node, list) {
4639 switch (node->type) {
4640 case nir_cf_node_block:
4641 emit_block(ctx, nir_cf_node_as_block(node));
4642 break;
4643 case nir_cf_node_if:
4644 emit_if(ctx, nir_cf_node_as_if(node));
4645 break;
4646 case nir_cf_node_loop:
4647 emit_loop(ctx, nir_cf_node_as_loop(node));
4648 break;
4649 case nir_cf_node_function:
4650 ir3_context_error(ctx, "TODO\n");
4651 break;
4652 }
4653 }
4654 }
4655
4656 /* emit stream-out code. At this point, the current block is the original
4657 * (nir) end block, and nir ensures that all flow control paths terminate
4658 * into the end block. We re-purpose the original end block to generate
4659 * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
4660 * block holding stream-out write instructions, followed by the new end
4661 * block:
4662 *
4663 * blockOrigEnd {
4664 * p0.x = (vtxcnt < maxvtxcnt)
4665 * // succs: blockStreamOut, blockNewEnd
4666 * }
4667 * blockStreamOut {
4668 * // preds: blockOrigEnd
4669 * ... stream-out instructions ...
4670 * // succs: blockNewEnd
4671 * }
4672 * blockNewEnd {
4673 * // preds: blockOrigEnd, blockStreamOut
4674 * }
4675 */
4676 static void
emit_stream_out(struct ir3_context * ctx)4677 emit_stream_out(struct ir3_context *ctx)
4678 {
4679 struct ir3 *ir = ctx->ir;
4680 struct ir3_stream_output_info *strmout = &ctx->so->stream_output;
4681 struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
4682 struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
4683 struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
4684
4685 /* create vtxcnt input in input block at top of shader,
4686 * so that it is seen as live over the entire duration
4687 * of the shader:
4688 */
4689 vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
4690 maxvtxcnt = create_driver_param(ctx, IR3_DP_VS(vtxcnt_max));
4691
4692 /* at this point, we are at the original 'end' block,
4693 * re-purpose this block to stream-out condition, then
4694 * append stream-out block and new-end block
4695 */
4696 orig_end_block = ctx->block;
4697
4698 // maybe w/ store_global intrinsic, we could do this
4699 // stuff in nir->nir pass
4700
4701 stream_out_block = ir3_block_create(ir);
4702 list_addtail(&stream_out_block->node, &ir->block_list);
4703
4704 new_end_block = ir3_block_create(ir);
4705 list_addtail(&new_end_block->node, &ir->block_list);
4706
4707 orig_end_block->successors[0] = stream_out_block;
4708 orig_end_block->successors[1] = new_end_block;
4709
4710 stream_out_block->successors[0] = new_end_block;
4711
4712 /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
4713 cond = ir3_CMPS_S(&ctx->build, vtxcnt, 0, maxvtxcnt, 0);
4714 cond->dsts[0]->flags |= IR3_REG_PREDICATE;
4715 cond->cat2.condition = IR3_COND_LT;
4716
4717 /* condition goes on previous block to the conditional,
4718 * since it is used to pick which of the two successor
4719 * paths to take:
4720 */
4721 ir3_BR(&ctx->build, cond, IR3_REG_PREDICATE);
4722
4723 /* switch to stream_out_block to generate the stream-out
4724 * instructions:
4725 */
4726 ir3_context_set_block(ctx, stream_out_block);
4727
4728 /* Calculate base addresses based on vtxcnt. Instructions
4729 * generated for bases not used in following loop will be
4730 * stripped out in the backend.
4731 */
4732 for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
4733 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
4734 unsigned stride = strmout->stride[i];
4735 struct ir3_instruction *base, *off;
4736
4737 base = create_uniform(
4738 &ctx->build,
4739 ir3_const_reg(const_state, IR3_CONST_ALLOC_TFBO, i));
4740
4741 /* 24-bit should be enough: */
4742 off = ir3_MUL_U24(&ctx->build, vtxcnt, 0,
4743 create_immed(&ctx->build, stride * 4), 0);
4744
4745 bases[i] = ir3_ADD_S(&ctx->build, off, 0, base, 0);
4746 }
4747
4748 /* Generate the per-output store instructions: */
4749 for (unsigned i = 0; i < strmout->num_outputs; i++) {
4750 for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
4751 unsigned c = j + strmout->output[i].start_component;
4752 struct ir3_instruction *base, *out, *stg;
4753
4754 base = bases[strmout->output[i].output_buffer];
4755 out = ctx->outputs[regid(strmout->output[i].register_index, c)];
4756
4757 stg = ir3_STG(
4758 &ctx->build, base, 0,
4759 create_immed(&ctx->build, (strmout->output[i].dst_offset + j) * 4),
4760 0, out, 0, create_immed(&ctx->build, 1), 0);
4761 stg->cat6.type = TYPE_U32;
4762
4763 array_insert(ctx->block, ctx->block->keeps, stg);
4764 }
4765 }
4766
4767 ir3_JUMP(&ctx->build);
4768
4769 /* and finally switch to the new_end_block: */
4770 ir3_context_set_block(ctx, new_end_block);
4771 }
4772
4773 static void
setup_predecessors(struct ir3 * ir)4774 setup_predecessors(struct ir3 *ir)
4775 {
4776 foreach_block (block, &ir->block_list) {
4777 for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
4778 if (block->successors[i])
4779 ir3_block_add_predecessor(block->successors[i], block);
4780 }
4781 }
4782 }
4783
4784 static void
emit_function(struct ir3_context * ctx,nir_function_impl * impl)4785 emit_function(struct ir3_context *ctx, nir_function_impl *impl)
4786 {
4787 nir_metadata_require(impl, nir_metadata_block_index);
4788
4789 emit_cf_list(ctx, &impl->body);
4790 emit_block(ctx, impl->end_block);
4791
4792 /* at this point, we should have a single empty block,
4793 * into which we emit the 'end' instruction.
4794 */
4795 compile_assert(ctx, list_is_empty(&ctx->block->instr_list));
4796
4797 /* If stream-out (aka transform-feedback) enabled, emit the
4798 * stream-out instructions, followed by a new empty block (into
4799 * which the 'end' instruction lands).
4800 *
4801 * NOTE: it is done in this order, rather than inserting before
4802 * we emit end_block, because NIR guarantees that all blocks
4803 * flow into end_block, and that end_block has no successors.
4804 * So by re-purposing end_block as the first block of stream-
4805 * out, we guarantee that all exit paths flow into the stream-
4806 * out instructions.
4807 */
4808 if ((ctx->compiler->gen < 5) &&
4809 (ctx->so->stream_output.num_outputs > 0) &&
4810 !ctx->so->binning_pass) {
4811 assert(ctx->so->type == MESA_SHADER_VERTEX);
4812 emit_stream_out(ctx);
4813 }
4814
4815 setup_predecessors(ctx->ir);
4816 foreach_block (block, &ctx->ir->block_list) {
4817 resolve_phis(ctx, block);
4818 }
4819 }
4820
4821 static void
setup_input(struct ir3_context * ctx,nir_intrinsic_instr * intr)4822 setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
4823 {
4824 struct ir3_shader_variant *so = ctx->so;
4825 struct ir3_instruction *coord = NULL;
4826
4827 if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
4828 coord =
4829 ir3_create_collect(&ctx->build, ir3_get_src(ctx, &intr->src[0]), 2);
4830
4831 compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
4832
4833 unsigned frac = nir_intrinsic_component(intr);
4834 unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
4835 unsigned ncomp = nir_intrinsic_dest_components(intr);
4836 unsigned n = nir_intrinsic_base(intr) + offset;
4837 unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
4838 unsigned compmask = BITFIELD_MASK(ncomp + frac);
4839
4840 /* Inputs are loaded using ldlw or ldg for other stages. */
4841 compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
4842 ctx->so->type == MESA_SHADER_VERTEX);
4843
4844 /* for clip+cull distances, unused components can't be eliminated because
4845 * they're read by fixed-function, even if there's a hole. Note that
4846 * clip/cull distance arrays must be declared in the FS, so we can just
4847 * use the NIR clip/cull distances to avoid reading ucp_enables in the
4848 * shader key.
4849 */
4850 if (ctx->so->type == MESA_SHADER_FRAGMENT &&
4851 (slot == VARYING_SLOT_CLIP_DIST0 ||
4852 slot == VARYING_SLOT_CLIP_DIST1)) {
4853 unsigned clip_cull_mask = so->clip_mask | so->cull_mask;
4854
4855 if (slot == VARYING_SLOT_CLIP_DIST0)
4856 compmask = clip_cull_mask & 0xf;
4857 else
4858 compmask = clip_cull_mask >> 4;
4859 }
4860
4861 /* for a4xx+ rasterflat */
4862 if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
4863 coord = NULL;
4864
4865 so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
4866
4867 so->inputs[n].slot = slot;
4868 so->inputs[n].compmask |= compmask;
4869 so->inputs_count = MAX2(so->inputs_count, n + 1);
4870 compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));
4871 so->inputs[n].flat = !coord;
4872
4873 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4874 compile_assert(ctx, slot != VARYING_SLOT_POS);
4875
4876 so->inputs[n].bary = true;
4877 unsigned idx = (n * 4) + frac;
4878 struct ir3_instruction_rpt instr =
4879 create_frag_input(ctx, coord, idx, ncomp);
4880 cp_instrs(ctx->last_dst, instr.rpts, ncomp);
4881
4882 if (slot == VARYING_SLOT_PRIMITIVE_ID)
4883 so->reads_primid = true;
4884
4885 so->inputs[n].inloc = 4 * n;
4886 so->varying_in = MAX2(so->varying_in, 4 * n + 4);
4887 } else {
4888 struct ir3_instruction *input = NULL;
4889
4890 foreach_input (in, ctx->ir) {
4891 if (in->input.inidx == n) {
4892 input = in;
4893 break;
4894 }
4895 }
4896
4897 if (!input) {
4898 input = create_input(ctx, compmask);
4899 input->input.inidx = n;
4900 } else {
4901 /* For aliased inputs, just append to the wrmask.. ie. if we
4902 * first see a vec2 index at slot N, and then later a vec4,
4903 * the wrmask of the resulting overlapped vec2 and vec4 is 0xf
4904 */
4905 input->dsts[0]->wrmask |= compmask;
4906 }
4907
4908 for (int i = 0; i < ncomp + frac; i++) {
4909 unsigned idx = (n * 4) + i;
4910 compile_assert(ctx, idx < ctx->ninputs);
4911
4912 /* fixup the src wrmask to avoid validation fail */
4913 if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {
4914 ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;
4915 continue;
4916 }
4917
4918 ir3_split_dest(&ctx->build, &ctx->inputs[idx], input, i, 1);
4919 }
4920
4921 for (int i = 0; i < ncomp; i++) {
4922 unsigned idx = (n * 4) + i + frac;
4923 ctx->last_dst[i] = ctx->inputs[idx];
4924 }
4925 }
4926 }
4927
4928 /* Initially we assign non-packed inloc's for varyings, as we don't really
4929 * know up-front which components will be unused. After all the compilation
4930 * stages we scan the shader to see which components are actually used, and
4931 * re-pack the inlocs to eliminate unneeded varyings.
4932 */
4933 static void
pack_inlocs(struct ir3_context * ctx)4934 pack_inlocs(struct ir3_context *ctx)
4935 {
4936 struct ir3_shader_variant *so = ctx->so;
4937 uint8_t used_components[so->inputs_count];
4938
4939 memset(used_components, 0, sizeof(used_components));
4940
4941 /*
4942 * First Step: scan shader to find which bary.f/ldlv remain:
4943 */
4944
4945 foreach_block (block, &ctx->ir->block_list) {
4946 foreach_instr (instr, &block->instr_list) {
4947 if (is_input(instr)) {
4948 unsigned inloc = instr->srcs[0]->iim_val;
4949 unsigned i = inloc / 4;
4950 unsigned j = inloc % 4;
4951
4952 compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);
4953 compile_assert(ctx, i < so->inputs_count);
4954
4955 used_components[i] |= 1 << j;
4956 } else if (instr->opc == OPC_META_TEX_PREFETCH) {
4957 for (int n = 0; n < 2; n++) {
4958 unsigned inloc = instr->prefetch.input_offset + n;
4959 unsigned i = inloc / 4;
4960 unsigned j = inloc % 4;
4961
4962 compile_assert(ctx, i < so->inputs_count);
4963
4964 used_components[i] |= 1 << j;
4965 }
4966 }
4967 }
4968 }
4969
4970 /*
4971 * Second Step: reassign varying inloc/slots:
4972 */
4973
4974 unsigned inloc = 0;
4975
4976 /* for clip+cull distances, unused components can't be eliminated because
4977 * they're read by fixed-function, even if there's a hole. Note that
4978 * clip/cull distance arrays must be declared in the FS, so we can just
4979 * use the NIR clip/cull distances to avoid reading ucp_enables in the
4980 * shader key.
4981 */
4982 unsigned clip_cull_mask = so->clip_mask | so->cull_mask;
4983
4984 so->varying_in = 0;
4985
4986 for (unsigned i = 0; i < so->inputs_count; i++) {
4987 unsigned compmask = 0, maxcomp = 0;
4988
4989 so->inputs[i].inloc = inloc;
4990 so->inputs[i].bary = false;
4991
4992 if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||
4993 so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {
4994 if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)
4995 compmask = clip_cull_mask & 0xf;
4996 else
4997 compmask = clip_cull_mask >> 4;
4998 used_components[i] = compmask;
4999 }
5000
5001 for (unsigned j = 0; j < 4; j++) {
5002 if (!(used_components[i] & (1 << j)))
5003 continue;
5004
5005 compmask |= (1 << j);
5006 maxcomp = j + 1;
5007
5008 /* at this point, since used_components[i] mask is only
5009 * considering varyings (ie. not sysvals) we know this
5010 * is a varying:
5011 */
5012 so->inputs[i].bary = true;
5013 }
5014
5015 if (so->inputs[i].bary) {
5016 so->varying_in++;
5017 so->inputs[i].compmask = (1 << maxcomp) - 1;
5018 inloc += maxcomp;
5019 }
5020 }
5021
5022 /*
5023 * Third Step: reassign packed inloc's:
5024 */
5025
5026 foreach_block (block, &ctx->ir->block_list) {
5027 foreach_instr (instr, &block->instr_list) {
5028 if (is_input(instr)) {
5029 unsigned inloc = instr->srcs[0]->iim_val;
5030 unsigned i = inloc / 4;
5031 unsigned j = inloc % 4;
5032
5033 instr->srcs[0]->iim_val = so->inputs[i].inloc + j;
5034 if (instr->opc == OPC_FLAT_B)
5035 instr->srcs[1]->iim_val = instr->srcs[0]->iim_val;
5036 } else if (instr->opc == OPC_META_TEX_PREFETCH) {
5037 unsigned i = instr->prefetch.input_offset / 4;
5038 unsigned j = instr->prefetch.input_offset % 4;
5039 instr->prefetch.input_offset = so->inputs[i].inloc + j;
5040 }
5041 }
5042 }
5043 }
5044
5045 static void
setup_output(struct ir3_context * ctx,nir_intrinsic_instr * intr)5046 setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)
5047 {
5048 struct ir3_shader_variant *so = ctx->so;
5049 nir_io_semantics io = nir_intrinsic_io_semantics(intr);
5050
5051 nir_src offset_src = *nir_get_io_offset_src(intr);
5052 compile_assert(ctx, nir_src_is_const(offset_src));
5053
5054 unsigned offset = nir_src_as_uint(offset_src);
5055 unsigned frac = nir_intrinsic_component(intr);
5056 unsigned ncomp = nir_intrinsic_src_components(intr, 0);
5057 unsigned slot = io.location + offset;
5058
5059 /* For per-view variables, each user-facing slot corresponds to multiple
5060 * views, each with a corresponding driver_location, and the view index
5061 * offsets the driver_location. */
5062 unsigned view_index = intr->intrinsic == nir_intrinsic_store_per_view_output
5063 ? nir_src_as_uint(intr->src[1])
5064 : 0;
5065 unsigned n = nir_intrinsic_base(intr) + offset + view_index;
5066
5067 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
5068 switch (slot) {
5069 case FRAG_RESULT_DEPTH:
5070 so->writes_pos = true;
5071 break;
5072 case FRAG_RESULT_COLOR:
5073 if (!ctx->s->info.fs.color_is_dual_source) {
5074 so->color0_mrt = 1;
5075 } else {
5076 slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;
5077 if (io.dual_source_blend_index > 0)
5078 so->dual_src_blend = true;
5079 }
5080 break;
5081 case FRAG_RESULT_SAMPLE_MASK:
5082 so->writes_smask = true;
5083 break;
5084 case FRAG_RESULT_STENCIL:
5085 so->writes_stencilref = true;
5086 break;
5087 default:
5088 slot += io.dual_source_blend_index; /* For dual-src blend */
5089 if (io.dual_source_blend_index > 0)
5090 so->dual_src_blend = true;
5091 if (slot >= FRAG_RESULT_DATA0)
5092 break;
5093 ir3_context_error(ctx, "unknown FS output name: %s\n",
5094 gl_frag_result_name(slot));
5095 }
5096 } else if (ctx->so->type == MESA_SHADER_VERTEX ||
5097 ctx->so->type == MESA_SHADER_TESS_EVAL ||
5098 ctx->so->type == MESA_SHADER_GEOMETRY) {
5099 switch (slot) {
5100 case VARYING_SLOT_POS:
5101 so->writes_pos = true;
5102 break;
5103 case VARYING_SLOT_PSIZ:
5104 so->writes_psize = true;
5105 break;
5106 case VARYING_SLOT_VIEWPORT:
5107 so->writes_viewport = true;
5108 break;
5109 case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
5110 so->writes_shading_rate = true;
5111 break;
5112 case VARYING_SLOT_PRIMITIVE_ID:
5113 case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
5114 assert(ctx->so->type == MESA_SHADER_GEOMETRY);
5115 FALLTHROUGH;
5116 case VARYING_SLOT_COL0:
5117 case VARYING_SLOT_COL1:
5118 case VARYING_SLOT_BFC0:
5119 case VARYING_SLOT_BFC1:
5120 case VARYING_SLOT_FOGC:
5121 case VARYING_SLOT_CLIP_DIST0:
5122 case VARYING_SLOT_CLIP_DIST1:
5123 case VARYING_SLOT_CLIP_VERTEX:
5124 case VARYING_SLOT_LAYER:
5125 break;
5126 default:
5127 if (slot >= VARYING_SLOT_VAR0)
5128 break;
5129 if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
5130 break;
5131 ir3_context_error(ctx, "unknown %s shader output name: %s\n",
5132 _mesa_shader_stage_to_string(ctx->so->type),
5133 gl_varying_slot_name_for_stage(slot, ctx->so->type));
5134 }
5135 } else {
5136 ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
5137 }
5138
5139 so->outputs_count = MAX2(so->outputs_count, n + 1);
5140 compile_assert(ctx, so->outputs_count <= ARRAY_SIZE(so->outputs));
5141
5142 so->outputs[n].slot = slot;
5143 if (view_index > 0)
5144 so->multi_pos_output = true;
5145 so->outputs[n].view = view_index;
5146
5147 for (int i = 0; i < ncomp; i++) {
5148 unsigned idx = (n * 4) + i + frac;
5149 compile_assert(ctx, idx < ctx->noutputs);
5150 ctx->outputs[idx] = create_immed(&ctx->build, fui(0.0));
5151 }
5152
5153 /* if varying packing doesn't happen, we could end up in a situation
5154 * with "holes" in the output, and since the per-generation code that
5155 * sets up varying linkage registers doesn't expect to have more than
5156 * one varying per vec4 slot, pad the holes.
5157 *
5158 * Note that this should probably generate a performance warning of
5159 * some sort.
5160 */
5161 for (int i = 0; i < frac; i++) {
5162 unsigned idx = (n * 4) + i;
5163 if (!ctx->outputs[idx]) {
5164 ctx->outputs[idx] = create_immed(&ctx->build, fui(0.0));
5165 }
5166 }
5167
5168 struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]);
5169 for (int i = 0; i < ncomp; i++) {
5170 unsigned idx = (n * 4) + i + frac;
5171 ctx->outputs[idx] = src[i];
5172 }
5173 }
5174
5175 static bool
uses_load_input(struct ir3_shader_variant * so)5176 uses_load_input(struct ir3_shader_variant *so)
5177 {
5178 return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;
5179 }
5180
5181 static bool
uses_store_output(struct ir3_shader_variant * so)5182 uses_store_output(struct ir3_shader_variant *so)
5183 {
5184 switch (so->type) {
5185 case MESA_SHADER_VERTEX:
5186 return !so->key.has_gs && !so->key.tessellation;
5187 case MESA_SHADER_TESS_EVAL:
5188 return !so->key.has_gs;
5189 case MESA_SHADER_GEOMETRY:
5190 case MESA_SHADER_FRAGMENT:
5191 return true;
5192 case MESA_SHADER_TESS_CTRL:
5193 case MESA_SHADER_COMPUTE:
5194 case MESA_SHADER_KERNEL:
5195 return false;
5196 default:
5197 unreachable("unknown stage");
5198 }
5199 }
5200
5201 static void
emit_instructions(struct ir3_context * ctx)5202 emit_instructions(struct ir3_context *ctx)
5203 {
5204 MESA_TRACE_FUNC();
5205
5206 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
5207
5208 /* some varying setup which can't be done in setup_input(): */
5209 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
5210 nir_foreach_shader_in_variable (var, ctx->s) {
5211 /* set rasterflat flag for front/back color */
5212 if (var->data.interpolation == INTERP_MODE_NONE) {
5213 switch (var->data.location) {
5214 case VARYING_SLOT_COL0:
5215 case VARYING_SLOT_COL1:
5216 case VARYING_SLOT_BFC0:
5217 case VARYING_SLOT_BFC1:
5218 ctx->so->inputs[var->data.driver_location].rasterflat = true;
5219 break;
5220 default:
5221 break;
5222 }
5223 }
5224 }
5225 }
5226
5227 if (uses_load_input(ctx->so)) {
5228 ctx->so->inputs_count = ctx->s->num_inputs;
5229 compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));
5230 ctx->ninputs = ctx->s->num_inputs * 4;
5231 ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
5232 } else {
5233 ctx->ninputs = 0;
5234 ctx->so->inputs_count = 0;
5235 }
5236
5237 if (uses_store_output(ctx->so)) {
5238 ctx->noutputs = ctx->s->num_outputs * 4;
5239 ctx->outputs =
5240 rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
5241 } else {
5242 ctx->noutputs = 0;
5243 }
5244
5245 ctx->ir = ir3_create(ctx->compiler, ctx->so);
5246
5247 /* Create inputs in first block: */
5248 ir3_context_set_block(ctx, get_block(ctx, nir_start_block(fxn)));
5249 ctx->in_block = ctx->block;
5250
5251 /* for fragment shader, the vcoord input register is used as the
5252 * base for bary.f varying fetch instrs:
5253 *
5254 * TODO defer creating ctx->ij_pixel and corresponding sysvals
5255 * until emit_intrinsic when we know they are actually needed.
5256 * For now, we defer creating ctx->ij_centroid, etc, since we
5257 * only need ij_pixel for "old style" varying inputs (ie.
5258 * tgsi_to_nir)
5259 */
5260 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
5261 ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
5262 }
5263
5264 /* Defer add_sysval_input() stuff until after setup_inputs(),
5265 * because sysvals need to be appended after varyings:
5266 */
5267 if (ctx->ij[IJ_PERSP_PIXEL]) {
5268 add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3,
5269 ctx->ij[IJ_PERSP_PIXEL]);
5270 }
5271
5272 /* Tesselation shaders always need primitive ID for indexing the
5273 * BO. Geometry shaders don't always need it but when they do it has be
5274 * delivered and unclobbered in the VS. To make things easy, we always
5275 * make room for it in VS/DS.
5276 */
5277 bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;
5278 bool has_gs = ctx->so->key.has_gs;
5279 switch (ctx->so->type) {
5280 case MESA_SHADER_VERTEX:
5281 if (has_tess) {
5282 ctx->tcs_header =
5283 create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
5284 ctx->rel_patch_id =
5285 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
5286 ctx->primitive_id =
5287 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
5288 } else if (has_gs) {
5289 ctx->gs_header =
5290 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
5291 ctx->primitive_id =
5292 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
5293 }
5294 break;
5295 case MESA_SHADER_TESS_CTRL:
5296 ctx->tcs_header =
5297 create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
5298 ctx->rel_patch_id =
5299 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
5300 break;
5301 case MESA_SHADER_TESS_EVAL:
5302 if (has_gs) {
5303 ctx->gs_header =
5304 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
5305 ctx->primitive_id =
5306 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
5307 }
5308 ctx->rel_patch_id =
5309 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
5310 break;
5311 case MESA_SHADER_GEOMETRY:
5312 ctx->gs_header =
5313 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
5314 break;
5315 default:
5316 break;
5317 }
5318
5319 /* Find # of samplers. Just assume that we'll be reading from images.. if
5320 * it is write-only we don't have to count it, but after lowering derefs
5321 * is too late to compact indices for that.
5322 */
5323 ctx->so->num_samp =
5324 BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;
5325
5326 /* Save off clip+cull information. Note that in OpenGL clip planes may
5327 * be individually enabled/disabled, and some gens handle lowering in
5328 * backend, so we also need to consider the shader key:
5329 */
5330 ctx->so->clip_mask = ctx->so->key.ucp_enables |
5331 MASK(ctx->s->info.clip_distance_array_size);
5332 ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size)
5333 << ctx->s->info.clip_distance_array_size;
5334
5335 ctx->so->pvtmem_size = ctx->s->scratch_size;
5336 ctx->so->shared_size = ctx->s->info.shared_size;
5337
5338 /* NOTE: need to do something more clever when we support >1 fxn */
5339 nir_foreach_reg_decl (decl, fxn) {
5340 ir3_declare_array(ctx, decl);
5341 }
5342
5343 /* And emit the body: */
5344 ctx->impl = fxn;
5345 emit_function(ctx, fxn);
5346
5347 if (ctx->so->type == MESA_SHADER_TESS_CTRL &&
5348 ctx->compiler->tess_use_shared) {
5349 /* Anything before shpe seems to be ignored in the main shader when early
5350 * preamble is enabled on a7xx, so we have to put the barrier after.
5351 */
5352 struct ir3_block *block = ir3_after_preamble(ctx->ir);
5353 struct ir3_builder build = ir3_builder_at(ir3_after_block(block));
5354
5355 struct ir3_instruction *barrier = ir3_BAR(&build);
5356 barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
5357 barrier->barrier_class = IR3_BARRIER_EVERYTHING;
5358 array_insert(block, block->keeps, barrier);
5359 ctx->so->has_barrier = true;
5360
5361 /* Move the barrier to the beginning of the block but after any phi/input
5362 * meta instructions that must be at the beginning. It must be before we
5363 * load VS outputs.
5364 */
5365 foreach_instr (instr, &block->instr_list) {
5366 if (instr->opc != OPC_META_INPUT &&
5367 instr->opc != OPC_META_TEX_PREFETCH &&
5368 instr->opc != OPC_META_PHI) {
5369 ir3_instr_move_before(barrier, instr);
5370 break;
5371 }
5372 }
5373 }
5374 }
5375
5376 /* Fixup tex sampler state for astc/srgb workaround instructions. We
5377 * need to assign the tex state indexes for these after we know the
5378 * max tex index.
5379 */
5380 static void
fixup_astc_srgb(struct ir3_context * ctx)5381 fixup_astc_srgb(struct ir3_context *ctx)
5382 {
5383 struct ir3_shader_variant *so = ctx->so;
5384 /* indexed by original tex idx, value is newly assigned alpha sampler
5385 * state tex idx. Zero is invalid since there is at least one sampler
5386 * if we get here.
5387 */
5388 unsigned alt_tex_state[16] = {0};
5389 unsigned tex_idx = ctx->max_texture_index + 1;
5390 unsigned idx = 0;
5391
5392 so->astc_srgb.base = tex_idx;
5393
5394 for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
5395 struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
5396
5397 compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
5398
5399 if (alt_tex_state[sam->cat5.tex] == 0) {
5400 /* assign new alternate/alpha tex state slot: */
5401 alt_tex_state[sam->cat5.tex] = tex_idx++;
5402 so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
5403 so->astc_srgb.count++;
5404 }
5405
5406 sam->cat5.tex = alt_tex_state[sam->cat5.tex];
5407 }
5408 }
5409
5410 /* Fixup tex sampler state for tg4 workaround instructions. We
5411 * need to assign the tex state indexes for these after we know the
5412 * max tex index.
5413 */
5414 static void
fixup_tg4(struct ir3_context * ctx)5415 fixup_tg4(struct ir3_context *ctx)
5416 {
5417 struct ir3_shader_variant *so = ctx->so;
5418 /* indexed by original tex idx, value is newly assigned alpha sampler
5419 * state tex idx. Zero is invalid since there is at least one sampler
5420 * if we get here.
5421 */
5422 unsigned alt_tex_state[16] = {0};
5423 unsigned tex_idx = ctx->max_texture_index + so->astc_srgb.count + 1;
5424 unsigned idx = 0;
5425
5426 so->tg4.base = tex_idx;
5427
5428 for (unsigned i = 0; i < ctx->ir->tg4_count; i++) {
5429 struct ir3_instruction *sam = ctx->ir->tg4[i];
5430
5431 compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
5432
5433 if (alt_tex_state[sam->cat5.tex] == 0) {
5434 /* assign new alternate/alpha tex state slot: */
5435 alt_tex_state[sam->cat5.tex] = tex_idx++;
5436 so->tg4.orig_idx[idx++] = sam->cat5.tex;
5437 so->tg4.count++;
5438 }
5439
5440 sam->cat5.tex = alt_tex_state[sam->cat5.tex];
5441 }
5442 }
5443
5444 static struct ir3_instruction *
find_end(struct ir3 * ir)5445 find_end(struct ir3 *ir)
5446 {
5447 foreach_block_rev (block, &ir->block_list) {
5448 foreach_instr_rev (instr, &block->instr_list) {
5449 if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
5450 return instr;
5451 }
5452 }
5453 unreachable("couldn't find end instruction");
5454 }
5455
5456 static void
collect_tex_prefetches(struct ir3_context * ctx,struct ir3 * ir)5457 collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir)
5458 {
5459 unsigned idx = 0;
5460
5461 /* Collect sampling instructions eligible for pre-dispatch. */
5462 foreach_block (block, &ir->block_list) {
5463 foreach_instr_safe (instr, &block->instr_list) {
5464 if (instr->opc == OPC_META_TEX_PREFETCH) {
5465 assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));
5466 struct ir3_sampler_prefetch *fetch =
5467 &ctx->so->sampler_prefetch[idx];
5468 idx++;
5469
5470 fetch->bindless = instr->flags & IR3_INSTR_B;
5471 if (fetch->bindless) {
5472 /* In bindless mode, the index is actually the base */
5473 fetch->tex_id = instr->prefetch.tex_base;
5474 fetch->samp_id = instr->prefetch.samp_base;
5475 fetch->tex_bindless_id = instr->prefetch.tex;
5476 fetch->samp_bindless_id = instr->prefetch.samp;
5477 } else {
5478 fetch->tex_id = instr->prefetch.tex;
5479 fetch->samp_id = instr->prefetch.samp;
5480 }
5481 fetch->tex_opc = OPC_SAM;
5482 fetch->wrmask = instr->dsts[0]->wrmask;
5483 fetch->dst = instr->dsts[0]->num;
5484 fetch->src = instr->prefetch.input_offset;
5485
5486 /* These are the limits on a5xx/a6xx, we might need to
5487 * revisit if SP_FS_PREFETCH[n] changes on later gens:
5488 */
5489 assert(fetch->dst <= 0x3f);
5490 assert(fetch->tex_id <= 0x1f);
5491 assert(fetch->samp_id <= 0xf);
5492
5493 ctx->so->total_in =
5494 MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);
5495
5496 fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);
5497
5498 /* Remove the prefetch placeholder instruction: */
5499 list_delinit(&instr->node);
5500 }
5501 }
5502 }
5503 }
5504
5505 int
ir3_compile_shader_nir(struct ir3_compiler * compiler,struct ir3_shader * shader,struct ir3_shader_variant * so)5506 ir3_compile_shader_nir(struct ir3_compiler *compiler,
5507 struct ir3_shader *shader,
5508 struct ir3_shader_variant *so)
5509 {
5510 struct ir3_context *ctx;
5511 struct ir3 *ir;
5512 int ret = 0, max_bary;
5513 bool progress;
5514
5515 MESA_TRACE_FUNC();
5516
5517 assert(!so->ir);
5518
5519 ctx = ir3_context_init(compiler, shader, so);
5520 if (!ctx) {
5521 DBG("INIT failed!");
5522 ret = -1;
5523 goto out;
5524 }
5525
5526 emit_instructions(ctx);
5527
5528 if (ctx->error) {
5529 DBG("EMIT failed!");
5530 ret = -1;
5531 goto out;
5532 }
5533
5534 ir = so->ir = ctx->ir;
5535
5536 if (gl_shader_stage_is_compute(so->type)) {
5537 so->local_size[0] = ctx->s->info.workgroup_size[0];
5538 so->local_size[1] = ctx->s->info.workgroup_size[1];
5539 so->local_size[2] = ctx->s->info.workgroup_size[2];
5540 so->local_size_variable = ctx->s->info.workgroup_size_variable;
5541 }
5542
5543 if (so->type == MESA_SHADER_FRAGMENT && so->reads_shading_rate &&
5544 !so->reads_smask &&
5545 compiler->reading_shading_rate_requires_smask_quirk) {
5546 create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
5547 }
5548
5549 /* Vertex shaders in a tessellation or geometry pipeline treat END as a
5550 * NOP and has an epilogue that writes the VS outputs to local storage, to
5551 * be read by the HS. Then it resets execution mask (chmask) and chains
5552 * to the next shader (chsh). There are also a few output values which we
5553 * must send to the next stage via registers, and in order for both stages
5554 * to agree on the register used we must force these to be in specific
5555 * registers.
5556 */
5557 if ((so->type == MESA_SHADER_VERTEX &&
5558 (so->key.has_gs || so->key.tessellation)) ||
5559 (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
5560 struct ir3_instruction *outputs[3];
5561 unsigned outidxs[3];
5562 unsigned regids[3];
5563 unsigned outputs_count = 0;
5564
5565 if (ctx->primitive_id) {
5566 unsigned n = so->outputs_count++;
5567 so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
5568
5569 struct ir3_instruction *out =
5570 ir3_collect(&ctx->build, ctx->primitive_id);
5571 outputs[outputs_count] = out;
5572 outidxs[outputs_count] = n;
5573 if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id)
5574 regids[outputs_count] = regid(0, 2);
5575 else
5576 regids[outputs_count] = regid(0, 1);
5577 outputs_count++;
5578 }
5579
5580 if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id) {
5581 unsigned n = so->outputs_count++;
5582 so->outputs[n].slot = VARYING_SLOT_REL_PATCH_ID_IR3;
5583 struct ir3_instruction *out =
5584 ir3_collect(&ctx->build, ctx->rel_patch_id);
5585 outputs[outputs_count] = out;
5586 outidxs[outputs_count] = n;
5587 regids[outputs_count] = regid(0, 1);
5588 outputs_count++;
5589 }
5590
5591 if (ctx->gs_header) {
5592 unsigned n = so->outputs_count++;
5593 so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
5594 struct ir3_instruction *out = ir3_collect(&ctx->build, ctx->gs_header);
5595 outputs[outputs_count] = out;
5596 outidxs[outputs_count] = n;
5597 regids[outputs_count] = regid(0, 0);
5598 outputs_count++;
5599 }
5600
5601 if (ctx->tcs_header) {
5602 unsigned n = so->outputs_count++;
5603 so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
5604 struct ir3_instruction *out =
5605 ir3_collect(&ctx->build, ctx->tcs_header);
5606 outputs[outputs_count] = out;
5607 outidxs[outputs_count] = n;
5608 regids[outputs_count] = regid(0, 0);
5609 outputs_count++;
5610 }
5611
5612 struct ir3_instruction *chmask =
5613 ir3_build_instr(&ctx->build, OPC_CHMASK, 0, outputs_count);
5614 chmask->barrier_class = IR3_BARRIER_EVERYTHING;
5615 chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
5616
5617 for (unsigned i = 0; i < outputs_count; i++)
5618 __ssa_src(chmask, outputs[i], 0)->num = regids[i];
5619
5620 chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
5621 memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
5622
5623 array_insert(ctx->block, ctx->block->keeps, chmask);
5624
5625 struct ir3_instruction *chsh = ir3_CHSH(&ctx->build);
5626 chsh->barrier_class = IR3_BARRIER_EVERYTHING;
5627 chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
5628 } else {
5629 assert((ctx->noutputs % 4) == 0);
5630 unsigned outidxs[ctx->noutputs / 4];
5631 struct ir3_instruction *outputs[ctx->noutputs / 4];
5632 unsigned outputs_count = 0;
5633
5634 struct ir3_block *b = ctx->block;
5635 /* Insert these collect's in the block before the end-block if
5636 * possible, so that any moves they generate can be shuffled around to
5637 * reduce nop's:
5638 */
5639 if (ctx->block->predecessors_count == 1)
5640 b = ctx->block->predecessors[0];
5641
5642 /* Setup IR level outputs, which are "collects" that gather
5643 * the scalar components of outputs.
5644 */
5645 for (unsigned i = 0; i < ctx->noutputs; i += 4) {
5646 unsigned ncomp = 0;
5647 /* figure out the # of components written:
5648 *
5649 * TODO do we need to handle holes, ie. if .x and .z
5650 * components written, but .y component not written?
5651 */
5652 for (unsigned j = 0; j < 4; j++) {
5653 if (!ctx->outputs[i + j])
5654 break;
5655 ncomp++;
5656 }
5657
5658 /* Note that in some stages, like TCS, store_output is
5659 * lowered to memory writes, so no components of the
5660 * are "written" from the PoV of traditional store-
5661 * output instructions:
5662 */
5663 if (!ncomp)
5664 continue;
5665
5666 struct ir3_builder build = ir3_builder_at(ir3_before_terminator(b));
5667 struct ir3_instruction *out =
5668 ir3_create_collect(&build, &ctx->outputs[i], ncomp);
5669
5670 int outidx = i / 4;
5671 assert(outidx < so->outputs_count);
5672
5673 outidxs[outputs_count] = outidx;
5674 outputs[outputs_count] = out;
5675 outputs_count++;
5676 }
5677
5678 /* for a6xx+, binning and draw pass VS use same VBO state, so we
5679 * need to make sure not to remove any inputs that are used by
5680 * the nonbinning VS.
5681 */
5682 if (ctx->compiler->gen >= 6 && so->binning_pass &&
5683 so->type == MESA_SHADER_VERTEX) {
5684 for (int i = 0; i < ctx->ninputs; i++) {
5685 struct ir3_instruction *in = ctx->inputs[i];
5686
5687 if (!in)
5688 continue;
5689
5690 unsigned n = i / 4;
5691 unsigned c = i % 4;
5692
5693 assert(n < so->nonbinning->inputs_count);
5694
5695 if (so->nonbinning->inputs[n].sysval)
5696 continue;
5697
5698 /* be sure to keep inputs, even if only used in VS */
5699 if (so->nonbinning->inputs[n].compmask & (1 << c))
5700 array_insert(in->block, in->block->keeps, in);
5701 }
5702 }
5703
5704 struct ir3_instruction *end =
5705 ir3_build_instr(&ctx->build, OPC_END, 0, outputs_count);
5706
5707 for (unsigned i = 0; i < outputs_count; i++) {
5708 __ssa_src(end, outputs[i], 0);
5709 }
5710
5711 end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
5712 memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
5713
5714 array_insert(ctx->block, ctx->block->keeps, end);
5715 }
5716
5717 if (so->type == MESA_SHADER_FRAGMENT &&
5718 ctx->s->info.fs.needs_quad_helper_invocations) {
5719 so->need_pixlod = true;
5720 so->need_full_quad = true;
5721 }
5722
5723 ir3_debug_print(ir, "AFTER: nir->ir3");
5724 ir3_validate(ir);
5725
5726 IR3_PASS(ir, ir3_remove_unreachable);
5727
5728 IR3_PASS(ir, ir3_array_to_ssa);
5729
5730 ir3_calc_reconvergence(so);
5731
5732 IR3_PASS(ir, ir3_lower_shared_phis);
5733
5734 do {
5735 progress = false;
5736
5737 /* the folding doesn't seem to work reliably on a4xx */
5738 if (ctx->compiler->gen != 4)
5739 progress |= IR3_PASS(ir, ir3_cf);
5740 progress |= IR3_PASS(ir, ir3_cp, so);
5741 progress |= IR3_PASS(ir, ir3_cse);
5742 progress |= IR3_PASS(ir, ir3_dce, so);
5743 progress |= IR3_PASS(ir, ir3_opt_predicates, so);
5744 progress |= IR3_PASS(ir, ir3_shared_fold);
5745 } while (progress);
5746
5747 IR3_PASS(ir, ir3_sched_add_deps);
5748
5749 /* At this point, all the dead code should be long gone: */
5750 assert(!IR3_PASS(ir, ir3_dce, so));
5751
5752 ret = ir3_sched(ir);
5753 if (ret) {
5754 DBG("SCHED failed!");
5755 goto out;
5756 }
5757
5758 ir3_debug_print(ir, "AFTER: ir3_sched");
5759
5760 /* Pre-assign VS inputs on a6xx+ binning pass shader, to align
5761 * with draw pass VS, so binning and draw pass can both use the
5762 * same VBO state.
5763 *
5764 * Note that VS inputs are expected to be full precision.
5765 */
5766 bool pre_assign_inputs = (ir->compiler->gen >= 6) &&
5767 (ir->type == MESA_SHADER_VERTEX) &&
5768 so->binning_pass;
5769
5770 if (pre_assign_inputs) {
5771 foreach_input (in, ir) {
5772 assert(in->opc == OPC_META_INPUT);
5773 unsigned inidx = in->input.inidx;
5774
5775 in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;
5776 }
5777 } else if (ctx->tcs_header) {
5778 /* We need to have these values in the same registers between VS and TCS
5779 * since the VS chains to TCS and doesn't get the sysvals redelivered.
5780 */
5781
5782 ctx->tcs_header->dsts[0]->num = regid(0, 0);
5783 ctx->rel_patch_id->dsts[0]->num = regid(0, 1);
5784 if (ctx->primitive_id)
5785 ctx->primitive_id->dsts[0]->num = regid(0, 2);
5786 } else if (ctx->gs_header) {
5787 /* We need to have these values in the same registers between producer
5788 * (VS or DS) and GS since the producer chains to GS and doesn't get
5789 * the sysvals redelivered.
5790 */
5791
5792 ctx->gs_header->dsts[0]->num = regid(0, 0);
5793 if (ctx->primitive_id)
5794 ctx->primitive_id->dsts[0]->num = regid(0, 1);
5795 } else if (so->num_sampler_prefetch) {
5796 assert(so->type == MESA_SHADER_FRAGMENT);
5797 int idx = 0;
5798
5799 foreach_input (instr, ir) {
5800 if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
5801 continue;
5802
5803 assert(idx < 2);
5804 instr->dsts[0]->num = idx;
5805 idx++;
5806 }
5807 }
5808
5809 IR3_PASS(ir, ir3_cleanup_rpt, so);
5810 ret = ir3_ra(so);
5811
5812 if (ret) {
5813 mesa_loge("ir3_ra() failed!");
5814 goto out;
5815 }
5816
5817 IR3_PASS(ir, ir3_merge_rpt, so);
5818 IR3_PASS(ir, ir3_postsched, so);
5819
5820 IR3_PASS(ir, ir3_legalize_relative);
5821 IR3_PASS(ir, ir3_lower_subgroups);
5822
5823 /* This isn't valid to do when transform feedback is done in HW, which is
5824 * a4xx onward, because the VS may use components not read by the FS for
5825 * transform feedback. Ideally we'd delete this, but a5xx and earlier seem to
5826 * be broken without it.
5827 */
5828 if (so->type == MESA_SHADER_FRAGMENT && ctx->compiler->gen < 6)
5829 pack_inlocs(ctx);
5830
5831 /*
5832 * Fixup inputs/outputs to point to the actual registers assigned:
5833 *
5834 * 1) initialize to r63.x (invalid/unused)
5835 * 2) iterate IR level inputs/outputs and update the variants
5836 * inputs/outputs table based on the assigned registers for
5837 * the remaining inputs/outputs.
5838 */
5839
5840 for (unsigned i = 0; i < so->inputs_count; i++)
5841 so->inputs[i].regid = INVALID_REG;
5842 for (unsigned i = 0; i < so->outputs_count; i++)
5843 so->outputs[i].regid = INVALID_REG;
5844
5845 struct ir3_instruction *end = find_end(so->ir);
5846
5847 for (unsigned i = 0; i < end->srcs_count; i++) {
5848 unsigned outidx = end->end.outidxs[i];
5849 struct ir3_register *reg = end->srcs[i];
5850
5851 so->outputs[outidx].regid = reg->num;
5852 so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
5853 }
5854
5855 foreach_input (in, ir) {
5856 assert(in->opc == OPC_META_INPUT);
5857 unsigned inidx = in->input.inidx;
5858
5859 if (pre_assign_inputs && !so->inputs[inidx].sysval) {
5860 if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {
5861 compile_assert(
5862 ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid);
5863 compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==
5864 so->nonbinning->inputs[inidx].half);
5865 }
5866 so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;
5867 so->inputs[inidx].half = so->nonbinning->inputs[inidx].half;
5868 } else {
5869 so->inputs[inidx].regid = in->dsts[0]->num;
5870 so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF);
5871 }
5872 }
5873
5874 uint8_t clip_cull_mask = ctx->so->clip_mask | ctx->so->cull_mask;
5875 /* Having non-zero clip/cull mask and not writting corresponding regs
5876 * leads to a GPU fault on A7XX.
5877 */
5878 if (clip_cull_mask &&
5879 ir3_find_output_regid(ctx->so, VARYING_SLOT_CLIP_DIST0) == regid(63, 0)) {
5880 ctx->so->clip_mask &= 0xf0;
5881 ctx->so->cull_mask &= 0xf0;
5882 }
5883 if ((clip_cull_mask >> 4) &&
5884 ir3_find_output_regid(ctx->so, VARYING_SLOT_CLIP_DIST1) == regid(63, 0)) {
5885 ctx->so->clip_mask &= 0xf;
5886 ctx->so->cull_mask &= 0xf;
5887 }
5888
5889 if (ctx->astc_srgb)
5890 fixup_astc_srgb(ctx);
5891
5892 if (ctx->compiler->gen == 4 && ctx->s->info.uses_texture_gather)
5893 fixup_tg4(ctx);
5894
5895 /* We need to do legalize after (for frag shader's) the "bary.f"
5896 * offsets (inloc) have been assigned.
5897 */
5898 IR3_PASS(ir, ir3_legalize, so, &max_bary);
5899
5900 /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't
5901 * know what we might have to wait on when coming in from VS chsh.
5902 */
5903 if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) {
5904 foreach_block (block, &ir->block_list) {
5905 foreach_instr (instr, &block->instr_list) {
5906 instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
5907 break;
5908 }
5909 }
5910 }
5911
5912 if (ctx->compiler->gen >= 7 && so->type == MESA_SHADER_COMPUTE) {
5913 struct ir3_instruction *end = find_end(so->ir);
5914 struct ir3_instruction *lock =
5915 ir3_build_instr(&ctx->build, OPC_LOCK, 0, 0);
5916 /* TODO: This flags should be set by scheduler only when needed */
5917 lock->flags = IR3_INSTR_SS | IR3_INSTR_SY | IR3_INSTR_JP;
5918 ir3_instr_move_before(lock, end);
5919 struct ir3_instruction *unlock =
5920 ir3_build_instr(&ctx->build, OPC_UNLOCK, 0, 0);
5921 ir3_instr_move_before(unlock, end);
5922 }
5923
5924 so->pvtmem_size = ALIGN(so->pvtmem_size, compiler->pvtmem_per_fiber_align);
5925
5926 /* Note that max_bary counts inputs that are not bary.f'd for FS: */
5927 if (so->type == MESA_SHADER_FRAGMENT)
5928 so->total_in = max_bary + 1;
5929
5930 /* Collect sampling instructions eligible for pre-dispatch. */
5931 collect_tex_prefetches(ctx, ir);
5932
5933 if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
5934 !ctx->s->info.fs.early_fragment_tests)
5935 ctx->so->no_earlyz |= ctx->s->info.writes_memory;
5936
5937 if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
5938 ctx->s->info.fs.post_depth_coverage)
5939 so->post_depth_coverage = true;
5940
5941 ctx->so->per_samp = ctx->s->info.fs.uses_sample_shading;
5942
5943 if (ctx->has_relative_load_const_ir3) {
5944 /* NOTE: if relative addressing is used, we set
5945 * constlen in the compiler (to worst-case value)
5946 * since we don't know in the assembler what the max
5947 * addr reg value can be:
5948 */
5949 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
5950 const enum ir3_const_alloc_type rel_const_srcs[] = {
5951 IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS, IR3_CONST_ALLOC_UBO_RANGES,
5952 IR3_CONST_ALLOC_PREAMBLE, IR3_CONST_ALLOC_GLOBAL};
5953 for (int i = 0; i < ARRAY_SIZE(rel_const_srcs); i++) {
5954 const struct ir3_const_allocation *const_alloc =
5955 &const_state->allocs.consts[rel_const_srcs[i]];
5956 if (const_alloc->size_vec4 > 0) {
5957 ctx->so->constlen =
5958 MAX2(ctx->so->constlen,
5959 const_alloc->offset_vec4 + const_alloc->size_vec4);
5960 }
5961 }
5962 }
5963
5964 if (ctx->so->type == MESA_SHADER_FRAGMENT &&
5965 compiler->fs_must_have_non_zero_constlen_quirk) {
5966 so->constlen = MAX2(so->constlen, 4);
5967 }
5968
5969 if (ctx->so->type == MESA_SHADER_VERTEX && ctx->compiler->gen >= 6) {
5970 so->constlen = MAX2(so->constlen, 8);
5971 }
5972
5973 if (gl_shader_stage_is_compute(so->type)) {
5974 so->cs.local_invocation_id =
5975 ir3_find_sysval_regid(so, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
5976 so->cs.work_group_id =
5977 ir3_find_sysval_regid(so, SYSTEM_VALUE_WORKGROUP_ID);
5978 } else {
5979 so->vtxid_base = ir3_find_sysval_regid(so, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
5980 }
5981
5982 out:
5983 if (ret) {
5984 if (so->ir)
5985 ir3_destroy(so->ir);
5986 so->ir = NULL;
5987 }
5988 ir3_context_free(ctx);
5989
5990 return ret;
5991 }
5992