1 /*
2 * Copyright © 2015 Rob Clark <robclark@freedesktop.org>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <robclark@freedesktop.org>
7 */
8
9 #include <stdarg.h>
10
11 #include "util/u_math.h"
12 #include "util/u_memory.h"
13 #include "util/u_string.h"
14
15 #include "ir3_compiler.h"
16 #include "ir3_image.h"
17 #include "ir3_nir.h"
18 #include "ir3_shader.h"
19
20 #include "instr-a3xx.h"
21 #include "ir3.h"
22 #include "ir3_context.h"
23
24 static struct ir3_instruction_rpt
rpt_instr(struct ir3_instruction * instr,unsigned nrpt)25 rpt_instr(struct ir3_instruction *instr, unsigned nrpt)
26 {
27 struct ir3_instruction_rpt dst = {{0}};
28
29 for (unsigned i = 0; i < nrpt; ++i)
30 dst.rpts[i] = instr;
31
32 return dst;
33 }
34
35 static void
cp_instrs(struct ir3_instruction * dst[],struct ir3_instruction * instrs[],unsigned n)36 cp_instrs(struct ir3_instruction *dst[], struct ir3_instruction *instrs[],
37 unsigned n)
38 {
39 for (unsigned i = 0; i < n; ++i)
40 dst[i] = instrs[i];
41 }
42
43 static struct ir3_instruction_rpt
create_immed_rpt(struct ir3_builder * build,unsigned nrpt,unsigned val)44 create_immed_rpt(struct ir3_builder *build, unsigned nrpt, unsigned val)
45 {
46 return rpt_instr(create_immed(build, val), nrpt);
47 }
48
49 static struct ir3_instruction_rpt
create_immed_shared_rpt(struct ir3_builder * build,unsigned nrpt,uint32_t val,bool shared)50 create_immed_shared_rpt(struct ir3_builder *build, unsigned nrpt, uint32_t val,
51 bool shared)
52 {
53 return rpt_instr(create_immed_shared(build, val, shared), nrpt);
54 }
55
56 static struct ir3_instruction_rpt
create_immed_typed_rpt(struct ir3_builder * build,unsigned nrpt,unsigned val,type_t type)57 create_immed_typed_rpt(struct ir3_builder *build, unsigned nrpt, unsigned val,
58 type_t type)
59 {
60 return rpt_instr(create_immed_typed(build, val, type), nrpt);
61 }
62
63 static inline struct ir3_instruction_rpt
create_immed_typed_shared_rpt(struct ir3_builder * build,unsigned nrpt,uint32_t val,type_t type,bool shared)64 create_immed_typed_shared_rpt(struct ir3_builder *build, unsigned nrpt,
65 uint32_t val, type_t type, bool shared)
66 {
67 return rpt_instr(create_immed_typed_shared(build, val, type, shared), nrpt);
68 }
69
70 static void
set_instr_flags(struct ir3_instruction * instrs[],unsigned n,ir3_instruction_flags flags)71 set_instr_flags(struct ir3_instruction *instrs[], unsigned n,
72 ir3_instruction_flags flags)
73 {
74 for (unsigned i = 0; i < n; ++i)
75 instrs[i]->flags |= flags;
76 }
77
78 static void
set_cat1_round(struct ir3_instruction * instrs[],unsigned n,round_t round)79 set_cat1_round(struct ir3_instruction *instrs[], unsigned n, round_t round)
80 {
81 for (unsigned i = 0; i < n; ++i)
82 instrs[i]->cat1.round = round;
83 }
84
85 static void
set_cat2_condition(struct ir3_instruction * instrs[],unsigned n,unsigned condition)86 set_cat2_condition(struct ir3_instruction *instrs[], unsigned n,
87 unsigned condition)
88 {
89 for (unsigned i = 0; i < n; ++i)
90 instrs[i]->cat2.condition = condition;
91 }
92
93 static void
set_dst_flags(struct ir3_instruction * instrs[],unsigned n,ir3_register_flags flags)94 set_dst_flags(struct ir3_instruction *instrs[], unsigned n,
95 ir3_register_flags flags)
96 {
97 for (unsigned i = 0; i < n; ++i)
98 instrs[i]->dsts[0]->flags |= flags;
99 }
100
101 void
ir3_handle_nonuniform(struct ir3_instruction * instr,nir_intrinsic_instr * intrin)102 ir3_handle_nonuniform(struct ir3_instruction *instr,
103 nir_intrinsic_instr *intrin)
104 {
105 if (nir_intrinsic_has_access(intrin) &&
106 (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {
107 instr->flags |= IR3_INSTR_NONUNIF;
108 }
109 }
110
111 void
ir3_handle_bindless_cat6(struct ir3_instruction * instr,nir_src rsrc)112 ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
113 {
114 nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
115 if (!intrin)
116 return;
117
118 instr->flags |= IR3_INSTR_B;
119 instr->cat6.base = nir_intrinsic_desc_set(intrin);
120 }
121
122 static struct ir3_instruction *
create_input(struct ir3_context * ctx,unsigned compmask)123 create_input(struct ir3_context *ctx, unsigned compmask)
124 {
125 struct ir3_instruction *in;
126
127 in = ir3_instr_create_at(ir3_before_terminator(ctx->in_block),
128 OPC_META_INPUT, 1, 0);
129 in->input.sysval = ~0;
130 __ssa_dst(in)->wrmask = compmask;
131
132 array_insert(ctx->ir, ctx->ir->inputs, in);
133
134 return in;
135 }
136
137 static struct ir3_instruction_rpt
create_frag_input(struct ir3_context * ctx,struct ir3_instruction * coord,unsigned n,unsigned ncomp)138 create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,
139 unsigned n, unsigned ncomp)
140 {
141 struct ir3_builder *build = &ctx->build;
142 struct ir3_instruction_rpt instr;
143 /* packed inloc is fixed up later: */
144 struct ir3_instruction_rpt inloc;
145
146 for (unsigned i = 0; i < ncomp; i++)
147 inloc.rpts[i] = create_immed(build, n + i);
148
149 if (coord) {
150 instr =
151 ir3_BARY_F_rpt(build, ncomp, inloc, 0, rpt_instr(coord, ncomp), 0);
152 } else if (ctx->compiler->flat_bypass) {
153 if (ctx->compiler->gen >= 6) {
154 instr = ir3_FLAT_B_rpt(build, ncomp, inloc, 0, inloc, 0);
155 } else {
156 for (unsigned i = 0; i < ncomp; i++) {
157 instr.rpts[i] =
158 ir3_LDLV(build, inloc.rpts[i], 0, create_immed(build, 1), 0);
159 instr.rpts[i]->cat6.type = TYPE_U32;
160 instr.rpts[i]->cat6.iim_val = 1;
161 }
162 }
163 } else {
164 instr = ir3_BARY_F_rpt(build, ncomp, inloc, 0,
165 rpt_instr(ctx->ij[IJ_PERSP_PIXEL], ncomp), 0);
166
167 for (unsigned i = 0; i < ncomp; i++)
168 instr.rpts[i]->srcs[1]->wrmask = 0x3;
169 }
170
171 return instr;
172 }
173
174 static struct ir3_instruction *
create_driver_param(struct ir3_context * ctx,uint32_t dp)175 create_driver_param(struct ir3_context *ctx, uint32_t dp)
176 {
177 /* first four vec4 sysval's reserved for UBOs: */
178 /* NOTE: dp is in scalar, but there can be >4 dp components: */
179 unsigned r = ir3_const_reg(ir3_const_state(ctx->so),
180 IR3_CONST_ALLOC_DRIVER_PARAMS, dp);
181 return create_uniform(&ctx->build, r);
182 }
183
184 static struct ir3_instruction *
create_driver_param_indirect(struct ir3_context * ctx,uint32_t dp,struct ir3_instruction * address)185 create_driver_param_indirect(struct ir3_context *ctx, uint32_t dp,
186 struct ir3_instruction *address)
187 {
188 /* first four vec4 sysval's reserved for UBOs: */
189 /* NOTE: dp is in scalar, but there can be >4 dp components: */
190 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
191 unsigned n =
192 const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4;
193 return create_uniform_indirect(&ctx->build, n * 4 + dp, TYPE_U32, address);
194 }
195
196 /*
197 * Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or
198 * 32-bit registers. We use NIR's 1-bit integers to represent bools, and
199 * trust that we will only see and/or/xor on those 1-bit values, so we can
200 * safely store NIR i1s in a 32-bit reg while always containing either a 1 or
201 * 0.
202 */
203
204 /*
205 * alu/sfu instructions:
206 */
207
208 static struct ir3_instruction_rpt
create_cov(struct ir3_context * ctx,unsigned nrpt,struct ir3_instruction_rpt src,unsigned src_bitsize,nir_op op)209 create_cov(struct ir3_context *ctx, unsigned nrpt,
210 struct ir3_instruction_rpt src, unsigned src_bitsize, nir_op op)
211 {
212 type_t src_type, dst_type;
213
214 switch (op) {
215 case nir_op_f2f32:
216 case nir_op_f2f16_rtne:
217 case nir_op_f2f16_rtz:
218 case nir_op_f2f16:
219 case nir_op_f2i32:
220 case nir_op_f2i16:
221 case nir_op_f2i8:
222 case nir_op_f2u32:
223 case nir_op_f2u16:
224 case nir_op_f2u8:
225 switch (src_bitsize) {
226 case 32:
227 src_type = TYPE_F32;
228 break;
229 case 16:
230 src_type = TYPE_F16;
231 break;
232 default:
233 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
234 }
235 break;
236
237 case nir_op_i2f32:
238 case nir_op_i2f16:
239 case nir_op_i2i32:
240 case nir_op_i2i16:
241 case nir_op_i2i8:
242 switch (src_bitsize) {
243 case 32:
244 src_type = TYPE_S32;
245 break;
246 case 16:
247 src_type = TYPE_S16;
248 break;
249 case 8:
250 src_type = TYPE_U8;
251 break;
252 default:
253 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
254 }
255 break;
256
257 case nir_op_u2f32:
258 case nir_op_u2f16:
259 case nir_op_u2u32:
260 case nir_op_u2u16:
261 case nir_op_u2u8:
262 switch (src_bitsize) {
263 case 32:
264 src_type = TYPE_U32;
265 break;
266 case 16:
267 src_type = TYPE_U16;
268 break;
269 case 8:
270 src_type = TYPE_U8;
271 break;
272 default:
273 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
274 }
275 break;
276
277 case nir_op_b2f16:
278 case nir_op_b2f32:
279 case nir_op_b2i8:
280 case nir_op_b2i16:
281 case nir_op_b2i32:
282 src_type = ctx->compiler->bool_type;
283 break;
284
285 default:
286 ir3_context_error(ctx, "invalid conversion op: %u", op);
287 }
288
289 switch (op) {
290 case nir_op_f2f32:
291 case nir_op_i2f32:
292 case nir_op_u2f32:
293 case nir_op_b2f32:
294 dst_type = TYPE_F32;
295 break;
296
297 case nir_op_f2f16_rtne:
298 case nir_op_f2f16_rtz:
299 case nir_op_f2f16:
300 case nir_op_i2f16:
301 case nir_op_u2f16:
302 case nir_op_b2f16:
303 dst_type = TYPE_F16;
304 break;
305
306 case nir_op_f2i32:
307 case nir_op_i2i32:
308 case nir_op_b2i32:
309 dst_type = TYPE_S32;
310 break;
311
312 case nir_op_f2i16:
313 case nir_op_i2i16:
314 case nir_op_b2i16:
315 dst_type = TYPE_S16;
316 break;
317
318 case nir_op_f2i8:
319 case nir_op_i2i8:
320 case nir_op_b2i8:
321 dst_type = TYPE_U8;
322 break;
323
324 case nir_op_f2u32:
325 case nir_op_u2u32:
326 dst_type = TYPE_U32;
327 break;
328
329 case nir_op_f2u16:
330 case nir_op_u2u16:
331 dst_type = TYPE_U16;
332 break;
333
334 case nir_op_f2u8:
335 case nir_op_u2u8:
336 dst_type = TYPE_U8;
337 break;
338
339 default:
340 ir3_context_error(ctx, "invalid conversion op: %u", op);
341 }
342
343 if (src_type == dst_type)
344 return src;
345
346 /* Zero-extension of 8-bit values doesn't work with `cov`, so simple masking
347 * is used to achieve the result.
348 */
349 if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_U32) {
350 struct ir3_instruction_rpt mask =
351 create_immed_typed_rpt(&ctx->build, nrpt, 0xff, TYPE_U8);
352 struct ir3_instruction_rpt cov =
353 ir3_AND_B_rpt(&ctx->build, nrpt, src, 0, mask, 0);
354 set_dst_flags(cov.rpts, nrpt, type_flags(dst_type));
355 return cov;
356 }
357
358 /* Conversion of 8-bit values into floating-point values doesn't work with
359 * a simple `cov`, instead the 8-bit values first have to be converted into
360 * corresponding 16-bit values and converted from there.
361 */
362 if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_F32) {
363 assert(op == nir_op_u2f16 || op == nir_op_i2f16 ||
364 op == nir_op_u2f32 || op == nir_op_i2f32);
365
366 struct ir3_instruction_rpt cov;
367 if (op == nir_op_u2f16 || op == nir_op_u2f32) {
368 struct ir3_instruction_rpt mask =
369 create_immed_typed_rpt(&ctx->build, nrpt, 0xff, TYPE_U8);
370 cov = ir3_AND_B_rpt(&ctx->build, nrpt, src, 0, mask, 0);
371 set_dst_flags(cov.rpts, nrpt, IR3_REG_HALF);
372 cov = ir3_COV_rpt(&ctx->build, nrpt, cov, TYPE_U16, dst_type);
373 } else {
374 cov = ir3_COV_rpt(&ctx->build, nrpt, src, TYPE_U8, TYPE_S16);
375 cov = ir3_COV_rpt(&ctx->build, nrpt, cov, TYPE_S16, dst_type);
376 }
377 return cov;
378 }
379
380 /* Conversion of floating-point values to 8-bit values also doesn't work
381 * through a single `cov`, instead the conversion has to go through the
382 * corresponding 16-bit type that's then truncated.
383 */
384 if (full_type(src_type) == TYPE_F32 && dst_type == TYPE_U8) {
385 assert(op == nir_op_f2u8 || op == nir_op_f2i8);
386
387 type_t intermediate_type = op == nir_op_f2u8 ? TYPE_U16 : TYPE_S16;
388 struct ir3_instruction_rpt cov =
389 ir3_COV_rpt(&ctx->build, nrpt, src, src_type, intermediate_type);
390 cov = ir3_COV_rpt(&ctx->build, nrpt, cov, intermediate_type, TYPE_U8);
391 return cov;
392 }
393
394 struct ir3_instruction_rpt cov =
395 ir3_COV_rpt(&ctx->build, nrpt, src, src_type, dst_type);
396
397 if (op == nir_op_f2f16_rtne) {
398 set_cat1_round(cov.rpts, nrpt, ROUND_EVEN);
399 } else if (op == nir_op_f2f16_rtz) {
400 set_cat1_round(cov.rpts, nrpt, ROUND_ZERO);
401 } else if (dst_type == TYPE_F16 || dst_type == TYPE_F32) {
402 unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
403 nir_alu_type type =
404 dst_type == TYPE_F16 ? nir_type_float16 : nir_type_float32;
405 nir_rounding_mode rounding_mode =
406 nir_get_rounding_mode_from_float_controls(execution_mode, type);
407 if (rounding_mode == nir_rounding_mode_rtne)
408 set_cat1_round(cov.rpts, nrpt, ROUND_EVEN);
409 else if (rounding_mode == nir_rounding_mode_rtz)
410 set_cat1_round(cov.rpts, nrpt, ROUND_ZERO);
411 }
412
413 return cov;
414 }
415
416 /* For shift instructions NIR always has shift amount as 32 bit integer */
417 static struct ir3_instruction_rpt
resize_shift_amount(struct ir3_context * ctx,unsigned nrpt,struct ir3_instruction_rpt src,unsigned bs)418 resize_shift_amount(struct ir3_context *ctx, unsigned nrpt,
419 struct ir3_instruction_rpt src, unsigned bs)
420 {
421 if (bs == 16)
422 return ir3_COV_rpt(&ctx->build, nrpt, src, TYPE_U32, TYPE_U16);
423 else if (bs == 8)
424 return ir3_COV_rpt(&ctx->build, nrpt, src, TYPE_U32, TYPE_U8);
425 else
426 return src;
427 }
428
429 static void
emit_alu_dot_4x8_as_dp4acc(struct ir3_context * ctx,nir_alu_instr * alu,struct ir3_instruction ** dst,struct ir3_instruction ** src)430 emit_alu_dot_4x8_as_dp4acc(struct ir3_context *ctx, nir_alu_instr *alu,
431 struct ir3_instruction **dst,
432 struct ir3_instruction **src)
433 {
434 if (ctx->compiler->has_compliant_dp4acc) {
435 dst[0] = ir3_DP4ACC(&ctx->build, src[0], 0, src[1], 0, src[2], 0);
436
437 /* This is actually the LHS signedness attribute.
438 * IR3_SRC_UNSIGNED ~ unsigned LHS (i.e. OpUDot and OpUDotAccSat).
439 */
440 if (alu->op == nir_op_udot_4x8_uadd ||
441 alu->op == nir_op_udot_4x8_uadd_sat) {
442 dst[0]->cat3.signedness = IR3_SRC_UNSIGNED;
443 } else {
444 dst[0]->cat3.signedness = IR3_SRC_MIXED;
445 }
446
447 /* This is actually the RHS signedness attribute.
448 * IR3_SRC_PACKED_HIGH ~ signed RHS (i.e. OpSDot and OpSDotAccSat).
449 */
450 if (alu->op == nir_op_sdot_4x8_iadd ||
451 alu->op == nir_op_sdot_4x8_iadd_sat) {
452 dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH;
453 } else {
454 dst[0]->cat3.packed = IR3_SRC_PACKED_LOW;
455 }
456
457 if (alu->op == nir_op_udot_4x8_uadd_sat ||
458 alu->op == nir_op_sdot_4x8_iadd_sat ||
459 alu->op == nir_op_sudot_4x8_iadd_sat) {
460 dst[0]->flags |= IR3_INSTR_SAT;
461 }
462 return;
463 }
464
465 struct ir3_instruction *accumulator = NULL;
466 if (alu->op == nir_op_udot_4x8_uadd_sat) {
467 accumulator = create_immed(&ctx->build, 0);
468 } else {
469 accumulator = src[2];
470 }
471
472 dst[0] = ir3_DP4ACC(&ctx->build, src[0], 0, src[1], 0, accumulator, 0);
473
474 if (alu->op == nir_op_udot_4x8_uadd ||
475 alu->op == nir_op_udot_4x8_uadd_sat) {
476 dst[0]->cat3.signedness = IR3_SRC_UNSIGNED;
477 } else {
478 dst[0]->cat3.signedness = IR3_SRC_MIXED;
479 }
480
481 /* For some reason (sat) doesn't work in unsigned case so
482 * we have to emulate it.
483 */
484 if (alu->op == nir_op_udot_4x8_uadd_sat) {
485 dst[0] = ir3_ADD_U(&ctx->build, dst[0], 0, src[2], 0);
486 dst[0]->flags |= IR3_INSTR_SAT;
487 } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
488 dst[0]->flags |= IR3_INSTR_SAT;
489 }
490 }
491
492 static void
emit_alu_dot_4x8_as_dp2acc(struct ir3_context * ctx,nir_alu_instr * alu,struct ir3_instruction ** dst,struct ir3_instruction ** src)493 emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu,
494 struct ir3_instruction **dst,
495 struct ir3_instruction **src)
496 {
497 int signedness;
498 if (alu->op == nir_op_udot_4x8_uadd ||
499 alu->op == nir_op_udot_4x8_uadd_sat) {
500 signedness = IR3_SRC_UNSIGNED;
501 } else {
502 signedness = IR3_SRC_MIXED;
503 }
504
505 struct ir3_instruction *accumulator = NULL;
506 if (alu->op == nir_op_udot_4x8_uadd_sat ||
507 alu->op == nir_op_sudot_4x8_iadd_sat) {
508 accumulator = create_immed(&ctx->build, 0);
509 } else {
510 accumulator = src[2];
511 }
512
513 dst[0] = ir3_DP2ACC(&ctx->build, src[0], 0, src[1], 0, accumulator, 0);
514 dst[0]->cat3.packed = IR3_SRC_PACKED_LOW;
515 dst[0]->cat3.signedness = signedness;
516
517 dst[0] = ir3_DP2ACC(&ctx->build, src[0], 0, src[1], 0, dst[0], 0);
518 dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH;
519 dst[0]->cat3.signedness = signedness;
520
521 if (alu->op == nir_op_udot_4x8_uadd_sat) {
522 dst[0] = ir3_ADD_U(&ctx->build, dst[0], 0, src[2], 0);
523 dst[0]->flags |= IR3_INSTR_SAT;
524 } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
525 dst[0] = ir3_ADD_S(&ctx->build, dst[0], 0, src[2], 0);
526 dst[0]->flags |= IR3_INSTR_SAT;
527 }
528 }
529
530 static bool
all_sat_compatible(struct ir3_instruction * instrs[],unsigned n)531 all_sat_compatible(struct ir3_instruction *instrs[], unsigned n)
532 {
533 for (unsigned i = 0; i < n; i++) {
534 if (!is_sat_compatible(instrs[i]->opc))
535 return false;
536 }
537
538 return true;
539 }
540
541 /* Is src the only use of its def, taking components into account. */
542 static bool
is_unique_use(nir_src * src)543 is_unique_use(nir_src *src)
544 {
545 nir_def *def = src->ssa;
546
547 if (list_is_singular(&def->uses))
548 return true;
549
550 nir_component_mask_t src_read_mask = nir_src_components_read(src);
551
552 nir_foreach_use (use, def) {
553 if (use == src)
554 continue;
555
556 if (nir_src_components_read(use) & src_read_mask)
557 return false;
558 }
559
560 return true;
561 }
562
563 static void
emit_alu(struct ir3_context * ctx,nir_alu_instr * alu)564 emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
565 {
566 const nir_op_info *info = &nir_op_infos[alu->op];
567 struct ir3_instruction_rpt dst, src[info->num_inputs];
568 unsigned bs[info->num_inputs]; /* bit size */
569 struct ir3_builder *b = &ctx->build;
570 unsigned dst_sz;
571 unsigned dst_bitsize = ir3_bitsize(ctx, alu->def.bit_size);
572 type_t dst_type = type_uint_size(dst_bitsize);
573
574 dst_sz = alu->def.num_components;
575 assert(dst_sz == 1 || ir3_supports_vectorized_nir_op(alu->op));
576
577 bool use_shared = !alu->def.divergent &&
578 ctx->compiler->has_scalar_alu &&
579 /* it probably isn't worth emulating these with scalar-only ops */
580 alu->op != nir_op_udot_4x8_uadd &&
581 alu->op != nir_op_udot_4x8_uadd_sat &&
582 alu->op != nir_op_sdot_4x8_iadd &&
583 alu->op != nir_op_sdot_4x8_iadd_sat &&
584 alu->op != nir_op_sudot_4x8_iadd &&
585 alu->op != nir_op_sudot_4x8_iadd_sat &&
586 /* not supported in HW, we have to fall back to normal registers */
587 alu->op != nir_op_ffma;
588
589 struct ir3_instruction **def = ir3_get_def(ctx, &alu->def, dst_sz);
590
591 /* Vectors are special in that they have non-scalarized writemasks,
592 * and just take the first swizzle channel for each argument in
593 * order into each writemask channel.
594 */
595 if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
596 (alu->op == nir_op_vec4) || (alu->op == nir_op_vec8) ||
597 (alu->op == nir_op_vec16)) {
598 for (int i = 0; i < info->num_inputs; i++) {
599 nir_alu_src *asrc = &alu->src[i];
600 struct ir3_instruction *src =
601 ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
602 compile_assert(ctx, src);
603 def[i] = ir3_MOV(b, src, dst_type);
604 }
605
606 ir3_instr_create_rpt(def, info->num_inputs);
607 ir3_put_def(ctx, &alu->def);
608 return;
609 }
610
611 assert(dst_sz <= ARRAY_SIZE(src[0].rpts));
612
613 for (int i = 0; i < info->num_inputs; i++) {
614 nir_alu_src *asrc = &alu->src[i];
615 struct ir3_instruction *const *input_src =
616 ir3_get_src_shared(ctx, &asrc->src, use_shared);
617 bs[i] = nir_src_bit_size(asrc->src);
618
619 for (unsigned rpt = 0; rpt < dst_sz; rpt++) {
620 src[i].rpts[rpt] = input_src[asrc->swizzle[rpt]];
621 compile_assert(ctx, src[i].rpts[rpt]);
622 }
623 }
624
625 switch (alu->op) {
626 case nir_op_mov:
627 dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type);
628 break;
629
630 case nir_op_f2f32:
631 case nir_op_f2f16_rtne:
632 case nir_op_f2f16_rtz:
633 case nir_op_f2f16:
634 case nir_op_f2i32:
635 case nir_op_f2i16:
636 case nir_op_f2i8:
637 case nir_op_f2u32:
638 case nir_op_f2u16:
639 case nir_op_f2u8:
640 case nir_op_i2f32:
641 case nir_op_i2f16:
642 case nir_op_i2i32:
643 case nir_op_i2i16:
644 case nir_op_i2i8:
645 case nir_op_u2f32:
646 case nir_op_u2f16:
647 case nir_op_u2u32:
648 case nir_op_u2u16:
649 case nir_op_u2u8:
650 case nir_op_b2f16:
651 case nir_op_b2f32:
652 case nir_op_b2i8:
653 case nir_op_b2i16:
654 case nir_op_b2i32:
655 dst = create_cov(ctx, dst_sz, src[0], bs[0], alu->op);
656 break;
657
658 case nir_op_fquantize2f16:
659 dst = create_cov(ctx, dst_sz,
660 create_cov(ctx, dst_sz, src[0], 32, nir_op_f2f16_rtne),
661 16, nir_op_f2f32);
662 break;
663
664 case nir_op_b2b1:
665 /* b2b1 will appear when translating from
666 *
667 * - nir_intrinsic_load_shared of a 32-bit 0/~0 value.
668 * - nir_intrinsic_load_constant of a 32-bit 0/~0 value
669 *
670 * A negate can turn those into a 1 or 0 for us.
671 */
672 dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
673 break;
674
675 case nir_op_b2b32:
676 /* b2b32 will appear when converting our 1-bit bools to a store_shared
677 * argument.
678 *
679 * A negate can turn those into a ~0 for us.
680 */
681 dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
682 break;
683
684 case nir_op_fneg:
685 dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FNEG);
686 break;
687 case nir_op_fabs:
688 dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FABS);
689 break;
690 case nir_op_fmax:
691 dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
692 break;
693 case nir_op_fmin:
694 dst = ir3_MIN_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
695 break;
696 case nir_op_fsat:
697 /* if there is just a single use of the src, and it supports
698 * (sat) bit, we can just fold the (sat) flag back to the
699 * src instruction and create a mov. This is easier for cp
700 * to eliminate.
701 */
702 if (all_sat_compatible(src[0].rpts, dst_sz) &&
703 is_unique_use(&alu->src[0].src)) {
704 set_instr_flags(src[0].rpts, dst_sz, IR3_INSTR_SAT);
705 dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type);
706 } else {
707 /* otherwise generate a max.f that saturates.. blob does
708 * similar (generating a cat2 mov using max.f)
709 */
710 dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[0], 0);
711 set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
712 }
713 break;
714 case nir_op_fmul:
715 dst = ir3_MUL_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
716 break;
717 case nir_op_fadd:
718 dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
719 break;
720 case nir_op_fsub:
721 dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], IR3_REG_FNEG);
722 break;
723 case nir_op_ffma:
724 dst = ir3_MAD_F32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
725 break;
726 case nir_op_flt:
727 dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
728 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
729 break;
730 case nir_op_fge:
731 dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
732 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
733 break;
734 case nir_op_feq:
735 dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
736 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ);
737 break;
738 case nir_op_fneu:
739 dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
740 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE);
741 break;
742 case nir_op_fceil:
743 dst = ir3_CEIL_F_rpt(b, dst_sz, src[0], 0);
744 break;
745 case nir_op_ffloor:
746 dst = ir3_FLOOR_F_rpt(b, dst_sz, src[0], 0);
747 break;
748 case nir_op_ftrunc:
749 dst = ir3_TRUNC_F_rpt(b, dst_sz, src[0], 0);
750 break;
751 case nir_op_fround_even:
752 dst = ir3_RNDNE_F_rpt(b, dst_sz, src[0], 0);
753 break;
754 case nir_op_fsign:
755 dst = ir3_SIGN_F_rpt(b, dst_sz, src[0], 0);
756 break;
757
758 case nir_op_fsin:
759 dst = ir3_SIN_rpt(b, dst_sz, src[0], 0);
760 break;
761 case nir_op_fcos:
762 dst = ir3_COS_rpt(b, dst_sz, src[0], 0);
763 break;
764 case nir_op_frsq:
765 dst = ir3_RSQ_rpt(b, dst_sz, src[0], 0);
766 break;
767 case nir_op_frcp:
768 assert(dst_sz == 1);
769 dst.rpts[0] = ir3_RCP(b, src[0].rpts[0], 0);
770 break;
771 case nir_op_flog2:
772 dst = ir3_LOG2_rpt(b, dst_sz, src[0], 0);
773 break;
774 case nir_op_fexp2:
775 dst = ir3_EXP2_rpt(b, dst_sz, src[0], 0);
776 break;
777 case nir_op_fsqrt:
778 dst = ir3_SQRT_rpt(b, dst_sz, src[0], 0);
779 break;
780
781 case nir_op_iabs:
782 dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SABS);
783 break;
784 case nir_op_iadd:
785 dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
786 break;
787 case nir_op_iadd3:
788 if (use_shared) {
789 /* sad doesn't support the scalar ALU so expand to two adds so that we
790 * don't unnecessarily fall back to non-earlypreamble.
791 */
792 struct ir3_instruction_rpt add01 =
793 ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
794
795 if (is_half(src[0].rpts[0])) {
796 set_dst_flags(add01.rpts, dst_sz, IR3_REG_HALF);
797 }
798
799 dst = ir3_ADD_U_rpt(b, dst_sz, add01, 0, src[2], 0);
800 } else {
801 if (is_half(src[0].rpts[0])) {
802 dst = ir3_SAD_S16_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
803 } else {
804 dst = ir3_SAD_S32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
805 }
806 }
807 break;
808 case nir_op_ihadd:
809 dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
810 set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI);
811 break;
812 case nir_op_uhadd:
813 dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
814 set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI);
815 break;
816 case nir_op_iand:
817 dst = ir3_AND_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
818 break;
819 case nir_op_imax:
820 dst = ir3_MAX_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
821 break;
822 case nir_op_umax:
823 dst = ir3_MAX_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
824 break;
825 case nir_op_imin:
826 dst = ir3_MIN_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
827 break;
828 case nir_op_umin:
829 dst = ir3_MIN_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
830 break;
831 case nir_op_umul_low:
832 dst = ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
833 break;
834 case nir_op_imadsh_mix16:
835 if (use_shared) {
836 struct ir3_instruction_rpt sixteen =
837 create_immed_shared_rpt(b, dst_sz, 16, true);
838 struct ir3_instruction_rpt src1 =
839 ir3_SHR_B_rpt(b, dst_sz, src[1], 0, sixteen, 0);
840 struct ir3_instruction_rpt mul =
841 ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src1, 0);
842 dst = ir3_ADD_U_rpt(b, dst_sz,
843 ir3_SHL_B_rpt(b, dst_sz, mul, 0, sixteen, 0), 0,
844 src[2], 0);
845 } else {
846 dst = ir3_MADSH_M16_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
847 }
848 break;
849 case nir_op_imad24_ir3:
850 if (use_shared) {
851 dst = ir3_ADD_U_rpt(b, dst_sz,
852 ir3_MUL_U24_rpt(b, dst_sz, src[0], 0, src[1], 0),
853 0, src[2], 0);
854 } else {
855 dst = ir3_MAD_S24_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
856 }
857 break;
858 case nir_op_imul:
859 compile_assert(ctx, alu->def.bit_size == 8 || alu->def.bit_size == 16);
860 dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0);
861 break;
862 case nir_op_imul24:
863 dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0);
864 break;
865 case nir_op_ineg:
866 dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
867 break;
868 case nir_op_inot:
869 if (bs[0] == 1) {
870 struct ir3_instruction_rpt one = create_immed_typed_shared_rpt(
871 b, dst_sz, 1, ctx->compiler->bool_type, use_shared);
872 dst = ir3_SUB_U_rpt(b, dst_sz, one, 0, src[0], 0);
873 } else {
874 dst = ir3_NOT_B_rpt(b, dst_sz, src[0], 0);
875 }
876 break;
877 case nir_op_ior:
878 dst = ir3_OR_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
879 break;
880 case nir_op_ishl:
881 dst = ir3_SHL_B_rpt(b, dst_sz, src[0], 0,
882 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
883 break;
884 case nir_op_ishr:
885 dst = ir3_ASHR_B_rpt(b, dst_sz, src[0], 0,
886 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
887 break;
888 case nir_op_isub:
889 dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
890 break;
891 case nir_op_ixor:
892 dst = ir3_XOR_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
893 break;
894 case nir_op_ushr:
895 dst = ir3_SHR_B_rpt(b, dst_sz, src[0], 0,
896 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
897 break;
898 case nir_op_shrm_ir3:
899 dst = ir3_SHRM_rpt(b, dst_sz,
900 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0,
901 src[0], 0, src[2], 0);
902 break;
903 case nir_op_shlm_ir3:
904 dst = ir3_SHLM_rpt(b, dst_sz,
905 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0,
906 src[0], 0, src[2], 0);
907 break;
908 case nir_op_shrg_ir3:
909 dst = ir3_SHRG_rpt(b, dst_sz,
910 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0,
911 src[0], 0, src[2], 0);
912 break;
913 case nir_op_shlg_ir3:
914 dst = ir3_SHLG_rpt(b, dst_sz,
915 resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0,
916 src[0], 0, src[2], 0);
917 break;
918 case nir_op_andg_ir3:
919 dst = ir3_ANDG_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
920 break;
921 case nir_op_ilt:
922 dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
923 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
924 break;
925 case nir_op_ige:
926 dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
927 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
928 break;
929 case nir_op_ieq:
930 dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
931 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ);
932 break;
933 case nir_op_ine:
934 dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
935 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE);
936 break;
937 case nir_op_ult:
938 dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
939 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
940 break;
941 case nir_op_uge:
942 dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
943 set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
944 break;
945
946 case nir_op_icsel_eqz:
947 case nir_op_bcsel: {
948 struct ir3_instruction_rpt conds;
949
950 compile_assert(ctx, bs[1] == bs[2]);
951
952 /* TODO: repeat the covs when possible. */
953 for (unsigned rpt = 0; rpt < dst_sz; ++rpt) {
954 struct ir3_instruction *cond =
955 ir3_get_cond_for_nonzero_compare(src[0].rpts[rpt]);
956
957 /* The condition's size has to match the other two arguments' size, so
958 * convert down if necessary.
959 *
960 * Single hashtable is fine, because the conversion will either be
961 * 16->32 or 32->16, but never both
962 */
963 if (is_half(src[1].rpts[rpt]) != is_half(cond)) {
964 struct hash_entry *prev_entry = _mesa_hash_table_search(
965 ctx->sel_cond_conversions, src[0].rpts[rpt]);
966 if (prev_entry) {
967 cond = prev_entry->data;
968 } else {
969 if (is_half(cond)) {
970 if (bs[0] == 8) {
971 /* Zero-extension of an 8-bit value has to be done through
972 * masking, as in create_cov.
973 */
974 struct ir3_instruction *mask =
975 create_immed_typed(b, 0xff, TYPE_U8);
976 cond = ir3_AND_B(b, cond, 0, mask, 0);
977 } else {
978 cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32);
979 }
980 } else {
981 cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
982 }
983 _mesa_hash_table_insert(ctx->sel_cond_conversions,
984 src[0].rpts[rpt], cond);
985 }
986 }
987 conds.rpts[rpt] = cond;
988 }
989
990 if (alu->op == nir_op_icsel_eqz) {
991 struct ir3_instruction_rpt tmp = src[1];
992 src[1] = src[2];
993 src[2] = tmp;
994 }
995
996 if (is_half(src[1].rpts[0]))
997 dst = ir3_SEL_B16_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
998 else
999 dst = ir3_SEL_B32_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
1000 break;
1001 }
1002
1003 case nir_op_bit_count: {
1004 if (ctx->compiler->gen < 5 ||
1005 (src[0].rpts[0]->dsts[0]->flags & IR3_REG_HALF)) {
1006 dst = ir3_CBITS_B_rpt(b, dst_sz, src[0], 0);
1007 break;
1008 }
1009
1010 // We need to do this 16b at a time on a5xx+a6xx. Once half-precision
1011 // support is in place, this should probably move to a NIR lowering pass:
1012 struct ir3_instruction_rpt hi, lo;
1013
1014 hi = ir3_COV_rpt(
1015 b, dst_sz,
1016 ir3_SHR_B_rpt(b, dst_sz, src[0], 0,
1017 create_immed_shared_rpt(b, dst_sz, 16, use_shared), 0),
1018 TYPE_U32, TYPE_U16);
1019 lo = ir3_COV_rpt(b, dst_sz, src[0], TYPE_U32, TYPE_U16);
1020
1021 hi = ir3_CBITS_B_rpt(b, dst_sz, hi, 0);
1022 lo = ir3_CBITS_B_rpt(b, dst_sz, lo, 0);
1023
1024 // TODO maybe the builders should default to making dst half-precision
1025 // if the src's were half precision, to make this less awkward.. otoh
1026 // we should probably just do this lowering in NIR.
1027 set_dst_flags(hi.rpts, dst_sz, IR3_REG_HALF);
1028 set_dst_flags(lo.rpts, dst_sz, IR3_REG_HALF);
1029
1030 dst = ir3_ADD_S_rpt(b, dst_sz, hi, 0, lo, 0);
1031 set_dst_flags(dst.rpts, dst_sz, IR3_REG_HALF);
1032 dst = ir3_COV_rpt(b, dst_sz, dst, TYPE_U16, TYPE_U32);
1033 break;
1034 }
1035 case nir_op_ifind_msb: {
1036 struct ir3_instruction_rpt cmp;
1037 dst = ir3_CLZ_S_rpt(b, dst_sz, src[0], 0);
1038 cmp =
1039 ir3_CMPS_S_rpt(b, dst_sz, dst, 0,
1040 create_immed_shared_rpt(b, dst_sz, 0, use_shared), 0);
1041 set_cat2_condition(cmp.rpts, dst_sz, IR3_COND_GE);
1042 dst = ir3_SEL_B32_rpt(
1043 b, dst_sz,
1044 ir3_SUB_U_rpt(b, dst_sz,
1045 create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0,
1046 dst, 0),
1047 0, cmp, 0, dst, 0);
1048 break;
1049 }
1050 case nir_op_ufind_msb:
1051 dst = ir3_CLZ_B_rpt(b, dst_sz, src[0], 0);
1052 dst = ir3_SEL_B32_rpt(
1053 b, dst_sz,
1054 ir3_SUB_U_rpt(b, dst_sz,
1055 create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0,
1056 dst, 0),
1057 0, src[0], 0, dst, 0);
1058 break;
1059 case nir_op_find_lsb:
1060 dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0);
1061 dst = ir3_CLZ_B_rpt(b, dst_sz, dst, 0);
1062 break;
1063 case nir_op_bitfield_reverse:
1064 dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0);
1065 break;
1066
1067 case nir_op_uadd_sat:
1068 dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
1069 set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1070 break;
1071 case nir_op_iadd_sat:
1072 dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
1073 set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1074 break;
1075 case nir_op_usub_sat:
1076 dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
1077 set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1078 break;
1079 case nir_op_isub_sat:
1080 dst = ir3_SUB_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
1081 set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1082 break;
1083 case nir_op_pack_64_2x32_split: {
1084 struct ir3_instruction *r0 = ir3_MOV(b, src[0].rpts[0], TYPE_U32);
1085 struct ir3_instruction *r1 = ir3_MOV(b, src[1].rpts[0], TYPE_U32);
1086 dst.rpts[0] = r0;
1087 dst.rpts[1] = r1;
1088 dst_sz = 2;
1089 break;
1090 }
1091 case nir_op_unpack_64_2x32_split_x: {
1092 ir3_split_dest(b, &dst.rpts[0], src[0].rpts[0], 0, 1);
1093 break;
1094 }
1095 case nir_op_unpack_64_2x32_split_y: {
1096 ir3_split_dest(b, &dst.rpts[0], src[0].rpts[0], 1, 1);
1097 break;
1098 }
1099 case nir_op_udot_4x8_uadd:
1100 case nir_op_udot_4x8_uadd_sat:
1101 case nir_op_sdot_4x8_iadd:
1102 case nir_op_sdot_4x8_iadd_sat:
1103 case nir_op_sudot_4x8_iadd:
1104 case nir_op_sudot_4x8_iadd_sat: {
1105 assert(dst_sz == 1);
1106
1107 struct ir3_instruction *src_rpt0[] = {src[0].rpts[0], src[1].rpts[0],
1108 src[2].rpts[0]};
1109
1110 if (ctx->compiler->has_dp4acc) {
1111 emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst.rpts, src_rpt0);
1112 } else if (ctx->compiler->has_dp2acc) {
1113 emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst.rpts, src_rpt0);
1114 } else {
1115 ir3_context_error(ctx, "ALU op should have been lowered: %s\n",
1116 nir_op_infos[alu->op].name);
1117 }
1118
1119 break;
1120 }
1121
1122 default:
1123 ir3_context_error(ctx, "Unhandled ALU op: %s\n",
1124 nir_op_infos[alu->op].name);
1125 break;
1126 }
1127
1128 if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
1129 assert(alu->def.bit_size == 1 || alu->op == nir_op_b2b32);
1130 } else {
1131 /* 1-bit values stored in 32-bit registers are only valid for certain
1132 * ALU ops.
1133 */
1134 switch (alu->op) {
1135 case nir_op_mov:
1136 case nir_op_iand:
1137 case nir_op_ior:
1138 case nir_op_ixor:
1139 case nir_op_inot:
1140 case nir_op_bcsel:
1141 case nir_op_andg_ir3:
1142 break;
1143 default:
1144 compile_assert(ctx, alu->def.bit_size != 1);
1145 }
1146 }
1147
1148 cp_instrs(def, dst.rpts, dst_sz);
1149 ir3_put_def(ctx, &alu->def);
1150 }
1151
1152 static void
emit_intrinsic_load_ubo_ldc(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1153 emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1154 struct ir3_instruction **dst)
1155 {
1156 struct ir3_builder *b = &ctx->build;
1157
1158 /* This is only generated for us by nir_lower_ubo_vec4, which leaves base =
1159 * 0.
1160 */
1161 assert(nir_intrinsic_base(intr) == 0);
1162
1163 unsigned ncomp = intr->num_components;
1164 struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
1165 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
1166 struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
1167 ldc->dsts[0]->wrmask = MASK(ncomp);
1168 ldc->cat6.iim_val = ncomp;
1169 ldc->cat6.d = nir_intrinsic_component(intr);
1170 ldc->cat6.type = utype_def(&intr->def);
1171
1172 ir3_handle_bindless_cat6(ldc, intr->src[0]);
1173 if (ldc->flags & IR3_INSTR_B)
1174 ctx->so->bindless_ubo = true;
1175 ir3_handle_nonuniform(ldc, intr);
1176
1177 if (!intr->def.divergent &&
1178 ctx->compiler->has_scalar_alu) {
1179 ldc->dsts[0]->flags |= IR3_REG_SHARED;
1180 ldc->flags |= IR3_INSTR_U;
1181 }
1182
1183 ir3_split_dest(b, dst, ldc, 0, ncomp);
1184 }
1185
1186 static void
emit_intrinsic_copy_ubo_to_uniform(struct ir3_context * ctx,nir_intrinsic_instr * intr)1187 emit_intrinsic_copy_ubo_to_uniform(struct ir3_context *ctx,
1188 nir_intrinsic_instr *intr)
1189 {
1190 struct ir3_builder *b = &ctx->build;
1191
1192 unsigned base = nir_intrinsic_base(intr);
1193 unsigned size = nir_intrinsic_range(intr);
1194
1195 struct ir3_instruction *addr1 = ir3_get_addr1(ctx, base);
1196
1197 struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
1198 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
1199 struct ir3_instruction *ldc = ir3_LDC_K(b, idx, 0, offset, 0);
1200 ldc->cat6.iim_val = size;
1201 ldc->barrier_class = ldc->barrier_conflict = IR3_BARRIER_CONST_W;
1202
1203 ir3_handle_bindless_cat6(ldc, intr->src[0]);
1204 if (ldc->flags & IR3_INSTR_B)
1205 ctx->so->bindless_ubo = true;
1206
1207 ir3_instr_set_address(ldc, addr1);
1208
1209 /* The assembler isn't aware of what value a1.x has, so make sure that
1210 * constlen includes the ldc.k here.
1211 */
1212 ctx->so->constlen =
1213 MAX2(ctx->so->constlen, DIV_ROUND_UP(base + size * 4, 4));
1214
1215 array_insert(ctx->block, ctx->block->keeps, ldc);
1216 }
1217
1218 static void
emit_intrinsic_copy_global_to_uniform(struct ir3_context * ctx,nir_intrinsic_instr * intr)1219 emit_intrinsic_copy_global_to_uniform(struct ir3_context *ctx,
1220 nir_intrinsic_instr *intr)
1221 {
1222 struct ir3_builder *b = &ctx->build;
1223
1224 unsigned size = nir_intrinsic_range(intr);
1225 unsigned dst = nir_intrinsic_range_base(intr);
1226 unsigned addr_offset = nir_intrinsic_base(intr);
1227 unsigned dst_lo = dst & 0xff;
1228 unsigned dst_hi = dst >> 8;
1229
1230 struct ir3_instruction *a1 = NULL;
1231 if (dst_hi)
1232 a1 = ir3_get_addr1(ctx, dst_hi << 8);
1233
1234 struct ir3_instruction *addr_lo = ir3_get_src(ctx, &intr->src[0])[0];
1235 struct ir3_instruction *addr_hi = ir3_get_src(ctx, &intr->src[0])[1];
1236 struct ir3_instruction *addr = ir3_collect(b, addr_lo, addr_hi);
1237 struct ir3_instruction *ldg = ir3_LDG_K(b, create_immed(b, dst_lo), 0, addr, 0,
1238 create_immed(b, addr_offset), 0,
1239 create_immed(b, size), 0);
1240 ldg->barrier_class = ldg->barrier_conflict = IR3_BARRIER_CONST_W;
1241 ldg->cat6.type = TYPE_U32;
1242
1243 if (a1) {
1244 ir3_instr_set_address(ldg, a1);
1245 ldg->flags |= IR3_INSTR_A1EN;
1246 }
1247
1248 /* The assembler isn't aware of what value a1.x has, so make sure that
1249 * constlen includes the ldg.k here.
1250 */
1251 ctx->so->constlen =
1252 MAX2(ctx->so->constlen, DIV_ROUND_UP(dst + size * 4, 4));
1253
1254 array_insert(ctx->block, ctx->block->keeps, ldg);
1255 }
1256
1257
1258 /* handles direct/indirect UBO reads: */
1259 static void
emit_intrinsic_load_ubo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1260 emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1261 struct ir3_instruction **dst)
1262 {
1263 struct ir3_builder *b = &ctx->build;
1264 struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
1265 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1266 unsigned ubo = ir3_const_reg(const_state, IR3_CONST_ALLOC_UBO_PTRS, 0);
1267 const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
1268
1269 int off = 0;
1270
1271 /* First src is ubo index, which could either be an immed or not: */
1272 src0 = ir3_get_src(ctx, &intr->src[0])[0];
1273 if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
1274 base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));
1275 base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);
1276 } else {
1277 base_lo = create_uniform_indirect(b, ubo, TYPE_U32,
1278 ir3_get_addr0(ctx, src0, ptrsz));
1279 base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32,
1280 ir3_get_addr0(ctx, src0, ptrsz));
1281
1282 /* NOTE: since relative addressing is used, make sure constlen is
1283 * at least big enough to cover all the UBO addresses, since the
1284 * assembler won't know what the max address reg is.
1285 */
1286 ctx->so->constlen = MAX2(
1287 ctx->so->constlen,
1288 const_state->allocs.consts[IR3_CONST_ALLOC_UBO_PTRS].offset_vec4 +
1289 (ctx->s->info.num_ubos * ptrsz));
1290 }
1291
1292 /* note: on 32bit gpu's base_hi is ignored and DCE'd */
1293 addr = base_lo;
1294
1295 if (nir_src_is_const(intr->src[1])) {
1296 off += nir_src_as_uint(intr->src[1]);
1297 } else {
1298 /* For load_ubo_indirect, second src is indirect offset: */
1299 src1 = ir3_get_src(ctx, &intr->src[1])[0];
1300
1301 /* and add offset to addr: */
1302 addr = ir3_ADD_S(b, addr, 0, src1, 0);
1303 }
1304
1305 /* if offset is to large to encode in the ldg, split it out: */
1306 if ((off + (intr->num_components * 4)) > 1024) {
1307 /* split out the minimal amount to improve the odds that
1308 * cp can fit the immediate in the add.s instruction:
1309 */
1310 unsigned off2 = off + (intr->num_components * 4) - 1024;
1311 addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
1312 off -= off2;
1313 }
1314
1315 if (ptrsz == 2) {
1316 struct ir3_instruction *carry;
1317
1318 /* handle 32b rollover, ie:
1319 * if (addr < base_lo)
1320 * base_hi++
1321 */
1322 carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
1323 carry->cat2.condition = IR3_COND_LT;
1324 base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
1325
1326 addr = ir3_collect(b, addr, base_hi);
1327 }
1328
1329 for (int i = 0; i < intr->num_components; i++) {
1330 struct ir3_instruction *load =
1331 ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0,
1332 create_immed(b, 1), 0); /* num components */
1333 load->cat6.type = TYPE_U32;
1334 dst[i] = load;
1335 }
1336 }
1337
1338 /* Load a kernel param: src[] = { address }. */
1339 static void
emit_intrinsic_load_kernel_input(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1340 emit_intrinsic_load_kernel_input(struct ir3_context *ctx,
1341 nir_intrinsic_instr *intr,
1342 struct ir3_instruction **dst)
1343 {
1344 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1345 struct ir3_builder *b = &ctx->build;
1346 unsigned offset = nir_intrinsic_base(intr);
1347 unsigned p = ir3_const_reg(const_state, IR3_CONST_ALLOC_KERNEL_PARAMS, 0);
1348
1349 struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0];
1350
1351 if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
1352 offset += src0->srcs[0]->iim_val;
1353
1354 /* kernel param position is in bytes, but constant space is 32b registers: */
1355 compile_assert(ctx, !(offset & 0x3));
1356
1357 dst[0] = create_uniform(b, p + (offset / 4));
1358 } else {
1359 /* kernel param position is in bytes, but constant space is 32b registers: */
1360 compile_assert(ctx, !(offset & 0x3));
1361
1362 /* TODO we should probably be lowering this in nir, and also handling
1363 * non-32b inputs.. Also we probably don't want to be using
1364 * SP_MODE_CONTROL.CONSTANT_DEMOTION_ENABLE for KERNEL shaders..
1365 */
1366 src0 = ir3_SHR_B(b, src0, 0, create_immed(b, 2), 0);
1367
1368 dst[0] = create_uniform_indirect(b, offset / 4, TYPE_U32,
1369 ir3_get_addr0(ctx, src0, 1));
1370 }
1371 }
1372
1373 /* src[] = { block_index } */
1374 static void
emit_intrinsic_ssbo_size(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1375 emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1376 struct ir3_instruction **dst)
1377 {
1378 struct ir3_builder *b = &ctx->build;
1379 struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
1380 struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
1381 resinfo->cat6.iim_val = 1;
1382 resinfo->cat6.d = ctx->compiler->gen >= 6 ? 1 : 2;
1383 resinfo->cat6.type = TYPE_U32;
1384 resinfo->cat6.typed = false;
1385 /* resinfo has no writemask and always writes out 3 components */
1386 resinfo->dsts[0]->wrmask = MASK(3);
1387 ir3_handle_bindless_cat6(resinfo, intr->src[0]);
1388 ir3_handle_nonuniform(resinfo, intr);
1389
1390 if (ctx->compiler->gen >= 6) {
1391 ir3_split_dest(b, dst, resinfo, 0, 1);
1392 } else {
1393 /* On a5xx, resinfo returns the low 16 bits of ssbo size in .x and the high 16 bits in .y */
1394 struct ir3_instruction *resinfo_dst[2];
1395 ir3_split_dest(b, resinfo_dst, resinfo, 0, 2);
1396 *dst = ir3_ADD_U(b, ir3_SHL_B(b, resinfo_dst[1], 0, create_immed(b, 16), 0), 0, resinfo_dst[0], 0);
1397 }
1398 }
1399
1400 /* src[] = { offset }. const_index[] = { base } */
1401 static void
emit_intrinsic_load_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1402 emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1403 struct ir3_instruction **dst)
1404 {
1405 struct ir3_builder *b = &ctx->build;
1406 struct ir3_instruction *ldl, *offset;
1407 unsigned base;
1408
1409 offset = ir3_get_src(ctx, &intr->src[0])[0];
1410 base = nir_intrinsic_base(intr);
1411
1412 ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0,
1413 create_immed(b, intr->num_components), 0);
1414
1415 ldl->cat6.type = utype_def(&intr->def);
1416 ldl->dsts[0]->wrmask = MASK(intr->num_components);
1417
1418 ldl->barrier_class = IR3_BARRIER_SHARED_R;
1419 ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
1420
1421 ir3_split_dest(b, dst, ldl, 0, intr->num_components);
1422 }
1423
1424 /* src[] = { value, offset }. const_index[] = { base, write_mask } */
1425 static void
emit_intrinsic_store_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)1426 emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1427 {
1428 struct ir3_builder *b = &ctx->build;
1429 struct ir3_instruction *stl, *offset;
1430 struct ir3_instruction *const *value;
1431 unsigned base, wrmask, ncomp;
1432
1433 value = ir3_get_src(ctx, &intr->src[0]);
1434 offset = ir3_get_src(ctx, &intr->src[1])[0];
1435
1436 base = nir_intrinsic_base(intr);
1437 wrmask = nir_intrinsic_write_mask(intr);
1438 ncomp = ffs(~wrmask) - 1;
1439
1440 assert(wrmask == BITFIELD_MASK(intr->num_components));
1441
1442 stl = ir3_STL(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1443 create_immed(b, ncomp), 0);
1444 stl->cat6.dst_offset = base;
1445 stl->cat6.type = utype_src(intr->src[0]);
1446 stl->barrier_class = IR3_BARRIER_SHARED_W;
1447 stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1448
1449 array_insert(ctx->block, ctx->block->keeps, stl);
1450 }
1451
1452 /* src[] = { offset }. const_index[] = { base } */
1453 static void
emit_intrinsic_load_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1454 emit_intrinsic_load_shared_ir3(struct ir3_context *ctx,
1455 nir_intrinsic_instr *intr,
1456 struct ir3_instruction **dst)
1457 {
1458 struct ir3_builder *b = &ctx->build;
1459 struct ir3_instruction *load, *offset;
1460 unsigned base;
1461
1462 offset = ir3_get_src(ctx, &intr->src[0])[0];
1463 base = nir_intrinsic_base(intr);
1464
1465 load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0,
1466 create_immed(b, intr->num_components), 0);
1467
1468 /* for a650, use LDL for tess ctrl inputs: */
1469 if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)
1470 load->opc = OPC_LDL;
1471
1472 load->cat6.type = utype_def(&intr->def);
1473 load->dsts[0]->wrmask = MASK(intr->num_components);
1474
1475 load->barrier_class = IR3_BARRIER_SHARED_R;
1476 load->barrier_conflict = IR3_BARRIER_SHARED_W;
1477
1478 ir3_split_dest(b, dst, load, 0, intr->num_components);
1479 }
1480
1481 /* src[] = { value, offset }. const_index[] = { base } */
1482 static void
emit_intrinsic_store_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr)1483 emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,
1484 nir_intrinsic_instr *intr)
1485 {
1486 struct ir3_builder *b = &ctx->build;
1487 struct ir3_instruction *store, *offset;
1488 struct ir3_instruction *const *value;
1489
1490 value = ir3_get_src(ctx, &intr->src[0]);
1491 offset = ir3_get_src(ctx, &intr->src[1])[0];
1492
1493 store = ir3_STLW(b, offset, 0,
1494 ir3_create_collect(b, value, intr->num_components), 0,
1495 create_immed(b, intr->num_components), 0);
1496
1497 /* for a650, use STL for vertex outputs used by tess ctrl shader: */
1498 if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&
1499 ctx->compiler->tess_use_shared)
1500 store->opc = OPC_STL;
1501
1502 store->cat6.dst_offset = nir_intrinsic_base(intr);
1503 store->cat6.type = utype_src(intr->src[0]);
1504 store->barrier_class = IR3_BARRIER_SHARED_W;
1505 store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1506
1507 array_insert(ctx->block, ctx->block->keeps, store);
1508 }
1509
1510 /*
1511 * CS shared variable atomic intrinsics
1512 *
1513 * All of the shared variable atomic memory operations read a value from
1514 * memory, compute a new value using one of the operations below, write the
1515 * new value to memory, and return the original value read.
1516 *
1517 * All operations take 2 sources except CompSwap that takes 3. These
1518 * sources represent:
1519 *
1520 * 0: The offset into the shared variable storage region that the atomic
1521 * operation will operate on.
1522 * 1: The data parameter to the atomic function (i.e. the value to add
1523 * in, etc).
1524 * 2: For CompSwap only: the second data parameter.
1525 */
1526 static struct ir3_instruction *
emit_intrinsic_atomic_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)1527 emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1528 {
1529 struct ir3_builder *b = &ctx->build;
1530 struct ir3_instruction *atomic, *src0, *src1;
1531 type_t type = TYPE_U32;
1532
1533 src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */
1534 src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */
1535
1536 switch (nir_intrinsic_atomic_op(intr)) {
1537 case nir_atomic_op_iadd:
1538 atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
1539 break;
1540 case nir_atomic_op_imin:
1541 atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1542 type = TYPE_S32;
1543 break;
1544 case nir_atomic_op_umin:
1545 atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1546 break;
1547 case nir_atomic_op_imax:
1548 atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1549 type = TYPE_S32;
1550 break;
1551 case nir_atomic_op_umax:
1552 atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1553 break;
1554 case nir_atomic_op_iand:
1555 atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
1556 break;
1557 case nir_atomic_op_ior:
1558 atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
1559 break;
1560 case nir_atomic_op_ixor:
1561 atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
1562 break;
1563 case nir_atomic_op_xchg:
1564 atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
1565 break;
1566 case nir_atomic_op_cmpxchg:
1567 /* for cmpxchg, src1 is [ui]vec2(data, compare): */
1568 src1 = ir3_collect(b, ir3_get_src(ctx, &intr->src[2])[0], src1);
1569 atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
1570 break;
1571 default:
1572 unreachable("boo");
1573 }
1574
1575 atomic->cat6.iim_val = 1;
1576 atomic->cat6.d = 1;
1577 atomic->cat6.type = type;
1578 atomic->barrier_class = IR3_BARRIER_SHARED_W;
1579 atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1580
1581 /* even if nothing consume the result, we can't DCE the instruction: */
1582 array_insert(ctx->block, ctx->block->keeps, atomic);
1583
1584 return atomic;
1585 }
1586
1587 static void
stp_ldp_offset(struct ir3_context * ctx,nir_src * src,struct ir3_instruction ** offset,int32_t * base)1588 stp_ldp_offset(struct ir3_context *ctx, nir_src *src,
1589 struct ir3_instruction **offset, int32_t *base)
1590 {
1591 struct ir3_builder *b = &ctx->build;
1592
1593 if (nir_src_is_const(*src)) {
1594 unsigned src_offset = nir_src_as_uint(*src);
1595 /* The base offset field is only 13 bits, and it's signed. Try to make the
1596 * offset constant whenever the original offsets are similar, to avoid
1597 * creating too many constants in the final shader.
1598 */
1599 *base = ((int32_t) src_offset << (32 - 13)) >> (32 - 13);
1600 uint32_t offset_val = src_offset - *base;
1601 *offset = create_immed(b, offset_val);
1602 } else {
1603 /* TODO: match on nir_iadd with a constant that fits */
1604 *base = 0;
1605 *offset = ir3_get_src(ctx, src)[0];
1606 }
1607 }
1608
1609 /* src[] = { offset }. */
1610 static void
emit_intrinsic_load_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1611 emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1612 struct ir3_instruction **dst)
1613 {
1614 struct ir3_builder *b = &ctx->build;
1615 struct ir3_instruction *ldp, *offset;
1616 int32_t base;
1617
1618 stp_ldp_offset(ctx, &intr->src[0], &offset, &base);
1619
1620 ldp = ir3_LDP(b, offset, 0, create_immed(b, base), 0,
1621 create_immed(b, intr->num_components), 0);
1622
1623 ldp->cat6.type = utype_def(&intr->def);
1624 ldp->dsts[0]->wrmask = MASK(intr->num_components);
1625
1626 ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
1627 ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
1628
1629 ir3_split_dest(b, dst, ldp, 0, intr->num_components);
1630 }
1631
1632 /* src[] = { value, offset }. const_index[] = { write_mask } */
1633 static void
emit_intrinsic_store_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr)1634 emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1635 {
1636 struct ir3_builder *b = &ctx->build;
1637 struct ir3_instruction *stp, *offset;
1638 struct ir3_instruction *const *value;
1639 unsigned wrmask, ncomp;
1640 int32_t base;
1641
1642 value = ir3_get_src(ctx, &intr->src[0]);
1643
1644 stp_ldp_offset(ctx, &intr->src[1], &offset, &base);
1645
1646 wrmask = nir_intrinsic_write_mask(intr);
1647 ncomp = ffs(~wrmask) - 1;
1648
1649 assert(wrmask == BITFIELD_MASK(intr->num_components));
1650
1651 stp = ir3_STP(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1652 create_immed(b, ncomp), 0);
1653 stp->cat6.dst_offset = base;
1654 stp->cat6.type = utype_src(intr->src[0]);
1655 stp->barrier_class = IR3_BARRIER_PRIVATE_W;
1656 stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
1657
1658 array_insert(ctx->block, ctx->block->keeps, stp);
1659 }
1660
1661 struct tex_src_info {
1662 /* For prefetch */
1663 unsigned tex_base, samp_base, tex_idx, samp_idx;
1664 /* For normal tex instructions */
1665 unsigned base, a1_val, flags;
1666 struct ir3_instruction *samp_tex;
1667 };
1668
1669 /* TODO handle actual indirect/dynamic case.. which is going to be weird
1670 * to handle with the image_mapping table..
1671 */
1672 static struct tex_src_info
get_image_ssbo_samp_tex_src(struct ir3_context * ctx,nir_src * src,bool image)1673 get_image_ssbo_samp_tex_src(struct ir3_context *ctx, nir_src *src, bool image)
1674 {
1675 struct ir3_builder *b = &ctx->build;
1676 struct tex_src_info info = {0};
1677 nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(*src);
1678
1679 if (bindless_tex) {
1680 /* Bindless case */
1681 ctx->so->bindless_tex = true;
1682 info.flags |= IR3_INSTR_B;
1683
1684 /* Gather information required to determine which encoding to
1685 * choose as well as for prefetch.
1686 */
1687 info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1688 bool tex_const = nir_src_is_const(bindless_tex->src[0]);
1689 if (tex_const)
1690 info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1691 info.samp_idx = 0;
1692
1693 /* Choose encoding. */
1694 if (tex_const && info.tex_idx < 256) {
1695 if (info.tex_idx < 16) {
1696 /* Everything fits within the instruction */
1697 info.base = info.tex_base;
1698 } else {
1699 info.base = info.tex_base;
1700 if (ctx->compiler->gen <= 6) {
1701 info.a1_val = info.tex_idx << 3;
1702 } else {
1703 info.a1_val = info.samp_idx << 3;
1704 }
1705 info.flags |= IR3_INSTR_A1EN;
1706 }
1707 info.samp_tex = NULL;
1708 } else {
1709 info.flags |= IR3_INSTR_S2EN;
1710 info.base = info.tex_base;
1711
1712 /* Note: the indirect source is now a vec2 instead of hvec2 */
1713 struct ir3_instruction *texture, *sampler;
1714
1715 texture = ir3_get_src(ctx, src)[0];
1716 sampler = create_immed(b, 0);
1717 info.samp_tex = ir3_collect(b, texture, sampler);
1718 }
1719 } else {
1720 info.flags |= IR3_INSTR_S2EN;
1721 unsigned slot = nir_src_as_uint(*src);
1722 unsigned tex_idx = image ?
1723 ir3_image_to_tex(&ctx->so->image_mapping, slot) :
1724 ir3_ssbo_to_tex(&ctx->so->image_mapping, slot);
1725 struct ir3_instruction *texture, *sampler;
1726
1727 ctx->so->num_samp = MAX2(ctx->so->num_samp, tex_idx + 1);
1728
1729 texture = create_immed_typed(b, tex_idx, TYPE_U16);
1730 sampler = create_immed_typed(b, tex_idx, TYPE_U16);
1731
1732 info.samp_tex = ir3_collect(b, texture, sampler);
1733 }
1734
1735 return info;
1736 }
1737
1738 static struct ir3_instruction *
emit_sam(struct ir3_context * ctx,opc_t opc,struct tex_src_info info,type_t type,unsigned wrmask,struct ir3_instruction * src0,struct ir3_instruction * src1)1739 emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,
1740 type_t type, unsigned wrmask, struct ir3_instruction *src0,
1741 struct ir3_instruction *src1)
1742 {
1743 struct ir3_instruction *sam, *addr;
1744 if (info.flags & IR3_INSTR_A1EN) {
1745 addr = ir3_get_addr1(ctx, info.a1_val);
1746 }
1747 sam = ir3_SAM(&ctx->build, opc, type, wrmask, info.flags, info.samp_tex,
1748 src0, src1);
1749 if (info.flags & IR3_INSTR_A1EN) {
1750 ir3_instr_set_address(sam, addr);
1751 }
1752 if (info.flags & IR3_INSTR_B) {
1753 sam->cat5.tex_base = info.base;
1754 sam->cat5.samp = info.samp_idx;
1755 sam->cat5.tex = info.tex_idx;
1756 }
1757 return sam;
1758 }
1759
1760 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
1761 static void
emit_intrinsic_load_image(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1762 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1763 struct ir3_instruction **dst)
1764 {
1765 /* If the image can be written, must use LDIB to retrieve data, rather than
1766 * through ISAM (which uses the texture cache and won't get previous writes).
1767 */
1768 if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER)) {
1769 ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1770 return;
1771 }
1772
1773 /* The sparse set of texture descriptors for non-coherent load_images means we can't do indirection, so
1774 * fall back to coherent load.
1775 */
1776 if (ctx->compiler->gen >= 5 &&
1777 !ir3_bindless_resource(intr->src[0]) &&
1778 !nir_src_is_const(intr->src[0])) {
1779 ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1780 return;
1781 }
1782
1783 struct ir3_builder *b = &ctx->build;
1784 struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], true);
1785 struct ir3_instruction *sam;
1786 struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]);
1787 struct ir3_instruction *coords[4];
1788 unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1789 type_t type = ir3_get_type_for_image_intrinsic(intr);
1790
1791 info.flags |= flags;
1792
1793 /* hw doesn't do 1d, so we treat it as 2d with height of 1, and patch up the
1794 * y coord. Note that the array index must come after the fake y coord.
1795 */
1796 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
1797 if (dim == GLSL_SAMPLER_DIM_1D || dim == GLSL_SAMPLER_DIM_BUF) {
1798 coords[0] = src0[0];
1799 coords[1] = create_immed(b, 0);
1800 for (unsigned i = 1; i < ncoords; i++)
1801 coords[i + 1] = src0[i];
1802 ncoords++;
1803 } else {
1804 for (unsigned i = 0; i < ncoords; i++)
1805 coords[i] = src0[i];
1806 }
1807
1808 sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
1809 ir3_create_collect(b, coords, ncoords), NULL);
1810
1811 ir3_handle_nonuniform(sam, intr);
1812
1813 sam->barrier_class = IR3_BARRIER_IMAGE_R;
1814 sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
1815
1816 ir3_split_dest(b, dst, sam, 0, 4);
1817 }
1818
1819 /* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */
1820 void
emit_intrinsic_image_size_tex(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1821 emit_intrinsic_image_size_tex(struct ir3_context *ctx,
1822 nir_intrinsic_instr *intr,
1823 struct ir3_instruction **dst)
1824 {
1825 struct ir3_builder *b = &ctx->build;
1826 struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], true);
1827 struct ir3_instruction *sam, *lod;
1828 unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1829 type_t dst_type = intr->def.bit_size == 16 ? TYPE_U16 : TYPE_U32;
1830
1831 info.flags |= flags;
1832 assert(nir_src_as_uint(intr->src[1]) == 0);
1833 lod = create_immed(b, 0);
1834 sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
1835
1836 /* Array size actually ends up in .w rather than .z. This doesn't
1837 * matter for miplevel 0, but for higher mips the value in z is
1838 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1839 * returned, which means that we have to add 1 to it for arrays for
1840 * a3xx.
1841 *
1842 * Note use a temporary dst and then copy, since the size of the dst
1843 * array that is passed in is based on nir's understanding of the
1844 * result size, not the hardware's
1845 */
1846 struct ir3_instruction *tmp[4];
1847
1848 ir3_split_dest(b, tmp, sam, 0, 4);
1849
1850 for (unsigned i = 0; i < ncoords; i++)
1851 dst[i] = tmp[i];
1852
1853 if (flags & IR3_INSTR_A) {
1854 if (ctx->compiler->levels_add_one) {
1855 dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
1856 } else {
1857 dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32);
1858 }
1859 }
1860 }
1861
1862 static struct tex_src_info
get_bindless_samp_src(struct ir3_context * ctx,nir_src * tex,nir_src * samp)1863 get_bindless_samp_src(struct ir3_context *ctx, nir_src *tex,
1864 nir_src *samp)
1865 {
1866 struct ir3_builder *b = &ctx->build;
1867 struct tex_src_info info = {0};
1868
1869 info.flags |= IR3_INSTR_B;
1870
1871 /* Gather information required to determine which encoding to
1872 * choose as well as for prefetch.
1873 */
1874 nir_intrinsic_instr *bindless_tex = NULL;
1875 bool tex_const;
1876 if (tex) {
1877 ctx->so->bindless_tex = true;
1878 bindless_tex = ir3_bindless_resource(*tex);
1879 assert(bindless_tex);
1880 info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1881 tex_const = nir_src_is_const(bindless_tex->src[0]);
1882 if (tex_const)
1883 info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1884 } else {
1885 /* To simplify some of the logic below, assume the index is
1886 * constant 0 when it's not enabled.
1887 */
1888 tex_const = true;
1889 info.tex_idx = 0;
1890 }
1891 nir_intrinsic_instr *bindless_samp = NULL;
1892 bool samp_const;
1893 if (samp) {
1894 ctx->so->bindless_samp = true;
1895 bindless_samp = ir3_bindless_resource(*samp);
1896 assert(bindless_samp);
1897 info.samp_base = nir_intrinsic_desc_set(bindless_samp);
1898 samp_const = nir_src_is_const(bindless_samp->src[0]);
1899 if (samp_const)
1900 info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);
1901 } else {
1902 samp_const = true;
1903 info.samp_idx = 0;
1904 }
1905
1906 /* Choose encoding. */
1907 if (tex_const && samp_const && info.tex_idx < 256 &&
1908 info.samp_idx < 256) {
1909 if (info.tex_idx < 16 && info.samp_idx < 16 &&
1910 (!bindless_tex || !bindless_samp ||
1911 info.tex_base == info.samp_base)) {
1912 /* Everything fits within the instruction */
1913 info.base = info.tex_base;
1914 } else {
1915 info.base = info.tex_base;
1916 if (ctx->compiler->gen <= 6) {
1917 info.a1_val = info.tex_idx << 3 | info.samp_base;
1918 } else {
1919 info.a1_val = info.samp_idx << 3 | info.samp_base;
1920 }
1921
1922 info.flags |= IR3_INSTR_A1EN;
1923 }
1924 info.samp_tex = NULL;
1925 } else {
1926 info.flags |= IR3_INSTR_S2EN;
1927 /* In the indirect case, we only use a1.x to store the sampler
1928 * base if it differs from the texture base.
1929 */
1930 if (!bindless_tex || !bindless_samp ||
1931 info.tex_base == info.samp_base) {
1932 info.base = info.tex_base;
1933 } else {
1934 info.base = info.tex_base;
1935 info.a1_val = info.samp_base;
1936 info.flags |= IR3_INSTR_A1EN;
1937 }
1938
1939 /* Note: the indirect source is now a vec2 instead of hvec2
1940 */
1941 struct ir3_instruction *texture, *sampler;
1942
1943 if (bindless_tex) {
1944 texture = ir3_get_src(ctx, tex)[0];
1945 } else {
1946 texture = create_immed(b, 0);
1947 }
1948
1949 if (bindless_samp) {
1950 sampler = ir3_get_src(ctx, samp)[0];
1951 } else {
1952 sampler = create_immed(b, 0);
1953 }
1954 info.samp_tex = ir3_collect(b, texture, sampler);
1955 }
1956
1957 return info;
1958 }
1959
1960 static void
emit_readonly_load_uav(struct ir3_context * ctx,nir_intrinsic_instr * intr,nir_src * index,struct ir3_instruction * coords,unsigned imm_offset,bool uav_load,struct ir3_instruction ** dst)1961 emit_readonly_load_uav(struct ir3_context *ctx,
1962 nir_intrinsic_instr *intr,
1963 nir_src *index,
1964 struct ir3_instruction *coords,
1965 unsigned imm_offset,
1966 bool uav_load,
1967 struct ir3_instruction **dst)
1968 {
1969 struct ir3_builder *b = &ctx->build;
1970 struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, index, false);
1971
1972 unsigned num_components = intr->def.num_components;
1973 struct ir3_instruction *sam =
1974 emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size),
1975 MASK(num_components), coords, create_immed(b, imm_offset));
1976
1977 ir3_handle_nonuniform(sam, intr);
1978
1979 sam->barrier_class = IR3_BARRIER_BUFFER_R;
1980 sam->barrier_conflict = IR3_BARRIER_BUFFER_W;
1981
1982 ir3_split_dest(b, dst, sam, 0, num_components);
1983
1984 if (ctx->compiler->has_isam_v && !uav_load) {
1985 sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D);
1986
1987 if (imm_offset) {
1988 sam->flags |= IR3_INSTR_IMM_OFFSET;
1989 }
1990 }
1991 }
1992
1993 /* src[] = { buffer_index, offset }. No const_index */
1994 static void
emit_intrinsic_load_ssbo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1995 emit_intrinsic_load_ssbo(struct ir3_context *ctx,
1996 nir_intrinsic_instr *intr,
1997 struct ir3_instruction **dst)
1998 {
1999 /* Note: we can only use isam for vectorized loads/stores if isam.v is
2000 * available.
2001 * Note: isam also can't handle 8-bit loads.
2002 */
2003 if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
2004 (intr->def.num_components > 1 && !ctx->compiler->has_isam_v) ||
2005 (ctx->compiler->options.storage_8bit && intr->def.bit_size == 8) ||
2006 !ctx->compiler->has_isam_ssbo) {
2007 ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
2008 return;
2009 }
2010
2011 struct ir3_builder *b = &ctx->build;
2012 nir_src *offset_src = &intr->src[2];
2013 struct ir3_instruction *coords = NULL;
2014 unsigned imm_offset = 0;
2015
2016 if (ctx->compiler->has_isam_v) {
2017 ir3_lower_imm_offset(ctx, intr, offset_src, 8, &coords, &imm_offset);
2018 } else {
2019 coords =
2020 ir3_collect(b, ir3_get_src(ctx, offset_src)[0], create_immed(b, 0));
2021 }
2022
2023 emit_readonly_load_uav(ctx, intr, &intr->src[0], coords, imm_offset, false, dst);
2024 }
2025
2026 static void
emit_intrinsic_load_uav(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)2027 emit_intrinsic_load_uav(struct ir3_context *ctx,
2028 nir_intrinsic_instr *intr,
2029 struct ir3_instruction **dst)
2030 {
2031 /* Note: isam currently can't handle vectorized loads/stores */
2032 if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
2033 intr->def.num_components > 1 ||
2034 !ctx->compiler->has_isam_ssbo) {
2035 ctx->funcs->emit_intrinsic_load_uav(ctx, intr, dst);
2036 return;
2037 }
2038
2039 struct ir3_builder *b = &ctx->build;
2040 struct ir3_instruction *coords =
2041 ir3_create_collect(b, ir3_get_src(ctx, &intr->src[1]), 2);
2042 emit_readonly_load_uav(ctx, intr, &intr->src[0], coords, 0, true, dst);
2043 }
2044
2045 static void
emit_control_barrier(struct ir3_context * ctx)2046 emit_control_barrier(struct ir3_context *ctx)
2047 {
2048 /* Hull shaders dispatch 32 wide so an entire patch will always
2049 * fit in a single warp and execute in lock-step. Consequently,
2050 * we don't need to do anything for TCS barriers. Emitting
2051 * barrier instruction will deadlock.
2052 */
2053 if (ctx->so->type == MESA_SHADER_TESS_CTRL)
2054 return;
2055
2056 struct ir3_builder *b = &ctx->build;
2057 struct ir3_instruction *barrier = ir3_BAR(b);
2058 barrier->cat7.g = true;
2059 if (ctx->compiler->gen < 6)
2060 barrier->cat7.l = true;
2061 barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
2062 barrier->barrier_class = IR3_BARRIER_EVERYTHING;
2063 array_insert(ctx->block, ctx->block->keeps, barrier);
2064
2065 ctx->so->has_barrier = true;
2066 }
2067
2068 static void
emit_intrinsic_barrier(struct ir3_context * ctx,nir_intrinsic_instr * intr)2069 emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2070 {
2071 struct ir3_builder *b = &ctx->build;
2072 struct ir3_instruction *barrier;
2073
2074 /* TODO: find out why there is a major difference of .l usage
2075 * between a5xx and a6xx,
2076 */
2077
2078 mesa_scope exec_scope = nir_intrinsic_execution_scope(intr);
2079 mesa_scope mem_scope = nir_intrinsic_memory_scope(intr);
2080 nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
2081 /* loads/stores are always cache-coherent so we can filter out
2082 * available/visible.
2083 */
2084 nir_memory_semantics semantics =
2085 nir_intrinsic_memory_semantics(intr) & (NIR_MEMORY_ACQUIRE |
2086 NIR_MEMORY_RELEASE);
2087
2088 if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
2089 /* Remove mode corresponding to TCS patch barriers because hull shaders
2090 * dispatch 32 wide so an entire patch will always fit in a single warp
2091 * and execute in lock-step.
2092 *
2093 * TODO: memory barrier also tells us not to reorder stores, this
2094 * information is lost here (backend doesn't reorder stores so we
2095 * are safe for now).
2096 */
2097 modes &= ~nir_var_shader_out;
2098 }
2099
2100 assert(!(modes & nir_var_shader_out));
2101
2102 if ((modes & (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global |
2103 nir_var_image)) && semantics) {
2104 barrier = ir3_FENCE(b);
2105 barrier->cat7.r = true;
2106 barrier->cat7.w = true;
2107
2108 if (modes & (nir_var_mem_ssbo | nir_var_image | nir_var_mem_global)) {
2109 barrier->cat7.g = true;
2110 }
2111
2112 if (ctx->compiler->gen >= 6) {
2113 if (modes & (nir_var_mem_ssbo | nir_var_image)) {
2114 barrier->cat7.l = true;
2115 }
2116 } else {
2117 if (modes & (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_image)) {
2118 barrier->cat7.l = true;
2119 }
2120 }
2121
2122 barrier->barrier_class = 0;
2123 barrier->barrier_conflict = 0;
2124
2125 if (modes & nir_var_mem_shared) {
2126 barrier->barrier_class |= IR3_BARRIER_SHARED_W;
2127 barrier->barrier_conflict |=
2128 IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
2129 }
2130
2131 if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
2132 barrier->barrier_class |= IR3_BARRIER_BUFFER_W;
2133 barrier->barrier_conflict |=
2134 IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
2135 }
2136
2137 if (modes & nir_var_image) {
2138 barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
2139 barrier->barrier_conflict |=
2140 IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
2141 }
2142
2143 /* make sure barrier doesn't get DCE'd */
2144 array_insert(ctx->block, ctx->block->keeps, barrier);
2145
2146 if (ctx->compiler->gen >= 7 && mem_scope > SCOPE_WORKGROUP &&
2147 modes & (nir_var_mem_ssbo | nir_var_image) &&
2148 semantics & NIR_MEMORY_ACQUIRE) {
2149 /* "r + l" is not enough to synchronize reads with writes from other
2150 * workgroups, we can disable them since they are useless here.
2151 */
2152 barrier->cat7.r = false;
2153 barrier->cat7.l = false;
2154
2155 struct ir3_instruction *ccinv = ir3_CCINV(b);
2156 /* A7XX TODO: ccinv should just stick to the barrier,
2157 * the barrier class/conflict introduces unnecessary waits.
2158 */
2159 ccinv->barrier_class = barrier->barrier_class;
2160 ccinv->barrier_conflict = barrier->barrier_conflict;
2161 array_insert(ctx->block, ctx->block->keeps, ccinv);
2162 }
2163 }
2164
2165 if (exec_scope >= SCOPE_WORKGROUP) {
2166 emit_control_barrier(ctx);
2167 }
2168 }
2169
2170 static void
add_sysval_input_compmask(struct ir3_context * ctx,gl_system_value slot,unsigned compmask,struct ir3_instruction * instr)2171 add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot,
2172 unsigned compmask, struct ir3_instruction *instr)
2173 {
2174 struct ir3_shader_variant *so = ctx->so;
2175 unsigned n = so->inputs_count++;
2176
2177 assert(instr->opc == OPC_META_INPUT);
2178 instr->input.inidx = n;
2179 instr->input.sysval = slot;
2180
2181 so->inputs[n].sysval = true;
2182 so->inputs[n].slot = slot;
2183 so->inputs[n].compmask = compmask;
2184 so->total_in++;
2185
2186 so->sysval_in += util_last_bit(compmask);
2187 }
2188
2189 static struct ir3_instruction *
create_sysval_input(struct ir3_context * ctx,gl_system_value slot,unsigned compmask)2190 create_sysval_input(struct ir3_context *ctx, gl_system_value slot,
2191 unsigned compmask)
2192 {
2193 assert(compmask);
2194 struct ir3_instruction *sysval = create_input(ctx, compmask);
2195 add_sysval_input_compmask(ctx, slot, compmask, sysval);
2196 return sysval;
2197 }
2198
2199 static struct ir3_instruction *
get_barycentric(struct ir3_context * ctx,enum ir3_bary bary)2200 get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)
2201 {
2202 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_PIXEL ==
2203 SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
2204 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_SAMPLE ==
2205 SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
2206 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTROID ==
2207 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
2208 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTER_RHW ==
2209 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW);
2210 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_PIXEL ==
2211 SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
2212 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_CENTROID ==
2213 SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
2214 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_SAMPLE ==
2215 SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
2216
2217 if (!ctx->ij[bary]) {
2218 struct ir3_instruction *xy[2];
2219 struct ir3_instruction *ij;
2220 struct ir3_builder build =
2221 ir3_builder_at(ir3_before_terminator(ctx->in_block));
2222
2223 ij = create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL +
2224 bary, 0x3);
2225 ir3_split_dest(&build, xy, ij, 0, 2);
2226
2227 ctx->ij[bary] = ir3_create_collect(&build, xy, 2);
2228 }
2229
2230 return ctx->ij[bary];
2231 }
2232
2233 /* TODO: make this a common NIR helper?
2234 * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so
2235 * it can't be extended to work with this
2236 */
2237 static gl_system_value
nir_intrinsic_barycentric_sysval(nir_intrinsic_instr * intr)2238 nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)
2239 {
2240 enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);
2241 gl_system_value sysval;
2242
2243 switch (intr->intrinsic) {
2244 case nir_intrinsic_load_barycentric_pixel:
2245 if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
2246 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
2247 else
2248 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2249 break;
2250 case nir_intrinsic_load_barycentric_centroid:
2251 if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
2252 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;
2253 else
2254 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;
2255 break;
2256 case nir_intrinsic_load_barycentric_sample:
2257 if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
2258 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;
2259 else
2260 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;
2261 break;
2262 default:
2263 unreachable("invalid barycentric intrinsic");
2264 }
2265
2266 return sysval;
2267 }
2268
2269 static void
emit_intrinsic_barycentric(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)2270 emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr,
2271 struct ir3_instruction **dst)
2272 {
2273 gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);
2274
2275 if (!ctx->so->key.msaa && ctx->compiler->gen < 6) {
2276 switch (sysval) {
2277 case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
2278 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2279 break;
2280 case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
2281 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2282 break;
2283 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
2284 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
2285 break;
2286 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
2287 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
2288 break;
2289 default:
2290 break;
2291 }
2292 }
2293
2294 enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2295
2296 struct ir3_instruction *ij = get_barycentric(ctx, bary);
2297 ir3_split_dest(&ctx->build, dst, ij, 0, 2);
2298 }
2299
2300 static struct ir3_instruction *
get_frag_coord(struct ir3_context * ctx,nir_intrinsic_instr * intr)2301 get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2302 {
2303 if (!ctx->frag_coord) {
2304 struct ir3_block *block = ir3_after_preamble(ctx->ir);
2305 struct ir3_builder b = ir3_builder_at(ir3_before_terminator(block));
2306 struct ir3_instruction_rpt xyzw;
2307 struct ir3_instruction *hw_frag_coord;
2308
2309 hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);
2310 ir3_split_dest(&b, xyzw.rpts, hw_frag_coord, 0, 4);
2311
2312 /* for frag_coord.xy, we get unsigned values.. we need
2313 * to subtract (integer) 8 and divide by 16 (right-
2314 * shift by 4) then convert to float:
2315 *
2316 * sub.s tmp, src, 8
2317 * shr.b tmp, tmp, 4
2318 * mov.u32f32 dst, tmp
2319 *
2320 */
2321 struct ir3_instruction_rpt xy =
2322 ir3_COV_rpt(&b, 2, xyzw, TYPE_U32, TYPE_F32);
2323 xy = ir3_MUL_F_rpt(&b, 2, xy, 0, create_immed_rpt(&b, 2, fui(1.0 / 16.0)),
2324 0);
2325 cp_instrs(xyzw.rpts, xy.rpts, 2);
2326 ctx->frag_coord = ir3_create_collect(&b, xyzw.rpts, 4);
2327 }
2328
2329 ctx->so->fragcoord_compmask |= nir_def_components_read(&intr->def);
2330
2331 return ctx->frag_coord;
2332 }
2333
2334 /* This is a bit of a hack until ir3_context is converted to store SSA values
2335 * as ir3_register's instead of ir3_instruction's. Pick out a given destination
2336 * of an instruction with multiple destinations using a mov that will get folded
2337 * away by ir3_cp.
2338 */
2339 static struct ir3_instruction *
create_multidst_mov(struct ir3_builder * build,struct ir3_register * dst)2340 create_multidst_mov(struct ir3_builder *build, struct ir3_register *dst)
2341 {
2342 struct ir3_instruction *mov = ir3_build_instr(build, OPC_MOV, 1, 1);
2343 unsigned dst_flags = dst->flags & IR3_REG_HALF;
2344 unsigned src_flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
2345
2346 __ssa_dst(mov)->flags |= dst_flags;
2347 struct ir3_register *src =
2348 ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | src_flags);
2349 src->wrmask = dst->wrmask;
2350 src->def = dst;
2351 assert(!(dst->flags & IR3_REG_RELATIV));
2352 mov->cat1.src_type = mov->cat1.dst_type =
2353 (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
2354 return mov;
2355 }
2356
2357 static reduce_op_t
get_reduce_op(nir_op opc)2358 get_reduce_op(nir_op opc)
2359 {
2360 switch (opc) {
2361 case nir_op_iadd: return REDUCE_OP_ADD_U;
2362 case nir_op_fadd: return REDUCE_OP_ADD_F;
2363 case nir_op_imul: return REDUCE_OP_MUL_U;
2364 case nir_op_fmul: return REDUCE_OP_MUL_F;
2365 case nir_op_umin: return REDUCE_OP_MIN_U;
2366 case nir_op_imin: return REDUCE_OP_MIN_S;
2367 case nir_op_fmin: return REDUCE_OP_MIN_F;
2368 case nir_op_umax: return REDUCE_OP_MAX_U;
2369 case nir_op_imax: return REDUCE_OP_MAX_S;
2370 case nir_op_fmax: return REDUCE_OP_MAX_F;
2371 case nir_op_iand: return REDUCE_OP_AND_B;
2372 case nir_op_ior: return REDUCE_OP_OR_B;
2373 case nir_op_ixor: return REDUCE_OP_XOR_B;
2374 default:
2375 unreachable("unknown NIR reduce op");
2376 }
2377 }
2378
2379 static uint32_t
get_reduce_identity(nir_op opc,unsigned size)2380 get_reduce_identity(nir_op opc, unsigned size)
2381 {
2382 switch (opc) {
2383 case nir_op_iadd:
2384 return 0;
2385 case nir_op_fadd:
2386 return size == 32 ? fui(0.0f) : _mesa_float_to_half(0.0f);
2387 case nir_op_imul:
2388 return 1;
2389 case nir_op_fmul:
2390 return size == 32 ? fui(1.0f) : _mesa_float_to_half(1.0f);
2391 case nir_op_umax:
2392 return 0;
2393 case nir_op_imax:
2394 return size == 32 ? INT32_MIN : (uint32_t)INT16_MIN;
2395 case nir_op_fmax:
2396 return size == 32 ? fui(-INFINITY) : _mesa_float_to_half(-INFINITY);
2397 case nir_op_umin:
2398 return size == 32 ? UINT32_MAX : UINT16_MAX;
2399 case nir_op_imin:
2400 return size == 32 ? INT32_MAX : (uint32_t)INT16_MAX;
2401 case nir_op_fmin:
2402 return size == 32 ? fui(INFINITY) : _mesa_float_to_half(INFINITY);
2403 case nir_op_iand:
2404 return size == 32 ? ~0 : (size == 16 ? (uint32_t)(uint16_t)~0 : 1);
2405 case nir_op_ior:
2406 return 0;
2407 case nir_op_ixor:
2408 return 0;
2409 default:
2410 unreachable("unknown NIR reduce op");
2411 }
2412 }
2413
2414 static struct ir3_instruction *
emit_intrinsic_reduce(struct ir3_context * ctx,nir_intrinsic_instr * intr)2415 emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2416 {
2417 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2418 nir_op nir_reduce_op = (nir_op) nir_intrinsic_reduction_op(intr);
2419 reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
2420 unsigned dst_size = intr->def.bit_size;
2421 unsigned flags = (ir3_bitsize(ctx, dst_size) == 16) ? IR3_REG_HALF : 0;
2422
2423 /* Note: the shared reg is initialized to the identity, so we need it to
2424 * always be 32-bit even when the source isn't because half shared regs are
2425 * not supported.
2426 */
2427 struct ir3_instruction *identity = create_immed_shared(
2428 &ctx->build, get_reduce_identity(nir_reduce_op, dst_size), true);
2429
2430 /* OPC_SCAN_MACRO has the following destinations:
2431 * - Exclusive scan result (interferes with source)
2432 * - Inclusive scan result
2433 * - Shared reg reduction result, must be initialized to the identity
2434 *
2435 * The loop computes all three results at the same time, we just have to
2436 * choose which destination to return.
2437 */
2438 struct ir3_instruction *scan =
2439 ir3_build_instr(&ctx->build, OPC_SCAN_MACRO, 3, 2);
2440 scan->cat1.reduce_op = reduce_op;
2441
2442 struct ir3_register *exclusive = __ssa_dst(scan);
2443 exclusive->flags |= flags | IR3_REG_EARLY_CLOBBER;
2444 struct ir3_register *inclusive = __ssa_dst(scan);
2445 inclusive->flags |= flags;
2446 struct ir3_register *reduce = __ssa_dst(scan);
2447 reduce->flags |= IR3_REG_SHARED;
2448
2449 /* The 32-bit multiply macro reads its sources after writing a partial result
2450 * to the destination, therefore inclusive also interferes with the source.
2451 */
2452 if (reduce_op == REDUCE_OP_MUL_U && dst_size == 32)
2453 inclusive->flags |= IR3_REG_EARLY_CLOBBER;
2454
2455 /* Normal source */
2456 __ssa_src(scan, src, 0);
2457
2458 /* shared reg tied source */
2459 struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
2460 ir3_reg_tie(reduce, reduce_init);
2461
2462 struct ir3_register *dst;
2463 switch (intr->intrinsic) {
2464 case nir_intrinsic_reduce: dst = reduce; break;
2465 case nir_intrinsic_inclusive_scan: dst = inclusive; break;
2466 case nir_intrinsic_exclusive_scan: dst = exclusive; break;
2467 default:
2468 unreachable("unknown reduce intrinsic");
2469 }
2470
2471 return create_multidst_mov(&ctx->build, dst);
2472 }
2473
2474 static struct ir3_instruction *
emit_intrinsic_reduce_clusters(struct ir3_context * ctx,nir_intrinsic_instr * intr)2475 emit_intrinsic_reduce_clusters(struct ir3_context *ctx,
2476 nir_intrinsic_instr *intr)
2477 {
2478 nir_op nir_reduce_op = (nir_op)nir_intrinsic_reduction_op(intr);
2479 reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
2480 unsigned dst_size = intr->def.bit_size;
2481
2482 bool need_exclusive =
2483 intr->intrinsic == nir_intrinsic_exclusive_scan_clusters_ir3;
2484 bool need_scratch = reduce_op == REDUCE_OP_MUL_U && dst_size == 32;
2485
2486 /* Note: the shared reg is initialized to the identity, so we need it to
2487 * always be 32-bit even when the source isn't because half shared regs are
2488 * not supported.
2489 */
2490 struct ir3_instruction *identity = create_immed_shared(
2491 &ctx->build, get_reduce_identity(nir_reduce_op, dst_size), true);
2492
2493 struct ir3_instruction *inclusive_src = ir3_get_src(ctx, &intr->src[0])[0];
2494
2495 struct ir3_instruction *exclusive_src = NULL;
2496 if (need_exclusive)
2497 exclusive_src = ir3_get_src(ctx, &intr->src[1])[0];
2498
2499 /* OPC_SCAN_CLUSTERS_MACRO has the following destinations:
2500 * - Shared reg reduction result, must be initialized to the identity
2501 * - Inclusive scan result
2502 * - (iff exclusive) Exclusive scan result. Conditionally added because
2503 * calculating the exclusive value is optional (i.e., not a side-effect of
2504 * calculating the inclusive value) and won't be DCE'd anymore at this
2505 * point.
2506 * - (iff 32b mul_u) Scratch register. We try to emit "op rx, ry, rx" for
2507 * most ops but this isn't possible for the 32b mul_u macro since its
2508 * destination is clobbered. So conditionally allocate an extra
2509 * register in that case.
2510 *
2511 * Note that the getlast loop this macro expands to iterates over all
2512 * clusters. However, for each iteration, not only the fibers in the current
2513 * cluster are active but all later ones as well. Since they still need their
2514 * sources when their cluster is handled, all destinations interfere with
2515 * the sources.
2516 */
2517 unsigned ndst = 2 + need_exclusive + need_scratch;
2518 unsigned nsrc = 2 + need_exclusive;
2519 struct ir3_instruction *scan =
2520 ir3_build_instr(&ctx->build, OPC_SCAN_CLUSTERS_MACRO, ndst, nsrc);
2521 scan->cat1.reduce_op = reduce_op;
2522
2523 unsigned dst_flags = IR3_REG_EARLY_CLOBBER;
2524 if (ir3_bitsize(ctx, dst_size) == 16)
2525 dst_flags |= IR3_REG_HALF;
2526
2527 struct ir3_register *reduce = __ssa_dst(scan);
2528 reduce->flags |= IR3_REG_SHARED;
2529 struct ir3_register *inclusive = __ssa_dst(scan);
2530 inclusive->flags |= dst_flags;
2531
2532 struct ir3_register *exclusive = NULL;
2533 if (need_exclusive) {
2534 exclusive = __ssa_dst(scan);
2535 exclusive->flags |= dst_flags;
2536 }
2537
2538 if (need_scratch) {
2539 struct ir3_register *scratch = __ssa_dst(scan);
2540 scratch->flags |= dst_flags;
2541 }
2542
2543 struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
2544 ir3_reg_tie(reduce, reduce_init);
2545
2546 __ssa_src(scan, inclusive_src, 0);
2547
2548 if (need_exclusive)
2549 __ssa_src(scan, exclusive_src, 0);
2550
2551 struct ir3_register *dst;
2552 switch (intr->intrinsic) {
2553 case nir_intrinsic_reduce_clusters_ir3:
2554 dst = reduce;
2555 break;
2556 case nir_intrinsic_inclusive_scan_clusters_ir3:
2557 dst = inclusive;
2558 break;
2559 case nir_intrinsic_exclusive_scan_clusters_ir3: {
2560 assert(exclusive != NULL);
2561 dst = exclusive;
2562 break;
2563 }
2564 default:
2565 unreachable("unknown reduce intrinsic");
2566 }
2567
2568 return create_multidst_mov(&ctx->build, dst);
2569 }
2570
2571 static struct ir3_instruction *
emit_intrinsic_brcst_active(struct ir3_context * ctx,nir_intrinsic_instr * intr)2572 emit_intrinsic_brcst_active(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2573 {
2574 struct ir3_instruction *default_src = ir3_get_src(ctx, &intr->src[0])[0];
2575 struct ir3_instruction *brcst_val = ir3_get_src(ctx, &intr->src[1])[0];
2576 return ir3_BRCST_ACTIVE(&ctx->build, nir_intrinsic_cluster_size(intr),
2577 brcst_val, default_src);
2578 }
2579
2580 static ir3_shfl_mode
shfl_mode(nir_intrinsic_instr * intr)2581 shfl_mode(nir_intrinsic_instr *intr)
2582 {
2583 switch (intr->intrinsic) {
2584 case nir_intrinsic_rotate:
2585 return SHFL_RDOWN;
2586 case nir_intrinsic_shuffle_up_uniform_ir3:
2587 return SHFL_RUP;
2588 case nir_intrinsic_shuffle_down_uniform_ir3:
2589 return SHFL_RDOWN;
2590 case nir_intrinsic_shuffle_xor_uniform_ir3:
2591 return SHFL_XOR;
2592 default:
2593 unreachable("unsupported shfl");
2594 }
2595 }
2596
2597 static struct ir3_instruction *
emit_shfl(struct ir3_context * ctx,nir_intrinsic_instr * intr)2598 emit_shfl(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2599 {
2600 assert(ctx->compiler->has_shfl);
2601
2602 struct ir3_instruction *val = ir3_get_src(ctx, &intr->src[0])[0];
2603 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0];
2604
2605 struct ir3_instruction *shfl = ir3_SHFL(&ctx->build, val, 0, idx, 0);
2606 shfl->cat6.shfl_mode = shfl_mode(intr);
2607 shfl->cat6.type = is_half(val) ? TYPE_U16 : TYPE_U32;
2608
2609 return shfl;
2610 }
2611
2612 static void
emit_ray_intersection(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)2613 emit_ray_intersection(struct ir3_context *ctx, nir_intrinsic_instr *intr,
2614 struct ir3_instruction **dst)
2615 {
2616 struct ir3_builder *b = &ctx->build;
2617
2618 ctx->so->info.uses_ray_intersection = true;
2619
2620 struct ir3_instruction *bvh_base =
2621 ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), 2);
2622 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0];
2623
2624 struct ir3_instruction *ray_info =
2625 ir3_create_collect(b, ir3_get_src(ctx, &intr->src[2]), 8);
2626 struct ir3_instruction *flags = ir3_get_src(ctx, &intr->src[3])[0];
2627
2628 struct ir3_instruction *dst_init =
2629 ir3_collect(b, NULL, NULL, NULL, create_immed(b, 0), NULL);
2630
2631 struct ir3_instruction *ray_intersection =
2632 ir3_RAY_INTERSECTION(b, bvh_base, 0, idx, 0, ray_info, 0, flags, 0,
2633 dst_init, 0);
2634 ray_intersection->dsts[0]->wrmask = MASK(5);
2635 ir3_reg_tie(ray_intersection->dsts[0], ray_intersection->srcs[4]);
2636
2637 ir3_split_dest(b, dst, ray_intersection, 0, 5);
2638 }
2639
2640 static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
2641 static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
2642
2643 static void
emit_intrinsic(struct ir3_context * ctx,nir_intrinsic_instr * intr)2644 emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2645 {
2646 const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
2647 struct ir3_instruction **dst;
2648 struct ir3_instruction *const *src;
2649 struct ir3_builder *b = &ctx->build;
2650 unsigned dest_components = nir_intrinsic_dest_components(intr);
2651 int idx;
2652 bool create_rpt = false;
2653
2654 if (info->has_dest) {
2655 dst = ir3_get_def(ctx, &intr->def, dest_components);
2656 } else {
2657 dst = NULL;
2658 }
2659
2660 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
2661 const unsigned primitive_param =
2662 const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4 * 4;
2663 const unsigned primitive_map =
2664 const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4 * 4;
2665
2666 switch (intr->intrinsic) {
2667 case nir_intrinsic_decl_reg:
2668 /* There's logically nothing to do, but this has a destination in NIR so
2669 * plug in something... It will get DCE'd.
2670 */
2671 dst[0] = create_immed(b, 0);
2672 break;
2673
2674 case nir_intrinsic_load_reg:
2675 case nir_intrinsic_load_reg_indirect: {
2676 struct ir3_array *arr = ir3_get_array(ctx, intr->src[0].ssa);
2677 struct ir3_instruction *addr = NULL;
2678
2679 if (intr->intrinsic == nir_intrinsic_load_reg_indirect) {
2680 addr = ir3_get_addr0(ctx, ir3_get_src(ctx, &intr->src[1])[0],
2681 dest_components);
2682 }
2683
2684 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(intr->src[0].ssa);
2685 assert(dest_components == nir_intrinsic_num_components(decl));
2686
2687 for (unsigned i = 0; i < dest_components; i++) {
2688 unsigned n = nir_intrinsic_base(intr) * dest_components + i;
2689 compile_assert(ctx, n < arr->length);
2690 dst[i] = ir3_create_array_load(ctx, arr, n, addr);
2691 }
2692
2693 break;
2694 }
2695
2696 case nir_intrinsic_store_reg:
2697 case nir_intrinsic_store_reg_indirect: {
2698 struct ir3_array *arr = ir3_get_array(ctx, intr->src[1].ssa);
2699 unsigned num_components = nir_src_num_components(intr->src[0]);
2700 struct ir3_instruction *addr = NULL;
2701
2702 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(intr->src[1].ssa);
2703 assert(num_components == nir_intrinsic_num_components(decl));
2704
2705 struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[0]);
2706
2707 if (intr->intrinsic == nir_intrinsic_store_reg_indirect) {
2708 addr = ir3_get_addr0(ctx, ir3_get_src(ctx, &intr->src[2])[0],
2709 num_components);
2710 }
2711
2712 u_foreach_bit(i, nir_intrinsic_write_mask(intr)) {
2713 assert(i < num_components);
2714
2715 unsigned n = nir_intrinsic_base(intr) * num_components + i;
2716 compile_assert(ctx, n < arr->length);
2717 if (value[i])
2718 ir3_create_array_store(ctx, arr, n, value[i], addr);
2719 }
2720
2721 break;
2722 }
2723
2724 case nir_intrinsic_load_const_ir3:
2725 idx = nir_intrinsic_base(intr);
2726 if (nir_src_is_const(intr->src[0])) {
2727 idx += nir_src_as_uint(intr->src[0]);
2728 for (int i = 0; i < dest_components; i++) {
2729 dst[i] = create_uniform_typed(
2730 b, idx + i,
2731 intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32);
2732 }
2733 create_rpt = true;
2734 } else {
2735 src = ctx->compiler->has_scalar_alu ?
2736 ir3_get_src_maybe_shared(ctx, &intr->src[0]) :
2737 ir3_get_src(ctx, &intr->src[0]);
2738 for (int i = 0; i < dest_components; i++) {
2739 dst[i] = create_uniform_indirect(
2740 b, idx + i,
2741 intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32,
2742 ir3_get_addr0(ctx, src[0], 1));
2743 /* Since this may not be foldable into conversions into shared
2744 * registers, manually make it shared. Optimizations can undo this if
2745 * the user can't use shared regs.
2746 */
2747 if (ctx->compiler->has_scalar_alu && !intr->def.divergent)
2748 dst[i]->dsts[0]->flags |= IR3_REG_SHARED;
2749 }
2750
2751 ctx->has_relative_load_const_ir3 = true;
2752 }
2753 break;
2754
2755 case nir_intrinsic_load_vs_primitive_stride_ir3:
2756 dst[0] = create_uniform(b, primitive_param + 0);
2757 break;
2758 case nir_intrinsic_load_vs_vertex_stride_ir3:
2759 dst[0] = create_uniform(b, primitive_param + 1);
2760 break;
2761 case nir_intrinsic_load_hs_patch_stride_ir3:
2762 dst[0] = create_uniform(b, primitive_param + 2);
2763 break;
2764 case nir_intrinsic_load_patch_vertices_in:
2765 dst[0] = create_uniform(b, primitive_param + 3);
2766 break;
2767 case nir_intrinsic_load_tess_param_base_ir3:
2768 dst[0] = create_uniform(b, primitive_param + 4);
2769 dst[1] = create_uniform(b, primitive_param + 5);
2770 break;
2771 case nir_intrinsic_load_tess_factor_base_ir3:
2772 dst[0] = create_uniform(b, primitive_param + 6);
2773 dst[1] = create_uniform(b, primitive_param + 7);
2774 break;
2775
2776 case nir_intrinsic_load_primitive_location_ir3:
2777 idx = nir_intrinsic_driver_location(intr);
2778 dst[0] = create_uniform(b, primitive_map + idx);
2779 break;
2780
2781 case nir_intrinsic_load_gs_header_ir3:
2782 dst[0] = ctx->gs_header;
2783 break;
2784 case nir_intrinsic_load_tcs_header_ir3:
2785 dst[0] = ctx->tcs_header;
2786 break;
2787
2788 case nir_intrinsic_load_rel_patch_id_ir3:
2789 dst[0] = ctx->rel_patch_id;
2790 break;
2791
2792 case nir_intrinsic_load_primitive_id:
2793 if (!ctx->primitive_id) {
2794 ctx->primitive_id =
2795 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
2796 }
2797 dst[0] = ctx->primitive_id;
2798 break;
2799
2800 case nir_intrinsic_load_tess_coord_xy:
2801 if (!ctx->tess_coord) {
2802 ctx->tess_coord =
2803 create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);
2804 }
2805 ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);
2806 break;
2807
2808 case nir_intrinsic_store_global_ir3:
2809 ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);
2810 break;
2811 case nir_intrinsic_load_global_ir3:
2812 ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);
2813 break;
2814
2815 case nir_intrinsic_load_ubo:
2816 emit_intrinsic_load_ubo(ctx, intr, dst);
2817 break;
2818 case nir_intrinsic_load_ubo_vec4:
2819 emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
2820 break;
2821 case nir_intrinsic_copy_ubo_to_uniform_ir3:
2822 emit_intrinsic_copy_ubo_to_uniform(ctx, intr);
2823 break;
2824 case nir_intrinsic_copy_global_to_uniform_ir3:
2825 emit_intrinsic_copy_global_to_uniform(ctx, intr);
2826 break;
2827 case nir_intrinsic_load_frag_coord:
2828 case nir_intrinsic_load_frag_coord_unscaled_ir3:
2829 ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);
2830 break;
2831 case nir_intrinsic_load_sample_pos_from_id: {
2832 /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
2833 * but that doesn't seem necessary.
2834 */
2835 struct ir3_instruction *offset =
2836 ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
2837 offset->dsts[0]->wrmask = 0x3;
2838 offset->cat5.type = TYPE_F32;
2839
2840 ir3_split_dest(b, dst, offset, 0, 2);
2841
2842 break;
2843 }
2844 case nir_intrinsic_load_persp_center_rhw_ir3:
2845 if (!ctx->ij[IJ_PERSP_CENTER_RHW]) {
2846 ctx->ij[IJ_PERSP_CENTER_RHW] =
2847 create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW, 0x1);
2848 }
2849 dst[0] = ctx->ij[IJ_PERSP_CENTER_RHW];
2850 break;
2851 case nir_intrinsic_load_barycentric_centroid:
2852 case nir_intrinsic_load_barycentric_sample:
2853 case nir_intrinsic_load_barycentric_pixel:
2854 emit_intrinsic_barycentric(ctx, intr, dst);
2855 break;
2856 case nir_intrinsic_load_interpolated_input:
2857 case nir_intrinsic_load_input:
2858 setup_input(ctx, intr);
2859 break;
2860 case nir_intrinsic_load_kernel_input:
2861 emit_intrinsic_load_kernel_input(ctx, intr, dst);
2862 break;
2863 /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
2864 * pass and replaced by an ir3-specifc version that adds the
2865 * dword-offset in the last source.
2866 */
2867 case nir_intrinsic_load_ssbo_ir3:
2868 emit_intrinsic_load_ssbo(ctx, intr, dst);
2869 break;
2870 case nir_intrinsic_load_uav_ir3:
2871 emit_intrinsic_load_uav(ctx, intr, dst);
2872 break;
2873 case nir_intrinsic_store_ssbo_ir3:
2874 ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
2875 break;
2876 case nir_intrinsic_get_ssbo_size:
2877 emit_intrinsic_ssbo_size(ctx, intr, dst);
2878 break;
2879 case nir_intrinsic_ssbo_atomic_ir3:
2880 case nir_intrinsic_ssbo_atomic_swap_ir3:
2881 dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
2882 break;
2883 case nir_intrinsic_load_shared:
2884 emit_intrinsic_load_shared(ctx, intr, dst);
2885 break;
2886 case nir_intrinsic_store_shared:
2887 emit_intrinsic_store_shared(ctx, intr);
2888 break;
2889 case nir_intrinsic_shared_atomic:
2890 case nir_intrinsic_shared_atomic_swap:
2891 dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
2892 break;
2893 case nir_intrinsic_load_scratch:
2894 emit_intrinsic_load_scratch(ctx, intr, dst);
2895 break;
2896 case nir_intrinsic_store_scratch:
2897 emit_intrinsic_store_scratch(ctx, intr);
2898 break;
2899 case nir_intrinsic_image_load:
2900 case nir_intrinsic_bindless_image_load:
2901 emit_intrinsic_load_image(ctx, intr, dst);
2902 break;
2903 case nir_intrinsic_image_store:
2904 case nir_intrinsic_bindless_image_store:
2905 ctx->funcs->emit_intrinsic_store_image(ctx, intr);
2906 break;
2907 case nir_intrinsic_image_size:
2908 case nir_intrinsic_bindless_image_size:
2909 ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);
2910 break;
2911 case nir_intrinsic_image_atomic:
2912 case nir_intrinsic_bindless_image_atomic:
2913 case nir_intrinsic_image_atomic_swap:
2914 case nir_intrinsic_bindless_image_atomic_swap:
2915 dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
2916 break;
2917 case nir_intrinsic_barrier:
2918 emit_intrinsic_barrier(ctx, intr);
2919 /* note that blk ptr no longer valid, make that obvious: */
2920 b = NULL;
2921 break;
2922 case nir_intrinsic_store_output:
2923 case nir_intrinsic_store_per_view_output:
2924 setup_output(ctx, intr);
2925 break;
2926 case nir_intrinsic_load_base_vertex:
2927 case nir_intrinsic_load_first_vertex:
2928 if (!ctx->basevertex) {
2929 ctx->basevertex = create_driver_param(ctx, IR3_DP_VS(vtxid_base));
2930 }
2931 dst[0] = ctx->basevertex;
2932 break;
2933 case nir_intrinsic_load_is_indexed_draw:
2934 if (!ctx->is_indexed_draw) {
2935 ctx->is_indexed_draw = create_driver_param(ctx, IR3_DP_VS(is_indexed_draw));
2936 }
2937 dst[0] = ctx->is_indexed_draw;
2938 break;
2939 case nir_intrinsic_load_draw_id:
2940 if (!ctx->draw_id) {
2941 ctx->draw_id = create_driver_param(ctx, IR3_DP_VS(draw_id));
2942 }
2943 dst[0] = ctx->draw_id;
2944 break;
2945 case nir_intrinsic_load_base_instance:
2946 if (!ctx->base_instance) {
2947 ctx->base_instance = create_driver_param(ctx, IR3_DP_VS(instid_base));
2948 }
2949 dst[0] = ctx->base_instance;
2950 break;
2951 case nir_intrinsic_load_view_index:
2952 if (!ctx->view_index) {
2953 ctx->view_index =
2954 create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);
2955 }
2956 dst[0] = ctx->view_index;
2957 break;
2958 case nir_intrinsic_load_vertex_id_zero_base:
2959 case nir_intrinsic_load_vertex_id:
2960 if (!ctx->vertex_id) {
2961 gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id)
2962 ? SYSTEM_VALUE_VERTEX_ID
2963 : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
2964 ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);
2965 }
2966 dst[0] = ctx->vertex_id;
2967 break;
2968 case nir_intrinsic_load_instance_id:
2969 if (!ctx->instance_id) {
2970 ctx->instance_id =
2971 create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);
2972 }
2973 dst[0] = ctx->instance_id;
2974 break;
2975 case nir_intrinsic_load_sample_id:
2976 case nir_intrinsic_load_sample_id_no_per_sample:
2977 if (!ctx->samp_id) {
2978 ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);
2979 ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;
2980 }
2981 dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
2982 break;
2983 case nir_intrinsic_load_sample_mask_in:
2984 if (!ctx->samp_mask_in) {
2985 ctx->so->reads_smask = true;
2986 ctx->samp_mask_in =
2987 create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
2988 }
2989 dst[0] = ctx->samp_mask_in;
2990 break;
2991 case nir_intrinsic_load_user_clip_plane:
2992 idx = nir_intrinsic_ucp_id(intr);
2993 for (int i = 0; i < dest_components; i++) {
2994 unsigned n = idx * 4 + i;
2995 dst[i] = create_driver_param(ctx, IR3_DP_VS(ucp[0].x) + n);
2996 }
2997 create_rpt = true;
2998 break;
2999 case nir_intrinsic_load_front_face:
3000 if (!ctx->frag_face) {
3001 ctx->so->frag_face = true;
3002 ctx->frag_face =
3003 create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);
3004 ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;
3005 }
3006 /* for fragface, we get -1 for back and 0 for front. However this is
3007 * the inverse of what nir expects (where ~0 is true).
3008 */
3009 dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0,
3010 create_immed_typed(b, 0, TYPE_U16), 0);
3011 dst[0]->cat2.condition = IR3_COND_EQ;
3012 break;
3013 case nir_intrinsic_load_local_invocation_id:
3014 if (!ctx->local_invocation_id) {
3015 ctx->local_invocation_id =
3016 create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);
3017 }
3018 ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);
3019 break;
3020 case nir_intrinsic_load_workgroup_id:
3021 if (ctx->compiler->has_shared_regfile) {
3022 if (!ctx->work_group_id) {
3023 ctx->work_group_id =
3024 create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);
3025 ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;
3026 }
3027 ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);
3028 } else {
3029 /* For a3xx/a4xx, this comes in via const injection by the hw */
3030 for (int i = 0; i < dest_components; i++) {
3031 dst[i] = create_driver_param(ctx, IR3_DP_CS(workgroup_id_x) + i);
3032 }
3033 }
3034 break;
3035 case nir_intrinsic_load_frag_shading_rate: {
3036 if (!ctx->frag_shading_rate) {
3037 ctx->so->reads_shading_rate = true;
3038 ctx->frag_shading_rate =
3039 create_sysval_input(ctx, SYSTEM_VALUE_FRAG_SHADING_RATE, 0x1);
3040 }
3041 dst[0] = ctx->frag_shading_rate;
3042 break;
3043 }
3044 case nir_intrinsic_load_base_workgroup_id:
3045 for (int i = 0; i < dest_components; i++) {
3046 dst[i] = create_driver_param(ctx, IR3_DP_CS(base_group_x) + i);
3047 }
3048 create_rpt = true;
3049 break;
3050 case nir_intrinsic_load_num_workgroups:
3051 for (int i = 0; i < dest_components; i++) {
3052 dst[i] = create_driver_param(ctx, IR3_DP_CS(num_work_groups_x) + i);
3053 }
3054 create_rpt = true;
3055 break;
3056 case nir_intrinsic_load_workgroup_size:
3057 for (int i = 0; i < dest_components; i++) {
3058 dst[i] = create_driver_param(ctx, IR3_DP_CS(local_group_size_x) + i);
3059 }
3060 create_rpt = true;
3061 break;
3062 case nir_intrinsic_load_subgroup_size: {
3063 assert(ctx->so->type == MESA_SHADER_COMPUTE ||
3064 ctx->so->type == MESA_SHADER_FRAGMENT);
3065 unsigned size = ctx->so->type == MESA_SHADER_COMPUTE ?
3066 IR3_DP_CS(subgroup_size) : IR3_DP_FS(subgroup_size);
3067 dst[0] = create_driver_param(ctx, size);
3068 break;
3069 }
3070 case nir_intrinsic_load_subgroup_id_shift_ir3:
3071 dst[0] = create_driver_param(ctx, IR3_DP_CS(subgroup_id_shift));
3072 break;
3073 case nir_intrinsic_load_work_dim:
3074 dst[0] = create_driver_param(ctx, IR3_DP_CS(work_dim));
3075 break;
3076 case nir_intrinsic_load_subgroup_invocation:
3077 assert(ctx->compiler->has_getfiberid);
3078 dst[0] = ir3_GETFIBERID(b);
3079 dst[0]->cat6.type = TYPE_U32;
3080 __ssa_dst(dst[0]);
3081 break;
3082 case nir_intrinsic_load_tess_level_outer_default:
3083 for (int i = 0; i < dest_components; i++) {
3084 dst[i] = create_driver_param(ctx, IR3_DP_TCS(default_outer_level_x) + i);
3085 }
3086 create_rpt = true;
3087 break;
3088 case nir_intrinsic_load_tess_level_inner_default:
3089 for (int i = 0; i < dest_components; i++) {
3090 dst[i] = create_driver_param(ctx, IR3_DP_TCS(default_inner_level_x) + i);
3091 }
3092 create_rpt = true;
3093 break;
3094 case nir_intrinsic_load_frag_invocation_count:
3095 dst[0] = create_driver_param(ctx, IR3_DP_FS(frag_invocation_count));
3096 break;
3097 case nir_intrinsic_load_frag_size_ir3:
3098 case nir_intrinsic_load_frag_offset_ir3: {
3099 unsigned param =
3100 intr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
3101 IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
3102 if (nir_src_is_const(intr->src[0])) {
3103 uint32_t view = nir_src_as_uint(intr->src[0]);
3104 for (int i = 0; i < dest_components; i++) {
3105 dst[i] = create_driver_param(ctx, param + 4 * view + i);
3106 }
3107 create_rpt = true;
3108 } else {
3109 struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0];
3110 for (int i = 0; i < dest_components; i++) {
3111 dst[i] = create_driver_param_indirect(ctx, param + i,
3112 ir3_get_addr0(ctx, view, 4));
3113 }
3114 ctx->so->constlen =
3115 MAX2(ctx->so->constlen,
3116 const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4 +
3117 param / 4 + nir_intrinsic_range(intr));
3118 }
3119 break;
3120 }
3121 case nir_intrinsic_demote:
3122 case nir_intrinsic_demote_if:
3123 case nir_intrinsic_terminate:
3124 case nir_intrinsic_terminate_if: {
3125 struct ir3_instruction *cond, *kill;
3126
3127 if (intr->intrinsic == nir_intrinsic_demote_if ||
3128 intr->intrinsic == nir_intrinsic_terminate_if) {
3129 /* conditional discard: */
3130 src = ir3_get_src(ctx, &intr->src[0]);
3131 cond = src[0];
3132 } else {
3133 /* unconditional discard: */
3134 cond = create_immed_typed(b, 1, ctx->compiler->bool_type);
3135 }
3136
3137 /* NOTE: only cmps.*.* can write p0.x: */
3138 struct ir3_instruction *zero =
3139 create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32);
3140 cond = ir3_CMPS_S(b, cond, 0, zero, 0);
3141 cond->cat2.condition = IR3_COND_NE;
3142
3143 /* condition always goes in predicate register: */
3144 cond->dsts[0]->flags |= IR3_REG_PREDICATE;
3145
3146 if (intr->intrinsic == nir_intrinsic_demote ||
3147 intr->intrinsic == nir_intrinsic_demote_if) {
3148 kill = ir3_DEMOTE(b, cond, 0);
3149 } else {
3150 kill = ir3_KILL(b, cond, 0);
3151 }
3152
3153 /* - Side-effects should not be moved on a different side of the kill
3154 * - Instructions that depend on active fibers should not be reordered
3155 */
3156 kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
3157 IR3_BARRIER_ACTIVE_FIBERS_W;
3158 kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
3159 IR3_BARRIER_ACTIVE_FIBERS_R;
3160 kill->srcs[0]->flags |= IR3_REG_PREDICATE;
3161
3162 array_insert(ctx->block, ctx->block->keeps, kill);
3163 ctx->so->has_kill = true;
3164
3165 break;
3166 }
3167
3168 case nir_intrinsic_vote_any:
3169 case nir_intrinsic_vote_all: {
3170 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3171 struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
3172 if (intr->intrinsic == nir_intrinsic_vote_any)
3173 dst[0] = ir3_ANY_MACRO(b, pred, 0);
3174 else
3175 dst[0] = ir3_ALL_MACRO(b, pred, 0);
3176 dst[0]->srcs[0]->flags |= IR3_REG_PREDICATE;
3177 break;
3178 }
3179 case nir_intrinsic_elect:
3180 dst[0] = ir3_ELECT_MACRO(b);
3181 dst[0]->flags |= IR3_INSTR_NEEDS_HELPERS;
3182 break;
3183 case nir_intrinsic_elect_any_ir3:
3184 dst[0] = ir3_ELECT_MACRO(b);
3185 break;
3186 case nir_intrinsic_preamble_start_ir3:
3187 dst[0] = ir3_SHPS_MACRO(b);
3188 break;
3189
3190 case nir_intrinsic_read_invocation_cond_ir3: {
3191 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3192 struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];
3193 dst[0] = ir3_READ_COND_MACRO(b, ir3_get_predicate(ctx, cond), 0, src, 0);
3194 dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
3195 dst[0]->srcs[0]->flags |= IR3_REG_PREDICATE;
3196 /* Work around a bug with half-register shared -> non-shared moves by
3197 * adding an extra mov here so that the original destination stays full.
3198 */
3199 if (src->dsts[0]->flags & IR3_REG_HALF) {
3200 dst[0] = ir3_MOV(b, dst[0], TYPE_U32);
3201 if (!ctx->compiler->has_scalar_alu)
3202 dst[0]->dsts[0]->flags &= ~IR3_REG_SHARED;
3203 }
3204 break;
3205 }
3206
3207 case nir_intrinsic_read_first_invocation: {
3208 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3209 dst[0] = ir3_READ_FIRST_MACRO(b, src, 0);
3210 dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
3211 /* See above. */
3212 if (src->dsts[0]->flags & IR3_REG_HALF) {
3213 dst[0] = ir3_MOV(b, dst[0], TYPE_U32);
3214 if (!ctx->compiler->has_scalar_alu)
3215 dst[0]->dsts[0]->flags &= ~IR3_REG_SHARED;
3216 }
3217 break;
3218 }
3219
3220 case nir_intrinsic_read_getlast_ir3: {
3221 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3222 dst[0] = ir3_READ_GETLAST_MACRO(b, src, 0);
3223 dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
3224 /* See above. */
3225 if (src->dsts[0]->flags & IR3_REG_HALF) {
3226 dst[0] = ir3_MOV(b, dst[0], TYPE_U32);
3227 if (!ctx->compiler->has_scalar_alu)
3228 dst[0]->dsts[0]->flags &= ~IR3_REG_SHARED;
3229 }
3230 break;
3231 }
3232
3233 case nir_intrinsic_ballot: {
3234 struct ir3_instruction *ballot;
3235 unsigned components = intr->def.num_components;
3236 if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
3237 /* ballot(true) is just MOVMSK */
3238 ballot = ir3_MOVMSK(b, components);
3239 } else {
3240 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3241 struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
3242 ballot = ir3_BALLOT_MACRO(b, pred, components);
3243 ballot->srcs[0]->flags |= IR3_REG_PREDICATE;
3244 }
3245
3246 ballot->barrier_class = IR3_BARRIER_ACTIVE_FIBERS_R;
3247 ballot->barrier_conflict = IR3_BARRIER_ACTIVE_FIBERS_W;
3248
3249 ir3_split_dest(b, dst, ballot, 0, components);
3250 break;
3251 }
3252
3253 case nir_intrinsic_quad_broadcast: {
3254 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3255 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0];
3256
3257 type_t dst_type = type_uint_size(intr->def.bit_size);
3258
3259 if (dst_type != TYPE_U32)
3260 idx = ir3_COV(b, idx, TYPE_U32, dst_type);
3261
3262 dst[0] = ir3_QUAD_SHUFFLE_BRCST(b, src, 0, idx, 0);
3263 dst[0]->cat5.type = dst_type;
3264 break;
3265 }
3266
3267 case nir_intrinsic_quad_swap_horizontal: {
3268 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3269 dst[0] = ir3_QUAD_SHUFFLE_HORIZ(b, src, 0);
3270 dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
3271 break;
3272 }
3273
3274 case nir_intrinsic_quad_swap_vertical: {
3275 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3276 dst[0] = ir3_QUAD_SHUFFLE_VERT(b, src, 0);
3277 dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
3278 break;
3279 }
3280
3281 case nir_intrinsic_quad_swap_diagonal: {
3282 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3283 dst[0] = ir3_QUAD_SHUFFLE_DIAG(b, src, 0);
3284 dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
3285 break;
3286 }
3287 case nir_intrinsic_ddx:
3288 case nir_intrinsic_ddx_coarse: {
3289 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3290 dst[0] = ir3_DSX(b, src, 0);
3291 dst[0]->cat5.type = TYPE_F32;
3292 break;
3293 }
3294 case nir_intrinsic_ddx_fine: {
3295 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3296 dst[0] = ir3_DSXPP_MACRO(b, src, 0);
3297 dst[0]->cat5.type = TYPE_F32;
3298 break;
3299 }
3300 case nir_intrinsic_ddy:
3301 case nir_intrinsic_ddy_coarse: {
3302 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3303 dst[0] = ir3_DSY(b, src, 0);
3304 dst[0]->cat5.type = TYPE_F32;
3305 break;
3306 }
3307 case nir_intrinsic_ddy_fine: {
3308 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3309 dst[0] = ir3_DSYPP_MACRO(b, src, 0);
3310 dst[0]->cat5.type = TYPE_F32;
3311 break;
3312 }
3313 case nir_intrinsic_load_shared_ir3:
3314 emit_intrinsic_load_shared_ir3(ctx, intr, dst);
3315 break;
3316 case nir_intrinsic_store_shared_ir3:
3317 emit_intrinsic_store_shared_ir3(ctx, intr);
3318 break;
3319 case nir_intrinsic_bindless_resource_ir3:
3320 dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
3321 break;
3322 case nir_intrinsic_global_atomic_ir3:
3323 case nir_intrinsic_global_atomic_swap_ir3: {
3324 dst[0] = ctx->funcs->emit_intrinsic_atomic_global(ctx, intr);
3325 break;
3326 }
3327
3328 case nir_intrinsic_reduce:
3329 case nir_intrinsic_inclusive_scan:
3330 case nir_intrinsic_exclusive_scan:
3331 dst[0] = emit_intrinsic_reduce(ctx, intr);
3332 break;
3333
3334 case nir_intrinsic_reduce_clusters_ir3:
3335 case nir_intrinsic_inclusive_scan_clusters_ir3:
3336 case nir_intrinsic_exclusive_scan_clusters_ir3:
3337 dst[0] = emit_intrinsic_reduce_clusters(ctx, intr);
3338 break;
3339
3340 case nir_intrinsic_brcst_active_ir3:
3341 dst[0] = emit_intrinsic_brcst_active(ctx, intr);
3342 break;
3343
3344 case nir_intrinsic_preamble_end_ir3: {
3345 struct ir3_instruction *instr = ir3_SHPE(b);
3346 instr->barrier_class = instr->barrier_conflict = IR3_BARRIER_CONST_W;
3347 array_insert(ctx->block, ctx->block->keeps, instr);
3348 break;
3349 }
3350 case nir_intrinsic_store_const_ir3: {
3351 unsigned components = nir_src_num_components(intr->src[0]);
3352 unsigned dst = nir_intrinsic_base(intr);
3353 unsigned dst_lo = dst & 0xff;
3354 unsigned dst_hi = dst >> 8;
3355
3356 struct ir3_instruction *src =
3357 ir3_create_collect(b, ir3_get_src_shared(ctx, &intr->src[0],
3358 ctx->compiler->has_scalar_alu),
3359 components);
3360 struct ir3_instruction *a1 = NULL;
3361 if (dst_hi) {
3362 /* Encode only the high part of the destination in a1.x to increase the
3363 * chance that we can reuse the a1.x value in subsequent stc
3364 * instructions.
3365 */
3366 a1 = ir3_get_addr1(ctx, dst_hi << 8);
3367 }
3368
3369 struct ir3_instruction *stc =
3370 ir3_STC(b, create_immed(b, dst_lo), 0, src, 0);
3371 stc->cat6.iim_val = components;
3372 stc->cat6.type = TYPE_U32;
3373 stc->barrier_conflict = IR3_BARRIER_CONST_W;
3374 if (a1) {
3375 ir3_instr_set_address(stc, a1);
3376 stc->flags |= IR3_INSTR_A1EN;
3377 }
3378 /* The assembler isn't aware of what value a1.x has, so make sure that
3379 * constlen includes the stc here.
3380 */
3381 ctx->so->constlen =
3382 MAX2(ctx->so->constlen, DIV_ROUND_UP(dst + components, 4));
3383 array_insert(ctx->block, ctx->block->keeps, stc);
3384 break;
3385 }
3386 case nir_intrinsic_copy_push_const_to_uniform_ir3: {
3387 struct ir3_instruction *load =
3388 ir3_build_instr(b, OPC_PUSH_CONSTS_LOAD_MACRO, 0, 0);
3389 array_insert(ctx->block, ctx->block->keeps, load);
3390
3391 load->push_consts.dst_base = nir_src_as_uint(intr->src[0]);
3392 load->push_consts.src_base = nir_intrinsic_base(intr);
3393 load->push_consts.src_size = nir_intrinsic_range(intr);
3394
3395 ctx->so->constlen =
3396 MAX2(ctx->so->constlen,
3397 DIV_ROUND_UP(
3398 load->push_consts.dst_base + load->push_consts.src_size, 4));
3399 break;
3400 }
3401 case nir_intrinsic_prefetch_sam_ir3: {
3402 struct tex_src_info info =
3403 get_bindless_samp_src(ctx, &intr->src[0], &intr->src[1]);
3404 struct ir3_instruction *sam =
3405 emit_sam(ctx, OPC_SAM, info, TYPE_F32, 0b1111, NULL, NULL);
3406
3407 sam->dsts_count = 0;
3408 array_insert(ctx->block, ctx->block->keeps, sam);
3409 break;
3410 }
3411 case nir_intrinsic_prefetch_tex_ir3: {
3412 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
3413 struct ir3_instruction *resinfo = ir3_RESINFO(b, idx, 0);
3414 resinfo->cat6.iim_val = 1;
3415 resinfo->cat6.d = 1;
3416 resinfo->cat6.type = TYPE_U32;
3417 resinfo->cat6.typed = false;
3418
3419 ir3_handle_bindless_cat6(resinfo, intr->src[0]);
3420 if (resinfo->flags & IR3_INSTR_B)
3421 ctx->so->bindless_tex = true;
3422
3423 resinfo->dsts_count = 0;
3424 array_insert(ctx->block, ctx->block->keeps, resinfo);
3425 break;
3426 }
3427 case nir_intrinsic_prefetch_ubo_ir3: {
3428 struct ir3_instruction *offset = create_immed(b, 0);
3429 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
3430 struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
3431 ldc->cat6.iim_val = 1;
3432 ldc->cat6.type = TYPE_U32;
3433
3434 ir3_handle_bindless_cat6(ldc, intr->src[0]);
3435 if (ldc->flags & IR3_INSTR_B)
3436 ctx->so->bindless_ubo = true;
3437
3438 ldc->dsts_count = 0;
3439 array_insert(ctx->block, ctx->block->keeps, ldc);
3440 break;
3441 }
3442 case nir_intrinsic_rotate:
3443 case nir_intrinsic_shuffle_up_uniform_ir3:
3444 case nir_intrinsic_shuffle_down_uniform_ir3:
3445 case nir_intrinsic_shuffle_xor_uniform_ir3:
3446 dst[0] = emit_shfl(ctx, intr);
3447 break;
3448 case nir_intrinsic_ray_intersection_ir3:
3449 emit_ray_intersection(ctx, intr, dst);
3450 break;
3451 default:
3452 ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
3453 nir_intrinsic_infos[intr->intrinsic].name);
3454 break;
3455 }
3456
3457 if (info->has_dest) {
3458 if (create_rpt)
3459 ir3_instr_create_rpt(dst, dest_components);
3460 ir3_put_def(ctx, &intr->def);
3461 }
3462 }
3463
3464 static void
emit_load_const(struct ir3_context * ctx,nir_load_const_instr * instr)3465 emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
3466 {
3467 unsigned bit_size = ir3_bitsize(ctx, instr->def.bit_size);
3468 struct ir3_instruction **dst =
3469 ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components * ((bit_size == 64) ? 2 : 1));
3470
3471 if (bit_size <= 8) {
3472 for (int i = 0; i < instr->def.num_components; i++)
3473 dst[i] = create_immed_typed(&ctx->build, instr->value[i].u8, TYPE_U8);
3474 } else if (bit_size <= 16) {
3475 for (int i = 0; i < instr->def.num_components; i++)
3476 dst[i] =
3477 create_immed_typed(&ctx->build, instr->value[i].u16, TYPE_U16);
3478 } else if (bit_size <= 32) {
3479 for (int i = 0; i < instr->def.num_components; i++)
3480 dst[i] =
3481 create_immed_typed(&ctx->build, instr->value[i].u32, TYPE_U32);
3482 } else {
3483 assert(instr->def.num_components == 1);
3484 for (int i = 0; i < instr->def.num_components; i++) {
3485 dst[2 * i] = create_immed_typed(
3486 &ctx->build, (uint32_t)(instr->value[i].u64), TYPE_U32);
3487 dst[2 * i + 1] = create_immed_typed(
3488 &ctx->build, (uint32_t)(instr->value[i].u64 >> 32), TYPE_U32);
3489 }
3490 }
3491 }
3492
3493 static void
emit_undef(struct ir3_context * ctx,nir_undef_instr * undef)3494 emit_undef(struct ir3_context *ctx, nir_undef_instr *undef)
3495 {
3496 struct ir3_instruction **dst =
3497 ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);
3498 type_t type = utype_for_size(ir3_bitsize(ctx, undef->def.bit_size));
3499
3500 /* backend doesn't want undefined instructions, so just plug
3501 * in 0.0..
3502 */
3503 for (int i = 0; i < undef->def.num_components; i++)
3504 dst[i] = create_immed_typed(&ctx->build, fui(0.0), type);
3505 }
3506
3507 /*
3508 * texture fetch/sample instructions:
3509 */
3510
3511 static type_t
get_tex_dest_type(nir_tex_instr * tex)3512 get_tex_dest_type(nir_tex_instr *tex)
3513 {
3514 type_t type;
3515
3516 switch (tex->dest_type) {
3517 case nir_type_float32:
3518 return TYPE_F32;
3519 case nir_type_float16:
3520 return TYPE_F16;
3521 case nir_type_int32:
3522 return TYPE_S32;
3523 case nir_type_int16:
3524 return TYPE_S16;
3525 case nir_type_bool32:
3526 case nir_type_uint32:
3527 return TYPE_U32;
3528 case nir_type_bool16:
3529 case nir_type_uint16:
3530 return TYPE_U16;
3531 case nir_type_invalid:
3532 default:
3533 unreachable("bad dest_type");
3534 }
3535
3536 return type;
3537 }
3538
3539 static void
tex_info(nir_tex_instr * tex,unsigned * flagsp,unsigned * coordsp)3540 tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
3541 {
3542 unsigned coords =
3543 glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);
3544 unsigned flags = 0;
3545
3546 /* note: would use tex->coord_components.. except txs.. also,
3547 * since array index goes after shadow ref, we don't want to
3548 * count it:
3549 */
3550 if (coords == 3)
3551 flags |= IR3_INSTR_3D;
3552
3553 if (tex->is_shadow && tex->op != nir_texop_lod)
3554 flags |= IR3_INSTR_S;
3555
3556 if (tex->is_array && tex->op != nir_texop_lod)
3557 flags |= IR3_INSTR_A;
3558
3559 *flagsp = flags;
3560 *coordsp = coords;
3561 }
3562
3563 /* Gets the sampler/texture idx as a hvec2. Which could either be dynamic
3564 * or immediate (in which case it will get lowered later to a non .s2en
3565 * version of the tex instruction which encode tex/samp as immediates:
3566 */
3567 static struct tex_src_info
get_tex_samp_tex_src(struct ir3_context * ctx,nir_tex_instr * tex)3568 get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
3569 {
3570 struct ir3_builder *b = &ctx->build;
3571 struct tex_src_info info = {0};
3572 int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
3573 int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
3574 struct ir3_instruction *texture, *sampler;
3575
3576 if (texture_idx >= 0 || sampler_idx >= 0) {
3577 /* Bindless case */
3578 info = get_bindless_samp_src(ctx,
3579 texture_idx >= 0 ? &tex->src[texture_idx].src : NULL,
3580 sampler_idx >= 0 ? &tex->src[sampler_idx].src : NULL);
3581
3582 if (tex->texture_non_uniform || tex->sampler_non_uniform)
3583 info.flags |= IR3_INSTR_NONUNIF;
3584 } else {
3585 info.flags |= IR3_INSTR_S2EN;
3586 texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
3587 sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
3588 if (texture_idx >= 0) {
3589 texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
3590 texture = ir3_COV(b, texture, TYPE_U32, TYPE_U16);
3591 } else {
3592 /* TODO what to do for dynamic case? I guess we only need the
3593 * max index for astc srgb workaround so maybe not a problem
3594 * to worry about if we don't enable indirect samplers for
3595 * a4xx?
3596 */
3597 ctx->max_texture_index =
3598 MAX2(ctx->max_texture_index, tex->texture_index);
3599 texture = create_immed_typed(b, tex->texture_index, TYPE_U16);
3600 info.tex_idx = tex->texture_index;
3601 }
3602
3603 if (sampler_idx >= 0) {
3604 sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
3605 sampler = ir3_COV(b, sampler, TYPE_U32, TYPE_U16);
3606 } else {
3607 sampler = create_immed_typed(b, tex->sampler_index, TYPE_U16);
3608 info.samp_idx = tex->texture_index;
3609 }
3610
3611 info.samp_tex = ir3_collect(b, texture, sampler);
3612 }
3613
3614 return info;
3615 }
3616
3617 static void
emit_tex(struct ir3_context * ctx,nir_tex_instr * tex)3618 emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
3619 {
3620 struct ir3_builder *b = &ctx->build;
3621 struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
3622 struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
3623 struct ir3_instruction *lod, *compare, *proj, *sample_index;
3624 struct tex_src_info info = {0};
3625 bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
3626 unsigned i, coords, flags, ncomp;
3627 unsigned nsrc0 = 0, nsrc1 = 0;
3628 type_t type;
3629 opc_t opc = 0;
3630
3631 ncomp = tex->def.num_components;
3632
3633 coord = off = ddx = ddy = NULL;
3634 lod = proj = compare = sample_index = NULL;
3635
3636 dst = ir3_get_def(ctx, &tex->def, ncomp);
3637
3638 for (unsigned i = 0; i < tex->num_srcs; i++) {
3639 switch (tex->src[i].src_type) {
3640 case nir_tex_src_coord:
3641 coord = ir3_get_src(ctx, &tex->src[i].src);
3642 break;
3643 case nir_tex_src_bias:
3644 lod = ir3_get_src(ctx, &tex->src[i].src)[0];
3645 has_bias = true;
3646 break;
3647 case nir_tex_src_lod:
3648 lod = ir3_get_src(ctx, &tex->src[i].src)[0];
3649 has_lod = true;
3650 break;
3651 case nir_tex_src_comparator: /* shadow comparator */
3652 compare = ir3_get_src(ctx, &tex->src[i].src)[0];
3653 break;
3654 case nir_tex_src_projector:
3655 proj = ir3_get_src(ctx, &tex->src[i].src)[0];
3656 has_proj = true;
3657 break;
3658 case nir_tex_src_offset:
3659 off = ir3_get_src(ctx, &tex->src[i].src);
3660 has_off = true;
3661 break;
3662 case nir_tex_src_ddx:
3663 ddx = ir3_get_src(ctx, &tex->src[i].src);
3664 break;
3665 case nir_tex_src_ddy:
3666 ddy = ir3_get_src(ctx, &tex->src[i].src);
3667 break;
3668 case nir_tex_src_ms_index:
3669 sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];
3670 break;
3671 case nir_tex_src_texture_offset:
3672 case nir_tex_src_sampler_offset:
3673 case nir_tex_src_texture_handle:
3674 case nir_tex_src_sampler_handle:
3675 /* handled in get_tex_samp_src() */
3676 break;
3677 default:
3678 ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
3679 tex->src[i].src_type);
3680 return;
3681 }
3682 }
3683
3684 switch (tex->op) {
3685 case nir_texop_tex_prefetch:
3686 compile_assert(ctx, !has_bias);
3687 compile_assert(ctx, !has_lod);
3688 compile_assert(ctx, !compare);
3689 compile_assert(ctx, !has_proj);
3690 compile_assert(ctx, !has_off);
3691 compile_assert(ctx, !ddx);
3692 compile_assert(ctx, !ddy);
3693 compile_assert(ctx, !sample_index);
3694 compile_assert(
3695 ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);
3696 compile_assert(
3697 ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);
3698
3699 if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {
3700 opc = OPC_META_TEX_PREFETCH;
3701 ctx->so->num_sampler_prefetch++;
3702 break;
3703 }
3704 FALLTHROUGH;
3705 case nir_texop_tex:
3706 opc = has_lod ? OPC_SAML : OPC_SAM;
3707 break;
3708 case nir_texop_txb:
3709 opc = OPC_SAMB;
3710 break;
3711 case nir_texop_txl:
3712 opc = OPC_SAML;
3713 break;
3714 case nir_texop_txd:
3715 opc = OPC_SAMGQ;
3716 break;
3717 case nir_texop_txf:
3718 opc = OPC_ISAML;
3719 break;
3720 case nir_texop_lod:
3721 opc = OPC_GETLOD;
3722 break;
3723 case nir_texop_tg4:
3724 switch (tex->component) {
3725 case 0:
3726 opc = OPC_GATHER4R;
3727 break;
3728 case 1:
3729 opc = OPC_GATHER4G;
3730 break;
3731 case 2:
3732 opc = OPC_GATHER4B;
3733 break;
3734 case 3:
3735 opc = OPC_GATHER4A;
3736 break;
3737 }
3738 break;
3739 case nir_texop_txf_ms_fb:
3740 case nir_texop_txf_ms:
3741 opc = OPC_ISAMM;
3742 break;
3743 default:
3744 ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
3745 return;
3746 }
3747
3748 tex_info(tex, &flags, &coords);
3749
3750 /*
3751 * lay out the first argument in the proper order:
3752 * - actual coordinates first
3753 * - shadow reference
3754 * - array index
3755 * - projection w
3756 * - starting at offset 4, dpdx.xy, dpdy.xy
3757 *
3758 * bias/lod go into the second arg
3759 */
3760
3761 /* insert tex coords: */
3762 for (i = 0; i < coords; i++)
3763 src0[i] = coord[i];
3764
3765 nsrc0 = i;
3766
3767 type_t coord_pad_type = is_half(coord[0]) ? TYPE_U16 : TYPE_U32;
3768 /* scale up integer coords for TXF based on the LOD */
3769 if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
3770 assert(has_lod);
3771 for (i = 0; i < coords; i++)
3772 src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
3773 }
3774
3775 if (coords == 1) {
3776 /* hw doesn't do 1d, so we treat it as 2d with
3777 * height of 1, and patch up the y coord.
3778 */
3779 if (is_isam(opc)) {
3780 src0[nsrc0++] = create_immed_typed(b, 0, coord_pad_type);
3781 } else if (is_half(coord[0])) {
3782 src0[nsrc0++] = create_immed_typed(b, _mesa_float_to_half(0.5), coord_pad_type);
3783 } else {
3784 src0[nsrc0++] = create_immed_typed(b, fui(0.5), coord_pad_type);
3785 }
3786 }
3787
3788 if (tex->is_shadow && tex->op != nir_texop_lod)
3789 src0[nsrc0++] = compare;
3790
3791 if (tex->is_array && tex->op != nir_texop_lod)
3792 src0[nsrc0++] = coord[coords];
3793
3794 if (has_proj) {
3795 src0[nsrc0++] = proj;
3796 flags |= IR3_INSTR_P;
3797 }
3798
3799 /* pad to 4, then ddx/ddy: */
3800 if (tex->op == nir_texop_txd) {
3801 while (nsrc0 < 4)
3802 src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3803 for (i = 0; i < coords; i++)
3804 src0[nsrc0++] = ddx[i];
3805 if (coords < 2)
3806 src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3807 for (i = 0; i < coords; i++)
3808 src0[nsrc0++] = ddy[i];
3809 if (coords < 2)
3810 src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3811 }
3812
3813 /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
3814 * with scaled x coord according to requested sample:
3815 */
3816 if (opc == OPC_ISAMM) {
3817 if (ctx->compiler->txf_ms_with_isaml) {
3818 /* the samples are laid out in x dimension as
3819 * 0 1 2 3
3820 * x_ms = (x << ms) + sample_index;
3821 */
3822 struct ir3_instruction *ms;
3823 ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
3824
3825 src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
3826 src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
3827
3828 opc = OPC_ISAML;
3829 } else {
3830 src0[nsrc0++] = sample_index;
3831 }
3832 }
3833
3834 /*
3835 * second argument (if applicable):
3836 * - offsets
3837 * - lod
3838 * - bias
3839 */
3840 if (has_off | has_lod | has_bias) {
3841 if (has_off) {
3842 unsigned off_coords = coords;
3843 if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
3844 off_coords--;
3845 for (i = 0; i < off_coords; i++)
3846 src1[nsrc1++] = off[i];
3847 if (off_coords < 2)
3848 src1[nsrc1++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3849 flags |= IR3_INSTR_O;
3850 }
3851
3852 if (has_lod | has_bias)
3853 src1[nsrc1++] = lod;
3854 }
3855
3856 type = get_tex_dest_type(tex);
3857
3858 if (opc == OPC_GETLOD)
3859 type = TYPE_S32;
3860
3861 if (tex->op == nir_texop_txf_ms_fb) {
3862 compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
3863
3864 ctx->so->fb_read = true;
3865 if (ctx->compiler->options.bindless_fb_read_descriptor >= 0) {
3866 ctx->so->bindless_tex = true;
3867 info.flags = IR3_INSTR_B;
3868 info.base = ctx->compiler->options.bindless_fb_read_descriptor;
3869 struct ir3_instruction *texture, *sampler;
3870
3871 int base_index =
3872 nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
3873 nir_src tex_src = tex->src[base_index].src;
3874
3875 if (nir_src_is_const(tex_src)) {
3876 texture = create_immed_typed(b,
3877 nir_src_as_uint(tex_src) + ctx->compiler->options.bindless_fb_read_slot,
3878 TYPE_U32);
3879 } else {
3880 texture = create_immed_typed(
3881 b, ctx->compiler->options.bindless_fb_read_slot, TYPE_U32);
3882 struct ir3_instruction *base =
3883 ir3_get_src(ctx, &tex->src[base_index].src)[0];
3884 texture = ir3_ADD_U(b, texture, 0, base, 0);
3885 }
3886 sampler = create_immed_typed(b, 0, TYPE_U32);
3887 info.samp_tex = ir3_collect(b, texture, sampler);
3888 info.flags |= IR3_INSTR_S2EN;
3889 if (tex->texture_non_uniform) {
3890 info.flags |= IR3_INSTR_NONUNIF;
3891 }
3892 } else {
3893 /* Otherwise append a sampler to be patched into the texture
3894 * state:
3895 */
3896 info.samp_tex =
3897 ir3_collect(b, create_immed_typed(b, ctx->so->num_samp, TYPE_U16),
3898 create_immed_typed(b, ctx->so->num_samp, TYPE_U16));
3899 info.flags = IR3_INSTR_S2EN;
3900 }
3901
3902 ctx->so->num_samp++;
3903 } else {
3904 info = get_tex_samp_tex_src(ctx, tex);
3905 }
3906
3907 bool tg4_swizzle_fixup = false;
3908 if (tex->op == nir_texop_tg4 && ctx->compiler->gen == 4 &&
3909 ctx->sampler_swizzles[tex->texture_index] != 0x688 /* rgba */) {
3910 uint16_t swizzles = ctx->sampler_swizzles[tex->texture_index];
3911 uint16_t swizzle = (swizzles >> (tex->component * 3)) & 7;
3912 if (swizzle > 3) {
3913 /* this would mean that we can just return 0 / 1, no texturing
3914 * necessary
3915 */
3916 struct ir3_instruction *imm = create_immed(b,
3917 type_float(type) ? fui(swizzle - 4) : (swizzle - 4));
3918 for (int i = 0; i < 4; i++)
3919 dst[i] = imm;
3920 ir3_put_def(ctx, &tex->def);
3921 return;
3922 }
3923 opc = OPC_GATHER4R + swizzle;
3924 tg4_swizzle_fixup = true;
3925 }
3926
3927 struct ir3_instruction *col0 = ir3_create_collect(b, src0, nsrc0);
3928 struct ir3_instruction *col1 = ir3_create_collect(b, src1, nsrc1);
3929
3930 if (opc == OPC_META_TEX_PREFETCH) {
3931 int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
3932
3933 struct ir3_builder build =
3934 ir3_builder_at(ir3_before_terminator(ctx->in_block));
3935 sam = ir3_SAM(&build, opc, type, MASK(ncomp), 0, NULL,
3936 get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
3937 sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);
3938 /* make sure not to add irrelevant flags like S2EN */
3939 sam->flags = flags | (info.flags & IR3_INSTR_B);
3940 sam->prefetch.tex = info.tex_idx;
3941 sam->prefetch.samp = info.samp_idx;
3942 sam->prefetch.tex_base = info.tex_base;
3943 sam->prefetch.samp_base = info.samp_base;
3944 } else {
3945 info.flags |= flags;
3946 sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
3947 }
3948
3949 if (tg4_swizzle_fixup) {
3950 /* TODO: fix-up for ASTC when alpha is selected? */
3951 array_insert(ctx->ir, ctx->ir->tg4, sam);
3952
3953 ir3_split_dest(b, dst, sam, 0, 4);
3954
3955 uint8_t tex_bits = ctx->sampler_swizzles[tex->texture_index] >> 12;
3956 if (!type_float(type) && tex_bits != 3 /* 32bpp */ &&
3957 tex_bits != 0 /* key unset */) {
3958 uint8_t bits = 0;
3959 switch (tex_bits) {
3960 case 1: /* 8bpp */
3961 bits = 8;
3962 break;
3963 case 2: /* 16bpp */
3964 bits = 16;
3965 break;
3966 case 4: /* 10bpp or 2bpp for alpha */
3967 if (opc == OPC_GATHER4A)
3968 bits = 2;
3969 else
3970 bits = 10;
3971 break;
3972 default:
3973 assert(0);
3974 }
3975
3976 sam->cat5.type = TYPE_F32;
3977 for (int i = 0; i < 4; i++) {
3978 /* scale and offset the unorm data */
3979 dst[i] = ir3_MAD_F32(b, dst[i], 0, create_immed(b, fui((1 << bits) - 1)), 0, create_immed(b, fui(0.5f)), 0);
3980 /* convert the scaled value to integer */
3981 dst[i] = ir3_COV(b, dst[i], TYPE_F32, TYPE_U32);
3982 /* sign extend for signed values */
3983 if (type == TYPE_S32) {
3984 dst[i] = ir3_SHL_B(b, dst[i], 0, create_immed(b, 32 - bits), 0);
3985 dst[i] = ir3_ASHR_B(b, dst[i], 0, create_immed(b, 32 - bits), 0);
3986 }
3987 }
3988 }
3989 } else if ((ctx->astc_srgb & (1 << tex->texture_index)) &&
3990 tex->op != nir_texop_tg4 && /* leave out tg4, unless it's on alpha? */
3991 !nir_tex_instr_is_query(tex)) {
3992 assert(opc != OPC_META_TEX_PREFETCH);
3993
3994 /* only need first 3 components: */
3995 sam->dsts[0]->wrmask = 0x7;
3996 ir3_split_dest(b, dst, sam, 0, 3);
3997
3998 /* we need to sample the alpha separately with a non-SRGB
3999 * texture state:
4000 */
4001 sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex,
4002 col0, col1);
4003
4004 array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
4005
4006 /* fixup .w component: */
4007 ir3_split_dest(b, &dst[3], sam, 3, 1);
4008 } else {
4009 /* normal (non-workaround) case: */
4010 ir3_split_dest(b, dst, sam, 0, ncomp);
4011 }
4012
4013 /* GETLOD returns results in 4.8 fixed point */
4014 if (opc == OPC_GETLOD) {
4015 bool half = tex->def.bit_size == 16;
4016 struct ir3_instruction *factor =
4017 half ? create_immed_typed(b, _mesa_float_to_half(1.0 / 256), TYPE_F16)
4018 : create_immed(b, fui(1.0 / 256));
4019
4020 for (i = 0; i < 2; i++) {
4021 dst[i] = ir3_MUL_F(
4022 b, ir3_COV(b, dst[i], TYPE_S32, half ? TYPE_F16 : TYPE_F32), 0,
4023 factor, 0);
4024 }
4025 }
4026
4027 ir3_put_def(ctx, &tex->def);
4028 }
4029
4030 static void
emit_tex_info(struct ir3_context * ctx,nir_tex_instr * tex,unsigned idx)4031 emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx)
4032 {
4033 struct ir3_builder *b = &ctx->build;
4034 struct ir3_instruction **dst, *sam;
4035 type_t dst_type = get_tex_dest_type(tex);
4036 struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
4037
4038 dst = ir3_get_def(ctx, &tex->def, 1);
4039
4040 sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);
4041
4042 /* even though there is only one component, since it ends
4043 * up in .y/.z/.w rather than .x, we need a split_dest()
4044 */
4045 ir3_split_dest(b, dst, sam, idx, 1);
4046
4047 /* The # of levels comes from getinfo.z. We need to add 1 to it, since
4048 * the value in TEX_CONST_0 is zero-based.
4049 */
4050 if (ctx->compiler->levels_add_one)
4051 dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
4052
4053 ir3_put_def(ctx, &tex->def);
4054 }
4055
4056 static void
emit_tex_txs(struct ir3_context * ctx,nir_tex_instr * tex)4057 emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
4058 {
4059 struct ir3_builder *b = &ctx->build;
4060 struct ir3_instruction **dst, *sam;
4061 struct ir3_instruction *lod;
4062 unsigned flags, coords;
4063 type_t dst_type = get_tex_dest_type(tex);
4064 struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
4065
4066 tex_info(tex, &flags, &coords);
4067 info.flags |= flags;
4068
4069 /* Actually we want the number of dimensions, not coordinates. This
4070 * distinction only matters for cubes.
4071 */
4072 if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
4073 coords = 2;
4074
4075 dst = ir3_get_def(ctx, &tex->def, 4);
4076
4077 int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
4078 compile_assert(ctx, lod_idx >= 0);
4079
4080 lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];
4081
4082 if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
4083 sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
4084 } else {
4085 /*
4086 * The maximum value which OPC_GETSIZE could return for one dimension
4087 * is 0x007ff0, however sampler buffer could be much bigger.
4088 * Blob uses OPC_GETBUF for them.
4089 */
4090 sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);
4091 }
4092
4093 ir3_split_dest(b, dst, sam, 0, 4);
4094
4095 /* Array size actually ends up in .w rather than .z. This doesn't
4096 * matter for miplevel 0, but for higher mips the value in z is
4097 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
4098 * returned, which means that we have to add 1 to it for arrays.
4099 */
4100 if (tex->is_array) {
4101 if (ctx->compiler->levels_add_one) {
4102 dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
4103 } else {
4104 dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
4105 }
4106 }
4107
4108 ir3_put_def(ctx, &tex->def);
4109 }
4110
4111 /* phi instructions are left partially constructed. We don't resolve
4112 * their srcs until the end of the shader, since (eg. loops) one of
4113 * the phi's srcs might be defined after the phi due to back edges in
4114 * the CFG.
4115 */
4116 static void
emit_phi(struct ir3_context * ctx,nir_phi_instr * nphi)4117 emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)
4118 {
4119 struct ir3_instruction *phi, **dst;
4120
4121 unsigned num_components = nphi->def.num_components;
4122 dst = ir3_get_def(ctx, &nphi->def, num_components);
4123
4124 if (exec_list_is_singular(&nphi->srcs)) {
4125 nir_phi_src *src = list_entry(exec_list_get_head(&nphi->srcs),
4126 nir_phi_src, node);
4127 if (nphi->def.divergent == src->src.ssa->divergent) {
4128 struct ir3_instruction *const *srcs =
4129 ir3_get_src_maybe_shared(ctx, &src->src);
4130 memcpy(dst, srcs, num_components * sizeof(struct ir3_instruction *));
4131 ir3_put_def(ctx, &nphi->def);
4132 return;
4133 }
4134 }
4135
4136 for (unsigned i = 0; i < num_components; i++) {
4137 phi = ir3_build_instr(&ctx->build, OPC_META_PHI, 1,
4138 exec_list_length(&nphi->srcs));
4139 __ssa_dst(phi);
4140 phi->phi.nphi = nphi;
4141 phi->phi.comp = i;
4142
4143 if (ctx->compiler->has_scalar_alu && !nphi->def.divergent)
4144 phi->dsts[0]->flags |= IR3_REG_SHARED;
4145
4146 dst[i] = phi;
4147 }
4148
4149 ir3_put_def(ctx, &nphi->def);
4150 }
4151
4152 static struct ir3_block *get_block(struct ir3_context *ctx,
4153 const nir_block *nblock);
4154
4155 static struct ir3_instruction *
read_phi_src(struct ir3_context * ctx,struct ir3_block * blk,struct ir3_instruction * phi,nir_phi_instr * nphi)4156 read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,
4157 struct ir3_instruction *phi, nir_phi_instr *nphi)
4158 {
4159 if (!blk->nblock) {
4160 struct ir3_builder build = ir3_builder_at(ir3_before_terminator(blk));
4161 struct ir3_instruction *continue_phi =
4162 ir3_build_instr(&build, OPC_META_PHI, 1, blk->predecessors_count);
4163 __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;
4164
4165 for (unsigned i = 0; i < blk->predecessors_count; i++) {
4166 struct ir3_instruction *src =
4167 read_phi_src(ctx, blk->predecessors[i], phi, nphi);
4168 if (src)
4169 __ssa_src(continue_phi, src, 0);
4170 else
4171 ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);
4172 }
4173
4174 return continue_phi;
4175 }
4176
4177 nir_foreach_phi_src (nsrc, nphi) {
4178 if (blk->nblock == nsrc->pred) {
4179 if (nsrc->src.ssa->parent_instr->type == nir_instr_type_undef) {
4180 /* Create an ir3 undef */
4181 return NULL;
4182 } else {
4183 /* We need to insert the move at the end of the block */
4184 struct ir3_block *old_block = ctx->block;
4185 ir3_context_set_block(ctx, blk);
4186 struct ir3_instruction *src = ir3_get_src_shared(
4187 ctx, &nsrc->src,
4188 phi->dsts[0]->flags & IR3_REG_SHARED)[phi->phi.comp];
4189 ir3_context_set_block(ctx, old_block);
4190 return src;
4191 }
4192 }
4193 }
4194
4195 unreachable("couldn't find phi node ir3 block");
4196 return NULL;
4197 }
4198
4199 static void
resolve_phis(struct ir3_context * ctx,struct ir3_block * block)4200 resolve_phis(struct ir3_context *ctx, struct ir3_block *block)
4201 {
4202 foreach_instr (phi, &block->instr_list) {
4203 if (phi->opc != OPC_META_PHI)
4204 break;
4205
4206 nir_phi_instr *nphi = phi->phi.nphi;
4207
4208 if (!nphi) /* skip continue phis created above */
4209 continue;
4210
4211 for (unsigned i = 0; i < block->predecessors_count; i++) {
4212 struct ir3_block *pred = block->predecessors[i];
4213 struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);
4214 if (src) {
4215 __ssa_src(phi, src, 0);
4216 } else {
4217 /* Create an ir3 undef */
4218 ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);
4219 }
4220 }
4221 }
4222 }
4223
4224 static void
emit_jump(struct ir3_context * ctx,nir_jump_instr * jump)4225 emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
4226 {
4227 switch (jump->type) {
4228 case nir_jump_break:
4229 case nir_jump_continue:
4230 case nir_jump_return:
4231 /* I *think* we can simply just ignore this, and use the
4232 * successor block link to figure out where we need to
4233 * jump to for break/continue
4234 */
4235 break;
4236 default:
4237 ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
4238 break;
4239 }
4240 }
4241
4242 static void
emit_instr(struct ir3_context * ctx,nir_instr * instr)4243 emit_instr(struct ir3_context *ctx, nir_instr *instr)
4244 {
4245 switch (instr->type) {
4246 case nir_instr_type_alu:
4247 emit_alu(ctx, nir_instr_as_alu(instr));
4248 break;
4249 case nir_instr_type_deref:
4250 /* ignored, handled as part of the intrinsic they are src to */
4251 break;
4252 case nir_instr_type_intrinsic:
4253 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
4254 break;
4255 case nir_instr_type_load_const:
4256 emit_load_const(ctx, nir_instr_as_load_const(instr));
4257 break;
4258 case nir_instr_type_undef:
4259 emit_undef(ctx, nir_instr_as_undef(instr));
4260 break;
4261 case nir_instr_type_tex: {
4262 nir_tex_instr *tex = nir_instr_as_tex(instr);
4263 /* couple tex instructions get special-cased:
4264 */
4265 switch (tex->op) {
4266 case nir_texop_txs:
4267 emit_tex_txs(ctx, tex);
4268 break;
4269 case nir_texop_query_levels:
4270 emit_tex_info(ctx, tex, 2);
4271 break;
4272 case nir_texop_texture_samples:
4273 emit_tex_info(ctx, tex, 3);
4274 break;
4275 default:
4276 emit_tex(ctx, tex);
4277 break;
4278 }
4279 break;
4280 }
4281 case nir_instr_type_jump:
4282 emit_jump(ctx, nir_instr_as_jump(instr));
4283 break;
4284 case nir_instr_type_phi:
4285 emit_phi(ctx, nir_instr_as_phi(instr));
4286 break;
4287 case nir_instr_type_call:
4288 case nir_instr_type_parallel_copy:
4289 case nir_instr_type_debug_info:
4290 ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n",
4291 instr->type);
4292 break;
4293 }
4294 }
4295
4296 static struct ir3_block *
get_block(struct ir3_context * ctx,const nir_block * nblock)4297 get_block(struct ir3_context *ctx, const nir_block *nblock)
4298 {
4299 struct ir3_block *block;
4300 struct hash_entry *hentry;
4301
4302 hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
4303 if (hentry)
4304 return hentry->data;
4305
4306 block = ir3_block_create(ctx->ir);
4307 block->nblock = nblock;
4308 _mesa_hash_table_insert(ctx->block_ht, nblock, block);
4309
4310 return block;
4311 }
4312
4313 static struct ir3_block *
get_block_or_continue(struct ir3_context * ctx,const nir_block * nblock)4314 get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock)
4315 {
4316 struct hash_entry *hentry;
4317
4318 hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);
4319 if (hentry)
4320 return hentry->data;
4321
4322 return get_block(ctx, nblock);
4323 }
4324
4325 static struct ir3_block *
create_continue_block(struct ir3_context * ctx,const nir_block * nblock)4326 create_continue_block(struct ir3_context *ctx, const nir_block *nblock)
4327 {
4328 struct ir3_block *block = ir3_block_create(ctx->ir);
4329 block->nblock = NULL;
4330 _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);
4331 return block;
4332 }
4333
4334 static void
emit_block(struct ir3_context * ctx,nir_block * nblock)4335 emit_block(struct ir3_context *ctx, nir_block *nblock)
4336 {
4337 ir3_context_set_block(ctx, get_block(ctx, nblock));
4338
4339 list_addtail(&ctx->block->node, &ctx->ir->block_list);
4340
4341 ctx->block->loop_depth = ctx->loop_depth;
4342
4343 /* re-emit addr register in each block if needed: */
4344 for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
4345 _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
4346 ctx->addr0_ht[i] = NULL;
4347 }
4348
4349 _mesa_hash_table_u64_destroy(ctx->addr1_ht);
4350 ctx->addr1_ht = NULL;
4351
4352 nir_foreach_instr (instr, nblock) {
4353 ctx->cur_instr = instr;
4354 emit_instr(ctx, instr);
4355 ctx->cur_instr = NULL;
4356 if (ctx->error)
4357 return;
4358 }
4359
4360 for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {
4361 if (nblock->successors[i]) {
4362 ctx->block->successors[i] =
4363 get_block_or_continue(ctx, nblock->successors[i]);
4364 }
4365 }
4366
4367 /* Emit unconditional branch if we only have one successor. Conditional
4368 * branches are emitted in emit_if.
4369 */
4370 if (ctx->block->successors[0] && !ctx->block->successors[1]) {
4371 if (!ir3_block_get_terminator(ctx->block))
4372 ir3_JUMP(&ctx->build);
4373 }
4374
4375 _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
4376 }
4377
4378 static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
4379
4380 /* Get the ir3 branch condition for a given nir source. This will strip any inot
4381 * instructions and set *inv when the condition should be inverted. This
4382 * inversion can be directly folded into branches (in the inv1/inv2 fields)
4383 * instead of adding an explicit not.b/sub.u instruction.
4384 */
4385 static struct ir3_instruction *
get_branch_condition(struct ir3_context * ctx,nir_src * src,unsigned comp,bool * inv)4386 get_branch_condition(struct ir3_context *ctx, nir_src *src, unsigned comp,
4387 bool *inv)
4388 {
4389 struct ir3_instruction *condition = ir3_get_src(ctx, src)[comp];
4390
4391 if (src->ssa->parent_instr->type == nir_instr_type_alu) {
4392 nir_alu_instr *nir_cond = nir_instr_as_alu(src->ssa->parent_instr);
4393
4394 if (nir_cond->op == nir_op_inot) {
4395 struct ir3_instruction *inv_cond = get_branch_condition(
4396 ctx, &nir_cond->src[0].src, nir_cond->src[0].swizzle[comp], inv);
4397 *inv = !*inv;
4398 return inv_cond;
4399 }
4400 }
4401
4402 *inv = false;
4403 return ir3_get_predicate(ctx, condition);
4404 }
4405
4406 /* Try to fold br (and/or cond1, cond2) into braa/brao cond1, cond2.
4407 */
4408 static struct ir3_instruction *
fold_conditional_branch(struct ir3_context * ctx,struct nir_src * nir_cond)4409 fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
4410 {
4411 if (!ctx->compiler->has_branch_and_or)
4412 return NULL;
4413
4414 if (nir_cond->ssa->parent_instr->type != nir_instr_type_alu)
4415 return NULL;
4416
4417 nir_alu_instr *alu_cond = nir_instr_as_alu(nir_cond->ssa->parent_instr);
4418
4419 if ((alu_cond->op != nir_op_iand) && (alu_cond->op != nir_op_ior))
4420 return NULL;
4421
4422 /* If the result of the and/or is also used for something else than an if
4423 * condition, the and/or cannot be removed. In that case, we will end-up with
4424 * extra predicate conversions for the conditions without actually removing
4425 * any instructions, resulting in an increase of instructions. Let's not fold
4426 * the conditions in the branch in that case.
4427 */
4428 if (!nir_def_only_used_by_if(&alu_cond->def))
4429 return NULL;
4430
4431 bool inv1, inv2;
4432 struct ir3_instruction *cond1 = get_branch_condition(
4433 ctx, &alu_cond->src[0].src, alu_cond->src[0].swizzle[0], &inv1);
4434 struct ir3_instruction *cond2 = get_branch_condition(
4435 ctx, &alu_cond->src[1].src, alu_cond->src[1].swizzle[0], &inv2);
4436
4437 struct ir3_instruction *branch;
4438 if (alu_cond->op == nir_op_iand) {
4439 branch = ir3_BRAA(&ctx->build, cond1, IR3_REG_PREDICATE, cond2,
4440 IR3_REG_PREDICATE);
4441 } else {
4442 branch = ir3_BRAO(&ctx->build, cond1, IR3_REG_PREDICATE, cond2,
4443 IR3_REG_PREDICATE);
4444 }
4445
4446 branch->cat0.inv1 = inv1;
4447 branch->cat0.inv2 = inv2;
4448 return branch;
4449 }
4450
4451 static bool
instr_can_be_predicated(nir_instr * instr)4452 instr_can_be_predicated(nir_instr *instr)
4453 {
4454 /* Anything that doesn't expand to control-flow can be predicated. */
4455 switch (instr->type) {
4456 case nir_instr_type_alu:
4457 case nir_instr_type_deref:
4458 case nir_instr_type_tex:
4459 case nir_instr_type_load_const:
4460 case nir_instr_type_undef:
4461 case nir_instr_type_phi:
4462 case nir_instr_type_parallel_copy:
4463 return true;
4464 case nir_instr_type_call:
4465 case nir_instr_type_jump:
4466 case nir_instr_type_debug_info:
4467 return false;
4468 case nir_instr_type_intrinsic: {
4469 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4470 switch (intrin->intrinsic) {
4471 case nir_intrinsic_reduce:
4472 case nir_intrinsic_inclusive_scan:
4473 case nir_intrinsic_exclusive_scan:
4474 case nir_intrinsic_reduce_clusters_ir3:
4475 case nir_intrinsic_inclusive_scan_clusters_ir3:
4476 case nir_intrinsic_exclusive_scan_clusters_ir3:
4477 case nir_intrinsic_brcst_active_ir3:
4478 case nir_intrinsic_ballot:
4479 case nir_intrinsic_elect:
4480 case nir_intrinsic_elect_any_ir3:
4481 case nir_intrinsic_read_invocation_cond_ir3:
4482 case nir_intrinsic_demote:
4483 case nir_intrinsic_demote_if:
4484 case nir_intrinsic_terminate:
4485 case nir_intrinsic_terminate_if:
4486 return false;
4487 default:
4488 return true;
4489 }
4490 }
4491 }
4492
4493 unreachable("Checked all cases");
4494 }
4495
4496 static bool
nif_can_be_predicated(nir_if * nif)4497 nif_can_be_predicated(nir_if *nif)
4498 {
4499 /* For non-divergent branches, predication is more expensive than a branch
4500 * because the latter can potentially skip all instructions.
4501 */
4502 if (!nir_src_is_divergent(&nif->condition))
4503 return false;
4504
4505 /* Although it could potentially be possible to allow a limited form of
4506 * nested predication (e.g., by resetting the predication mask after a nested
4507 * branch), let's avoid this for now and only use predication for leaf
4508 * branches. That is, for ifs that contain exactly one block in both branches
4509 * (note that they always contain at least one block).
4510 */
4511 if (!exec_list_is_singular(&nif->then_list) ||
4512 !exec_list_is_singular(&nif->else_list)) {
4513 return false;
4514 }
4515
4516 nir_foreach_instr (instr, nir_if_first_then_block(nif)) {
4517 if (!instr_can_be_predicated(instr))
4518 return false;
4519 }
4520
4521 nir_foreach_instr (instr, nir_if_first_else_block(nif)) {
4522 if (!instr_can_be_predicated(instr))
4523 return false;
4524 }
4525
4526 return true;
4527 }
4528
4529 /* A typical if-else block like this:
4530 * if (cond) {
4531 * tblock;
4532 * } else {
4533 * fblock;
4534 * }
4535 * Will be emitted as:
4536 * |-- i --|
4537 * | ... |
4538 * | predt |
4539 * |-------|
4540 * succ0 / \ succ1
4541 * |-- i+1 --| |-- i+2 --|
4542 * | tblock | | fblock |
4543 * | predf | | jump |
4544 * |---------| |---------|
4545 * succ0 \ / succ0
4546 * |-- j --|
4547 * | ... |
4548 * |-------|
4549 * Where the numbers at the top of blocks are their indices. That is, the true
4550 * block and false block are laid-out contiguously after the current block. This
4551 * layout is verified during legalization in prede_sched which also inserts the
4552 * final prede instruction. Note that we don't insert prede right away to allow
4553 * opt_jump to optimize the jump in the false block.
4554 */
4555 static struct ir3_instruction *
emit_predicated_branch(struct ir3_context * ctx,nir_if * nif)4556 emit_predicated_branch(struct ir3_context *ctx, nir_if *nif)
4557 {
4558 if (!ctx->compiler->has_predication)
4559 return NULL;
4560 if (!nif_can_be_predicated(nif))
4561 return NULL;
4562
4563 struct ir3_block *then_block = get_block(ctx, nir_if_first_then_block(nif));
4564 struct ir3_block *else_block = get_block(ctx, nir_if_first_else_block(nif));
4565 assert(list_is_empty(&then_block->instr_list) &&
4566 list_is_empty(&else_block->instr_list));
4567
4568 bool inv;
4569 struct ir3_instruction *condition =
4570 get_branch_condition(ctx, &nif->condition, 0, &inv);
4571 struct ir3_builder then_build = ir3_builder_at(ir3_after_block(then_block));
4572 struct ir3_instruction *pred, *pred_inv;
4573
4574 if (!inv) {
4575 pred = ir3_PREDT(&ctx->build, condition, IR3_REG_PREDICATE);
4576 pred_inv = ir3_PREDF(&then_build, condition, IR3_REG_PREDICATE);
4577 } else {
4578 pred = ir3_PREDF(&ctx->build, condition, IR3_REG_PREDICATE);
4579 pred_inv = ir3_PREDT(&then_build, condition, IR3_REG_PREDICATE);
4580 }
4581
4582 pred->srcs[0]->num = REG_P0_X;
4583 pred_inv->srcs[0]->num = REG_P0_X;
4584 return pred;
4585 }
4586
4587 static struct ir3_instruction *
emit_conditional_branch(struct ir3_context * ctx,nir_if * nif)4588 emit_conditional_branch(struct ir3_context *ctx, nir_if *nif)
4589 {
4590 nir_src *nir_cond = &nif->condition;
4591 struct ir3_instruction *folded = fold_conditional_branch(ctx, nir_cond);
4592 if (folded)
4593 return folded;
4594
4595 struct ir3_instruction *predicated = emit_predicated_branch(ctx, nif);
4596 if (predicated)
4597 return predicated;
4598
4599 bool inv1;
4600 struct ir3_instruction *cond1 =
4601 get_branch_condition(ctx, nir_cond, 0, &inv1);
4602 struct ir3_instruction *branch =
4603 ir3_BR(&ctx->build, cond1, IR3_REG_PREDICATE);
4604 branch->cat0.inv1 = inv1;
4605 return branch;
4606 }
4607
4608 static void
emit_if(struct ir3_context * ctx,nir_if * nif)4609 emit_if(struct ir3_context *ctx, nir_if *nif)
4610 {
4611 struct ir3_instruction *condition = ir3_get_src_maybe_shared(ctx, &nif->condition)[0];
4612
4613 if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
4614 struct ir3_instruction *pred = ssa(condition->srcs[0]);
4615 ir3_BANY(&ctx->build, pred, IR3_REG_PREDICATE);
4616 } else if (condition->opc == OPC_ALL_MACRO &&
4617 condition->block == ctx->block) {
4618 struct ir3_instruction *pred = ssa(condition->srcs[0]);
4619 ir3_BALL(&ctx->build, pred, IR3_REG_PREDICATE);
4620 } else if (condition->opc == OPC_ELECT_MACRO &&
4621 condition->block == ctx->block) {
4622 struct ir3_instruction *branch = ir3_GETONE(&ctx->build);
4623 branch->flags |= condition->flags & IR3_INSTR_NEEDS_HELPERS;
4624 } else if (condition->opc == OPC_SHPS_MACRO &&
4625 condition->block == ctx->block) {
4626 /* TODO: technically this only works if the block is the only user of the
4627 * shps, but we only use it in very constrained scenarios so this should
4628 * be ok.
4629 */
4630 ir3_SHPS(&ctx->build);
4631 } else {
4632 emit_conditional_branch(ctx, nif);
4633 }
4634
4635 ctx->block->divergent_condition = nir_src_is_divergent(&nif->condition);
4636
4637 emit_cf_list(ctx, &nif->then_list);
4638 emit_cf_list(ctx, &nif->else_list);
4639 }
4640
4641 static bool
has_nontrivial_continue(nir_loop * nloop)4642 has_nontrivial_continue(nir_loop *nloop)
4643 {
4644 struct nir_block *nstart = nir_loop_first_block(nloop);
4645
4646 /* There's always one incoming edge from outside the loop, and if there
4647 * is more than one backedge from inside the loop (so more than 2 total
4648 * edges) then one must be a nontrivial continue.
4649 */
4650 if (nstart->predecessors->entries > 2)
4651 return true;
4652
4653 /* Check whether the one backedge is a nontrivial continue. This can happen
4654 * if the loop ends with a break.
4655 */
4656 set_foreach (nstart->predecessors, entry) {
4657 nir_block *pred = (nir_block*)entry->key;
4658 if (pred == nir_loop_last_block(nloop) ||
4659 pred == nir_cf_node_as_block(nir_cf_node_prev(&nloop->cf_node)))
4660 continue;
4661 return true;
4662 }
4663
4664 return false;
4665 }
4666
4667 static void
emit_loop(struct ir3_context * ctx,nir_loop * nloop)4668 emit_loop(struct ir3_context *ctx, nir_loop *nloop)
4669 {
4670 assert(!nir_loop_has_continue_construct(nloop));
4671 ctx->loop_depth++;
4672
4673 struct nir_block *nstart = nir_loop_first_block(nloop);
4674 struct ir3_block *continue_blk = NULL;
4675
4676 /* If the loop has a continue statement that isn't at the end, then we need to
4677 * create a continue block in order to let control flow reconverge before
4678 * entering the next iteration of the loop.
4679 */
4680 if (has_nontrivial_continue(nloop)) {
4681 continue_blk = create_continue_block(ctx, nstart);
4682 }
4683
4684 emit_cf_list(ctx, &nloop->body);
4685
4686 if (continue_blk) {
4687 struct ir3_block *start = get_block(ctx, nstart);
4688 struct ir3_builder build = ir3_builder_at(ir3_after_block(continue_blk));
4689 ir3_JUMP(&build);
4690 continue_blk->successors[0] = start;
4691 continue_blk->loop_depth = ctx->loop_depth;
4692 list_addtail(&continue_blk->node, &ctx->ir->block_list);
4693 }
4694
4695 ctx->so->loops++;
4696 ctx->loop_depth--;
4697 }
4698
4699 static void
emit_cf_list(struct ir3_context * ctx,struct exec_list * list)4700 emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
4701 {
4702 foreach_list_typed (nir_cf_node, node, node, list) {
4703 switch (node->type) {
4704 case nir_cf_node_block:
4705 emit_block(ctx, nir_cf_node_as_block(node));
4706 break;
4707 case nir_cf_node_if:
4708 emit_if(ctx, nir_cf_node_as_if(node));
4709 break;
4710 case nir_cf_node_loop:
4711 emit_loop(ctx, nir_cf_node_as_loop(node));
4712 break;
4713 case nir_cf_node_function:
4714 ir3_context_error(ctx, "TODO\n");
4715 break;
4716 }
4717 }
4718 }
4719
4720 /* emit stream-out code. At this point, the current block is the original
4721 * (nir) end block, and nir ensures that all flow control paths terminate
4722 * into the end block. We re-purpose the original end block to generate
4723 * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
4724 * block holding stream-out write instructions, followed by the new end
4725 * block:
4726 *
4727 * blockOrigEnd {
4728 * p0.x = (vtxcnt < maxvtxcnt)
4729 * // succs: blockStreamOut, blockNewEnd
4730 * }
4731 * blockStreamOut {
4732 * // preds: blockOrigEnd
4733 * ... stream-out instructions ...
4734 * // succs: blockNewEnd
4735 * }
4736 * blockNewEnd {
4737 * // preds: blockOrigEnd, blockStreamOut
4738 * }
4739 */
4740 static void
emit_stream_out(struct ir3_context * ctx)4741 emit_stream_out(struct ir3_context *ctx)
4742 {
4743 struct ir3 *ir = ctx->ir;
4744 struct ir3_stream_output_info *strmout = &ctx->so->stream_output;
4745 struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
4746 struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
4747 struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
4748
4749 /* create vtxcnt input in input block at top of shader,
4750 * so that it is seen as live over the entire duration
4751 * of the shader:
4752 */
4753 vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
4754 maxvtxcnt = create_driver_param(ctx, IR3_DP_VS(vtxcnt_max));
4755
4756 /* at this point, we are at the original 'end' block,
4757 * re-purpose this block to stream-out condition, then
4758 * append stream-out block and new-end block
4759 */
4760 orig_end_block = ctx->block;
4761
4762 // maybe w/ store_global intrinsic, we could do this
4763 // stuff in nir->nir pass
4764
4765 stream_out_block = ir3_block_create(ir);
4766 list_addtail(&stream_out_block->node, &ir->block_list);
4767
4768 new_end_block = ir3_block_create(ir);
4769 list_addtail(&new_end_block->node, &ir->block_list);
4770
4771 orig_end_block->successors[0] = stream_out_block;
4772 orig_end_block->successors[1] = new_end_block;
4773
4774 stream_out_block->successors[0] = new_end_block;
4775
4776 /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
4777 cond = ir3_CMPS_S(&ctx->build, vtxcnt, 0, maxvtxcnt, 0);
4778 cond->dsts[0]->flags |= IR3_REG_PREDICATE;
4779 cond->cat2.condition = IR3_COND_LT;
4780
4781 /* condition goes on previous block to the conditional,
4782 * since it is used to pick which of the two successor
4783 * paths to take:
4784 */
4785 ir3_BR(&ctx->build, cond, IR3_REG_PREDICATE);
4786
4787 /* switch to stream_out_block to generate the stream-out
4788 * instructions:
4789 */
4790 ir3_context_set_block(ctx, stream_out_block);
4791
4792 /* Calculate base addresses based on vtxcnt. Instructions
4793 * generated for bases not used in following loop will be
4794 * stripped out in the backend.
4795 */
4796 for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
4797 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
4798 unsigned stride = strmout->stride[i];
4799 struct ir3_instruction *base, *off;
4800
4801 base = create_uniform(
4802 &ctx->build,
4803 ir3_const_reg(const_state, IR3_CONST_ALLOC_TFBO, i));
4804
4805 /* 24-bit should be enough: */
4806 off = ir3_MUL_U24(&ctx->build, vtxcnt, 0,
4807 create_immed(&ctx->build, stride * 4), 0);
4808
4809 bases[i] = ir3_ADD_S(&ctx->build, off, 0, base, 0);
4810 }
4811
4812 /* Generate the per-output store instructions: */
4813 for (unsigned i = 0; i < strmout->num_outputs; i++) {
4814 for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
4815 unsigned c = j + strmout->output[i].start_component;
4816 struct ir3_instruction *base, *out, *stg;
4817
4818 base = bases[strmout->output[i].output_buffer];
4819 out = ctx->outputs[regid(strmout->output[i].register_index, c)];
4820
4821 stg = ir3_STG(
4822 &ctx->build, base, 0,
4823 create_immed(&ctx->build, (strmout->output[i].dst_offset + j) * 4),
4824 0, out, 0, create_immed(&ctx->build, 1), 0);
4825 stg->cat6.type = TYPE_U32;
4826
4827 array_insert(ctx->block, ctx->block->keeps, stg);
4828 }
4829 }
4830
4831 ir3_JUMP(&ctx->build);
4832
4833 /* and finally switch to the new_end_block: */
4834 ir3_context_set_block(ctx, new_end_block);
4835 }
4836
4837 static void
setup_predecessors(struct ir3 * ir)4838 setup_predecessors(struct ir3 *ir)
4839 {
4840 foreach_block (block, &ir->block_list) {
4841 for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
4842 if (block->successors[i])
4843 ir3_block_add_predecessor(block->successors[i], block);
4844 }
4845 }
4846 }
4847
4848 static void
emit_function(struct ir3_context * ctx,nir_function_impl * impl)4849 emit_function(struct ir3_context *ctx, nir_function_impl *impl)
4850 {
4851 nir_metadata_require(impl, nir_metadata_block_index);
4852
4853 emit_cf_list(ctx, &impl->body);
4854 emit_block(ctx, impl->end_block);
4855
4856 /* at this point, we should have a single empty block,
4857 * into which we emit the 'end' instruction.
4858 */
4859 compile_assert(ctx, list_is_empty(&ctx->block->instr_list));
4860
4861 /* If stream-out (aka transform-feedback) enabled, emit the
4862 * stream-out instructions, followed by a new empty block (into
4863 * which the 'end' instruction lands).
4864 *
4865 * NOTE: it is done in this order, rather than inserting before
4866 * we emit end_block, because NIR guarantees that all blocks
4867 * flow into end_block, and that end_block has no successors.
4868 * So by re-purposing end_block as the first block of stream-
4869 * out, we guarantee that all exit paths flow into the stream-
4870 * out instructions.
4871 */
4872 if ((ctx->compiler->gen < 5) &&
4873 (ctx->so->stream_output.num_outputs > 0) &&
4874 !ctx->so->binning_pass) {
4875 assert(ctx->so->type == MESA_SHADER_VERTEX);
4876 emit_stream_out(ctx);
4877 }
4878
4879 setup_predecessors(ctx->ir);
4880 foreach_block (block, &ctx->ir->block_list) {
4881 resolve_phis(ctx, block);
4882 }
4883 }
4884
4885 static void
setup_input(struct ir3_context * ctx,nir_intrinsic_instr * intr)4886 setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
4887 {
4888 struct ir3_shader_variant *so = ctx->so;
4889 struct ir3_instruction *coord = NULL;
4890
4891 if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
4892 coord =
4893 ir3_create_collect(&ctx->build, ir3_get_src(ctx, &intr->src[0]), 2);
4894
4895 compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
4896
4897 unsigned frac = nir_intrinsic_component(intr);
4898 unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
4899 unsigned ncomp = nir_intrinsic_dest_components(intr);
4900 unsigned n = nir_intrinsic_base(intr) + offset;
4901 unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
4902 unsigned compmask = BITFIELD_MASK(ncomp + frac);
4903
4904 /* Inputs are loaded using ldlw or ldg for other stages. */
4905 compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
4906 ctx->so->type == MESA_SHADER_VERTEX);
4907
4908 /* for clip+cull distances, unused components can't be eliminated because
4909 * they're read by fixed-function, even if there's a hole. Note that
4910 * clip/cull distance arrays must be declared in the FS, so we can just
4911 * use the NIR clip/cull distances to avoid reading ucp_enables in the
4912 * shader key.
4913 */
4914 if (ctx->so->type == MESA_SHADER_FRAGMENT &&
4915 (slot == VARYING_SLOT_CLIP_DIST0 ||
4916 slot == VARYING_SLOT_CLIP_DIST1)) {
4917 unsigned clip_cull_mask = so->clip_mask | so->cull_mask;
4918
4919 if (slot == VARYING_SLOT_CLIP_DIST0)
4920 compmask = clip_cull_mask & 0xf;
4921 else
4922 compmask = clip_cull_mask >> 4;
4923 }
4924
4925 /* for a4xx+ rasterflat */
4926 if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
4927 coord = NULL;
4928
4929 so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
4930
4931 so->inputs[n].slot = slot;
4932 so->inputs[n].compmask |= compmask;
4933 so->inputs_count = MAX2(so->inputs_count, n + 1);
4934 compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));
4935 so->inputs[n].flat = !coord;
4936
4937 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4938 compile_assert(ctx, slot != VARYING_SLOT_POS);
4939
4940 so->inputs[n].bary = true;
4941 unsigned idx = (n * 4) + frac;
4942 struct ir3_instruction_rpt instr =
4943 create_frag_input(ctx, coord, idx, ncomp);
4944 cp_instrs(ctx->last_dst, instr.rpts, ncomp);
4945
4946 if (slot == VARYING_SLOT_PRIMITIVE_ID)
4947 so->reads_primid = true;
4948
4949 so->inputs[n].inloc = 4 * n;
4950 so->varying_in = MAX2(so->varying_in, 4 * n + 4);
4951 } else {
4952 struct ir3_instruction *input = NULL;
4953
4954 foreach_input (in, ctx->ir) {
4955 if (in->input.inidx == n) {
4956 input = in;
4957 break;
4958 }
4959 }
4960
4961 if (!input) {
4962 input = create_input(ctx, compmask);
4963 input->input.inidx = n;
4964 } else {
4965 /* For aliased inputs, just append to the wrmask.. ie. if we
4966 * first see a vec2 index at slot N, and then later a vec4,
4967 * the wrmask of the resulting overlapped vec2 and vec4 is 0xf
4968 */
4969 input->dsts[0]->wrmask |= compmask;
4970 }
4971
4972 for (int i = 0; i < ncomp + frac; i++) {
4973 unsigned idx = (n * 4) + i;
4974 compile_assert(ctx, idx < ctx->ninputs);
4975
4976 /* fixup the src wrmask to avoid validation fail */
4977 if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {
4978 ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;
4979 continue;
4980 }
4981
4982 ir3_split_dest(&ctx->build, &ctx->inputs[idx], input, i, 1);
4983 }
4984
4985 for (int i = 0; i < ncomp; i++) {
4986 unsigned idx = (n * 4) + i + frac;
4987 ctx->last_dst[i] = ctx->inputs[idx];
4988 }
4989 }
4990 }
4991
4992 /* Initially we assign non-packed inloc's for varyings, as we don't really
4993 * know up-front which components will be unused. After all the compilation
4994 * stages we scan the shader to see which components are actually used, and
4995 * re-pack the inlocs to eliminate unneeded varyings.
4996 */
4997 static void
pack_inlocs(struct ir3_context * ctx)4998 pack_inlocs(struct ir3_context *ctx)
4999 {
5000 struct ir3_shader_variant *so = ctx->so;
5001 uint8_t used_components[so->inputs_count];
5002
5003 memset(used_components, 0, sizeof(used_components));
5004
5005 /*
5006 * First Step: scan shader to find which bary.f/ldlv remain:
5007 */
5008
5009 foreach_block (block, &ctx->ir->block_list) {
5010 foreach_instr (instr, &block->instr_list) {
5011 if (is_input(instr)) {
5012 unsigned inloc = instr->srcs[0]->iim_val;
5013 unsigned i = inloc / 4;
5014 unsigned j = inloc % 4;
5015
5016 compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);
5017 compile_assert(ctx, i < so->inputs_count);
5018
5019 used_components[i] |= 1 << j;
5020 } else if (instr->opc == OPC_META_TEX_PREFETCH) {
5021 for (int n = 0; n < 2; n++) {
5022 unsigned inloc = instr->prefetch.input_offset + n;
5023 unsigned i = inloc / 4;
5024 unsigned j = inloc % 4;
5025
5026 compile_assert(ctx, i < so->inputs_count);
5027
5028 used_components[i] |= 1 << j;
5029 }
5030 }
5031 }
5032 }
5033
5034 /*
5035 * Second Step: reassign varying inloc/slots:
5036 */
5037
5038 unsigned inloc = 0;
5039
5040 /* for clip+cull distances, unused components can't be eliminated because
5041 * they're read by fixed-function, even if there's a hole. Note that
5042 * clip/cull distance arrays must be declared in the FS, so we can just
5043 * use the NIR clip/cull distances to avoid reading ucp_enables in the
5044 * shader key.
5045 */
5046 unsigned clip_cull_mask = so->clip_mask | so->cull_mask;
5047
5048 so->varying_in = 0;
5049
5050 for (unsigned i = 0; i < so->inputs_count; i++) {
5051 unsigned compmask = 0, maxcomp = 0;
5052
5053 so->inputs[i].inloc = inloc;
5054 so->inputs[i].bary = false;
5055
5056 if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||
5057 so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {
5058 if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)
5059 compmask = clip_cull_mask & 0xf;
5060 else
5061 compmask = clip_cull_mask >> 4;
5062 used_components[i] = compmask;
5063 }
5064
5065 for (unsigned j = 0; j < 4; j++) {
5066 if (!(used_components[i] & (1 << j)))
5067 continue;
5068
5069 compmask |= (1 << j);
5070 maxcomp = j + 1;
5071
5072 /* at this point, since used_components[i] mask is only
5073 * considering varyings (ie. not sysvals) we know this
5074 * is a varying:
5075 */
5076 so->inputs[i].bary = true;
5077 }
5078
5079 if (so->inputs[i].bary) {
5080 so->varying_in++;
5081 so->inputs[i].compmask = (1 << maxcomp) - 1;
5082 inloc += maxcomp;
5083 }
5084 }
5085
5086 /*
5087 * Third Step: reassign packed inloc's:
5088 */
5089
5090 foreach_block (block, &ctx->ir->block_list) {
5091 foreach_instr (instr, &block->instr_list) {
5092 if (is_input(instr)) {
5093 unsigned inloc = instr->srcs[0]->iim_val;
5094 unsigned i = inloc / 4;
5095 unsigned j = inloc % 4;
5096
5097 instr->srcs[0]->iim_val = so->inputs[i].inloc + j;
5098 if (instr->opc == OPC_FLAT_B)
5099 instr->srcs[1]->iim_val = instr->srcs[0]->iim_val;
5100 } else if (instr->opc == OPC_META_TEX_PREFETCH) {
5101 unsigned i = instr->prefetch.input_offset / 4;
5102 unsigned j = instr->prefetch.input_offset % 4;
5103 instr->prefetch.input_offset = so->inputs[i].inloc + j;
5104 }
5105 }
5106 }
5107 }
5108
5109 static void
setup_output(struct ir3_context * ctx,nir_intrinsic_instr * intr)5110 setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)
5111 {
5112 struct ir3_shader_variant *so = ctx->so;
5113 nir_io_semantics io = nir_intrinsic_io_semantics(intr);
5114
5115 nir_src offset_src = *nir_get_io_offset_src(intr);
5116 compile_assert(ctx, nir_src_is_const(offset_src));
5117
5118 unsigned offset = nir_src_as_uint(offset_src);
5119 unsigned frac = nir_intrinsic_component(intr);
5120 unsigned ncomp = nir_intrinsic_src_components(intr, 0);
5121 unsigned slot = io.location + offset;
5122
5123 /* For per-view variables, each user-facing slot corresponds to multiple
5124 * views, each with a corresponding driver_location, and the view index
5125 * offsets the driver_location. */
5126 unsigned view_index = intr->intrinsic == nir_intrinsic_store_per_view_output
5127 ? nir_src_as_uint(intr->src[1])
5128 : 0;
5129 unsigned n = nir_intrinsic_base(intr) + offset + view_index;
5130
5131 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
5132 switch (slot) {
5133 case FRAG_RESULT_DEPTH:
5134 so->writes_pos = true;
5135 break;
5136 case FRAG_RESULT_COLOR:
5137 if (!ctx->s->info.fs.color_is_dual_source) {
5138 so->color0_mrt = 1;
5139 } else {
5140 slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;
5141 if (io.dual_source_blend_index > 0)
5142 so->dual_src_blend = true;
5143 }
5144 break;
5145 case FRAG_RESULT_SAMPLE_MASK:
5146 so->writes_smask = true;
5147 break;
5148 case FRAG_RESULT_STENCIL:
5149 so->writes_stencilref = true;
5150 break;
5151 default:
5152 slot += io.dual_source_blend_index; /* For dual-src blend */
5153 if (io.dual_source_blend_index > 0)
5154 so->dual_src_blend = true;
5155 if (slot >= FRAG_RESULT_DATA0)
5156 break;
5157 ir3_context_error(ctx, "unknown FS output name: %s\n",
5158 gl_frag_result_name(slot));
5159 }
5160 } else if (ctx->so->type == MESA_SHADER_VERTEX ||
5161 ctx->so->type == MESA_SHADER_TESS_EVAL ||
5162 ctx->so->type == MESA_SHADER_GEOMETRY) {
5163 switch (slot) {
5164 case VARYING_SLOT_POS:
5165 so->writes_pos = true;
5166 break;
5167 case VARYING_SLOT_PSIZ:
5168 so->writes_psize = true;
5169 break;
5170 case VARYING_SLOT_VIEWPORT:
5171 so->writes_viewport = true;
5172 break;
5173 case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
5174 so->writes_shading_rate = true;
5175 break;
5176 case VARYING_SLOT_PRIMITIVE_ID:
5177 case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
5178 assert(ctx->so->type == MESA_SHADER_GEOMETRY);
5179 FALLTHROUGH;
5180 case VARYING_SLOT_COL0:
5181 case VARYING_SLOT_COL1:
5182 case VARYING_SLOT_BFC0:
5183 case VARYING_SLOT_BFC1:
5184 case VARYING_SLOT_FOGC:
5185 case VARYING_SLOT_CLIP_DIST0:
5186 case VARYING_SLOT_CLIP_DIST1:
5187 case VARYING_SLOT_CLIP_VERTEX:
5188 case VARYING_SLOT_LAYER:
5189 break;
5190 default:
5191 if (slot >= VARYING_SLOT_VAR0)
5192 break;
5193 if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
5194 break;
5195 ir3_context_error(ctx, "unknown %s shader output name: %s\n",
5196 _mesa_shader_stage_to_string(ctx->so->type),
5197 gl_varying_slot_name_for_stage(slot, ctx->so->type));
5198 }
5199 } else {
5200 ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
5201 }
5202
5203 so->outputs_count = MAX2(so->outputs_count, n + 1);
5204 compile_assert(ctx, so->outputs_count <= ARRAY_SIZE(so->outputs));
5205
5206 so->outputs[n].slot = slot;
5207 if (view_index > 0)
5208 so->multi_pos_output = true;
5209 so->outputs[n].view = view_index;
5210
5211 for (int i = 0; i < ncomp; i++) {
5212 unsigned idx = (n * 4) + i + frac;
5213 compile_assert(ctx, idx < ctx->noutputs);
5214 ctx->outputs[idx] = create_immed(&ctx->build, fui(0.0));
5215 }
5216
5217 /* if varying packing doesn't happen, we could end up in a situation
5218 * with "holes" in the output, and since the per-generation code that
5219 * sets up varying linkage registers doesn't expect to have more than
5220 * one varying per vec4 slot, pad the holes.
5221 *
5222 * Note that this should probably generate a performance warning of
5223 * some sort.
5224 */
5225 for (int i = 0; i < frac; i++) {
5226 unsigned idx = (n * 4) + i;
5227 if (!ctx->outputs[idx]) {
5228 ctx->outputs[idx] = create_immed(&ctx->build, fui(0.0));
5229 }
5230 }
5231
5232 struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]);
5233 for (int i = 0; i < ncomp; i++) {
5234 unsigned idx = (n * 4) + i + frac;
5235 ctx->outputs[idx] = src[i];
5236 }
5237 }
5238
5239 static bool
uses_load_input(struct ir3_shader_variant * so)5240 uses_load_input(struct ir3_shader_variant *so)
5241 {
5242 return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;
5243 }
5244
5245 static bool
uses_store_output(struct ir3_shader_variant * so)5246 uses_store_output(struct ir3_shader_variant *so)
5247 {
5248 switch (so->type) {
5249 case MESA_SHADER_VERTEX:
5250 return !so->key.has_gs && !so->key.tessellation;
5251 case MESA_SHADER_TESS_EVAL:
5252 return !so->key.has_gs;
5253 case MESA_SHADER_GEOMETRY:
5254 case MESA_SHADER_FRAGMENT:
5255 return true;
5256 case MESA_SHADER_TESS_CTRL:
5257 case MESA_SHADER_COMPUTE:
5258 case MESA_SHADER_KERNEL:
5259 return false;
5260 default:
5261 unreachable("unknown stage");
5262 }
5263 }
5264
5265 static void
emit_instructions(struct ir3_context * ctx)5266 emit_instructions(struct ir3_context *ctx)
5267 {
5268 MESA_TRACE_FUNC();
5269
5270 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
5271
5272 /* some varying setup which can't be done in setup_input(): */
5273 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
5274 nir_foreach_shader_in_variable (var, ctx->s) {
5275 /* set rasterflat flag for front/back color */
5276 if (var->data.interpolation == INTERP_MODE_NONE) {
5277 switch (var->data.location) {
5278 case VARYING_SLOT_COL0:
5279 case VARYING_SLOT_COL1:
5280 case VARYING_SLOT_BFC0:
5281 case VARYING_SLOT_BFC1:
5282 ctx->so->inputs[var->data.driver_location].rasterflat = true;
5283 break;
5284 default:
5285 break;
5286 }
5287 }
5288 }
5289 }
5290
5291 if (uses_load_input(ctx->so)) {
5292 ctx->so->inputs_count = ctx->s->num_inputs;
5293 compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));
5294 ctx->ninputs = ctx->s->num_inputs * 4;
5295 ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
5296 } else {
5297 ctx->ninputs = 0;
5298 ctx->so->inputs_count = 0;
5299 }
5300
5301 if (uses_store_output(ctx->so)) {
5302 ctx->noutputs = ctx->s->num_outputs * 4;
5303 ctx->outputs =
5304 rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
5305 } else {
5306 ctx->noutputs = 0;
5307 }
5308
5309 ctx->ir = ir3_create(ctx->compiler, ctx->so);
5310
5311 /* Create inputs in first block: */
5312 ir3_context_set_block(ctx, get_block(ctx, nir_start_block(fxn)));
5313 ctx->in_block = ctx->block;
5314
5315 /* for fragment shader, the vcoord input register is used as the
5316 * base for bary.f varying fetch instrs:
5317 *
5318 * TODO defer creating ctx->ij_pixel and corresponding sysvals
5319 * until emit_intrinsic when we know they are actually needed.
5320 * For now, we defer creating ctx->ij_centroid, etc, since we
5321 * only need ij_pixel for "old style" varying inputs (ie.
5322 * tgsi_to_nir)
5323 */
5324 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
5325 ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
5326 }
5327
5328 /* Defer add_sysval_input() stuff until after setup_inputs(),
5329 * because sysvals need to be appended after varyings:
5330 */
5331 if (ctx->ij[IJ_PERSP_PIXEL]) {
5332 add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3,
5333 ctx->ij[IJ_PERSP_PIXEL]);
5334 }
5335
5336 /* Tesselation shaders always need primitive ID for indexing the
5337 * BO. Geometry shaders don't always need it but when they do it has be
5338 * delivered and unclobbered in the VS. To make things easy, we always
5339 * make room for it in VS/DS.
5340 */
5341 bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;
5342 bool has_gs = ctx->so->key.has_gs;
5343 switch (ctx->so->type) {
5344 case MESA_SHADER_VERTEX:
5345 if (has_tess) {
5346 ctx->tcs_header =
5347 create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
5348 ctx->rel_patch_id =
5349 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
5350 ctx->primitive_id =
5351 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
5352 } else if (has_gs) {
5353 ctx->gs_header =
5354 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
5355 ctx->primitive_id =
5356 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
5357 }
5358 break;
5359 case MESA_SHADER_TESS_CTRL:
5360 ctx->tcs_header =
5361 create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
5362 ctx->rel_patch_id =
5363 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
5364 break;
5365 case MESA_SHADER_TESS_EVAL:
5366 if (has_gs) {
5367 ctx->gs_header =
5368 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
5369 ctx->primitive_id =
5370 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
5371 }
5372 ctx->rel_patch_id =
5373 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
5374 break;
5375 case MESA_SHADER_GEOMETRY:
5376 ctx->gs_header =
5377 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
5378 break;
5379 default:
5380 break;
5381 }
5382
5383 /* Find # of samplers. Just assume that we'll be reading from images.. if
5384 * it is write-only we don't have to count it, but after lowering derefs
5385 * is too late to compact indices for that.
5386 */
5387 ctx->so->num_samp =
5388 BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;
5389
5390 /* Save off clip+cull information. Note that in OpenGL clip planes may
5391 * be individually enabled/disabled, and some gens handle lowering in
5392 * backend, so we also need to consider the shader key:
5393 */
5394 ctx->so->clip_mask = ctx->so->key.ucp_enables |
5395 MASK(ctx->s->info.clip_distance_array_size);
5396 ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size)
5397 << ctx->s->info.clip_distance_array_size;
5398
5399 ctx->so->pvtmem_size = ctx->s->scratch_size;
5400 ctx->so->shared_size = ctx->s->info.shared_size;
5401
5402 /* NOTE: need to do something more clever when we support >1 fxn */
5403 nir_foreach_reg_decl (decl, fxn) {
5404 ir3_declare_array(ctx, decl);
5405 }
5406
5407 /* And emit the body: */
5408 ctx->impl = fxn;
5409 emit_function(ctx, fxn);
5410
5411 if (ctx->so->type == MESA_SHADER_TESS_CTRL &&
5412 ctx->compiler->tess_use_shared) {
5413 /* Anything before shpe seems to be ignored in the main shader when early
5414 * preamble is enabled on a7xx, so we have to put the barrier after.
5415 */
5416 struct ir3_block *block = ir3_after_preamble(ctx->ir);
5417 struct ir3_builder build = ir3_builder_at(ir3_after_block(block));
5418
5419 struct ir3_instruction *barrier = ir3_BAR(&build);
5420 barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
5421 barrier->barrier_class = IR3_BARRIER_EVERYTHING;
5422 array_insert(block, block->keeps, barrier);
5423 ctx->so->has_barrier = true;
5424
5425 /* Move the barrier to the beginning of the block but after any phi/input
5426 * meta instructions that must be at the beginning. It must be before we
5427 * load VS outputs.
5428 */
5429 foreach_instr (instr, &block->instr_list) {
5430 if (instr->opc != OPC_META_INPUT &&
5431 instr->opc != OPC_META_TEX_PREFETCH &&
5432 instr->opc != OPC_META_PHI) {
5433 ir3_instr_move_before(barrier, instr);
5434 break;
5435 }
5436 }
5437 }
5438 }
5439
5440 /* Fixup tex sampler state for astc/srgb workaround instructions. We
5441 * need to assign the tex state indexes for these after we know the
5442 * max tex index.
5443 */
5444 static void
fixup_astc_srgb(struct ir3_context * ctx)5445 fixup_astc_srgb(struct ir3_context *ctx)
5446 {
5447 struct ir3_shader_variant *so = ctx->so;
5448 /* indexed by original tex idx, value is newly assigned alpha sampler
5449 * state tex idx. Zero is invalid since there is at least one sampler
5450 * if we get here.
5451 */
5452 unsigned alt_tex_state[16] = {0};
5453 unsigned tex_idx = ctx->max_texture_index + 1;
5454 unsigned idx = 0;
5455
5456 so->astc_srgb.base = tex_idx;
5457
5458 for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
5459 struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
5460
5461 compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
5462
5463 if (alt_tex_state[sam->cat5.tex] == 0) {
5464 /* assign new alternate/alpha tex state slot: */
5465 alt_tex_state[sam->cat5.tex] = tex_idx++;
5466 so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
5467 so->astc_srgb.count++;
5468 }
5469
5470 sam->cat5.tex = alt_tex_state[sam->cat5.tex];
5471 }
5472 }
5473
5474 /* Fixup tex sampler state for tg4 workaround instructions. We
5475 * need to assign the tex state indexes for these after we know the
5476 * max tex index.
5477 */
5478 static void
fixup_tg4(struct ir3_context * ctx)5479 fixup_tg4(struct ir3_context *ctx)
5480 {
5481 struct ir3_shader_variant *so = ctx->so;
5482 /* indexed by original tex idx, value is newly assigned alpha sampler
5483 * state tex idx. Zero is invalid since there is at least one sampler
5484 * if we get here.
5485 */
5486 unsigned alt_tex_state[16] = {0};
5487 unsigned tex_idx = ctx->max_texture_index + so->astc_srgb.count + 1;
5488 unsigned idx = 0;
5489
5490 so->tg4.base = tex_idx;
5491
5492 for (unsigned i = 0; i < ctx->ir->tg4_count; i++) {
5493 struct ir3_instruction *sam = ctx->ir->tg4[i];
5494
5495 compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
5496
5497 if (alt_tex_state[sam->cat5.tex] == 0) {
5498 /* assign new alternate/alpha tex state slot: */
5499 alt_tex_state[sam->cat5.tex] = tex_idx++;
5500 so->tg4.orig_idx[idx++] = sam->cat5.tex;
5501 so->tg4.count++;
5502 }
5503
5504 sam->cat5.tex = alt_tex_state[sam->cat5.tex];
5505 }
5506 }
5507
5508 static void
collect_tex_prefetches(struct ir3_context * ctx,struct ir3 * ir)5509 collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir)
5510 {
5511 unsigned idx = 0;
5512
5513 /* Collect sampling instructions eligible for pre-dispatch. */
5514 foreach_block (block, &ir->block_list) {
5515 foreach_instr_safe (instr, &block->instr_list) {
5516 if (instr->opc == OPC_META_TEX_PREFETCH) {
5517 assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));
5518 struct ir3_sampler_prefetch *fetch =
5519 &ctx->so->sampler_prefetch[idx];
5520 idx++;
5521
5522 fetch->bindless = instr->flags & IR3_INSTR_B;
5523 if (fetch->bindless) {
5524 /* In bindless mode, the index is actually the base */
5525 fetch->tex_id = instr->prefetch.tex_base;
5526 fetch->samp_id = instr->prefetch.samp_base;
5527 fetch->tex_bindless_id = instr->prefetch.tex;
5528 fetch->samp_bindless_id = instr->prefetch.samp;
5529 } else {
5530 fetch->tex_id = instr->prefetch.tex;
5531 fetch->samp_id = instr->prefetch.samp;
5532 }
5533 fetch->tex_opc = OPC_SAM;
5534 fetch->wrmask = instr->dsts[0]->wrmask;
5535 fetch->dst = instr->dsts[0]->num;
5536 fetch->src = instr->prefetch.input_offset;
5537
5538 /* These are the limits on a5xx/a6xx, we might need to
5539 * revisit if SP_FS_PREFETCH[n] changes on later gens:
5540 */
5541 assert(fetch->dst <= 0x3f);
5542 assert(fetch->tex_id <= 0x1f);
5543 assert(fetch->samp_id <= 0xf);
5544
5545 ctx->so->total_in =
5546 MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);
5547
5548 fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);
5549
5550 /* Remove the prefetch placeholder instruction: */
5551 list_delinit(&instr->node);
5552 }
5553 }
5554 }
5555 }
5556
5557 int
ir3_compile_shader_nir(struct ir3_compiler * compiler,struct ir3_shader * shader,struct ir3_shader_variant * so)5558 ir3_compile_shader_nir(struct ir3_compiler *compiler,
5559 struct ir3_shader *shader,
5560 struct ir3_shader_variant *so)
5561 {
5562 struct ir3_context *ctx;
5563 struct ir3 *ir;
5564 int ret = 0, max_bary;
5565 bool progress;
5566
5567 MESA_TRACE_FUNC();
5568
5569 assert(!so->ir);
5570
5571 ctx = ir3_context_init(compiler, shader, so);
5572 if (!ctx) {
5573 DBG("INIT failed!");
5574 ret = -1;
5575 goto out;
5576 }
5577
5578 emit_instructions(ctx);
5579
5580 if (ctx->error) {
5581 DBG("EMIT failed!");
5582 ret = -1;
5583 goto out;
5584 }
5585
5586 ir = so->ir = ctx->ir;
5587
5588 if (gl_shader_stage_is_compute(so->type)) {
5589 so->local_size[0] = ctx->s->info.workgroup_size[0];
5590 so->local_size[1] = ctx->s->info.workgroup_size[1];
5591 so->local_size[2] = ctx->s->info.workgroup_size[2];
5592 so->local_size_variable = ctx->s->info.workgroup_size_variable;
5593 }
5594
5595 if (so->type == MESA_SHADER_FRAGMENT && so->reads_shading_rate &&
5596 !so->reads_smask &&
5597 compiler->reading_shading_rate_requires_smask_quirk) {
5598 create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
5599 }
5600
5601 /* Vertex shaders in a tessellation or geometry pipeline treat END as a
5602 * NOP and has an epilogue that writes the VS outputs to local storage, to
5603 * be read by the HS. Then it resets execution mask (chmask) and chains
5604 * to the next shader (chsh). There are also a few output values which we
5605 * must send to the next stage via registers, and in order for both stages
5606 * to agree on the register used we must force these to be in specific
5607 * registers.
5608 */
5609 if ((so->type == MESA_SHADER_VERTEX &&
5610 (so->key.has_gs || so->key.tessellation)) ||
5611 (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
5612 struct ir3_instruction *outputs[3];
5613 unsigned outidxs[3];
5614 unsigned regids[3];
5615 unsigned outputs_count = 0;
5616
5617 if (ctx->primitive_id) {
5618 unsigned n = so->outputs_count++;
5619 so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
5620
5621 struct ir3_instruction *out =
5622 ir3_collect(&ctx->build, ctx->primitive_id);
5623 outputs[outputs_count] = out;
5624 outidxs[outputs_count] = n;
5625 if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id)
5626 regids[outputs_count] = regid(0, 2);
5627 else
5628 regids[outputs_count] = regid(0, 1);
5629 outputs_count++;
5630 }
5631
5632 if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id) {
5633 unsigned n = so->outputs_count++;
5634 so->outputs[n].slot = VARYING_SLOT_REL_PATCH_ID_IR3;
5635 struct ir3_instruction *out =
5636 ir3_collect(&ctx->build, ctx->rel_patch_id);
5637 outputs[outputs_count] = out;
5638 outidxs[outputs_count] = n;
5639 regids[outputs_count] = regid(0, 1);
5640 outputs_count++;
5641 }
5642
5643 if (ctx->gs_header) {
5644 unsigned n = so->outputs_count++;
5645 so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
5646 struct ir3_instruction *out = ir3_collect(&ctx->build, ctx->gs_header);
5647 outputs[outputs_count] = out;
5648 outidxs[outputs_count] = n;
5649 regids[outputs_count] = regid(0, 0);
5650 outputs_count++;
5651 }
5652
5653 if (ctx->tcs_header) {
5654 unsigned n = so->outputs_count++;
5655 so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
5656 struct ir3_instruction *out =
5657 ir3_collect(&ctx->build, ctx->tcs_header);
5658 outputs[outputs_count] = out;
5659 outidxs[outputs_count] = n;
5660 regids[outputs_count] = regid(0, 0);
5661 outputs_count++;
5662 }
5663
5664 struct ir3_instruction *chmask =
5665 ir3_build_instr(&ctx->build, OPC_CHMASK, 0, outputs_count);
5666 chmask->barrier_class = IR3_BARRIER_EVERYTHING;
5667 chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
5668
5669 for (unsigned i = 0; i < outputs_count; i++)
5670 __ssa_src(chmask, outputs[i], 0)->num = regids[i];
5671
5672 chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
5673 memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
5674
5675 array_insert(ctx->block, ctx->block->keeps, chmask);
5676
5677 struct ir3_instruction *chsh = ir3_CHSH(&ctx->build);
5678 chsh->barrier_class = IR3_BARRIER_EVERYTHING;
5679 chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
5680 } else {
5681 assert((ctx->noutputs % 4) == 0);
5682 unsigned outidxs[ctx->noutputs / 4];
5683 struct ir3_instruction *outputs[ctx->noutputs / 4];
5684 unsigned outputs_count = 0;
5685
5686 struct ir3_block *b = ctx->block;
5687 /* Insert these collect's in the block before the end-block if
5688 * possible, so that any moves they generate can be shuffled around to
5689 * reduce nop's:
5690 */
5691 if (ctx->block->predecessors_count == 1)
5692 b = ctx->block->predecessors[0];
5693
5694 /* Setup IR level outputs, which are "collects" that gather
5695 * the scalar components of outputs.
5696 */
5697 for (unsigned i = 0; i < ctx->noutputs; i += 4) {
5698 unsigned ncomp = 0;
5699 /* figure out the # of components written:
5700 *
5701 * TODO do we need to handle holes, ie. if .x and .z
5702 * components written, but .y component not written?
5703 */
5704 for (unsigned j = 0; j < 4; j++) {
5705 if (!ctx->outputs[i + j])
5706 break;
5707 ncomp++;
5708 }
5709
5710 /* Note that in some stages, like TCS, store_output is
5711 * lowered to memory writes, so no components of the
5712 * are "written" from the PoV of traditional store-
5713 * output instructions:
5714 */
5715 if (!ncomp)
5716 continue;
5717
5718 struct ir3_builder build = ir3_builder_at(ir3_before_terminator(b));
5719 struct ir3_instruction *out =
5720 ir3_create_collect(&build, &ctx->outputs[i], ncomp);
5721
5722 int outidx = i / 4;
5723 assert(outidx < so->outputs_count);
5724
5725 outidxs[outputs_count] = outidx;
5726 outputs[outputs_count] = out;
5727 outputs_count++;
5728 }
5729
5730 /* for a6xx+, binning and draw pass VS use same VBO state, so we
5731 * need to make sure not to remove any inputs that are used by
5732 * the nonbinning VS.
5733 */
5734 if (ctx->compiler->gen >= 6 && so->binning_pass &&
5735 so->type == MESA_SHADER_VERTEX) {
5736 for (int i = 0; i < ctx->ninputs; i++) {
5737 struct ir3_instruction *in = ctx->inputs[i];
5738
5739 if (!in)
5740 continue;
5741
5742 unsigned n = i / 4;
5743 unsigned c = i % 4;
5744
5745 assert(n < so->nonbinning->inputs_count);
5746
5747 if (so->nonbinning->inputs[n].sysval)
5748 continue;
5749
5750 /* be sure to keep inputs, even if only used in VS */
5751 if (so->nonbinning->inputs[n].compmask & (1 << c))
5752 array_insert(in->block, in->block->keeps, in);
5753 }
5754 }
5755
5756 struct ir3_instruction *end =
5757 ir3_build_instr(&ctx->build, OPC_END, 0, outputs_count);
5758
5759 for (unsigned i = 0; i < outputs_count; i++) {
5760 __ssa_src(end, outputs[i], 0);
5761 }
5762
5763 end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
5764 memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
5765
5766 array_insert(ctx->block, ctx->block->keeps, end);
5767 }
5768
5769 if (so->type == MESA_SHADER_FRAGMENT &&
5770 ctx->s->info.fs.needs_quad_helper_invocations) {
5771 so->need_pixlod = true;
5772 so->need_full_quad = true;
5773 }
5774
5775 ir3_debug_print(ir, "AFTER: nir->ir3");
5776 ir3_validate(ir);
5777
5778 IR3_PASS(ir, ir3_remove_unreachable);
5779
5780 IR3_PASS(ir, ir3_array_to_ssa);
5781
5782 ir3_calc_reconvergence(so);
5783
5784 IR3_PASS(ir, ir3_lower_shared_phis);
5785
5786 do {
5787 progress = false;
5788
5789 /* the folding doesn't seem to work reliably on a4xx */
5790 if (ctx->compiler->gen != 4)
5791 progress |= IR3_PASS(ir, ir3_cf);
5792 progress |= IR3_PASS(ir, ir3_cp, so);
5793 progress |= IR3_PASS(ir, ir3_cse);
5794 progress |= IR3_PASS(ir, ir3_dce, so);
5795 progress |= IR3_PASS(ir, ir3_opt_predicates, so);
5796 progress |= IR3_PASS(ir, ir3_shared_fold);
5797 } while (progress);
5798
5799 progress = IR3_PASS(ir, ir3_create_alias_tex_regs);
5800 progress |= IR3_PASS(ir, ir3_create_alias_rt, so);
5801
5802 if (progress) {
5803 IR3_PASS(ir, ir3_dce, so);
5804 }
5805
5806 IR3_PASS(ir, ir3_sched_add_deps);
5807
5808 /* At this point, all the dead code should be long gone: */
5809 assert(!IR3_PASS(ir, ir3_dce, so));
5810
5811 ret = ir3_sched(ir);
5812 if (ret) {
5813 DBG("SCHED failed!");
5814 goto out;
5815 }
5816
5817 ir3_debug_print(ir, "AFTER: ir3_sched");
5818
5819 /* Pre-assign VS inputs on a6xx+ binning pass shader, to align
5820 * with draw pass VS, so binning and draw pass can both use the
5821 * same VBO state.
5822 *
5823 * Note that VS inputs are expected to be full precision.
5824 */
5825 bool pre_assign_inputs = (ir->compiler->gen >= 6) &&
5826 (ir->type == MESA_SHADER_VERTEX) &&
5827 so->binning_pass;
5828
5829 if (pre_assign_inputs) {
5830 foreach_input (in, ir) {
5831 assert(in->opc == OPC_META_INPUT);
5832 unsigned inidx = in->input.inidx;
5833
5834 in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;
5835 }
5836 } else if (ctx->tcs_header) {
5837 /* We need to have these values in the same registers between VS and TCS
5838 * since the VS chains to TCS and doesn't get the sysvals redelivered.
5839 */
5840
5841 ctx->tcs_header->dsts[0]->num = regid(0, 0);
5842 ctx->rel_patch_id->dsts[0]->num = regid(0, 1);
5843 if (ctx->primitive_id)
5844 ctx->primitive_id->dsts[0]->num = regid(0, 2);
5845 } else if (ctx->gs_header) {
5846 /* We need to have these values in the same registers between producer
5847 * (VS or DS) and GS since the producer chains to GS and doesn't get
5848 * the sysvals redelivered.
5849 */
5850
5851 ctx->gs_header->dsts[0]->num = regid(0, 0);
5852 if (ctx->primitive_id)
5853 ctx->primitive_id->dsts[0]->num = regid(0, 1);
5854 } else if (so->num_sampler_prefetch) {
5855 assert(so->type == MESA_SHADER_FRAGMENT);
5856 int idx = 0;
5857
5858 foreach_input (instr, ir) {
5859 if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
5860 continue;
5861
5862 assert(idx < 2);
5863 instr->dsts[0]->num = idx;
5864 idx++;
5865 }
5866 }
5867
5868 IR3_PASS(ir, ir3_cleanup_rpt, so);
5869 ret = ir3_ra(so);
5870
5871 if (ret) {
5872 mesa_loge("ir3_ra() failed!");
5873 goto out;
5874 }
5875
5876 IR3_PASS(ir, ir3_merge_rpt, so);
5877 IR3_PASS(ir, ir3_postsched, so);
5878
5879 IR3_PASS(ir, ir3_legalize_relative);
5880 IR3_PASS(ir, ir3_lower_subgroups);
5881
5882 /* This isn't valid to do when transform feedback is done in HW, which is
5883 * a4xx onward, because the VS may use components not read by the FS for
5884 * transform feedback. Ideally we'd delete this, but a5xx and earlier seem to
5885 * be broken without it.
5886 */
5887 if (so->type == MESA_SHADER_FRAGMENT && ctx->compiler->gen < 6)
5888 pack_inlocs(ctx);
5889
5890 /*
5891 * Fixup inputs/outputs to point to the actual registers assigned:
5892 *
5893 * 1) initialize to r63.x (invalid/unused)
5894 * 2) iterate IR level inputs/outputs and update the variants
5895 * inputs/outputs table based on the assigned registers for
5896 * the remaining inputs/outputs.
5897 */
5898
5899 for (unsigned i = 0; i < so->inputs_count; i++)
5900 so->inputs[i].regid = INVALID_REG;
5901 for (unsigned i = 0; i < so->outputs_count; i++)
5902 so->outputs[i].regid = INVALID_REG;
5903
5904 struct ir3_instruction *end = ir3_find_end(so->ir);
5905
5906 for (unsigned i = 0; i < end->srcs_count; i++) {
5907 unsigned outidx = end->end.outidxs[i];
5908 struct ir3_register *reg = end->srcs[i];
5909
5910 so->outputs[outidx].regid = reg->num;
5911 so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
5912 }
5913
5914 foreach_input (in, ir) {
5915 assert(in->opc == OPC_META_INPUT);
5916 unsigned inidx = in->input.inidx;
5917
5918 if (pre_assign_inputs && !so->inputs[inidx].sysval) {
5919 if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {
5920 compile_assert(
5921 ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid);
5922 compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==
5923 so->nonbinning->inputs[inidx].half);
5924 }
5925 so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;
5926 so->inputs[inidx].half = so->nonbinning->inputs[inidx].half;
5927 } else {
5928 so->inputs[inidx].regid = in->dsts[0]->num;
5929 so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF);
5930 }
5931 }
5932
5933 uint8_t clip_cull_mask = ctx->so->clip_mask | ctx->so->cull_mask;
5934 /* Having non-zero clip/cull mask and not writting corresponding regs
5935 * leads to a GPU fault on A7XX.
5936 */
5937 if (clip_cull_mask &&
5938 ir3_find_output_regid(ctx->so, VARYING_SLOT_CLIP_DIST0) == regid(63, 0)) {
5939 ctx->so->clip_mask &= 0xf0;
5940 ctx->so->cull_mask &= 0xf0;
5941 }
5942 if ((clip_cull_mask >> 4) &&
5943 ir3_find_output_regid(ctx->so, VARYING_SLOT_CLIP_DIST1) == regid(63, 0)) {
5944 ctx->so->clip_mask &= 0xf;
5945 ctx->so->cull_mask &= 0xf;
5946 }
5947
5948 if (ctx->astc_srgb)
5949 fixup_astc_srgb(ctx);
5950
5951 if (ctx->compiler->gen == 4 && ctx->s->info.uses_texture_gather)
5952 fixup_tg4(ctx);
5953
5954 /* We need to do legalize after (for frag shader's) the "bary.f"
5955 * offsets (inloc) have been assigned.
5956 */
5957 IR3_PASS(ir, ir3_legalize, so, &max_bary);
5958
5959 /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't
5960 * know what we might have to wait on when coming in from VS chsh.
5961 */
5962 if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) {
5963 foreach_block (block, &ir->block_list) {
5964 foreach_instr (instr, &block->instr_list) {
5965 instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
5966 break;
5967 }
5968 }
5969 }
5970
5971 if (ctx->compiler->gen >= 7 && so->type == MESA_SHADER_COMPUTE) {
5972 struct ir3_instruction *end = ir3_find_end(so->ir);
5973 struct ir3_instruction *lock =
5974 ir3_build_instr(&ctx->build, OPC_LOCK, 0, 0);
5975 /* TODO: This flags should be set by scheduler only when needed */
5976 lock->flags = IR3_INSTR_SS | IR3_INSTR_SY | IR3_INSTR_JP;
5977 ir3_instr_move_before(lock, end);
5978 struct ir3_instruction *unlock =
5979 ir3_build_instr(&ctx->build, OPC_UNLOCK, 0, 0);
5980 ir3_instr_move_before(unlock, end);
5981 }
5982
5983 so->pvtmem_size = ALIGN(so->pvtmem_size, compiler->pvtmem_per_fiber_align);
5984
5985 /* Note that max_bary counts inputs that are not bary.f'd for FS: */
5986 if (so->type == MESA_SHADER_FRAGMENT)
5987 so->total_in = max_bary + 1;
5988
5989 /* Collect sampling instructions eligible for pre-dispatch. */
5990 collect_tex_prefetches(ctx, ir);
5991
5992 if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
5993 !ctx->s->info.fs.early_fragment_tests)
5994 ctx->so->no_earlyz |= ctx->s->info.writes_memory;
5995
5996 if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
5997 ctx->s->info.fs.post_depth_coverage)
5998 so->post_depth_coverage = true;
5999
6000 ctx->so->per_samp = ctx->s->info.fs.uses_sample_shading;
6001
6002 if (ctx->has_relative_load_const_ir3) {
6003 /* NOTE: if relative addressing is used, we set
6004 * constlen in the compiler (to worst-case value)
6005 * since we don't know in the assembler what the max
6006 * addr reg value can be:
6007 */
6008 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
6009 const enum ir3_const_alloc_type rel_const_srcs[] = {
6010 IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS, IR3_CONST_ALLOC_UBO_RANGES,
6011 IR3_CONST_ALLOC_PREAMBLE, IR3_CONST_ALLOC_GLOBAL};
6012 for (int i = 0; i < ARRAY_SIZE(rel_const_srcs); i++) {
6013 const struct ir3_const_allocation *const_alloc =
6014 &const_state->allocs.consts[rel_const_srcs[i]];
6015 if (const_alloc->size_vec4 > 0) {
6016 ctx->so->constlen =
6017 MAX2(ctx->so->constlen,
6018 const_alloc->offset_vec4 + const_alloc->size_vec4);
6019 }
6020 }
6021 }
6022
6023 if (ctx->so->type == MESA_SHADER_FRAGMENT &&
6024 compiler->fs_must_have_non_zero_constlen_quirk) {
6025 so->constlen = MAX2(so->constlen, 4);
6026 }
6027
6028 if (ctx->so->type == MESA_SHADER_VERTEX && ctx->compiler->gen >= 6) {
6029 so->constlen = MAX2(so->constlen, 8);
6030 }
6031
6032 if (gl_shader_stage_is_compute(so->type)) {
6033 so->cs.local_invocation_id =
6034 ir3_find_sysval_regid(so, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
6035 so->cs.work_group_id =
6036 ir3_find_sysval_regid(so, SYSTEM_VALUE_WORKGROUP_ID);
6037 } else {
6038 so->vtxid_base = ir3_find_sysval_regid(so, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
6039 }
6040
6041 out:
6042 if (ret) {
6043 if (so->ir)
6044 ir3_destroy(so->ir);
6045 so->ir = NULL;
6046 }
6047 ir3_context_free(ctx);
6048
6049 return ret;
6050 }
6051