1 /*
2 * Copyright © 2022 Collabora, Ltd.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "nak_private.h"
7 #include "nir_builder.h"
8 #include "nir_control_flow.h"
9 #include "nir_xfb_info.h"
10
11 #include "util/u_math.h"
12
13 #define OPT(nir, pass, ...) ({ \
14 bool this_progress = false; \
15 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
16 if (this_progress) \
17 progress = true; \
18 this_progress; \
19 })
20
21 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
22
23 bool
nak_nir_workgroup_has_one_subgroup(const nir_shader * nir)24 nak_nir_workgroup_has_one_subgroup(const nir_shader *nir)
25 {
26 switch (nir->info.stage) {
27 case MESA_SHADER_VERTEX:
28 case MESA_SHADER_TESS_EVAL:
29 case MESA_SHADER_GEOMETRY:
30 case MESA_SHADER_FRAGMENT:
31 unreachable("Shader stage does not have workgroups");
32 break;
33
34 case MESA_SHADER_TESS_CTRL:
35 /* Tessellation only ever has one subgroup per workgroup. The Vulkan
36 * limit on the number of tessellation invocations is 32 to allow for
37 * this.
38 */
39 return true;
40
41 case MESA_SHADER_COMPUTE:
42 case MESA_SHADER_KERNEL: {
43 if (nir->info.workgroup_size_variable)
44 return false;
45
46 uint16_t wg_sz = nir->info.workgroup_size[0] *
47 nir->info.workgroup_size[1] *
48 nir->info.workgroup_size[2];
49
50 return wg_sz <= NAK_SUBGROUP_SIZE;
51 }
52
53 default:
54 unreachable("Unknown shader stage");
55 }
56 }
57
58 static uint8_t
vectorize_filter_cb(const nir_instr * instr,const void * _data)59 vectorize_filter_cb(const nir_instr *instr, const void *_data)
60 {
61 if (instr->type != nir_instr_type_alu)
62 return 0;
63
64 const nir_alu_instr *alu = nir_instr_as_alu(instr);
65
66 const unsigned bit_size = nir_alu_instr_is_comparison(alu)
67 ? alu->src[0].src.ssa->bit_size
68 : alu->def.bit_size;
69
70 switch (alu->op) {
71 case nir_op_fadd:
72 case nir_op_fsub:
73 case nir_op_fabs:
74 case nir_op_fneg:
75 case nir_op_feq:
76 case nir_op_fge:
77 case nir_op_flt:
78 case nir_op_fneu:
79 case nir_op_fmul:
80 case nir_op_ffma:
81 case nir_op_fsign:
82 case nir_op_fsat:
83 case nir_op_fmax:
84 case nir_op_fmin:
85 return bit_size == 16 ? 2 : 1;
86 default:
87 return 1;
88 }
89 }
90
91 static void
optimize_nir(nir_shader * nir,const struct nak_compiler * nak,bool allow_copies)92 optimize_nir(nir_shader *nir, const struct nak_compiler *nak, bool allow_copies)
93 {
94 bool progress;
95
96 unsigned lower_flrp =
97 (nir->options->lower_flrp16 ? 16 : 0) |
98 (nir->options->lower_flrp32 ? 32 : 0) |
99 (nir->options->lower_flrp64 ? 64 : 0);
100
101 do {
102 progress = false;
103
104 /* This pass is causing problems with types used by OpenCL :
105 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13955
106 *
107 * Running with it disabled made no difference in the resulting assembly
108 * code.
109 */
110 if (nir->info.stage != MESA_SHADER_KERNEL)
111 OPT(nir, nir_split_array_vars, nir_var_function_temp);
112
113 OPT(nir, nir_shrink_vec_array_vars, nir_var_function_temp);
114 OPT(nir, nir_opt_deref);
115 if (OPT(nir, nir_opt_memcpy))
116 OPT(nir, nir_split_var_copies);
117
118 OPT(nir, nir_lower_vars_to_ssa);
119
120 if (allow_copies) {
121 /* Only run this pass in the first call to brw_nir_optimize. Later
122 * calls assume that we've lowered away any copy_deref instructions
123 * and we don't want to introduce any more.
124 */
125 OPT(nir, nir_opt_find_array_copies);
126 }
127 OPT(nir, nir_opt_copy_prop_vars);
128 OPT(nir, nir_opt_dead_write_vars);
129 OPT(nir, nir_opt_combine_stores, nir_var_all);
130
131 OPT(nir, nir_lower_alu_width, vectorize_filter_cb, NULL);
132 OPT(nir, nir_opt_vectorize, vectorize_filter_cb, NULL);
133 OPT(nir, nir_lower_phis_to_scalar, false);
134 OPT(nir, nir_lower_frexp);
135 OPT(nir, nir_copy_prop);
136 OPT(nir, nir_opt_dce);
137 OPT(nir, nir_opt_cse);
138
139 OPT(nir, nir_opt_peephole_select, 0, false, false);
140 OPT(nir, nir_opt_intrinsics);
141 OPT(nir, nir_opt_idiv_const, 32);
142 OPT(nir, nir_opt_algebraic);
143 OPT(nir, nir_lower_constant_convert_alu_types);
144 OPT(nir, nir_opt_constant_folding);
145
146 if (lower_flrp != 0) {
147 if (OPT(nir, nir_lower_flrp, lower_flrp, false /* always_precise */))
148 OPT(nir, nir_opt_constant_folding);
149 /* Nothing should rematerialize any flrps */
150 lower_flrp = 0;
151 }
152
153 OPT(nir, nir_opt_dead_cf);
154 if (OPT(nir, nir_opt_loop)) {
155 /* If nir_opt_loop makes progress, then we need to clean things up
156 * if we want any hope of nir_opt_if or nir_opt_loop_unroll to make
157 * progress.
158 */
159 OPT(nir, nir_copy_prop);
160 OPT(nir, nir_opt_dce);
161 }
162 OPT(nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
163 OPT(nir, nir_opt_conditional_discard);
164 if (nir->options->max_unroll_iterations != 0) {
165 OPT(nir, nir_opt_loop_unroll);
166 }
167 OPT(nir, nir_opt_remove_phis);
168 OPT(nir, nir_opt_gcm, false);
169 OPT(nir, nir_opt_undef);
170 OPT(nir, nir_lower_pack);
171 } while (progress);
172
173 OPT(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
174 }
175
176 void
nak_optimize_nir(nir_shader * nir,const struct nak_compiler * nak)177 nak_optimize_nir(nir_shader *nir, const struct nak_compiler *nak)
178 {
179 optimize_nir(nir, nak, false);
180 }
181
182 static unsigned
lower_bit_size_cb(const nir_instr * instr,void * data)183 lower_bit_size_cb(const nir_instr *instr, void *data)
184 {
185 const struct nak_compiler *nak = data;
186
187 switch (instr->type) {
188 case nir_instr_type_alu: {
189 nir_alu_instr *alu = nir_instr_as_alu(instr);
190 if (nir_op_infos[alu->op].is_conversion)
191 return 0;
192
193 const unsigned bit_size = nir_alu_instr_is_comparison(alu)
194 ? alu->src[0].src.ssa->bit_size
195 : alu->def.bit_size;
196
197 switch (alu->op) {
198 case nir_op_bit_count:
199 case nir_op_ufind_msb:
200 case nir_op_ifind_msb:
201 case nir_op_find_lsb:
202 /* These are handled specially because the destination is always
203 * 32-bit and so the bit size of the instruction is given by the
204 * source.
205 */
206 return alu->src[0].src.ssa->bit_size == 32 ? 0 : 32;
207
208 case nir_op_fabs:
209 case nir_op_fadd:
210 case nir_op_fneg:
211 case nir_op_feq:
212 case nir_op_fge:
213 case nir_op_flt:
214 case nir_op_fneu:
215 case nir_op_fmul:
216 case nir_op_ffma:
217 case nir_op_ffmaz:
218 case nir_op_fsign:
219 case nir_op_fsat:
220 case nir_op_fceil:
221 case nir_op_ffloor:
222 case nir_op_fround_even:
223 case nir_op_ftrunc:
224 if (bit_size == 16 && nak->sm >= 70)
225 return 0;
226 break;
227
228 case nir_op_fmax:
229 case nir_op_fmin:
230 if (bit_size == 16 && nak->sm >= 80)
231 return 0;
232 break;
233
234 default:
235 break;
236 }
237
238 if (bit_size >= 32)
239 return 0;
240
241 if (bit_size & (8 | 16))
242 return 32;
243
244 return 0;
245 }
246
247 case nir_instr_type_intrinsic: {
248 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
249 switch (intrin->intrinsic) {
250 case nir_intrinsic_vote_ieq:
251 if (intrin->src[0].ssa->bit_size != 1 &&
252 intrin->src[0].ssa->bit_size < 32)
253 return 32;
254 return 0;
255
256 case nir_intrinsic_vote_feq:
257 case nir_intrinsic_read_invocation:
258 case nir_intrinsic_read_first_invocation:
259 case nir_intrinsic_shuffle:
260 case nir_intrinsic_shuffle_xor:
261 case nir_intrinsic_shuffle_up:
262 case nir_intrinsic_shuffle_down:
263 case nir_intrinsic_quad_broadcast:
264 case nir_intrinsic_quad_swap_horizontal:
265 case nir_intrinsic_quad_swap_vertical:
266 case nir_intrinsic_quad_swap_diagonal:
267 case nir_intrinsic_reduce:
268 case nir_intrinsic_inclusive_scan:
269 case nir_intrinsic_exclusive_scan:
270 if (intrin->src[0].ssa->bit_size < 32)
271 return 32;
272 return 0;
273
274 default:
275 return 0;
276 }
277 }
278
279 case nir_instr_type_phi: {
280 nir_phi_instr *phi = nir_instr_as_phi(instr);
281 if (phi->def.bit_size < 32 && phi->def.bit_size != 1)
282 return 32;
283 return 0;
284 }
285
286 default:
287 return 0;
288 }
289 }
290
291 void
nak_preprocess_nir(nir_shader * nir,const struct nak_compiler * nak)292 nak_preprocess_nir(nir_shader *nir, const struct nak_compiler *nak)
293 {
294 UNUSED bool progress = false;
295
296 nir_validate_ssa_dominance(nir, "before nak_preprocess_nir");
297
298 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
299 nir_lower_io_to_temporaries(nir, nir_shader_get_entrypoint(nir),
300 true /* outputs */, false /* inputs */);
301 }
302
303 const nir_lower_tex_options tex_options = {
304 .lower_txd_3d = true,
305 .lower_txd_cube_map = true,
306 .lower_txd_clamp = true,
307 .lower_txd_shadow = true,
308 .lower_txp = ~0,
309 /* TODO: More lowering */
310 };
311 OPT(nir, nir_lower_tex, &tex_options);
312 OPT(nir, nir_normalize_cubemap_coords);
313
314 nir_lower_image_options image_options = {
315 .lower_cube_size = true,
316 };
317 OPT(nir, nir_lower_image, &image_options);
318
319 OPT(nir, nir_lower_global_vars_to_local);
320
321 OPT(nir, nir_split_var_copies);
322 OPT(nir, nir_split_struct_vars, nir_var_function_temp);
323
324 /* Optimize but allow copies because we haven't lowered them yet */
325 optimize_nir(nir, nak, true /* allow_copies */);
326
327 OPT(nir, nir_lower_load_const_to_scalar);
328 OPT(nir, nir_lower_var_copies);
329 OPT(nir, nir_lower_system_values);
330 OPT(nir, nir_lower_compute_system_values, NULL);
331
332 if (nir->info.stage == MESA_SHADER_FRAGMENT)
333 OPT(nir, nir_lower_terminate_to_demote);
334 }
335
336 uint16_t
nak_varying_attr_addr(const struct nak_compiler * nak,gl_varying_slot slot)337 nak_varying_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot)
338 {
339 if (slot >= VARYING_SLOT_PATCH0) {
340 return NAK_ATTR_PATCH_START + (slot - VARYING_SLOT_PATCH0) * 0x10;
341 } else if (slot >= VARYING_SLOT_VAR0) {
342 return NAK_ATTR_GENERIC_START + (slot - VARYING_SLOT_VAR0) * 0x10;
343 } else {
344 switch (slot) {
345 case VARYING_SLOT_TESS_LEVEL_OUTER: return NAK_ATTR_TESS_LOD;
346 case VARYING_SLOT_TESS_LEVEL_INNER: return NAK_ATTR_TESS_INTERRIOR;
347 case VARYING_SLOT_PRIMITIVE_ID: return NAK_ATTR_PRIMITIVE_ID;
348 case VARYING_SLOT_LAYER: return NAK_ATTR_RT_ARRAY_INDEX;
349 case VARYING_SLOT_VIEWPORT: return NAK_ATTR_VIEWPORT_INDEX;
350 case VARYING_SLOT_PSIZ: return NAK_ATTR_POINT_SIZE;
351 case VARYING_SLOT_POS: return NAK_ATTR_POSITION;
352 case VARYING_SLOT_CLIP_DIST0: return NAK_ATTR_CLIP_CULL_DIST_0;
353 case VARYING_SLOT_CLIP_DIST1: return NAK_ATTR_CLIP_CULL_DIST_4;
354 case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
355 return nak->sm >= 86 ? NAK_ATTR_VPRS_TABLE_INDEX
356 : NAK_ATTR_VIEWPORT_INDEX;
357 default: unreachable("Invalid varying slot");
358 }
359 }
360 }
361
362 static uint16_t
nak_fs_out_addr(gl_frag_result slot,uint32_t blend_idx)363 nak_fs_out_addr(gl_frag_result slot, uint32_t blend_idx)
364 {
365 switch (slot) {
366 case FRAG_RESULT_DEPTH:
367 assert(blend_idx == 0);
368 return NAK_FS_OUT_DEPTH;
369
370 case FRAG_RESULT_STENCIL:
371 unreachable("EXT_shader_stencil_export not supported");
372
373 case FRAG_RESULT_COLOR:
374 unreachable("Vulkan alway uses explicit locations");
375
376 case FRAG_RESULT_SAMPLE_MASK:
377 assert(blend_idx == 0);
378 return NAK_FS_OUT_SAMPLE_MASK;
379
380 default:
381 assert(blend_idx < 2);
382 return NAK_FS_OUT_COLOR((slot - FRAG_RESULT_DATA0) + blend_idx);
383 }
384 }
385
386 uint16_t
nak_sysval_attr_addr(const struct nak_compiler * nak,gl_system_value sysval)387 nak_sysval_attr_addr(const struct nak_compiler *nak, gl_system_value sysval)
388 {
389 switch (sysval) {
390 case SYSTEM_VALUE_PRIMITIVE_ID: return NAK_ATTR_PRIMITIVE_ID;
391 case SYSTEM_VALUE_FRAG_COORD: return NAK_ATTR_POSITION;
392 case SYSTEM_VALUE_POINT_COORD: return NAK_ATTR_POINT_SPRITE;
393 case SYSTEM_VALUE_TESS_COORD: return NAK_ATTR_TESS_COORD;
394 case SYSTEM_VALUE_INSTANCE_ID: return NAK_ATTR_INSTANCE_ID;
395 case SYSTEM_VALUE_VERTEX_ID: return NAK_ATTR_VERTEX_ID;
396 case SYSTEM_VALUE_FRONT_FACE: return NAK_ATTR_FRONT_FACE;
397 case SYSTEM_VALUE_LAYER_ID: return NAK_ATTR_RT_ARRAY_INDEX;
398 default: unreachable("Invalid system value");
399 }
400 }
401
402 static uint8_t
nak_sysval_sysval_idx(gl_system_value sysval)403 nak_sysval_sysval_idx(gl_system_value sysval)
404 {
405 switch (sysval) {
406 case SYSTEM_VALUE_SUBGROUP_INVOCATION: return NAK_SV_LANE_ID;
407 case SYSTEM_VALUE_VERTICES_IN: return NAK_SV_PRIM_TYPE;
408 case SYSTEM_VALUE_INVOCATION_ID: return NAK_SV_INVOCATION_ID;
409 case SYSTEM_VALUE_HELPER_INVOCATION: return NAK_SV_THREAD_KILL;
410 case SYSTEM_VALUE_LOCAL_INVOCATION_ID: return NAK_SV_TID;
411 case SYSTEM_VALUE_WORKGROUP_ID: return NAK_SV_CTAID;
412 case SYSTEM_VALUE_SUBGROUP_EQ_MASK: return NAK_SV_LANEMASK_EQ;
413 case SYSTEM_VALUE_SUBGROUP_LT_MASK: return NAK_SV_LANEMASK_LT;
414 case SYSTEM_VALUE_SUBGROUP_LE_MASK: return NAK_SV_LANEMASK_LE;
415 case SYSTEM_VALUE_SUBGROUP_GT_MASK: return NAK_SV_LANEMASK_GT;
416 case SYSTEM_VALUE_SUBGROUP_GE_MASK: return NAK_SV_LANEMASK_GE;
417 default: unreachable("Invalid system value");
418 }
419 }
420
421 static bool
nak_nir_lower_system_value_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * data)422 nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
423 void *data)
424 {
425 const struct nak_compiler *nak = data;
426
427 b->cursor = nir_before_instr(&intrin->instr);
428
429 nir_def *val;
430 switch (intrin->intrinsic) {
431 case nir_intrinsic_load_primitive_id:
432 case nir_intrinsic_load_instance_id:
433 case nir_intrinsic_load_vertex_id: {
434 assert(b->shader->info.stage != MESA_SHADER_VERTEX ||
435 b->shader->info.stage != MESA_SHADER_TESS_CTRL ||
436 b->shader->info.stage == MESA_SHADER_TESS_EVAL ||
437 b->shader->info.stage == MESA_SHADER_GEOMETRY);
438 const gl_system_value sysval =
439 nir_system_value_from_intrinsic(intrin->intrinsic);
440 const uint32_t addr = nak_sysval_attr_addr(nak, sysval);
441 val = nir_ald_nv(b, 1, nir_imm_int(b, 0), nir_imm_int(b, 0),
442 .base = addr, .flags = 0,
443 .range_base = addr, .range = 4,
444 .access = ACCESS_CAN_REORDER);
445 break;
446 }
447
448 case nir_intrinsic_load_patch_vertices_in: {
449 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_PRIM_TYPE,
450 .access = ACCESS_CAN_REORDER);
451 val = nir_extract_u8(b, val, nir_imm_int(b, 1));
452 break;
453 }
454
455 case nir_intrinsic_load_frag_shading_rate: {
456 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VARIABLE_RATE,
457 .access = ACCESS_CAN_REORDER);
458
459 /* X is in bits 8..16 and Y is in bits 16..24. However, we actually
460 * want the log2 of X and Y and, since we only support 1, 2, and 4, a
461 * right shift by 1 is log2. So this gives us
462 *
463 * x_log2 = (sv >> 9) & 3
464 * y_log2 = (sv >> 17) & 3
465 *
466 * However, we actually want y_log2 at 0..2 and x_log2 at 2..4 so that
467 * gives us
468 */
469 nir_def *x = nir_iand_imm(b, nir_ushr_imm(b, val, 7), 0xc);
470 nir_def *y = nir_iand_imm(b, nir_ushr_imm(b, val, 17), 0x3);
471 val = nir_ior(b, x, y);
472 break;
473 }
474
475 case nir_intrinsic_load_subgroup_eq_mask:
476 case nir_intrinsic_load_subgroup_lt_mask:
477 case nir_intrinsic_load_subgroup_le_mask:
478 case nir_intrinsic_load_subgroup_gt_mask:
479 case nir_intrinsic_load_subgroup_ge_mask: {
480 const gl_system_value sysval =
481 nir_system_value_from_intrinsic(intrin->intrinsic);
482 const uint32_t idx = nak_sysval_sysval_idx(sysval);
483 val = nir_load_sysval_nv(b, 32, .base = idx,
484 .access = ACCESS_CAN_REORDER);
485
486 /* Pad with 0 because all invocations above 31 are off */
487 if (intrin->def.bit_size == 64) {
488 val = nir_u2u32(b, val);
489 } else {
490 assert(intrin->def.bit_size == 32);
491 val = nir_pad_vector_imm_int(b, val, 0, intrin->def.num_components);
492 }
493 break;
494 }
495
496 case nir_intrinsic_load_subgroup_invocation:
497 case nir_intrinsic_load_helper_invocation:
498 case nir_intrinsic_load_invocation_id:
499 case nir_intrinsic_load_workgroup_id: {
500 const gl_system_value sysval =
501 nir_system_value_from_intrinsic(intrin->intrinsic);
502 const uint32_t idx = nak_sysval_sysval_idx(sysval);
503 nir_def *comps[3];
504 assert(intrin->def.num_components <= 3);
505 for (unsigned c = 0; c < intrin->def.num_components; c++) {
506 comps[c] = nir_load_sysval_nv(b, 32, .base = idx + c,
507 .access = ACCESS_CAN_REORDER);
508 }
509 val = nir_vec(b, comps, intrin->def.num_components);
510 break;
511 }
512
513 case nir_intrinsic_load_local_invocation_id: {
514 nir_def *x = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_X,
515 .access = ACCESS_CAN_REORDER);
516 nir_def *y = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Y,
517 .access = ACCESS_CAN_REORDER);
518 nir_def *z = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Z,
519 .access = ACCESS_CAN_REORDER);
520
521 if (b->shader->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
522 nir_def *x_lo = nir_iand_imm(b, x, 0x1);
523 nir_def *y_lo = nir_ushr_imm(b, nir_iand_imm(b, x, 0x2), 1);
524 nir_def *x_hi = nir_ushr_imm(b, nir_iand_imm(b, x, ~0x3), 1);
525 nir_def *y_hi = nir_ishl_imm(b, y, 1);
526
527 x = nir_ior(b, x_lo, x_hi);
528 y = nir_ior(b, y_lo, y_hi);
529 }
530
531 val = nir_vec3(b, x, y, z);
532 break;
533 }
534
535 case nir_intrinsic_load_num_subgroups: {
536 assert(!b->shader->info.workgroup_size_variable);
537 uint16_t wg_size = b->shader->info.workgroup_size[0] *
538 b->shader->info.workgroup_size[1] *
539 b->shader->info.workgroup_size[2];
540 val = nir_imm_int(b, DIV_ROUND_UP(wg_size, 32));
541 break;
542 }
543
544 case nir_intrinsic_load_subgroup_id:
545 if (nak_nir_workgroup_has_one_subgroup(b->shader)) {
546 val = nir_imm_int(b, 0);
547 } else {
548 assert(!b->shader->info.workgroup_size_variable);
549 nir_def *tid_x = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_X,
550 .access = ACCESS_CAN_REORDER);
551 nir_def *tid_y = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Y,
552 .access = ACCESS_CAN_REORDER);
553 nir_def *tid_z = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Z,
554 .access = ACCESS_CAN_REORDER);
555
556 const uint16_t *wg_size = b->shader->info.workgroup_size;
557 nir_def *tid =
558 nir_iadd(b, tid_x,
559 nir_iadd(b, nir_imul_imm(b, tid_y, wg_size[0]),
560 nir_imul_imm(b, tid_z, wg_size[0] * wg_size[1])));
561
562 val = nir_udiv_imm(b, tid, 32);
563 }
564 break;
565
566 case nir_intrinsic_is_helper_invocation: {
567 /* Unlike load_helper_invocation, this one isn't re-orderable */
568 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_THREAD_KILL);
569 break;
570 }
571
572 case nir_intrinsic_shader_clock: {
573 /* The CS2R opcode can load 64 bits worth of sysval data at a time but
574 * it's not actually atomic. In order to get correct shader clocks, we
575 * need to do a loop where we do
576 *
577 * CS2R SV_CLOCK_HI
578 * CS2R SV_CLOCK_LO
579 * CS2R SV_CLOCK_HI
580 * CS2R SV_CLOCK_LO
581 * CS2R SV_CLOCK_HI
582 * ...
583 *
584 * The moment two high values are the same, we take the low value
585 * between them and that gives us our clock.
586 *
587 * In order to make sure we don't run into any weird races, we also need
588 * to insert a barrier after every load to ensure the one load completes
589 * before we kick off the next load. Otherwise, if one load happens to
590 * be faster than the other (they are variable latency, after all) we're
591 * still guaranteed that the loads happen in the order we want.
592 */
593 nir_variable *clock =
594 nir_local_variable_create(b->impl, glsl_uvec2_type(), NULL);
595
596 nir_def *clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_HI);
597 nir_ssa_bar_nv(b, clock_hi);
598
599 nir_store_var(b, clock, nir_vec2(b, nir_imm_int(b, 0), clock_hi), 0x3);
600
601 nir_push_loop(b);
602 {
603 nir_def *last_clock = nir_load_var(b, clock);
604
605 nir_def *clock_lo = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_LO);
606 nir_ssa_bar_nv(b, clock_lo);
607
608 clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK + 1);
609 nir_ssa_bar_nv(b, clock_hi);
610
611 nir_store_var(b, clock, nir_vec2(b, clock_lo, clock_hi), 0x3);
612
613 nir_break_if(b, nir_ieq(b, clock_hi, nir_channel(b, last_clock, 1)));
614 }
615 nir_pop_loop(b, NULL);
616
617 val = nir_load_var(b, clock);
618 if (intrin->def.bit_size == 64)
619 val = nir_pack_64_2x32(b, val);
620 break;
621 }
622
623 case nir_intrinsic_load_warps_per_sm_nv:
624 val = nir_imm_int(b, nak->warps_per_sm);
625 break;
626
627 case nir_intrinsic_load_sm_count_nv:
628 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTCFG);
629 val = nir_ubitfield_extract_imm(b, val, 20, 9);
630 break;
631
632 case nir_intrinsic_load_warp_id_nv:
633 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
634 val = nir_ubitfield_extract_imm(b, val, 8, 7);
635 break;
636
637 case nir_intrinsic_load_sm_id_nv:
638 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
639 val = nir_ubitfield_extract_imm(b, val, 20, 9);
640 break;
641
642 default:
643 return false;
644 }
645
646 if (intrin->def.bit_size == 1)
647 val = nir_i2b(b, val);
648
649 nir_def_rewrite_uses(&intrin->def, val);
650
651 return true;
652 }
653
654 static bool
nak_nir_lower_system_values(nir_shader * nir,const struct nak_compiler * nak)655 nak_nir_lower_system_values(nir_shader *nir, const struct nak_compiler *nak)
656 {
657 return nir_shader_intrinsics_pass(nir, nak_nir_lower_system_value_intrin,
658 nir_metadata_none,
659 (void *)nak);
660 }
661
662 struct nak_xfb_info
nak_xfb_from_nir(const struct nak_compiler * nak,const struct nir_xfb_info * nir_xfb)663 nak_xfb_from_nir(const struct nak_compiler *nak,
664 const struct nir_xfb_info *nir_xfb)
665 {
666 if (nir_xfb == NULL)
667 return (struct nak_xfb_info) { };
668
669 struct nak_xfb_info nak_xfb = { };
670
671 u_foreach_bit(b, nir_xfb->buffers_written) {
672 nak_xfb.stride[b] = nir_xfb->buffers[b].stride;
673 nak_xfb.stream[b] = nir_xfb->buffer_to_stream[b];
674 }
675 memset(nak_xfb.attr_index, 0xff, sizeof(nak_xfb.attr_index)); /* = skip */
676
677 for (unsigned o = 0; o < nir_xfb->output_count; o++) {
678 const nir_xfb_output_info *out = &nir_xfb->outputs[o];
679 const uint8_t b = out->buffer;
680 assert(nir_xfb->buffers_written & BITFIELD_BIT(b));
681
682 const uint16_t attr_addr = nak_varying_attr_addr(nak, out->location);
683 assert(attr_addr % 4 == 0);
684 const uint16_t attr_idx = attr_addr / 4;
685
686 assert(out->offset % 4 == 0);
687 uint8_t out_idx = out->offset / 4;
688
689 u_foreach_bit(c, out->component_mask)
690 nak_xfb.attr_index[b][out_idx++] = attr_idx + c;
691
692 nak_xfb.attr_count[b] = MAX2(nak_xfb.attr_count[b], out_idx);
693 }
694
695 return nak_xfb;
696 }
697
698 static bool
lower_fs_output_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * _data)699 lower_fs_output_intrin(nir_builder *b, nir_intrinsic_instr *intrin, void *_data)
700 {
701 if (intrin->intrinsic != nir_intrinsic_store_output)
702 return false;
703
704 b->cursor = nir_before_instr(&intrin->instr);
705
706 const nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
707 uint16_t addr = nak_fs_out_addr(sem.location, sem.dual_source_blend_index) +
708 nir_src_as_uint(intrin->src[1]) * 16 +
709 nir_intrinsic_component(intrin) * 4;
710
711 nir_def *data = intrin->src[0].ssa;
712
713 /* The fs_out_nv intrinsic is always scalar */
714 u_foreach_bit(c, nir_intrinsic_write_mask(intrin)) {
715 if (nir_scalar_is_undef(nir_scalar_resolved(data, c)))
716 continue;
717
718 nir_fs_out_nv(b, nir_channel(b, data, c), .base = addr + c * 4);
719 }
720
721 nir_instr_remove(&intrin->instr);
722
723 return true;
724 }
725
726 static bool
nak_nir_lower_fs_outputs(nir_shader * nir)727 nak_nir_lower_fs_outputs(nir_shader *nir)
728 {
729 if (nir->info.outputs_written == 0)
730 return false;
731
732 bool progress = nir_shader_intrinsics_pass(nir, lower_fs_output_intrin,
733 nir_metadata_control_flow,
734 NULL);
735
736 if (progress) {
737 /* We need a copy_fs_outputs_nv intrinsic so NAK knows where to place
738 * the final copy. This needs to be in the last block, after all
739 * store_output intrinsics.
740 */
741 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
742 nir_builder b = nir_builder_at(nir_after_impl(impl));
743 nir_copy_fs_outputs_nv(&b);
744 }
745
746 return progress;
747 }
748
749 static bool
nak_nir_remove_barrier_intrin(nir_builder * b,nir_intrinsic_instr * barrier,UNUSED void * _data)750 nak_nir_remove_barrier_intrin(nir_builder *b, nir_intrinsic_instr *barrier,
751 UNUSED void *_data)
752 {
753 if (barrier->intrinsic != nir_intrinsic_barrier)
754 return false;
755
756 mesa_scope exec_scope = nir_intrinsic_execution_scope(barrier);
757 assert(exec_scope <= SCOPE_WORKGROUP &&
758 "Control barrier with scope > WORKGROUP");
759
760 if (exec_scope == SCOPE_WORKGROUP &&
761 nak_nir_workgroup_has_one_subgroup(b->shader))
762 exec_scope = SCOPE_SUBGROUP;
763
764 /* Because we're guaranteeing maximal convergence via warp barriers,
765 * subgroup barriers do nothing.
766 */
767 if (exec_scope <= SCOPE_SUBGROUP)
768 exec_scope = SCOPE_NONE;
769
770 const nir_variable_mode mem_modes = nir_intrinsic_memory_modes(barrier);
771 if (exec_scope == SCOPE_NONE && mem_modes == 0) {
772 nir_instr_remove(&barrier->instr);
773 return true;
774 }
775
776 /* In this case, we're leaving the barrier there */
777 b->shader->info.uses_control_barrier = true;
778
779 bool progress = false;
780 if (exec_scope != nir_intrinsic_execution_scope(barrier)) {
781 nir_intrinsic_set_execution_scope(barrier, exec_scope);
782 progress = true;
783 }
784
785 return progress;
786 }
787
788 static bool
nak_nir_remove_barriers(nir_shader * nir)789 nak_nir_remove_barriers(nir_shader *nir)
790 {
791 /* We'll set this back to true if we leave any barriers in place */
792 nir->info.uses_control_barrier = false;
793
794 return nir_shader_intrinsics_pass(nir, nak_nir_remove_barrier_intrin,
795 nir_metadata_control_flow,
796 NULL);
797 }
798
799 static bool
nak_mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,int64_t hole_size,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * cb_data)800 nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
801 unsigned bit_size, unsigned num_components,
802 int64_t hole_size, nir_intrinsic_instr *low,
803 nir_intrinsic_instr *high, void *cb_data)
804 {
805 /*
806 * Since we legalize these later with nir_lower_mem_access_bit_sizes,
807 * we can optimistically combine anything that might be profitable
808 */
809 assert(util_is_power_of_two_nonzero(align_mul));
810
811 if (hole_size > 0)
812 return false;
813
814 unsigned max_bytes = 128u / 8u;
815 if (low->intrinsic == nir_intrinsic_ldc_nv ||
816 low->intrinsic == nir_intrinsic_ldcx_nv)
817 max_bytes = 64u / 8u;
818
819 align_mul = MIN2(align_mul, max_bytes);
820 align_offset = align_offset % align_mul;
821 return align_offset + num_components * (bit_size / 8) <= align_mul;
822 }
823
824 static nir_mem_access_size_align
nak_mem_access_size_align(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,enum gl_access_qualifier access,const void * cb_data)825 nak_mem_access_size_align(nir_intrinsic_op intrin,
826 uint8_t bytes, uint8_t bit_size,
827 uint32_t align_mul, uint32_t align_offset,
828 bool offset_is_const, enum gl_access_qualifier access,
829 const void *cb_data)
830 {
831 const uint32_t align = nir_combined_align(align_mul, align_offset);
832 assert(util_is_power_of_two_nonzero(align));
833
834 unsigned bytes_pow2;
835 if (nir_intrinsic_infos[intrin].has_dest) {
836 /* Reads can over-fetch a bit if the alignment is okay. */
837 bytes_pow2 = util_next_power_of_two(bytes);
838 } else {
839 bytes_pow2 = 1 << (util_last_bit(bytes) - 1);
840 }
841
842 unsigned chunk_bytes = MIN3(bytes_pow2, align, 16);
843 assert(util_is_power_of_two_nonzero(chunk_bytes));
844 if (intrin == nir_intrinsic_ldc_nv ||
845 intrin == nir_intrinsic_ldcx_nv)
846 chunk_bytes = MIN2(chunk_bytes, 8);
847
848 if ((intrin == nir_intrinsic_ldc_nv ||
849 intrin == nir_intrinsic_ldcx_nv) && align < 4) {
850 /* CBufs require 4B alignment unless we're doing a ldc.u8 or ldc.i8.
851 * In particular, this applies to ldc.u16 which means we either have to
852 * fall back to two ldc.u8 or use ldc.u32 and shift stuff around to get
853 * the 16bit value out. Fortunately, nir_lower_mem_access_bit_sizes()
854 * can handle over-alignment for reads.
855 */
856 if (align == 2 || offset_is_const) {
857 return (nir_mem_access_size_align) {
858 .bit_size = 32,
859 .num_components = 1,
860 .align = 4,
861 .shift = nir_mem_access_shift_method_scalar,
862 };
863 } else {
864 assert(align == 1);
865 return (nir_mem_access_size_align) {
866 .bit_size = 8,
867 .num_components = 1,
868 .align = 1,
869 .shift = nir_mem_access_shift_method_scalar,
870 };
871 }
872 } else if (chunk_bytes < 4) {
873 return (nir_mem_access_size_align) {
874 .bit_size = chunk_bytes * 8,
875 .num_components = 1,
876 .align = chunk_bytes,
877 .shift = nir_mem_access_shift_method_scalar,
878 };
879 } else {
880 return (nir_mem_access_size_align) {
881 .bit_size = 32,
882 .num_components = chunk_bytes / 4,
883 .align = chunk_bytes,
884 .shift = nir_mem_access_shift_method_scalar,
885 };
886 }
887 }
888
889 static bool
nir_shader_has_local_variables(const nir_shader * nir)890 nir_shader_has_local_variables(const nir_shader *nir)
891 {
892 nir_foreach_function(func, nir) {
893 if (func->impl && !exec_list_is_empty(&func->impl->locals))
894 return true;
895 }
896
897 return false;
898 }
899
900 static int
type_size_vec4(const struct glsl_type * type,bool bindless)901 type_size_vec4(const struct glsl_type *type, bool bindless)
902 {
903 return glsl_count_vec4_slots(type, false, bindless);
904 }
905
906 void
nak_postprocess_nir(nir_shader * nir,const struct nak_compiler * nak,nir_variable_mode robust2_modes,const struct nak_fs_key * fs_key)907 nak_postprocess_nir(nir_shader *nir,
908 const struct nak_compiler *nak,
909 nir_variable_mode robust2_modes,
910 const struct nak_fs_key *fs_key)
911 {
912 UNUSED bool progress = false;
913
914 nak_optimize_nir(nir, nak);
915
916 const nir_lower_subgroups_options subgroups_options = {
917 .subgroup_size = NAK_SUBGROUP_SIZE,
918 .ballot_bit_size = 32,
919 .ballot_components = 1,
920 .lower_to_scalar = true,
921 .lower_vote_eq = true,
922 .lower_first_invocation_to_ballot = true,
923 .lower_read_first_invocation = true,
924 .lower_elect = true,
925 .lower_quad_vote = true,
926 .lower_inverse_ballot = true,
927 .lower_rotate_to_shuffle = true
928 };
929 OPT(nir, nir_lower_subgroups, &subgroups_options);
930 OPT(nir, nak_nir_lower_scan_reduce);
931
932 if (nir_shader_has_local_variables(nir)) {
933 OPT(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
934 glsl_get_natural_size_align_bytes);
935 OPT(nir, nir_lower_explicit_io, nir_var_function_temp,
936 nir_address_format_32bit_offset);
937 nak_optimize_nir(nir, nak);
938 }
939
940 OPT(nir, nir_opt_shrink_vectors, true);
941
942 nir_load_store_vectorize_options vectorize_opts = {};
943 vectorize_opts.modes = nir_var_mem_global |
944 nir_var_mem_ssbo |
945 nir_var_mem_shared |
946 nir_var_shader_temp;
947 vectorize_opts.callback = nak_mem_vectorize_cb;
948 vectorize_opts.robust_modes = robust2_modes;
949 OPT(nir, nir_opt_load_store_vectorize, &vectorize_opts);
950
951 nir_lower_mem_access_bit_sizes_options mem_bit_size_options = {
952 .modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic,
953 .callback = nak_mem_access_size_align,
954 };
955 OPT(nir, nir_lower_mem_access_bit_sizes, &mem_bit_size_options);
956 OPT(nir, nir_lower_bit_size, lower_bit_size_cb, (void *)nak);
957
958 OPT(nir, nir_opt_combine_barriers, NULL, NULL);
959
960 nak_optimize_nir(nir, nak);
961
962 OPT(nir, nak_nir_lower_tex, nak);
963 OPT(nir, nir_lower_idiv, NULL);
964
965 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
966
967 OPT(nir, nir_lower_indirect_derefs, 0, UINT32_MAX);
968
969 if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
970 OPT(nir, nir_lower_tess_coord_z,
971 nir->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);
972 }
973
974 /* We need to do this before nak_nir_lower_system_values() because it
975 * relies on the workgroup size being the actual HW workgroup size in
976 * nir_intrinsic_load_subgroup_id.
977 */
978 if (gl_shader_stage_uses_workgroup(nir->info.stage) &&
979 nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
980 assert(nir->info.workgroup_size[0] % 2 == 0);
981 assert(nir->info.workgroup_size[1] % 2 == 0);
982 nir->info.workgroup_size[0] *= 2;
983 nir->info.workgroup_size[1] /= 2;
984 }
985
986 OPT(nir, nak_nir_lower_system_values, nak);
987
988 switch (nir->info.stage) {
989 case MESA_SHADER_VERTEX:
990 case MESA_SHADER_TESS_CTRL:
991 case MESA_SHADER_TESS_EVAL:
992 case MESA_SHADER_GEOMETRY:
993 OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
994 type_size_vec4, nir_lower_io_lower_64bit_to_32_new);
995 OPT(nir, nir_opt_constant_folding);
996 OPT(nir, nak_nir_lower_vtg_io, nak);
997 if (nir->info.stage == MESA_SHADER_GEOMETRY)
998 OPT(nir, nak_nir_lower_gs_intrinsics);
999 break;
1000
1001 case MESA_SHADER_FRAGMENT:
1002 OPT(nir, nir_lower_indirect_derefs,
1003 nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
1004 OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1005 type_size_vec4, nir_lower_io_lower_64bit_to_32_new |
1006 nir_lower_io_use_interpolated_input_intrinsics);
1007 OPT(nir, nir_opt_constant_folding);
1008 OPT(nir, nak_nir_lower_fs_inputs, nak, fs_key);
1009 OPT(nir, nak_nir_lower_fs_outputs);
1010 break;
1011
1012 case MESA_SHADER_COMPUTE:
1013 case MESA_SHADER_KERNEL:
1014 break;
1015
1016 default:
1017 unreachable("Unsupported shader stage");
1018 }
1019
1020 OPT(nir, nir_lower_doubles, NULL, nak->nir_options.lower_doubles_options);
1021 OPT(nir, nir_lower_int64);
1022
1023 nak_optimize_nir(nir, nak);
1024
1025 do {
1026 progress = false;
1027 OPT(nir, nir_opt_algebraic_late);
1028 OPT(nir, nak_nir_lower_algebraic_late, nak);
1029
1030 /* If we're lowering fp64 sat but not min/max, the sat lowering may have
1031 * been undone by nir_opt_algebraic. Lower sat again just to be sure.
1032 */
1033 if ((nak->nir_options.lower_doubles_options & nir_lower_dsat) &&
1034 !(nak->nir_options.lower_doubles_options & nir_lower_dminmax))
1035 OPT(nir, nir_lower_doubles, NULL, nir_lower_dsat);
1036
1037 if (progress) {
1038 OPT(nir, nir_opt_constant_folding);
1039 OPT(nir, nir_copy_prop);
1040 OPT(nir, nir_opt_dce);
1041 OPT(nir, nir_opt_cse);
1042 }
1043 } while (progress);
1044
1045 if (nak->sm < 70)
1046 OPT(nir, nak_nir_split_64bit_conversions);
1047
1048 bool lcssa_progress = nir_convert_to_lcssa(nir, false, false);
1049 nir_divergence_analysis(nir);
1050
1051 if (nak->sm >= 75) {
1052 if (lcssa_progress) {
1053 OPT(nir, nak_nir_mark_lcssa_invariants);
1054 }
1055 if (OPT(nir, nak_nir_lower_non_uniform_ldcx)) {
1056 OPT(nir, nir_copy_prop);
1057 OPT(nir, nir_opt_dce);
1058 nir_divergence_analysis(nir);
1059 }
1060 }
1061
1062 OPT(nir, nak_nir_remove_barriers);
1063
1064 if (nak->sm >= 70) {
1065 if (nak_should_print_nir()) {
1066 fprintf(stderr, "Structured NIR for %s shader:\n",
1067 _mesa_shader_stage_to_string(nir->info.stage));
1068 nir_print_shader(nir, stderr);
1069 }
1070 OPT(nir, nak_nir_lower_cf);
1071 }
1072
1073 /* Re-index blocks and compact SSA defs because we'll use them to index
1074 * arrays
1075 */
1076 nir_foreach_function(func, nir) {
1077 if (func->impl) {
1078 nir_index_blocks(func->impl);
1079 nir_index_ssa_defs(func->impl);
1080 }
1081 }
1082
1083 if (nak_should_print_nir()) {
1084 fprintf(stderr, "NIR for %s shader:\n",
1085 _mesa_shader_stage_to_string(nir->info.stage));
1086 nir_print_shader(nir, stderr);
1087 }
1088 }
1089
1090 static bool
scalar_is_imm_int(nir_scalar x,unsigned bits)1091 scalar_is_imm_int(nir_scalar x, unsigned bits)
1092 {
1093 if (!nir_scalar_is_const(x))
1094 return false;
1095
1096 int64_t imm = nir_scalar_as_int(x);
1097 return u_intN_min(bits) <= imm && imm <= u_intN_max(bits);
1098 }
1099
1100 struct nak_io_addr_offset
nak_get_io_addr_offset(nir_def * addr,uint8_t imm_bits)1101 nak_get_io_addr_offset(nir_def *addr, uint8_t imm_bits)
1102 {
1103 nir_scalar addr_s = {
1104 .def = addr,
1105 .comp = 0,
1106 };
1107 if (scalar_is_imm_int(addr_s, imm_bits)) {
1108 /* Base is a dumb name for this. It should be offset */
1109 return (struct nak_io_addr_offset) {
1110 .offset = nir_scalar_as_int(addr_s),
1111 };
1112 }
1113
1114 addr_s = nir_scalar_chase_movs(addr_s);
1115 if (!nir_scalar_is_alu(addr_s) ||
1116 nir_scalar_alu_op(addr_s) != nir_op_iadd) {
1117 return (struct nak_io_addr_offset) {
1118 .base = addr_s,
1119 };
1120 }
1121
1122 for (unsigned i = 0; i < 2; i++) {
1123 nir_scalar off_s = nir_scalar_chase_alu_src(addr_s, i);
1124 off_s = nir_scalar_chase_movs(off_s);
1125 if (scalar_is_imm_int(off_s, imm_bits)) {
1126 return (struct nak_io_addr_offset) {
1127 .base = nir_scalar_chase_alu_src(addr_s, 1 - i),
1128 .offset = nir_scalar_as_int(off_s),
1129 };
1130 }
1131 }
1132
1133 return (struct nak_io_addr_offset) {
1134 .base = addr_s,
1135 };
1136 }
1137