1 /*
2 * Copyright © 2022 Collabora, Ltd.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "nak_private.h"
7 #include "nir_builder.h"
8 #include "nir_xfb_info.h"
9
10 #include "util/u_math.h"
11
12 #define OPT(nir, pass, ...) ({ \
13 bool this_progress = false; \
14 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
15 if (this_progress) \
16 progress = true; \
17 this_progress; \
18 })
19
20 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
21
22 bool
nak_nir_workgroup_has_one_subgroup(const nir_shader * nir)23 nak_nir_workgroup_has_one_subgroup(const nir_shader *nir)
24 {
25 switch (nir->info.stage) {
26 case MESA_SHADER_VERTEX:
27 case MESA_SHADER_TESS_EVAL:
28 case MESA_SHADER_GEOMETRY:
29 case MESA_SHADER_FRAGMENT:
30 unreachable("Shader stage does not have workgroups");
31 break;
32
33 case MESA_SHADER_TESS_CTRL:
34 /* Tessellation only ever has one subgroup per workgroup. The Vulkan
35 * limit on the number of tessellation invocations is 32 to allow for
36 * this.
37 */
38 return true;
39
40 case MESA_SHADER_COMPUTE:
41 case MESA_SHADER_KERNEL: {
42 if (nir->info.workgroup_size_variable)
43 return false;
44
45 uint16_t wg_sz = nir->info.workgroup_size[0] *
46 nir->info.workgroup_size[1] *
47 nir->info.workgroup_size[2];
48
49 return wg_sz <= 32;
50 }
51
52 default:
53 unreachable("Unknown shader stage");
54 }
55 }
56
57 static void
optimize_nir(nir_shader * nir,const struct nak_compiler * nak,bool allow_copies)58 optimize_nir(nir_shader *nir, const struct nak_compiler *nak, bool allow_copies)
59 {
60 bool progress;
61
62 unsigned lower_flrp =
63 (nir->options->lower_flrp16 ? 16 : 0) |
64 (nir->options->lower_flrp32 ? 32 : 0) |
65 (nir->options->lower_flrp64 ? 64 : 0);
66
67 do {
68 progress = false;
69
70 /* This pass is causing problems with types used by OpenCL :
71 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13955
72 *
73 * Running with it disabled made no difference in the resulting assembly
74 * code.
75 */
76 if (nir->info.stage != MESA_SHADER_KERNEL)
77 OPT(nir, nir_split_array_vars, nir_var_function_temp);
78
79 OPT(nir, nir_shrink_vec_array_vars, nir_var_function_temp);
80 OPT(nir, nir_opt_deref);
81 if (OPT(nir, nir_opt_memcpy))
82 OPT(nir, nir_split_var_copies);
83
84 OPT(nir, nir_lower_vars_to_ssa);
85
86 if (allow_copies) {
87 /* Only run this pass in the first call to brw_nir_optimize. Later
88 * calls assume that we've lowered away any copy_deref instructions
89 * and we don't want to introduce any more.
90 */
91 OPT(nir, nir_opt_find_array_copies);
92 }
93 OPT(nir, nir_opt_copy_prop_vars);
94 OPT(nir, nir_opt_dead_write_vars);
95 OPT(nir, nir_opt_combine_stores, nir_var_all);
96
97 OPT(nir, nir_lower_alu_to_scalar, NULL, NULL);
98 OPT(nir, nir_lower_phis_to_scalar, false);
99 OPT(nir, nir_lower_frexp);
100 OPT(nir, nir_copy_prop);
101 OPT(nir, nir_opt_dce);
102 OPT(nir, nir_opt_cse);
103
104 OPT(nir, nir_opt_peephole_select, 0, false, false);
105 OPT(nir, nir_opt_intrinsics);
106 OPT(nir, nir_opt_idiv_const, 32);
107 OPT(nir, nir_opt_algebraic);
108 OPT(nir, nir_lower_constant_convert_alu_types);
109 OPT(nir, nir_opt_constant_folding);
110
111 if (lower_flrp != 0) {
112 if (OPT(nir, nir_lower_flrp, lower_flrp, false /* always_precise */))
113 OPT(nir, nir_opt_constant_folding);
114 /* Nothing should rematerialize any flrps */
115 lower_flrp = 0;
116 }
117
118 OPT(nir, nir_opt_dead_cf);
119 if (OPT(nir, nir_opt_loop)) {
120 /* If nir_opt_loop makes progress, then we need to clean things up
121 * if we want any hope of nir_opt_if or nir_opt_loop_unroll to make
122 * progress.
123 */
124 OPT(nir, nir_copy_prop);
125 OPT(nir, nir_opt_dce);
126 }
127 OPT(nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
128 OPT(nir, nir_opt_conditional_discard);
129 if (nir->options->max_unroll_iterations != 0) {
130 OPT(nir, nir_opt_loop_unroll);
131 }
132 OPT(nir, nir_opt_remove_phis);
133 OPT(nir, nir_opt_gcm, false);
134 OPT(nir, nir_opt_undef);
135 OPT(nir, nir_lower_pack);
136 } while (progress);
137
138 OPT(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
139 }
140
141 void
nak_optimize_nir(nir_shader * nir,const struct nak_compiler * nak)142 nak_optimize_nir(nir_shader *nir, const struct nak_compiler *nak)
143 {
144 optimize_nir(nir, nak, false);
145 }
146
147 static unsigned
lower_bit_size_cb(const nir_instr * instr,void * _data)148 lower_bit_size_cb(const nir_instr *instr, void *_data)
149 {
150 switch (instr->type) {
151 case nir_instr_type_alu: {
152 nir_alu_instr *alu = nir_instr_as_alu(instr);
153 if (nir_op_infos[alu->op].is_conversion)
154 return 0;
155
156 switch (alu->op) {
157 case nir_op_bit_count:
158 case nir_op_ufind_msb:
159 case nir_op_ifind_msb:
160 case nir_op_find_lsb:
161 /* These are handled specially because the destination is always
162 * 32-bit and so the bit size of the instruction is given by the
163 * source.
164 */
165 return alu->src[0].src.ssa->bit_size == 32 ? 0 : 32;
166 default:
167 break;
168 }
169
170 const unsigned bit_size = nir_alu_instr_is_comparison(alu)
171 ? alu->src[0].src.ssa->bit_size
172 : alu->def.bit_size;
173 if (bit_size >= 32)
174 return 0;
175
176 /* TODO: Some hardware has native 16-bit support */
177 if (bit_size & (8 | 16))
178 return 32;
179
180 return 0;
181 }
182
183 case nir_instr_type_intrinsic: {
184 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
185 switch (intrin->intrinsic) {
186 case nir_intrinsic_vote_ieq:
187 if (intrin->src[0].ssa->bit_size != 1 &&
188 intrin->src[0].ssa->bit_size < 32)
189 return 32;
190 return 0;
191
192 case nir_intrinsic_vote_feq:
193 case nir_intrinsic_read_invocation:
194 case nir_intrinsic_read_first_invocation:
195 case nir_intrinsic_shuffle:
196 case nir_intrinsic_shuffle_xor:
197 case nir_intrinsic_shuffle_up:
198 case nir_intrinsic_shuffle_down:
199 case nir_intrinsic_quad_broadcast:
200 case nir_intrinsic_quad_swap_horizontal:
201 case nir_intrinsic_quad_swap_vertical:
202 case nir_intrinsic_quad_swap_diagonal:
203 case nir_intrinsic_reduce:
204 case nir_intrinsic_inclusive_scan:
205 case nir_intrinsic_exclusive_scan:
206 if (intrin->src[0].ssa->bit_size < 32)
207 return 32;
208 return 0;
209
210 default:
211 return 0;
212 }
213 }
214
215 case nir_instr_type_phi: {
216 nir_phi_instr *phi = nir_instr_as_phi(instr);
217 if (phi->def.bit_size < 32 && phi->def.bit_size != 1)
218 return 32;
219 return 0;
220 }
221
222 default:
223 return 0;
224 }
225 }
226
227 static nir_def *
nir_udiv_round_up(nir_builder * b,nir_def * n,nir_def * d)228 nir_udiv_round_up(nir_builder *b, nir_def *n, nir_def *d)
229 {
230 return nir_udiv(b, nir_iadd(b, n, nir_iadd_imm(b, d, -1)), d);
231 }
232
233 static bool
nak_nir_lower_subgroup_id_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * data)234 nak_nir_lower_subgroup_id_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
235 void *data)
236 {
237 switch (intrin->intrinsic) {
238 case nir_intrinsic_load_num_subgroups: {
239 b->cursor = nir_instr_remove(&intrin->instr);
240
241 nir_def *num_subgroups;
242 if (nak_nir_workgroup_has_one_subgroup(b->shader)) {
243 num_subgroups = nir_imm_int(b, 1);
244 } else {
245 assert(b->shader->info.cs.derivative_group == DERIVATIVE_GROUP_NONE);
246
247 nir_def *workgroup_size = nir_load_workgroup_size(b);
248 workgroup_size =
249 nir_imul(b, nir_imul(b, nir_channel(b, workgroup_size, 0),
250 nir_channel(b, workgroup_size, 1)),
251 nir_channel(b, workgroup_size, 2));
252 nir_def *subgroup_size = nir_load_subgroup_size(b);
253 num_subgroups = nir_udiv_round_up(b, workgroup_size, subgroup_size);
254 }
255 nir_def_rewrite_uses(&intrin->def, num_subgroups);
256
257 return true;
258 }
259 case nir_intrinsic_load_subgroup_id: {
260 b->cursor = nir_instr_remove(&intrin->instr);
261
262 nir_def *subgroup_id;
263 if (nak_nir_workgroup_has_one_subgroup(b->shader)) {
264 subgroup_id = nir_imm_int(b, 0);
265 } else {
266 assert(b->shader->info.cs.derivative_group == DERIVATIVE_GROUP_NONE);
267
268 nir_def *invocation_index = nir_load_local_invocation_index(b);
269 nir_def *subgroup_size = nir_load_subgroup_size(b);
270 subgroup_id = nir_udiv(b, invocation_index, subgroup_size);
271 }
272 nir_def_rewrite_uses(&intrin->def, subgroup_id);
273
274 return true;
275 }
276 default:
277 return false;
278 }
279 }
280
281 static bool
nak_nir_lower_subgroup_id(nir_shader * nir)282 nak_nir_lower_subgroup_id(nir_shader *nir)
283 {
284 return nir_shader_intrinsics_pass(nir, nak_nir_lower_subgroup_id_intrin,
285 nir_metadata_block_index |
286 nir_metadata_dominance,
287 NULL);
288 }
289
290 void
nak_preprocess_nir(nir_shader * nir,const struct nak_compiler * nak)291 nak_preprocess_nir(nir_shader *nir, const struct nak_compiler *nak)
292 {
293 UNUSED bool progress = false;
294
295 nir_validate_ssa_dominance(nir, "before nak_preprocess_nir");
296
297 const nir_lower_tex_options tex_options = {
298 .lower_txd_3d = true,
299 .lower_txd_cube_map = true,
300 .lower_txd_clamp = true,
301 .lower_txd_shadow = true,
302 .lower_txp = ~0,
303 /* TODO: More lowering */
304 };
305 OPT(nir, nir_lower_tex, &tex_options);
306 OPT(nir, nir_normalize_cubemap_coords);
307
308 nir_lower_image_options image_options = {
309 .lower_cube_size = true,
310 };
311 OPT(nir, nir_lower_image, &image_options);
312
313 OPT(nir, nir_lower_global_vars_to_local);
314
315 OPT(nir, nir_split_var_copies);
316 OPT(nir, nir_split_struct_vars, nir_var_function_temp);
317
318 /* Optimize but allow copies because we haven't lowered them yet */
319 optimize_nir(nir, nak, true /* allow_copies */);
320
321 OPT(nir, nir_lower_load_const_to_scalar);
322 OPT(nir, nir_lower_var_copies);
323 OPT(nir, nir_lower_system_values);
324 OPT(nir, nak_nir_lower_subgroup_id);
325 OPT(nir, nir_lower_compute_system_values, NULL);
326 }
327
328 static uint16_t
nak_attribute_attr_addr(gl_vert_attrib attrib)329 nak_attribute_attr_addr(gl_vert_attrib attrib)
330 {
331 assert(attrib >= VERT_ATTRIB_GENERIC0);
332 return NAK_ATTR_GENERIC_START + (attrib - VERT_ATTRIB_GENERIC0) * 0x10;
333 }
334
335 static int
type_size_vec4_bytes(const struct glsl_type * type,bool bindless)336 type_size_vec4_bytes(const struct glsl_type *type, bool bindless)
337 {
338 return glsl_count_vec4_slots(type, false, bindless) * 16;
339 }
340
341 static bool
nak_nir_lower_vs_inputs(nir_shader * nir)342 nak_nir_lower_vs_inputs(nir_shader *nir)
343 {
344 bool progress = false;
345
346 nir_foreach_shader_in_variable(var, nir) {
347 var->data.driver_location =
348 nak_attribute_attr_addr(var->data.location);
349 }
350
351 progress |= OPT(nir, nir_lower_io, nir_var_shader_in, type_size_vec4_bytes,
352 nir_lower_io_lower_64bit_to_32);
353
354 return progress;
355 }
356
357 static uint16_t
nak_varying_attr_addr(gl_varying_slot slot)358 nak_varying_attr_addr(gl_varying_slot slot)
359 {
360 if (slot >= VARYING_SLOT_PATCH0) {
361 return NAK_ATTR_PATCH_START + (slot - VARYING_SLOT_PATCH0) * 0x10;
362 } else if (slot >= VARYING_SLOT_VAR0) {
363 return NAK_ATTR_GENERIC_START + (slot - VARYING_SLOT_VAR0) * 0x10;
364 } else {
365 switch (slot) {
366 case VARYING_SLOT_TESS_LEVEL_OUTER: return NAK_ATTR_TESS_LOD;
367 case VARYING_SLOT_TESS_LEVEL_INNER: return NAK_ATTR_TESS_INTERRIOR;
368 case VARYING_SLOT_PRIMITIVE_ID: return NAK_ATTR_PRIMITIVE_ID;
369 case VARYING_SLOT_LAYER: return NAK_ATTR_RT_ARRAY_INDEX;
370 case VARYING_SLOT_VIEWPORT: return NAK_ATTR_VIEWPORT_INDEX;
371 case VARYING_SLOT_PSIZ: return NAK_ATTR_POINT_SIZE;
372 case VARYING_SLOT_POS: return NAK_ATTR_POSITION;
373 case VARYING_SLOT_CLIP_DIST0: return NAK_ATTR_CLIP_CULL_DIST_0;
374 case VARYING_SLOT_CLIP_DIST1: return NAK_ATTR_CLIP_CULL_DIST_4;
375 default: unreachable("Invalid varying slot");
376 }
377 }
378 }
379
380 static uint16_t
nak_sysval_attr_addr(gl_system_value sysval)381 nak_sysval_attr_addr(gl_system_value sysval)
382 {
383 switch (sysval) {
384 case SYSTEM_VALUE_PRIMITIVE_ID: return NAK_ATTR_PRIMITIVE_ID;
385 case SYSTEM_VALUE_FRAG_COORD: return NAK_ATTR_POSITION;
386 case SYSTEM_VALUE_POINT_COORD: return NAK_ATTR_POINT_SPRITE;
387 case SYSTEM_VALUE_TESS_COORD: return NAK_ATTR_TESS_COORD;
388 case SYSTEM_VALUE_INSTANCE_ID: return NAK_ATTR_INSTANCE_ID;
389 case SYSTEM_VALUE_VERTEX_ID: return NAK_ATTR_VERTEX_ID;
390 case SYSTEM_VALUE_FRONT_FACE: return NAK_ATTR_FRONT_FACE;
391 default: unreachable("Invalid system value");
392 }
393 }
394
395 static uint8_t
nak_sysval_sysval_idx(gl_system_value sysval)396 nak_sysval_sysval_idx(gl_system_value sysval)
397 {
398 switch (sysval) {
399 case SYSTEM_VALUE_SUBGROUP_INVOCATION: return NAK_SV_LANE_ID;
400 case SYSTEM_VALUE_VERTICES_IN: return NAK_SV_VERTEX_COUNT;
401 case SYSTEM_VALUE_INVOCATION_ID: return NAK_SV_INVOCATION_ID;
402 case SYSTEM_VALUE_HELPER_INVOCATION: return NAK_SV_THREAD_KILL;
403 case SYSTEM_VALUE_LOCAL_INVOCATION_ID: return NAK_SV_TID;
404 case SYSTEM_VALUE_WORKGROUP_ID: return NAK_SV_CTAID;
405 case SYSTEM_VALUE_SUBGROUP_EQ_MASK: return NAK_SV_LANEMASK_EQ;
406 case SYSTEM_VALUE_SUBGROUP_LT_MASK: return NAK_SV_LANEMASK_LT;
407 case SYSTEM_VALUE_SUBGROUP_LE_MASK: return NAK_SV_LANEMASK_LE;
408 case SYSTEM_VALUE_SUBGROUP_GT_MASK: return NAK_SV_LANEMASK_GT;
409 case SYSTEM_VALUE_SUBGROUP_GE_MASK: return NAK_SV_LANEMASK_GE;
410 default: unreachable("Invalid system value");
411 }
412 }
413
414 static bool
nak_nir_lower_system_value_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * data)415 nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
416 void *data)
417 {
418 const struct nak_compiler *nak = data;
419
420 b->cursor = nir_before_instr(&intrin->instr);
421
422 nir_def *val;
423 switch (intrin->intrinsic) {
424 case nir_intrinsic_load_layer_id: {
425 const uint32_t addr = nak_varying_attr_addr(VARYING_SLOT_LAYER);
426 val = nir_load_input(b, intrin->def.num_components, 32,
427 nir_imm_int(b, 0), .base = addr,
428 .dest_type = nir_type_int32);
429 break;
430 }
431
432 case nir_intrinsic_load_primitive_id: {
433 assert(b->shader->info.stage == MESA_SHADER_TESS_CTRL ||
434 b->shader->info.stage == MESA_SHADER_TESS_EVAL ||
435 b->shader->info.stage == MESA_SHADER_GEOMETRY);
436 val = nir_load_per_vertex_input(b, 1, 32, nir_imm_int(b, 0),
437 nir_imm_int(b, 0),
438 .base = NAK_ATTR_PRIMITIVE_ID,
439 .dest_type = nir_type_int32);
440 break;
441 }
442
443 case nir_intrinsic_load_front_face:
444 case nir_intrinsic_load_instance_id:
445 case nir_intrinsic_load_vertex_id: {
446 const gl_system_value sysval =
447 nir_system_value_from_intrinsic(intrin->intrinsic);
448 const uint32_t addr = nak_sysval_attr_addr(sysval);
449 val = nir_load_input(b, intrin->def.num_components, 32,
450 nir_imm_int(b, 0), .base = addr,
451 .dest_type = nir_type_int32);
452 break;
453 }
454
455 case nir_intrinsic_load_patch_vertices_in: {
456 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VERTEX_COUNT,
457 .access = ACCESS_CAN_REORDER);
458 val = nir_extract_u8(b, val, nir_imm_int(b, 1));
459 break;
460 }
461
462 case nir_intrinsic_load_subgroup_eq_mask:
463 case nir_intrinsic_load_subgroup_lt_mask:
464 case nir_intrinsic_load_subgroup_le_mask:
465 case nir_intrinsic_load_subgroup_gt_mask:
466 case nir_intrinsic_load_subgroup_ge_mask: {
467 const gl_system_value sysval =
468 nir_system_value_from_intrinsic(intrin->intrinsic);
469 const uint32_t idx = nak_sysval_sysval_idx(sysval);
470 val = nir_load_sysval_nv(b, 32, .base = idx,
471 .access = ACCESS_CAN_REORDER);
472
473 /* Pad with 0 because all invocations above 31 are off */
474 if (intrin->def.bit_size == 64) {
475 val = nir_u2u32(b, val);
476 } else {
477 assert(intrin->def.bit_size == 32);
478 val = nir_pad_vector_imm_int(b, val, 0, intrin->def.num_components);
479 }
480 break;
481 }
482
483 case nir_intrinsic_load_subgroup_invocation:
484 case nir_intrinsic_load_helper_invocation:
485 case nir_intrinsic_load_invocation_id:
486 case nir_intrinsic_load_local_invocation_id:
487 case nir_intrinsic_load_workgroup_id:
488 case nir_intrinsic_load_workgroup_id_zero_base: {
489 const gl_system_value sysval =
490 intrin->intrinsic == nir_intrinsic_load_workgroup_id_zero_base ?
491 SYSTEM_VALUE_WORKGROUP_ID :
492 nir_system_value_from_intrinsic(intrin->intrinsic);
493 const uint32_t idx = nak_sysval_sysval_idx(sysval);
494 nir_def *comps[3];
495 assert(intrin->def.num_components <= 3);
496 for (unsigned c = 0; c < intrin->def.num_components; c++) {
497 comps[c] = nir_load_sysval_nv(b, 32, .base = idx + c,
498 .access = ACCESS_CAN_REORDER);
499 }
500 val = nir_vec(b, comps, intrin->def.num_components);
501 break;
502 }
503
504 case nir_intrinsic_is_helper_invocation: {
505 /* Unlike load_helper_invocation, this one isn't re-orderable */
506 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_THREAD_KILL);
507 break;
508 }
509
510 case nir_intrinsic_shader_clock: {
511 /* The CS2R opcode can load 64 bits worth of sysval data at a time but
512 * it's not actually atomic. In order to get correct shader clocks, we
513 * need to do a loop where we do
514 *
515 * CS2R SV_CLOCK_HI
516 * CS2R SV_CLOCK_LO
517 * CS2R SV_CLOCK_HI
518 * CS2R SV_CLOCK_LO
519 * CS2R SV_CLOCK_HI
520 * ...
521 *
522 * The moment two high values are the same, we take the low value
523 * between them and that gives us our clock.
524 *
525 * In order to make sure we don't run into any weird races, we also need
526 * to insert a barrier after every load to ensure the one load completes
527 * before we kick off the next load. Otherwise, if one load happens to
528 * be faster than the other (they are variable latency, after all) we're
529 * still guaranteed that the loads happen in the order we want.
530 */
531 nir_variable *clock =
532 nir_local_variable_create(b->impl, glsl_uvec2_type(), NULL);
533
534 nir_def *clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_HI);
535 nir_ssa_bar_nv(b, clock_hi);
536
537 nir_store_var(b, clock, nir_vec2(b, nir_imm_int(b, 0), clock_hi), 0x3);
538
539 nir_push_loop(b);
540 {
541 nir_def *last_clock = nir_load_var(b, clock);
542
543 nir_def *clock_lo = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_LO);
544 nir_ssa_bar_nv(b, clock_lo);
545
546 clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK + 1);
547 nir_ssa_bar_nv(b, clock_hi);
548
549 nir_store_var(b, clock, nir_vec2(b, clock_lo, clock_hi), 0x3);
550
551 nir_push_if(b, nir_ieq(b, clock_hi, nir_channel(b, last_clock, 1)));
552 {
553 nir_jump(b, nir_jump_break);
554 }
555 nir_pop_if(b, NULL);
556 }
557 nir_pop_loop(b, NULL);
558
559 val = nir_load_var(b, clock);
560 if (intrin->def.bit_size == 64)
561 val = nir_pack_64_2x32(b, val);
562 break;
563 }
564
565 case nir_intrinsic_load_warps_per_sm_nv:
566 val = nir_imm_int(b, nak->warps_per_sm);
567 break;
568
569 case nir_intrinsic_load_sm_count_nv:
570 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTCFG);
571 val = nir_ubitfield_extract_imm(b, val, 20, 9);
572 break;
573
574 case nir_intrinsic_load_warp_id_nv:
575 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
576 val = nir_ubitfield_extract_imm(b, val, 8, 7);
577 break;
578
579 case nir_intrinsic_load_sm_id_nv:
580 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
581 val = nir_ubitfield_extract_imm(b, val, 20, 9);
582 break;
583
584 default:
585 return false;
586 }
587
588 if (intrin->def.bit_size == 1)
589 val = nir_i2b(b, val);
590
591 nir_def_rewrite_uses(&intrin->def, val);
592
593 return true;
594 }
595
596 static bool
nak_nir_lower_system_values(nir_shader * nir,const struct nak_compiler * nak)597 nak_nir_lower_system_values(nir_shader *nir, const struct nak_compiler *nak)
598 {
599 return nir_shader_intrinsics_pass(nir, nak_nir_lower_system_value_intrin,
600 nir_metadata_none,
601 (void *)nak);
602 }
603
604 static bool
nak_nir_lower_varyings(nir_shader * nir,nir_variable_mode modes)605 nak_nir_lower_varyings(nir_shader *nir, nir_variable_mode modes)
606 {
607 bool progress = false;
608
609 assert(!(modes & ~(nir_var_shader_in | nir_var_shader_out)));
610
611 nir_foreach_variable_with_modes(var, nir, modes)
612 var->data.driver_location = nak_varying_attr_addr(var->data.location);
613
614 OPT(nir, nir_lower_io, modes, type_size_vec4_bytes,
615 nir_lower_io_lower_64bit_to_32);
616
617 return progress;
618 }
619
620 struct nak_xfb_info
nak_xfb_from_nir(const struct nir_xfb_info * nir_xfb)621 nak_xfb_from_nir(const struct nir_xfb_info *nir_xfb)
622 {
623 if (nir_xfb == NULL)
624 return (struct nak_xfb_info) { };
625
626 struct nak_xfb_info nak_xfb = { };
627
628 u_foreach_bit(b, nir_xfb->buffers_written) {
629 nak_xfb.stride[b] = nir_xfb->buffers[b].stride;
630 nak_xfb.stream[b] = nir_xfb->buffer_to_stream[b];
631 }
632 memset(nak_xfb.attr_index, 0xff, sizeof(nak_xfb.attr_index)); /* = skip */
633
634 for (unsigned o = 0; o < nir_xfb->output_count; o++) {
635 const nir_xfb_output_info *out = &nir_xfb->outputs[o];
636 const uint8_t b = out->buffer;
637 assert(nir_xfb->buffers_written & BITFIELD_BIT(b));
638
639 const uint16_t attr_addr = nak_varying_attr_addr(out->location);
640 assert(attr_addr % 4 == 0);
641 const uint16_t attr_idx = attr_addr / 4;
642
643 assert(out->offset % 4 == 0);
644 uint8_t out_idx = out->offset / 4;
645
646 u_foreach_bit(c, out->component_mask)
647 nak_xfb.attr_index[b][out_idx++] = attr_idx + c;
648
649 nak_xfb.attr_count[b] = MAX2(nak_xfb.attr_count[b], out_idx);
650 }
651
652 return nak_xfb;
653 }
654
655 static nir_def *
load_frag_w(nir_builder * b,enum nak_interp_loc interp_loc,nir_def * offset)656 load_frag_w(nir_builder *b, enum nak_interp_loc interp_loc, nir_def *offset)
657 {
658 if (offset == NULL)
659 offset = nir_imm_int(b, 0);
660
661 const uint16_t w_addr =
662 nak_sysval_attr_addr(SYSTEM_VALUE_FRAG_COORD) + 12;
663
664 const struct nak_nir_ipa_flags flags = {
665 .interp_mode = NAK_INTERP_MODE_SCREEN_LINEAR,
666 .interp_freq = NAK_INTERP_FREQ_PASS,
667 .interp_loc = interp_loc,
668 };
669 uint32_t flags_u32;
670 memcpy(&flags_u32, &flags, sizeof(flags_u32));
671
672 return nir_ipa_nv(b, nir_imm_float(b, 0), offset,
673 .base = w_addr, .flags = flags_u32);
674 }
675
676 static nir_def *
load_interpolated_input(nir_builder * b,unsigned num_components,uint32_t addr,enum nak_interp_mode interp_mode,enum nak_interp_loc interp_loc,nir_def * inv_w,nir_def * offset,const struct nak_compiler * nak)677 load_interpolated_input(nir_builder *b, unsigned num_components, uint32_t addr,
678 enum nak_interp_mode interp_mode,
679 enum nak_interp_loc interp_loc,
680 nir_def *inv_w, nir_def *offset,
681 const struct nak_compiler *nak)
682 {
683 if (offset == NULL)
684 offset = nir_imm_int(b, 0);
685
686 if (nak->sm >= 70) {
687 const struct nak_nir_ipa_flags flags = {
688 .interp_mode = interp_mode,
689 .interp_freq = NAK_INTERP_FREQ_PASS,
690 .interp_loc = interp_loc,
691 };
692 uint32_t flags_u32;
693 memcpy(&flags_u32, &flags, sizeof(flags_u32));
694
695 nir_def *comps[NIR_MAX_VEC_COMPONENTS];
696 for (unsigned c = 0; c < num_components; c++) {
697 comps[c] = nir_ipa_nv(b, nir_imm_float(b, 0), offset,
698 .base = addr + c * 4,
699 .flags = flags_u32);
700 if (interp_mode == NAK_INTERP_MODE_PERSPECTIVE)
701 comps[c] = nir_fmul(b, comps[c], inv_w);
702 }
703 return nir_vec(b, comps, num_components);
704 } else if (nak->sm >= 50) {
705 struct nak_nir_ipa_flags flags = {
706 .interp_mode = interp_mode,
707 .interp_freq = NAK_INTERP_FREQ_PASS,
708 .interp_loc = interp_loc,
709 };
710
711 if (interp_mode == NAK_INTERP_MODE_PERSPECTIVE)
712 flags.interp_freq = NAK_INTERP_FREQ_PASS_MUL_W;
713 else
714 inv_w = nir_imm_float(b, 0);
715
716 uint32_t flags_u32;
717 memcpy(&flags_u32, &flags, sizeof(flags_u32));
718
719 nir_def *comps[NIR_MAX_VEC_COMPONENTS];
720 for (unsigned c = 0; c < num_components; c++) {
721 comps[c] = nir_ipa_nv(b, inv_w, offset,
722 .base = addr + c * 4,
723 .flags = flags_u32);
724 }
725 return nir_vec(b, comps, num_components);
726 } else {
727 unreachable("Figure out input interpolation on Kepler");
728 }
729 }
730
731 static nir_def *
load_sample_pos_at(nir_builder * b,nir_def * sample_id,const struct nak_fs_key * fs_key)732 load_sample_pos_at(nir_builder *b, nir_def *sample_id,
733 const struct nak_fs_key *fs_key)
734 {
735 nir_def *loc = nir_load_ubo(b, 1, 64,
736 nir_imm_int(b, fs_key->sample_locations_cb),
737 nir_imm_int(b, fs_key->sample_locations_offset),
738 .align_mul = 8,
739 .align_offset = 0,
740 .range = fs_key->sample_locations_offset + 8);
741
742 /* Yay little endian */
743 loc = nir_ushr(b, loc, nir_imul_imm(b, sample_id, 8));
744 nir_def *loc_x_u4 = nir_iand_imm(b, loc, 0xf);
745 nir_def *loc_y_u4 = nir_iand_imm(b, nir_ushr_imm(b, loc, 4), 0xf);
746 nir_def *loc_u4 = nir_vec2(b, loc_x_u4, loc_y_u4);
747 nir_def *result = nir_fmul_imm(b, nir_i2f32(b, loc_u4), 1.0 / 16.0);
748
749 return result;
750 }
751
752 static nir_def *
load_barycentric_offset(nir_builder * b,nir_intrinsic_instr * bary,const struct nak_fs_key * fs_key)753 load_barycentric_offset(nir_builder *b, nir_intrinsic_instr *bary,
754 const struct nak_fs_key *fs_key)
755 {
756 nir_def *offset_f;
757
758 if (bary->intrinsic == nir_intrinsic_load_barycentric_coord_at_sample ||
759 bary->intrinsic == nir_intrinsic_load_barycentric_at_sample) {
760 nir_def *sample_id = bary->src[0].ssa;
761 nir_def *sample_pos = load_sample_pos_at(b, sample_id, fs_key);
762 offset_f = nir_fadd_imm(b, sample_pos, -0.5);
763 } else {
764 offset_f = bary->src[0].ssa;
765 }
766
767 offset_f = nir_fclamp(b, offset_f, nir_imm_float(b, -0.5),
768 nir_imm_float(b, 0.437500));
769 nir_def *offset_fixed =
770 nir_f2i32(b, nir_fmul_imm(b, offset_f, 4096.0));
771 nir_def *offset = nir_ior(b, nir_ishl_imm(b, nir_channel(b, offset_fixed, 1), 16),
772 nir_iand_imm(b, nir_channel(b, offset_fixed, 0),
773 0xffff));
774
775 return offset;
776 }
777
778 struct lower_fs_input_ctx {
779 const struct nak_compiler *nak;
780 const struct nak_fs_key *fs_key;
781 };
782
783 static bool
lower_fs_input_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * data)784 lower_fs_input_intrin(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
785 {
786 const struct lower_fs_input_ctx *ctx = data;
787
788 switch (intrin->intrinsic) {
789 case nir_intrinsic_load_barycentric_pixel: {
790 if (!(ctx->fs_key && ctx->fs_key->force_sample_shading))
791 return false;
792
793 intrin->intrinsic = nir_intrinsic_load_barycentric_sample;
794 return true;
795 }
796
797 case nir_intrinsic_load_frag_coord:
798 case nir_intrinsic_load_point_coord: {
799 b->cursor = nir_before_instr(&intrin->instr);
800
801 const enum nak_interp_loc interp_loc =
802 b->shader->info.fs.uses_sample_shading ? NAK_INTERP_LOC_CENTROID
803 : NAK_INTERP_LOC_DEFAULT;
804 const uint32_t addr =
805 intrin->intrinsic == nir_intrinsic_load_point_coord ?
806 nak_sysval_attr_addr(SYSTEM_VALUE_POINT_COORD) :
807 nak_sysval_attr_addr(SYSTEM_VALUE_FRAG_COORD);
808
809 nir_def *coord = load_interpolated_input(b, intrin->def.num_components,
810 addr,
811 NAK_INTERP_MODE_SCREEN_LINEAR,
812 interp_loc, NULL, NULL,
813 ctx->nak);
814
815 nir_def_rewrite_uses(&intrin->def, coord);
816 nir_instr_remove(&intrin->instr);
817
818 return true;
819 }
820
821 case nir_intrinsic_load_input: {
822 b->cursor = nir_before_instr(&intrin->instr);
823
824 uint16_t addr = nir_intrinsic_base(intrin) +
825 nir_src_as_uint(intrin->src[0]) +
826 nir_intrinsic_component(intrin) * 4;
827
828 const struct nak_nir_ipa_flags flags = {
829 .interp_mode = NAK_INTERP_MODE_CONSTANT,
830 .interp_freq = NAK_INTERP_FREQ_CONSTANT,
831 .interp_loc = NAK_INTERP_LOC_DEFAULT,
832 };
833 uint32_t flags_u32;
834 memcpy(&flags_u32, &flags, sizeof(flags_u32));
835
836 nir_def *comps[NIR_MAX_VEC_COMPONENTS];
837 for (unsigned c = 0; c < intrin->def.num_components; c++) {
838 comps[c] = nir_ipa_nv(b, nir_imm_float(b, 0), nir_imm_int(b, 0),
839 .base = addr + c * 4, .flags = flags_u32);
840 }
841 nir_def *res = nir_vec(b, comps, intrin->def.num_components);
842
843 nir_def_rewrite_uses(&intrin->def, res);
844 nir_instr_remove(&intrin->instr);
845
846 return true;
847 }
848
849 case nir_intrinsic_load_barycentric_coord_pixel:
850 case nir_intrinsic_load_barycentric_coord_centroid:
851 case nir_intrinsic_load_barycentric_coord_sample:
852 case nir_intrinsic_load_barycentric_coord_at_sample:
853 case nir_intrinsic_load_barycentric_coord_at_offset: {
854 b->cursor = nir_before_instr(&intrin->instr);
855
856 uint32_t addr;
857 enum nak_interp_mode interp_mode;
858 if (nir_intrinsic_interp_mode(intrin) == INTERP_MODE_NOPERSPECTIVE) {
859 addr = NAK_ATTR_BARY_COORD_NO_PERSP;
860 interp_mode = NAK_INTERP_MODE_SCREEN_LINEAR;
861 } else {
862 addr = NAK_ATTR_BARY_COORD;
863 interp_mode = NAK_INTERP_MODE_PERSPECTIVE;
864 }
865
866 nir_def *offset = NULL;
867 enum nak_interp_loc interp_loc;
868 switch (intrin->intrinsic) {
869 case nir_intrinsic_load_barycentric_coord_at_sample:
870 case nir_intrinsic_load_barycentric_coord_at_offset:
871 interp_loc = NAK_INTERP_LOC_OFFSET;
872 offset = load_barycentric_offset(b, intrin, ctx->fs_key);
873 break;
874 case nir_intrinsic_load_barycentric_coord_centroid:
875 case nir_intrinsic_load_barycentric_coord_sample:
876 interp_loc = NAK_INTERP_LOC_CENTROID;
877 break;
878 case nir_intrinsic_load_barycentric_coord_pixel:
879 interp_loc = NAK_INTERP_LOC_DEFAULT;
880 break;
881 default:
882 unreachable("Unknown intrinsic");
883 }
884
885 nir_def *inv_w = NULL;
886 if (interp_mode == NAK_INTERP_MODE_PERSPECTIVE)
887 inv_w = nir_frcp(b, load_frag_w(b, interp_loc, offset));
888
889 nir_def *res = load_interpolated_input(b, intrin->def.num_components,
890 addr, interp_mode, interp_loc,
891 inv_w, offset, ctx->nak);
892
893 nir_def_rewrite_uses(&intrin->def, res);
894 nir_instr_remove(&intrin->instr);
895
896 return true;
897 }
898
899 case nir_intrinsic_load_interpolated_input: {
900 b->cursor = nir_before_instr(&intrin->instr);
901
902 const uint16_t addr = nir_intrinsic_base(intrin) +
903 nir_src_as_uint(intrin->src[1]) +
904 nir_intrinsic_component(intrin) * 4;
905
906 nir_intrinsic_instr *bary = nir_src_as_intrinsic(intrin->src[0]);
907
908 enum nak_interp_mode interp_mode;
909 if (nir_intrinsic_interp_mode(bary) == INTERP_MODE_SMOOTH ||
910 nir_intrinsic_interp_mode(bary) == INTERP_MODE_NONE)
911 interp_mode = NAK_INTERP_MODE_PERSPECTIVE;
912 else
913 interp_mode = NAK_INTERP_MODE_SCREEN_LINEAR;
914
915 nir_def *offset = NULL;
916 enum nak_interp_loc interp_loc;
917 switch (bary->intrinsic) {
918 case nir_intrinsic_load_barycentric_at_offset:
919 case nir_intrinsic_load_barycentric_at_sample: {
920 interp_loc = NAK_INTERP_LOC_OFFSET;
921 offset = load_barycentric_offset(b, bary, ctx->fs_key);
922 break;
923 }
924
925 case nir_intrinsic_load_barycentric_centroid:
926 case nir_intrinsic_load_barycentric_sample:
927 interp_loc = NAK_INTERP_LOC_CENTROID;
928 break;
929
930 case nir_intrinsic_load_barycentric_pixel:
931 interp_loc = NAK_INTERP_LOC_DEFAULT;
932 break;
933
934 default:
935 unreachable("Unsupported barycentric");
936 }
937
938 nir_def *inv_w = NULL;
939 if (interp_mode == NAK_INTERP_MODE_PERSPECTIVE)
940 inv_w = nir_frcp(b, load_frag_w(b, interp_loc, offset));
941
942 nir_def *res = load_interpolated_input(b, intrin->def.num_components,
943 addr, interp_mode, interp_loc,
944 inv_w, offset, ctx->nak);
945
946 nir_def_rewrite_uses(&intrin->def, res);
947 nir_instr_remove(&intrin->instr);
948
949 return true;
950 }
951
952 case nir_intrinsic_load_sample_mask_in: {
953 if (!b->shader->info.fs.uses_sample_shading &&
954 !(ctx->fs_key && ctx->fs_key->force_sample_shading))
955 return false;
956
957 b->cursor = nir_after_instr(&intrin->instr);
958
959 /* Mask off just the current sample */
960 nir_def *sample = nir_load_sample_id(b);
961 nir_def *mask = nir_ishl(b, nir_imm_int(b, 1), sample);
962 mask = nir_iand(b, &intrin->def, mask);
963 nir_def_rewrite_uses_after(&intrin->def, mask, mask->parent_instr);
964
965 return true;
966 }
967
968 case nir_intrinsic_load_sample_pos: {
969 b->cursor = nir_before_instr(&intrin->instr);
970
971 nir_def *sample_id = nir_load_sample_id(b);
972 nir_def *sample_pos = load_sample_pos_at(b, sample_id, ctx->fs_key);
973
974 nir_def_rewrite_uses(&intrin->def, sample_pos);
975 nir_instr_remove(&intrin->instr);
976
977 return true;
978 }
979
980 case nir_intrinsic_load_input_vertex: {
981 b->cursor = nir_before_instr(&intrin->instr);
982
983 unsigned vertex_id = nir_src_as_uint(intrin->src[0]);
984 assert(vertex_id < 3);
985
986 const uint16_t addr = nir_intrinsic_base(intrin) +
987 nir_src_as_uint(intrin->src[1]) +
988 nir_intrinsic_component(intrin) * 4;
989
990 nir_def *comps[NIR_MAX_VEC_COMPONENTS];
991 for (unsigned c = 0; c < intrin->def.num_components; c++) {
992 nir_def *data = nir_ldtram_nv(b, .base = addr + c * 4,
993 .flags = vertex_id == 2);
994 comps[c] = nir_channel(b, data, vertex_id & 1);
995 }
996 nir_def *res = nir_vec(b, comps, intrin->num_components);
997
998 nir_def_rewrite_uses(&intrin->def, res);
999 nir_instr_remove(&intrin->instr);
1000
1001 return true;
1002 }
1003
1004 default:
1005 return false;
1006 }
1007 }
1008
1009 static bool
nak_nir_lower_fs_inputs(nir_shader * nir,const struct nak_compiler * nak,const struct nak_fs_key * fs_key)1010 nak_nir_lower_fs_inputs(nir_shader *nir,
1011 const struct nak_compiler *nak,
1012 const struct nak_fs_key *fs_key)
1013 {
1014 NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
1015 NIR_PASS_V(nir, nak_nir_lower_varyings, nir_var_shader_in);
1016 NIR_PASS_V(nir, nir_opt_constant_folding);
1017
1018 const struct lower_fs_input_ctx fs_in_ctx = {
1019 .nak = nak,
1020 .fs_key = fs_key,
1021 };
1022 NIR_PASS_V(nir, nir_shader_intrinsics_pass, lower_fs_input_intrin,
1023 nir_metadata_block_index | nir_metadata_dominance,
1024 (void *)&fs_in_ctx);
1025
1026 return true;
1027 }
1028
1029 static int
fs_out_size(const struct glsl_type * type,bool bindless)1030 fs_out_size(const struct glsl_type *type, bool bindless)
1031 {
1032 assert(glsl_type_is_vector_or_scalar(type));
1033 return 16;
1034 }
1035
1036 static bool
nak_nir_lower_fs_outputs(nir_shader * nir)1037 nak_nir_lower_fs_outputs(nir_shader *nir)
1038 {
1039 if (nir->info.outputs_written == 0)
1040 return false;
1041
1042 NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, true);
1043
1044 nir->num_outputs = 0;
1045 nir_foreach_shader_out_variable(var, nir) {
1046 switch (var->data.location) {
1047 case FRAG_RESULT_DEPTH:
1048 assert(var->data.index == 0);
1049 assert(var->data.location_frac == 0);
1050 var->data.driver_location = NAK_FS_OUT_DEPTH;
1051 break;
1052 case FRAG_RESULT_STENCIL:
1053 unreachable("EXT_shader_stencil_export not supported");
1054 break;
1055 case FRAG_RESULT_COLOR:
1056 unreachable("Vulkan alway uses explicit locations");
1057 break;
1058 case FRAG_RESULT_SAMPLE_MASK:
1059 assert(var->data.index == 0);
1060 assert(var->data.location_frac == 0);
1061 var->data.driver_location = NAK_FS_OUT_SAMPLE_MASK;
1062 break;
1063 default: {
1064 assert(var->data.location >= FRAG_RESULT_DATA0);
1065 assert(var->data.index < 2);
1066 const unsigned out =
1067 (var->data.location - FRAG_RESULT_DATA0) + var->data.index;
1068 var->data.driver_location = NAK_FS_OUT_COLOR(out);
1069 break;
1070 }
1071 }
1072 }
1073
1074 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_out, fs_out_size, 0);
1075
1076 return true;
1077 }
1078
1079 static bool
nak_mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * cb_data)1080 nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
1081 unsigned bit_size, unsigned num_components,
1082 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
1083 void *cb_data)
1084 {
1085 /*
1086 * Since we legalize these later with nir_lower_mem_access_bit_sizes,
1087 * we can optimistically combine anything that might be profitable
1088 */
1089 assert(util_is_power_of_two_nonzero(align_mul));
1090
1091 unsigned max_bytes = 128u / 8u;
1092 if (low->intrinsic == nir_intrinsic_load_ubo)
1093 max_bytes = 64u / 8u;
1094
1095 align_mul = MIN2(align_mul, max_bytes);
1096 align_offset = align_offset % align_mul;
1097 return align_offset + num_components * (bit_size / 8) <= align_mul;
1098 }
1099
1100 static nir_mem_access_size_align
nak_mem_access_size_align(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,const void * cb_data)1101 nak_mem_access_size_align(nir_intrinsic_op intrin,
1102 uint8_t bytes, uint8_t bit_size,
1103 uint32_t align_mul, uint32_t align_offset,
1104 bool offset_is_const, const void *cb_data)
1105 {
1106 const uint32_t align = nir_combined_align(align_mul, align_offset);
1107 assert(util_is_power_of_two_nonzero(align));
1108
1109 unsigned bytes_pow2;
1110 if (nir_intrinsic_infos[intrin].has_dest) {
1111 /* Reads can over-fetch a bit if the alignment is okay. */
1112 bytes_pow2 = util_next_power_of_two(bytes);
1113 } else {
1114 bytes_pow2 = 1 << (util_last_bit(bytes) - 1);
1115 }
1116
1117 unsigned chunk_bytes = MIN3(bytes_pow2, align, 16);
1118 assert(util_is_power_of_two_nonzero(chunk_bytes));
1119 if (intrin == nir_intrinsic_load_ubo)
1120 chunk_bytes = MIN2(chunk_bytes, 8);
1121
1122 if (intrin == nir_intrinsic_load_ubo && align < 4) {
1123 /* CBufs require 4B alignment unless we're doing a ldc.u8 or ldc.i8.
1124 * In particular, this applies to ldc.u16 which means we either have to
1125 * fall back to two ldc.u8 or use ldc.u32 and shift stuff around to get
1126 * the 16bit value out. Fortunately, nir_lower_mem_access_bit_sizes()
1127 * can handle over-alignment for reads.
1128 */
1129 if (align == 2 || offset_is_const) {
1130 return (nir_mem_access_size_align) {
1131 .bit_size = 32,
1132 .num_components = 1,
1133 .align = 4,
1134 };
1135 } else {
1136 assert(align == 1);
1137 return (nir_mem_access_size_align) {
1138 .bit_size = 8,
1139 .num_components = 1,
1140 .align = 1,
1141 };
1142 }
1143 } else if (chunk_bytes < 4) {
1144 return (nir_mem_access_size_align) {
1145 .bit_size = chunk_bytes * 8,
1146 .num_components = 1,
1147 .align = chunk_bytes,
1148 };
1149 } else {
1150 return (nir_mem_access_size_align) {
1151 .bit_size = 32,
1152 .num_components = chunk_bytes / 4,
1153 .align = chunk_bytes,
1154 };
1155 }
1156 }
1157
1158 static bool
nir_shader_has_local_variables(const nir_shader * nir)1159 nir_shader_has_local_variables(const nir_shader *nir)
1160 {
1161 nir_foreach_function(func, nir) {
1162 if (func->impl && !exec_list_is_empty(&func->impl->locals))
1163 return true;
1164 }
1165
1166 return false;
1167 }
1168
1169 void
nak_postprocess_nir(nir_shader * nir,const struct nak_compiler * nak,nir_variable_mode robust2_modes,const struct nak_fs_key * fs_key)1170 nak_postprocess_nir(nir_shader *nir,
1171 const struct nak_compiler *nak,
1172 nir_variable_mode robust2_modes,
1173 const struct nak_fs_key *fs_key)
1174 {
1175 UNUSED bool progress = false;
1176
1177 nak_optimize_nir(nir, nak);
1178
1179 const nir_lower_subgroups_options subgroups_options = {
1180 .subgroup_size = 32,
1181 .ballot_bit_size = 32,
1182 .ballot_components = 1,
1183 .lower_to_scalar = true,
1184 .lower_vote_eq = true,
1185 .lower_first_invocation_to_ballot = true,
1186 .lower_read_first_invocation = true,
1187 .lower_elect = true,
1188 .lower_inverse_ballot = true,
1189 };
1190 OPT(nir, nir_lower_subgroups, &subgroups_options);
1191 OPT(nir, nak_nir_lower_scan_reduce);
1192
1193 if (nir_shader_has_local_variables(nir)) {
1194 OPT(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
1195 glsl_get_natural_size_align_bytes);
1196 OPT(nir, nir_lower_explicit_io, nir_var_function_temp,
1197 nir_address_format_32bit_offset);
1198 nak_optimize_nir(nir, nak);
1199 }
1200
1201 OPT(nir, nir_opt_shrink_vectors);
1202
1203 nir_load_store_vectorize_options vectorize_opts = {};
1204 vectorize_opts.modes = nir_var_mem_global |
1205 nir_var_mem_ssbo |
1206 nir_var_mem_shared |
1207 nir_var_shader_temp;
1208 vectorize_opts.callback = nak_mem_vectorize_cb;
1209 vectorize_opts.robust_modes = robust2_modes;
1210 OPT(nir, nir_opt_load_store_vectorize, &vectorize_opts);
1211
1212 nir_lower_mem_access_bit_sizes_options mem_bit_size_options = {
1213 .modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic,
1214 .callback = nak_mem_access_size_align,
1215 };
1216 OPT(nir, nir_lower_mem_access_bit_sizes, &mem_bit_size_options);
1217 OPT(nir, nir_lower_bit_size, lower_bit_size_cb, (void *)nak);
1218
1219 OPT(nir, nir_opt_combine_barriers, NULL, NULL);
1220
1221 nak_optimize_nir(nir, nak);
1222
1223 OPT(nir, nak_nir_lower_tex, nak);
1224 OPT(nir, nir_lower_idiv, NULL);
1225
1226 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
1227
1228 OPT(nir, nir_lower_indirect_derefs, 0, UINT32_MAX);
1229
1230 if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1231 OPT(nir, nir_lower_tess_coord_z,
1232 nir->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);
1233 }
1234
1235 OPT(nir, nak_nir_lower_system_values, nak);
1236
1237 switch (nir->info.stage) {
1238 case MESA_SHADER_VERTEX:
1239 OPT(nir, nak_nir_lower_vs_inputs);
1240 OPT(nir, nak_nir_lower_varyings, nir_var_shader_out);
1241 OPT(nir, nir_opt_constant_folding);
1242 OPT(nir, nak_nir_lower_vtg_io, nak);
1243 break;
1244
1245 case MESA_SHADER_TESS_CTRL:
1246 case MESA_SHADER_TESS_EVAL:
1247 OPT(nir, nak_nir_lower_varyings, nir_var_shader_in | nir_var_shader_out);
1248 OPT(nir, nir_opt_constant_folding);
1249 OPT(nir, nak_nir_lower_vtg_io, nak);
1250 break;
1251
1252 case MESA_SHADER_FRAGMENT:
1253 OPT(nir, nak_nir_lower_fs_inputs, nak, fs_key);
1254 OPT(nir, nak_nir_lower_fs_outputs);
1255 break;
1256
1257 case MESA_SHADER_GEOMETRY:
1258 OPT(nir, nak_nir_lower_varyings, nir_var_shader_in | nir_var_shader_out);
1259 OPT(nir, nir_opt_constant_folding);
1260 OPT(nir, nak_nir_lower_vtg_io, nak);
1261 OPT(nir, nak_nir_lower_gs_intrinsics);
1262 break;
1263
1264 case MESA_SHADER_COMPUTE:
1265 case MESA_SHADER_KERNEL:
1266 break;
1267
1268 default:
1269 unreachable("Unsupported shader stage");
1270 }
1271
1272 OPT(nir, nir_lower_doubles, NULL, nak->nir_options.lower_doubles_options);
1273 OPT(nir, nir_lower_int64);
1274
1275 nak_optimize_nir(nir, nak);
1276
1277 do {
1278 progress = false;
1279 OPT(nir, nir_opt_algebraic_late);
1280 OPT(nir, nak_nir_lower_algebraic_late, nak);
1281
1282 /* If we're lowering fp64 sat but not min/max, the sat lowering may have
1283 * been undone by nir_opt_algebraic. Lower sat again just to be sure.
1284 */
1285 if ((nak->nir_options.lower_doubles_options & nir_lower_dsat) &&
1286 !(nak->nir_options.lower_doubles_options & nir_lower_dminmax))
1287 OPT(nir, nir_lower_doubles, NULL, nir_lower_dsat);
1288
1289 if (progress) {
1290 OPT(nir, nir_opt_constant_folding);
1291 OPT(nir, nir_copy_prop);
1292 OPT(nir, nir_opt_dce);
1293 OPT(nir, nir_opt_cse);
1294 }
1295 } while (progress);
1296
1297 nir_divergence_analysis(nir);
1298
1299 OPT(nir, nak_nir_add_barriers, nak);
1300
1301 /* Re-index blocks and compact SSA defs because we'll use them to index
1302 * arrays
1303 */
1304 nir_foreach_function(func, nir) {
1305 if (func->impl) {
1306 nir_index_blocks(func->impl);
1307 nir_index_ssa_defs(func->impl);
1308 }
1309 }
1310
1311 if (nak_should_print_nir())
1312 nir_print_shader(nir, stderr);
1313 }
1314
1315 static bool
scalar_is_imm_int(nir_scalar x,unsigned bits)1316 scalar_is_imm_int(nir_scalar x, unsigned bits)
1317 {
1318 if (!nir_scalar_is_const(x))
1319 return false;
1320
1321 int64_t imm = nir_scalar_as_int(x);
1322 return u_intN_min(bits) <= imm && imm <= u_intN_max(bits);
1323 }
1324
1325 struct nak_io_addr_offset
nak_get_io_addr_offset(nir_def * addr,uint8_t imm_bits)1326 nak_get_io_addr_offset(nir_def *addr, uint8_t imm_bits)
1327 {
1328 nir_scalar addr_s = {
1329 .def = addr,
1330 .comp = 0,
1331 };
1332 if (scalar_is_imm_int(addr_s, imm_bits)) {
1333 /* Base is a dumb name for this. It should be offset */
1334 return (struct nak_io_addr_offset) {
1335 .offset = nir_scalar_as_int(addr_s),
1336 };
1337 }
1338
1339 addr_s = nir_scalar_chase_movs(addr_s);
1340 if (!nir_scalar_is_alu(addr_s) ||
1341 nir_scalar_alu_op(addr_s) != nir_op_iadd) {
1342 return (struct nak_io_addr_offset) {
1343 .base = addr_s,
1344 };
1345 }
1346
1347 for (unsigned i = 0; i < 2; i++) {
1348 nir_scalar off_s = nir_scalar_chase_alu_src(addr_s, i);
1349 off_s = nir_scalar_chase_movs(off_s);
1350 if (scalar_is_imm_int(off_s, imm_bits)) {
1351 return (struct nak_io_addr_offset) {
1352 .base = nir_scalar_chase_alu_src(addr_s, 1 - i),
1353 .offset = nir_scalar_as_int(off_s),
1354 };
1355 }
1356 }
1357
1358 return (struct nak_io_addr_offset) {
1359 .base = addr_s,
1360 };
1361 }
1362