• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora, Ltd.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "nak_private.h"
7 #include "nir_builder.h"
8 #include "nir_control_flow.h"
9 #include "nir_xfb_info.h"
10 
11 #include "util/u_math.h"
12 
13 #define OPT(nir, pass, ...) ({                           \
14    bool this_progress = false;                           \
15    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);    \
16    if (this_progress)                                    \
17       progress = true;                                   \
18    this_progress;                                        \
19 })
20 
21 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
22 
23 bool
nak_nir_workgroup_has_one_subgroup(const nir_shader * nir)24 nak_nir_workgroup_has_one_subgroup(const nir_shader *nir)
25 {
26    switch (nir->info.stage) {
27    case MESA_SHADER_VERTEX:
28    case MESA_SHADER_TESS_EVAL:
29    case MESA_SHADER_GEOMETRY:
30    case MESA_SHADER_FRAGMENT:
31       unreachable("Shader stage does not have workgroups");
32       break;
33 
34    case MESA_SHADER_TESS_CTRL:
35       /* Tessellation only ever has one subgroup per workgroup.  The Vulkan
36        * limit on the number of tessellation invocations is 32 to allow for
37        * this.
38        */
39       return true;
40 
41    case MESA_SHADER_COMPUTE:
42    case MESA_SHADER_KERNEL: {
43       if (nir->info.workgroup_size_variable)
44          return false;
45 
46       uint16_t wg_sz = nir->info.workgroup_size[0] *
47                        nir->info.workgroup_size[1] *
48                        nir->info.workgroup_size[2];
49 
50       return wg_sz <= NAK_SUBGROUP_SIZE;
51    }
52 
53    default:
54       unreachable("Unknown shader stage");
55    }
56 }
57 
58 static uint8_t
vectorize_filter_cb(const nir_instr * instr,const void * _data)59 vectorize_filter_cb(const nir_instr *instr, const void *_data)
60 {
61    if (instr->type != nir_instr_type_alu)
62       return 0;
63 
64    const nir_alu_instr *alu = nir_instr_as_alu(instr);
65 
66    const unsigned bit_size = nir_alu_instr_is_comparison(alu)
67                              ? alu->src[0].src.ssa->bit_size
68                              : alu->def.bit_size;
69 
70    switch (alu->op) {
71    case nir_op_fadd:
72    case nir_op_fsub:
73    case nir_op_fabs:
74    case nir_op_fneg:
75    case nir_op_feq:
76    case nir_op_fge:
77    case nir_op_flt:
78    case nir_op_fneu:
79    case nir_op_fmul:
80    case nir_op_ffma:
81    case nir_op_fsign:
82    case nir_op_fsat:
83    case nir_op_fmax:
84    case nir_op_fmin:
85       return bit_size == 16 ? 2 : 1;
86    default:
87       return 1;
88    }
89 }
90 
91 static void
optimize_nir(nir_shader * nir,const struct nak_compiler * nak,bool allow_copies)92 optimize_nir(nir_shader *nir, const struct nak_compiler *nak, bool allow_copies)
93 {
94    bool progress;
95 
96    unsigned lower_flrp =
97       (nir->options->lower_flrp16 ? 16 : 0) |
98       (nir->options->lower_flrp32 ? 32 : 0) |
99       (nir->options->lower_flrp64 ? 64 : 0);
100 
101    do {
102       progress = false;
103 
104       /* This pass is causing problems with types used by OpenCL :
105        *    https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13955
106        *
107        * Running with it disabled made no difference in the resulting assembly
108        * code.
109        */
110       if (nir->info.stage != MESA_SHADER_KERNEL)
111          OPT(nir, nir_split_array_vars, nir_var_function_temp);
112 
113       OPT(nir, nir_shrink_vec_array_vars, nir_var_function_temp);
114       OPT(nir, nir_opt_deref);
115       if (OPT(nir, nir_opt_memcpy))
116          OPT(nir, nir_split_var_copies);
117 
118       OPT(nir, nir_lower_vars_to_ssa);
119 
120       if (allow_copies) {
121          /* Only run this pass in the first call to brw_nir_optimize.  Later
122           * calls assume that we've lowered away any copy_deref instructions
123           * and we don't want to introduce any more.
124           */
125          OPT(nir, nir_opt_find_array_copies);
126       }
127       OPT(nir, nir_opt_copy_prop_vars);
128       OPT(nir, nir_opt_dead_write_vars);
129       OPT(nir, nir_opt_combine_stores, nir_var_all);
130 
131       OPT(nir, nir_lower_alu_width, vectorize_filter_cb, NULL);
132       OPT(nir, nir_opt_vectorize, vectorize_filter_cb, NULL);
133       OPT(nir, nir_lower_phis_to_scalar, false);
134       OPT(nir, nir_lower_frexp);
135       OPT(nir, nir_copy_prop);
136       OPT(nir, nir_opt_dce);
137       OPT(nir, nir_opt_cse);
138 
139       OPT(nir, nir_opt_peephole_select, 0, false, false);
140       OPT(nir, nir_opt_intrinsics);
141       OPT(nir, nir_opt_idiv_const, 32);
142       OPT(nir, nir_opt_algebraic);
143       OPT(nir, nir_lower_constant_convert_alu_types);
144       OPT(nir, nir_opt_constant_folding);
145 
146       if (lower_flrp != 0) {
147          if (OPT(nir, nir_lower_flrp, lower_flrp, false /* always_precise */))
148             OPT(nir, nir_opt_constant_folding);
149          /* Nothing should rematerialize any flrps */
150          lower_flrp = 0;
151       }
152 
153       OPT(nir, nir_opt_dead_cf);
154       if (OPT(nir, nir_opt_loop)) {
155          /* If nir_opt_loop makes progress, then we need to clean things up
156           * if we want any hope of nir_opt_if or nir_opt_loop_unroll to make
157           * progress.
158           */
159          OPT(nir, nir_copy_prop);
160          OPT(nir, nir_opt_dce);
161       }
162       OPT(nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
163       OPT(nir, nir_opt_conditional_discard);
164       if (nir->options->max_unroll_iterations != 0) {
165          OPT(nir, nir_opt_loop_unroll);
166       }
167       OPT(nir, nir_opt_remove_phis);
168       OPT(nir, nir_opt_gcm, false);
169       OPT(nir, nir_opt_undef);
170       OPT(nir, nir_lower_pack);
171    } while (progress);
172 
173    OPT(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
174 }
175 
176 void
nak_optimize_nir(nir_shader * nir,const struct nak_compiler * nak)177 nak_optimize_nir(nir_shader *nir, const struct nak_compiler *nak)
178 {
179    optimize_nir(nir, nak, false);
180 }
181 
182 static unsigned
lower_bit_size_cb(const nir_instr * instr,void * data)183 lower_bit_size_cb(const nir_instr *instr, void *data)
184 {
185    const struct nak_compiler *nak = data;
186 
187    switch (instr->type) {
188    case nir_instr_type_alu: {
189       nir_alu_instr *alu = nir_instr_as_alu(instr);
190       if (nir_op_infos[alu->op].is_conversion)
191          return 0;
192 
193       const unsigned bit_size = nir_alu_instr_is_comparison(alu)
194                                 ? alu->src[0].src.ssa->bit_size
195                                 : alu->def.bit_size;
196 
197       switch (alu->op) {
198       case nir_op_bit_count:
199       case nir_op_ufind_msb:
200       case nir_op_ifind_msb:
201       case nir_op_find_lsb:
202          /* These are handled specially because the destination is always
203           * 32-bit and so the bit size of the instruction is given by the
204           * source.
205           */
206          return alu->src[0].src.ssa->bit_size == 32 ? 0 : 32;
207 
208       case nir_op_fabs:
209       case nir_op_fadd:
210       case nir_op_fneg:
211       case nir_op_feq:
212       case nir_op_fge:
213       case nir_op_flt:
214       case nir_op_fneu:
215       case nir_op_fmul:
216       case nir_op_ffma:
217       case nir_op_ffmaz:
218       case nir_op_fsign:
219       case nir_op_fsat:
220       case nir_op_fceil:
221       case nir_op_ffloor:
222       case nir_op_fround_even:
223       case nir_op_ftrunc:
224          if (bit_size == 16  && nak->sm >= 70)
225             return 0;
226          break;
227 
228       case nir_op_fmax:
229       case nir_op_fmin:
230          if (bit_size == 16 && nak->sm >= 80)
231             return 0;
232          break;
233 
234       default:
235          break;
236       }
237 
238       if (bit_size >= 32)
239          return 0;
240 
241       if (bit_size & (8 | 16))
242          return 32;
243 
244       return 0;
245    }
246 
247    case nir_instr_type_intrinsic: {
248       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
249       switch (intrin->intrinsic) {
250       case nir_intrinsic_vote_ieq:
251          if (intrin->src[0].ssa->bit_size != 1 &&
252              intrin->src[0].ssa->bit_size < 32)
253             return 32;
254          return 0;
255 
256       case nir_intrinsic_vote_feq:
257       case nir_intrinsic_read_invocation:
258       case nir_intrinsic_read_first_invocation:
259       case nir_intrinsic_shuffle:
260       case nir_intrinsic_shuffle_xor:
261       case nir_intrinsic_shuffle_up:
262       case nir_intrinsic_shuffle_down:
263       case nir_intrinsic_quad_broadcast:
264       case nir_intrinsic_quad_swap_horizontal:
265       case nir_intrinsic_quad_swap_vertical:
266       case nir_intrinsic_quad_swap_diagonal:
267       case nir_intrinsic_reduce:
268       case nir_intrinsic_inclusive_scan:
269       case nir_intrinsic_exclusive_scan:
270          if (intrin->src[0].ssa->bit_size < 32)
271             return 32;
272          return 0;
273 
274       default:
275          return 0;
276       }
277    }
278 
279    case nir_instr_type_phi: {
280       nir_phi_instr *phi = nir_instr_as_phi(instr);
281       if (phi->def.bit_size < 32 && phi->def.bit_size != 1)
282          return 32;
283       return 0;
284    }
285 
286    default:
287       return 0;
288    }
289 }
290 
291 void
nak_preprocess_nir(nir_shader * nir,const struct nak_compiler * nak)292 nak_preprocess_nir(nir_shader *nir, const struct nak_compiler *nak)
293 {
294    UNUSED bool progress = false;
295 
296    nir_validate_ssa_dominance(nir, "before nak_preprocess_nir");
297 
298    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
299       nir_lower_io_to_temporaries(nir, nir_shader_get_entrypoint(nir),
300                                   true /* outputs */, false /* inputs */);
301    }
302 
303    const nir_lower_tex_options tex_options = {
304       .lower_txd_3d = true,
305       .lower_txd_cube_map = true,
306       .lower_txd_clamp = true,
307       .lower_txd_shadow = true,
308       .lower_txp = ~0,
309       /* TODO: More lowering */
310    };
311    OPT(nir, nir_lower_tex, &tex_options);
312    OPT(nir, nir_normalize_cubemap_coords);
313 
314    nir_lower_image_options image_options = {
315       .lower_cube_size = true,
316    };
317    OPT(nir, nir_lower_image, &image_options);
318 
319    OPT(nir, nir_lower_global_vars_to_local);
320 
321    OPT(nir, nir_split_var_copies);
322    OPT(nir, nir_split_struct_vars, nir_var_function_temp);
323 
324    /* Optimize but allow copies because we haven't lowered them yet */
325    optimize_nir(nir, nak, true /* allow_copies */);
326 
327    OPT(nir, nir_lower_load_const_to_scalar);
328    OPT(nir, nir_lower_var_copies);
329    OPT(nir, nir_lower_system_values);
330    OPT(nir, nir_lower_compute_system_values, NULL);
331 
332    if (nir->info.stage == MESA_SHADER_FRAGMENT)
333       OPT(nir, nir_lower_terminate_to_demote);
334 }
335 
336 uint16_t
nak_varying_attr_addr(const struct nak_compiler * nak,gl_varying_slot slot)337 nak_varying_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot)
338 {
339    if (slot >= VARYING_SLOT_PATCH0) {
340       return NAK_ATTR_PATCH_START + (slot - VARYING_SLOT_PATCH0) * 0x10;
341    } else if (slot >= VARYING_SLOT_VAR0) {
342       return NAK_ATTR_GENERIC_START + (slot - VARYING_SLOT_VAR0) * 0x10;
343    } else {
344       switch (slot) {
345       case VARYING_SLOT_TESS_LEVEL_OUTER: return NAK_ATTR_TESS_LOD;
346       case VARYING_SLOT_TESS_LEVEL_INNER: return NAK_ATTR_TESS_INTERRIOR;
347       case VARYING_SLOT_PRIMITIVE_ID:     return NAK_ATTR_PRIMITIVE_ID;
348       case VARYING_SLOT_LAYER:            return NAK_ATTR_RT_ARRAY_INDEX;
349       case VARYING_SLOT_VIEWPORT:         return NAK_ATTR_VIEWPORT_INDEX;
350       case VARYING_SLOT_PSIZ:             return NAK_ATTR_POINT_SIZE;
351       case VARYING_SLOT_POS:              return NAK_ATTR_POSITION;
352       case VARYING_SLOT_CLIP_DIST0:       return NAK_ATTR_CLIP_CULL_DIST_0;
353       case VARYING_SLOT_CLIP_DIST1:       return NAK_ATTR_CLIP_CULL_DIST_4;
354       case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
355          return nak->sm >= 86 ? NAK_ATTR_VPRS_TABLE_INDEX
356                               : NAK_ATTR_VIEWPORT_INDEX;
357       default: unreachable("Invalid varying slot");
358       }
359    }
360 }
361 
362 static uint16_t
nak_fs_out_addr(gl_frag_result slot,uint32_t blend_idx)363 nak_fs_out_addr(gl_frag_result slot, uint32_t blend_idx)
364 {
365    switch (slot) {
366    case FRAG_RESULT_DEPTH:
367       assert(blend_idx == 0);
368       return NAK_FS_OUT_DEPTH;
369 
370    case FRAG_RESULT_STENCIL:
371       unreachable("EXT_shader_stencil_export not supported");
372 
373    case FRAG_RESULT_COLOR:
374       unreachable("Vulkan alway uses explicit locations");
375 
376    case FRAG_RESULT_SAMPLE_MASK:
377       assert(blend_idx == 0);
378       return NAK_FS_OUT_SAMPLE_MASK;
379 
380    default:
381       assert(blend_idx < 2);
382       return NAK_FS_OUT_COLOR((slot - FRAG_RESULT_DATA0) + blend_idx);
383    }
384 }
385 
386 uint16_t
nak_sysval_attr_addr(const struct nak_compiler * nak,gl_system_value sysval)387 nak_sysval_attr_addr(const struct nak_compiler *nak, gl_system_value sysval)
388 {
389    switch (sysval) {
390    case SYSTEM_VALUE_PRIMITIVE_ID:  return NAK_ATTR_PRIMITIVE_ID;
391    case SYSTEM_VALUE_FRAG_COORD:    return NAK_ATTR_POSITION;
392    case SYSTEM_VALUE_POINT_COORD:   return NAK_ATTR_POINT_SPRITE;
393    case SYSTEM_VALUE_TESS_COORD:    return NAK_ATTR_TESS_COORD;
394    case SYSTEM_VALUE_INSTANCE_ID:   return NAK_ATTR_INSTANCE_ID;
395    case SYSTEM_VALUE_VERTEX_ID:     return NAK_ATTR_VERTEX_ID;
396    case SYSTEM_VALUE_FRONT_FACE:    return NAK_ATTR_FRONT_FACE;
397    case SYSTEM_VALUE_LAYER_ID:      return NAK_ATTR_RT_ARRAY_INDEX;
398    default: unreachable("Invalid system value");
399    }
400 }
401 
402 static uint8_t
nak_sysval_sysval_idx(gl_system_value sysval)403 nak_sysval_sysval_idx(gl_system_value sysval)
404 {
405    switch (sysval) {
406    case SYSTEM_VALUE_SUBGROUP_INVOCATION:    return NAK_SV_LANE_ID;
407    case SYSTEM_VALUE_VERTICES_IN:            return NAK_SV_PRIM_TYPE;
408    case SYSTEM_VALUE_INVOCATION_ID:          return NAK_SV_INVOCATION_ID;
409    case SYSTEM_VALUE_HELPER_INVOCATION:      return NAK_SV_THREAD_KILL;
410    case SYSTEM_VALUE_LOCAL_INVOCATION_ID:    return NAK_SV_TID;
411    case SYSTEM_VALUE_WORKGROUP_ID:           return NAK_SV_CTAID;
412    case SYSTEM_VALUE_SUBGROUP_EQ_MASK:       return NAK_SV_LANEMASK_EQ;
413    case SYSTEM_VALUE_SUBGROUP_LT_MASK:       return NAK_SV_LANEMASK_LT;
414    case SYSTEM_VALUE_SUBGROUP_LE_MASK:       return NAK_SV_LANEMASK_LE;
415    case SYSTEM_VALUE_SUBGROUP_GT_MASK:       return NAK_SV_LANEMASK_GT;
416    case SYSTEM_VALUE_SUBGROUP_GE_MASK:       return NAK_SV_LANEMASK_GE;
417    default: unreachable("Invalid system value");
418    }
419 }
420 
421 static bool
nak_nir_lower_system_value_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * data)422 nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
423                                   void *data)
424 {
425    const struct nak_compiler *nak = data;
426 
427    b->cursor = nir_before_instr(&intrin->instr);
428 
429    nir_def *val;
430    switch (intrin->intrinsic) {
431    case nir_intrinsic_load_primitive_id:
432    case nir_intrinsic_load_instance_id:
433    case nir_intrinsic_load_vertex_id: {
434       assert(b->shader->info.stage != MESA_SHADER_VERTEX ||
435              b->shader->info.stage != MESA_SHADER_TESS_CTRL ||
436              b->shader->info.stage == MESA_SHADER_TESS_EVAL ||
437              b->shader->info.stage == MESA_SHADER_GEOMETRY);
438       const gl_system_value sysval =
439          nir_system_value_from_intrinsic(intrin->intrinsic);
440       const uint32_t addr = nak_sysval_attr_addr(nak, sysval);
441       val = nir_ald_nv(b, 1, nir_imm_int(b, 0), nir_imm_int(b, 0),
442                        .base = addr, .flags = 0,
443                        .range_base = addr, .range = 4,
444                        .access = ACCESS_CAN_REORDER);
445       break;
446    }
447 
448    case nir_intrinsic_load_patch_vertices_in: {
449       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_PRIM_TYPE,
450                                .access = ACCESS_CAN_REORDER);
451       val = nir_extract_u8(b, val, nir_imm_int(b, 1));
452       break;
453    }
454 
455    case nir_intrinsic_load_frag_shading_rate: {
456       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VARIABLE_RATE,
457                                .access = ACCESS_CAN_REORDER);
458 
459       /* X is in bits 8..16 and Y is in bits 16..24.  However, we actually
460        * want the log2 of X and Y and, since we only support 1, 2, and 4, a
461        * right shift by 1 is log2.  So this gives us
462        *
463        * x_log2 = (sv >> 9) & 3
464        * y_log2 = (sv >> 17) & 3
465        *
466        * However, we actually want y_log2 at 0..2 and x_log2 at 2..4 so that
467        * gives us
468        */
469       nir_def *x = nir_iand_imm(b, nir_ushr_imm(b, val, 7), 0xc);
470       nir_def *y = nir_iand_imm(b, nir_ushr_imm(b, val, 17), 0x3);
471       val = nir_ior(b, x, y);
472       break;
473    }
474 
475    case nir_intrinsic_load_subgroup_eq_mask:
476    case nir_intrinsic_load_subgroup_lt_mask:
477    case nir_intrinsic_load_subgroup_le_mask:
478    case nir_intrinsic_load_subgroup_gt_mask:
479    case nir_intrinsic_load_subgroup_ge_mask: {
480       const gl_system_value sysval =
481          nir_system_value_from_intrinsic(intrin->intrinsic);
482       const uint32_t idx = nak_sysval_sysval_idx(sysval);
483       val = nir_load_sysval_nv(b, 32, .base = idx,
484                                .access = ACCESS_CAN_REORDER);
485 
486       /* Pad with 0 because all invocations above 31 are off */
487       if (intrin->def.bit_size == 64) {
488          val = nir_u2u32(b, val);
489       } else {
490          assert(intrin->def.bit_size == 32);
491          val = nir_pad_vector_imm_int(b, val, 0, intrin->def.num_components);
492       }
493       break;
494    }
495 
496    case nir_intrinsic_load_subgroup_invocation:
497    case nir_intrinsic_load_helper_invocation:
498    case nir_intrinsic_load_invocation_id:
499    case nir_intrinsic_load_workgroup_id: {
500       const gl_system_value sysval =
501          nir_system_value_from_intrinsic(intrin->intrinsic);
502       const uint32_t idx = nak_sysval_sysval_idx(sysval);
503       nir_def *comps[3];
504       assert(intrin->def.num_components <= 3);
505       for (unsigned c = 0; c < intrin->def.num_components; c++) {
506          comps[c] = nir_load_sysval_nv(b, 32, .base = idx + c,
507                                        .access = ACCESS_CAN_REORDER);
508       }
509       val = nir_vec(b, comps, intrin->def.num_components);
510       break;
511    }
512 
513    case nir_intrinsic_load_local_invocation_id: {
514       nir_def *x = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_X,
515                                       .access = ACCESS_CAN_REORDER);
516       nir_def *y = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Y,
517                                       .access = ACCESS_CAN_REORDER);
518       nir_def *z = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Z,
519                                       .access = ACCESS_CAN_REORDER);
520 
521       if (b->shader->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
522          nir_def *x_lo = nir_iand_imm(b, x, 0x1);
523          nir_def *y_lo = nir_ushr_imm(b, nir_iand_imm(b, x, 0x2), 1);
524          nir_def *x_hi = nir_ushr_imm(b, nir_iand_imm(b, x, ~0x3), 1);
525          nir_def *y_hi = nir_ishl_imm(b, y, 1);
526 
527          x = nir_ior(b, x_lo, x_hi);
528          y = nir_ior(b, y_lo, y_hi);
529       }
530 
531       val = nir_vec3(b, x, y, z);
532       break;
533    }
534 
535    case nir_intrinsic_load_num_subgroups: {
536       assert(!b->shader->info.workgroup_size_variable);
537       uint16_t wg_size = b->shader->info.workgroup_size[0] *
538                          b->shader->info.workgroup_size[1] *
539                          b->shader->info.workgroup_size[2];
540       val = nir_imm_int(b, DIV_ROUND_UP(wg_size, 32));
541       break;
542    }
543 
544    case nir_intrinsic_load_subgroup_id:
545       if (nak_nir_workgroup_has_one_subgroup(b->shader)) {
546          val = nir_imm_int(b, 0);
547       } else {
548          assert(!b->shader->info.workgroup_size_variable);
549          nir_def *tid_x = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_X,
550                                              .access = ACCESS_CAN_REORDER);
551          nir_def *tid_y = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Y,
552                                              .access = ACCESS_CAN_REORDER);
553          nir_def *tid_z = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Z,
554                                              .access = ACCESS_CAN_REORDER);
555 
556          const uint16_t *wg_size = b->shader->info.workgroup_size;
557          nir_def *tid =
558             nir_iadd(b, tid_x,
559             nir_iadd(b, nir_imul_imm(b, tid_y, wg_size[0]),
560                         nir_imul_imm(b, tid_z, wg_size[0] * wg_size[1])));
561 
562          val = nir_udiv_imm(b, tid, 32);
563       }
564       break;
565 
566    case nir_intrinsic_is_helper_invocation: {
567       /* Unlike load_helper_invocation, this one isn't re-orderable */
568       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_THREAD_KILL);
569       break;
570    }
571 
572    case nir_intrinsic_shader_clock: {
573       /* The CS2R opcode can load 64 bits worth of sysval data at a time but
574        * it's not actually atomic.  In order to get correct shader clocks, we
575        * need to do a loop where we do
576        *
577        *    CS2R SV_CLOCK_HI
578        *    CS2R SV_CLOCK_LO
579        *    CS2R SV_CLOCK_HI
580        *    CS2R SV_CLOCK_LO
581        *    CS2R SV_CLOCK_HI
582        *    ...
583        *
584        * The moment two high values are the same, we take the low value
585        * between them and that gives us our clock.
586        *
587        * In order to make sure we don't run into any weird races, we also need
588        * to insert a barrier after every load to ensure the one load completes
589        * before we kick off the next load.  Otherwise, if one load happens to
590        * be faster than the other (they are variable latency, after all) we're
591        * still guaranteed that the loads happen in the order we want.
592        */
593       nir_variable *clock =
594          nir_local_variable_create(b->impl, glsl_uvec2_type(), NULL);
595 
596       nir_def *clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_HI);
597       nir_ssa_bar_nv(b, clock_hi);
598 
599       nir_store_var(b, clock, nir_vec2(b, nir_imm_int(b, 0), clock_hi), 0x3);
600 
601       nir_push_loop(b);
602       {
603          nir_def *last_clock = nir_load_var(b, clock);
604 
605          nir_def *clock_lo = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_LO);
606          nir_ssa_bar_nv(b, clock_lo);
607 
608          clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK + 1);
609          nir_ssa_bar_nv(b, clock_hi);
610 
611          nir_store_var(b, clock, nir_vec2(b, clock_lo, clock_hi), 0x3);
612 
613          nir_break_if(b, nir_ieq(b, clock_hi, nir_channel(b, last_clock, 1)));
614       }
615       nir_pop_loop(b, NULL);
616 
617       val = nir_load_var(b, clock);
618       if (intrin->def.bit_size == 64)
619          val = nir_pack_64_2x32(b, val);
620       break;
621    }
622 
623    case nir_intrinsic_load_warps_per_sm_nv:
624       val = nir_imm_int(b, nak->warps_per_sm);
625       break;
626 
627    case nir_intrinsic_load_sm_count_nv:
628       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTCFG);
629       val = nir_ubitfield_extract_imm(b, val, 20, 9);
630       break;
631 
632    case nir_intrinsic_load_warp_id_nv:
633       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
634       val = nir_ubitfield_extract_imm(b, val, 8, 7);
635       break;
636 
637    case nir_intrinsic_load_sm_id_nv:
638       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
639       val = nir_ubitfield_extract_imm(b, val, 20, 9);
640       break;
641 
642    default:
643       return false;
644    }
645 
646    if (intrin->def.bit_size == 1)
647       val = nir_i2b(b, val);
648 
649    nir_def_rewrite_uses(&intrin->def, val);
650 
651    return true;
652 }
653 
654 static bool
nak_nir_lower_system_values(nir_shader * nir,const struct nak_compiler * nak)655 nak_nir_lower_system_values(nir_shader *nir, const struct nak_compiler *nak)
656 {
657    return nir_shader_intrinsics_pass(nir, nak_nir_lower_system_value_intrin,
658                                      nir_metadata_none,
659                                      (void *)nak);
660 }
661 
662 struct nak_xfb_info
nak_xfb_from_nir(const struct nak_compiler * nak,const struct nir_xfb_info * nir_xfb)663 nak_xfb_from_nir(const struct nak_compiler *nak,
664                  const struct nir_xfb_info *nir_xfb)
665 {
666    if (nir_xfb == NULL)
667       return (struct nak_xfb_info) { };
668 
669    struct nak_xfb_info nak_xfb = { };
670 
671    u_foreach_bit(b, nir_xfb->buffers_written) {
672       nak_xfb.stride[b] = nir_xfb->buffers[b].stride;
673       nak_xfb.stream[b] = nir_xfb->buffer_to_stream[b];
674    }
675    memset(nak_xfb.attr_index, 0xff, sizeof(nak_xfb.attr_index)); /* = skip */
676 
677    for (unsigned o = 0; o < nir_xfb->output_count; o++) {
678       const nir_xfb_output_info *out = &nir_xfb->outputs[o];
679       const uint8_t b = out->buffer;
680       assert(nir_xfb->buffers_written & BITFIELD_BIT(b));
681 
682       const uint16_t attr_addr = nak_varying_attr_addr(nak, out->location);
683       assert(attr_addr % 4 == 0);
684       const uint16_t attr_idx = attr_addr / 4;
685 
686       assert(out->offset % 4 == 0);
687       uint8_t out_idx = out->offset / 4;
688 
689       u_foreach_bit(c, out->component_mask)
690          nak_xfb.attr_index[b][out_idx++] = attr_idx + c;
691 
692       nak_xfb.attr_count[b] = MAX2(nak_xfb.attr_count[b], out_idx);
693    }
694 
695    return nak_xfb;
696 }
697 
698 static bool
lower_fs_output_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * _data)699 lower_fs_output_intrin(nir_builder *b, nir_intrinsic_instr *intrin, void *_data)
700 {
701    if (intrin->intrinsic != nir_intrinsic_store_output)
702       return false;
703 
704    b->cursor = nir_before_instr(&intrin->instr);
705 
706    const nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
707    uint16_t addr = nak_fs_out_addr(sem.location, sem.dual_source_blend_index) +
708                    nir_src_as_uint(intrin->src[1]) * 16 +
709                    nir_intrinsic_component(intrin) * 4;
710 
711    nir_def *data = intrin->src[0].ssa;
712 
713    /* The fs_out_nv intrinsic is always scalar */
714    u_foreach_bit(c, nir_intrinsic_write_mask(intrin)) {
715       if (nir_scalar_is_undef(nir_scalar_resolved(data, c)))
716          continue;
717 
718       nir_fs_out_nv(b, nir_channel(b, data, c), .base = addr + c * 4);
719    }
720 
721    nir_instr_remove(&intrin->instr);
722 
723    return true;
724 }
725 
726 static bool
nak_nir_lower_fs_outputs(nir_shader * nir)727 nak_nir_lower_fs_outputs(nir_shader *nir)
728 {
729    if (nir->info.outputs_written == 0)
730       return false;
731 
732    bool progress = nir_shader_intrinsics_pass(nir, lower_fs_output_intrin,
733                                               nir_metadata_control_flow,
734                                               NULL);
735 
736    if (progress) {
737       /* We need a copy_fs_outputs_nv intrinsic so NAK knows where to place
738        * the final copy.  This needs to be in the last block, after all
739        * store_output intrinsics.
740        */
741       nir_function_impl *impl = nir_shader_get_entrypoint(nir);
742       nir_builder b = nir_builder_at(nir_after_impl(impl));
743       nir_copy_fs_outputs_nv(&b);
744    }
745 
746    return progress;
747 }
748 
749 static bool
nak_nir_remove_barrier_intrin(nir_builder * b,nir_intrinsic_instr * barrier,UNUSED void * _data)750 nak_nir_remove_barrier_intrin(nir_builder *b, nir_intrinsic_instr *barrier,
751                               UNUSED void *_data)
752 {
753    if (barrier->intrinsic != nir_intrinsic_barrier)
754       return false;
755 
756    mesa_scope exec_scope = nir_intrinsic_execution_scope(barrier);
757    assert(exec_scope <= SCOPE_WORKGROUP &&
758           "Control barrier with scope > WORKGROUP");
759 
760    if (exec_scope == SCOPE_WORKGROUP &&
761        nak_nir_workgroup_has_one_subgroup(b->shader))
762       exec_scope = SCOPE_SUBGROUP;
763 
764    /* Because we're guaranteeing maximal convergence via warp barriers,
765     * subgroup barriers do nothing.
766     */
767    if (exec_scope <= SCOPE_SUBGROUP)
768       exec_scope = SCOPE_NONE;
769 
770    const nir_variable_mode mem_modes = nir_intrinsic_memory_modes(barrier);
771    if (exec_scope == SCOPE_NONE && mem_modes == 0) {
772       nir_instr_remove(&barrier->instr);
773       return true;
774    }
775 
776    /* In this case, we're leaving the barrier there */
777    b->shader->info.uses_control_barrier = true;
778 
779    bool progress = false;
780    if (exec_scope != nir_intrinsic_execution_scope(barrier)) {
781       nir_intrinsic_set_execution_scope(barrier, exec_scope);
782       progress = true;
783    }
784 
785    return progress;
786 }
787 
788 static bool
nak_nir_remove_barriers(nir_shader * nir)789 nak_nir_remove_barriers(nir_shader *nir)
790 {
791    /* We'll set this back to true if we leave any barriers in place */
792    nir->info.uses_control_barrier = false;
793 
794    return nir_shader_intrinsics_pass(nir, nak_nir_remove_barrier_intrin,
795                                      nir_metadata_control_flow,
796                                      NULL);
797 }
798 
799 static bool
nak_mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,int64_t hole_size,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * cb_data)800 nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
801                      unsigned bit_size, unsigned num_components,
802                      int64_t hole_size, nir_intrinsic_instr *low,
803                      nir_intrinsic_instr *high, void *cb_data)
804 {
805    /*
806     * Since we legalize these later with nir_lower_mem_access_bit_sizes,
807     * we can optimistically combine anything that might be profitable
808     */
809    assert(util_is_power_of_two_nonzero(align_mul));
810 
811    if (hole_size > 0)
812       return false;
813 
814    unsigned max_bytes = 128u / 8u;
815    if (low->intrinsic == nir_intrinsic_ldc_nv ||
816        low->intrinsic == nir_intrinsic_ldcx_nv)
817       max_bytes = 64u / 8u;
818 
819    align_mul = MIN2(align_mul, max_bytes);
820    align_offset = align_offset % align_mul;
821    return align_offset + num_components * (bit_size / 8) <= align_mul;
822 }
823 
824 static nir_mem_access_size_align
nak_mem_access_size_align(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,enum gl_access_qualifier access,const void * cb_data)825 nak_mem_access_size_align(nir_intrinsic_op intrin,
826                           uint8_t bytes, uint8_t bit_size,
827                           uint32_t align_mul, uint32_t align_offset,
828                           bool offset_is_const, enum gl_access_qualifier access,
829                           const void *cb_data)
830 {
831    const uint32_t align = nir_combined_align(align_mul, align_offset);
832    assert(util_is_power_of_two_nonzero(align));
833 
834    unsigned bytes_pow2;
835    if (nir_intrinsic_infos[intrin].has_dest) {
836       /* Reads can over-fetch a bit if the alignment is okay. */
837       bytes_pow2 = util_next_power_of_two(bytes);
838    } else {
839       bytes_pow2 = 1 << (util_last_bit(bytes) - 1);
840    }
841 
842    unsigned chunk_bytes = MIN3(bytes_pow2, align, 16);
843    assert(util_is_power_of_two_nonzero(chunk_bytes));
844    if (intrin == nir_intrinsic_ldc_nv ||
845        intrin == nir_intrinsic_ldcx_nv)
846       chunk_bytes = MIN2(chunk_bytes, 8);
847 
848    if ((intrin == nir_intrinsic_ldc_nv ||
849         intrin == nir_intrinsic_ldcx_nv) && align < 4) {
850       /* CBufs require 4B alignment unless we're doing a ldc.u8 or ldc.i8.
851        * In particular, this applies to ldc.u16 which means we either have to
852        * fall back to two ldc.u8 or use ldc.u32 and shift stuff around to get
853        * the 16bit value out.  Fortunately, nir_lower_mem_access_bit_sizes()
854        * can handle over-alignment for reads.
855        */
856       if (align == 2 || offset_is_const) {
857          return (nir_mem_access_size_align) {
858             .bit_size = 32,
859             .num_components = 1,
860             .align = 4,
861             .shift = nir_mem_access_shift_method_scalar,
862          };
863       } else {
864          assert(align == 1);
865          return (nir_mem_access_size_align) {
866             .bit_size = 8,
867             .num_components = 1,
868             .align = 1,
869             .shift = nir_mem_access_shift_method_scalar,
870          };
871       }
872    } else if (chunk_bytes < 4) {
873       return (nir_mem_access_size_align) {
874          .bit_size = chunk_bytes * 8,
875          .num_components = 1,
876          .align = chunk_bytes,
877          .shift = nir_mem_access_shift_method_scalar,
878       };
879    } else {
880       return (nir_mem_access_size_align) {
881          .bit_size = 32,
882          .num_components = chunk_bytes / 4,
883          .align = chunk_bytes,
884          .shift = nir_mem_access_shift_method_scalar,
885       };
886    }
887 }
888 
889 static bool
nir_shader_has_local_variables(const nir_shader * nir)890 nir_shader_has_local_variables(const nir_shader *nir)
891 {
892    nir_foreach_function(func, nir) {
893       if (func->impl && !exec_list_is_empty(&func->impl->locals))
894          return true;
895    }
896 
897    return false;
898 }
899 
900 static int
type_size_vec4(const struct glsl_type * type,bool bindless)901 type_size_vec4(const struct glsl_type *type, bool bindless)
902 {
903    return glsl_count_vec4_slots(type, false, bindless);
904 }
905 
906 void
nak_postprocess_nir(nir_shader * nir,const struct nak_compiler * nak,nir_variable_mode robust2_modes,const struct nak_fs_key * fs_key)907 nak_postprocess_nir(nir_shader *nir,
908                     const struct nak_compiler *nak,
909                     nir_variable_mode robust2_modes,
910                     const struct nak_fs_key *fs_key)
911 {
912    UNUSED bool progress = false;
913 
914    nak_optimize_nir(nir, nak);
915 
916    const nir_lower_subgroups_options subgroups_options = {
917       .subgroup_size = NAK_SUBGROUP_SIZE,
918       .ballot_bit_size = 32,
919       .ballot_components = 1,
920       .lower_to_scalar = true,
921       .lower_vote_eq = true,
922       .lower_first_invocation_to_ballot = true,
923       .lower_read_first_invocation = true,
924       .lower_elect = true,
925       .lower_quad_vote = true,
926       .lower_inverse_ballot = true,
927       .lower_rotate_to_shuffle = true
928    };
929    OPT(nir, nir_lower_subgroups, &subgroups_options);
930    OPT(nir, nak_nir_lower_scan_reduce);
931 
932    if (nir_shader_has_local_variables(nir)) {
933       OPT(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
934           glsl_get_natural_size_align_bytes);
935       OPT(nir, nir_lower_explicit_io, nir_var_function_temp,
936           nir_address_format_32bit_offset);
937       nak_optimize_nir(nir, nak);
938    }
939 
940    OPT(nir, nir_opt_shrink_vectors, true);
941 
942    nir_load_store_vectorize_options vectorize_opts = {};
943    vectorize_opts.modes = nir_var_mem_global |
944                           nir_var_mem_ssbo |
945                           nir_var_mem_shared |
946                           nir_var_shader_temp;
947    vectorize_opts.callback = nak_mem_vectorize_cb;
948    vectorize_opts.robust_modes = robust2_modes;
949    OPT(nir, nir_opt_load_store_vectorize, &vectorize_opts);
950 
951    nir_lower_mem_access_bit_sizes_options mem_bit_size_options = {
952       .modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic,
953       .callback = nak_mem_access_size_align,
954    };
955    OPT(nir, nir_lower_mem_access_bit_sizes, &mem_bit_size_options);
956    OPT(nir, nir_lower_bit_size, lower_bit_size_cb, (void *)nak);
957 
958    OPT(nir, nir_opt_combine_barriers, NULL, NULL);
959 
960    nak_optimize_nir(nir, nak);
961 
962    OPT(nir, nak_nir_lower_tex, nak);
963    OPT(nir, nir_lower_idiv, NULL);
964 
965    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
966 
967    OPT(nir, nir_lower_indirect_derefs, 0, UINT32_MAX);
968 
969    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
970       OPT(nir, nir_lower_tess_coord_z,
971           nir->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);
972    }
973 
974    /* We need to do this before nak_nir_lower_system_values() because it
975     * relies on the workgroup size being the actual HW workgroup size in
976     * nir_intrinsic_load_subgroup_id.
977     */
978    if (gl_shader_stage_uses_workgroup(nir->info.stage) &&
979        nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
980       assert(nir->info.workgroup_size[0] % 2 == 0);
981       assert(nir->info.workgroup_size[1] % 2 == 0);
982       nir->info.workgroup_size[0] *= 2;
983       nir->info.workgroup_size[1] /= 2;
984    }
985 
986    OPT(nir, nak_nir_lower_system_values, nak);
987 
988    switch (nir->info.stage) {
989    case MESA_SHADER_VERTEX:
990    case MESA_SHADER_TESS_CTRL:
991    case MESA_SHADER_TESS_EVAL:
992    case MESA_SHADER_GEOMETRY:
993       OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
994           type_size_vec4, nir_lower_io_lower_64bit_to_32_new);
995       OPT(nir, nir_opt_constant_folding);
996       OPT(nir, nak_nir_lower_vtg_io, nak);
997       if (nir->info.stage == MESA_SHADER_GEOMETRY)
998          OPT(nir, nak_nir_lower_gs_intrinsics);
999       break;
1000 
1001    case MESA_SHADER_FRAGMENT:
1002       OPT(nir, nir_lower_indirect_derefs,
1003           nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
1004       OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1005           type_size_vec4, nir_lower_io_lower_64bit_to_32_new |
1006           nir_lower_io_use_interpolated_input_intrinsics);
1007       OPT(nir, nir_opt_constant_folding);
1008       OPT(nir, nak_nir_lower_fs_inputs, nak, fs_key);
1009       OPT(nir, nak_nir_lower_fs_outputs);
1010       break;
1011 
1012    case MESA_SHADER_COMPUTE:
1013    case MESA_SHADER_KERNEL:
1014       break;
1015 
1016    default:
1017       unreachable("Unsupported shader stage");
1018    }
1019 
1020    OPT(nir, nir_lower_doubles, NULL, nak->nir_options.lower_doubles_options);
1021    OPT(nir, nir_lower_int64);
1022 
1023    nak_optimize_nir(nir, nak);
1024 
1025    do {
1026       progress = false;
1027       OPT(nir, nir_opt_algebraic_late);
1028       OPT(nir, nak_nir_lower_algebraic_late, nak);
1029 
1030       /* If we're lowering fp64 sat but not min/max, the sat lowering may have
1031        * been undone by nir_opt_algebraic.  Lower sat again just to be sure.
1032        */
1033       if ((nak->nir_options.lower_doubles_options & nir_lower_dsat) &&
1034           !(nak->nir_options.lower_doubles_options & nir_lower_dminmax))
1035          OPT(nir, nir_lower_doubles, NULL, nir_lower_dsat);
1036 
1037       if (progress) {
1038          OPT(nir, nir_opt_constant_folding);
1039          OPT(nir, nir_copy_prop);
1040          OPT(nir, nir_opt_dce);
1041          OPT(nir, nir_opt_cse);
1042       }
1043    } while (progress);
1044 
1045    if (nak->sm < 70)
1046       OPT(nir, nak_nir_split_64bit_conversions);
1047 
1048    bool lcssa_progress = nir_convert_to_lcssa(nir, false, false);
1049    nir_divergence_analysis(nir);
1050 
1051    if (nak->sm >= 75) {
1052       if (lcssa_progress) {
1053          OPT(nir, nak_nir_mark_lcssa_invariants);
1054       }
1055       if (OPT(nir, nak_nir_lower_non_uniform_ldcx)) {
1056          OPT(nir, nir_copy_prop);
1057          OPT(nir, nir_opt_dce);
1058          nir_divergence_analysis(nir);
1059       }
1060    }
1061 
1062    OPT(nir, nak_nir_remove_barriers);
1063 
1064    if (nak->sm >= 70) {
1065       if (nak_should_print_nir()) {
1066          fprintf(stderr, "Structured NIR for %s shader:\n",
1067                  _mesa_shader_stage_to_string(nir->info.stage));
1068          nir_print_shader(nir, stderr);
1069       }
1070       OPT(nir, nak_nir_lower_cf);
1071    }
1072 
1073    /* Re-index blocks and compact SSA defs because we'll use them to index
1074     * arrays
1075     */
1076    nir_foreach_function(func, nir) {
1077       if (func->impl) {
1078          nir_index_blocks(func->impl);
1079          nir_index_ssa_defs(func->impl);
1080       }
1081    }
1082 
1083    if (nak_should_print_nir()) {
1084       fprintf(stderr, "NIR for %s shader:\n",
1085               _mesa_shader_stage_to_string(nir->info.stage));
1086       nir_print_shader(nir, stderr);
1087    }
1088 }
1089 
1090 static bool
scalar_is_imm_int(nir_scalar x,unsigned bits)1091 scalar_is_imm_int(nir_scalar x, unsigned bits)
1092 {
1093    if (!nir_scalar_is_const(x))
1094       return false;
1095 
1096    int64_t imm = nir_scalar_as_int(x);
1097    return u_intN_min(bits) <= imm && imm <= u_intN_max(bits);
1098 }
1099 
1100 struct nak_io_addr_offset
nak_get_io_addr_offset(nir_def * addr,uint8_t imm_bits)1101 nak_get_io_addr_offset(nir_def *addr, uint8_t imm_bits)
1102 {
1103    nir_scalar addr_s = {
1104       .def = addr,
1105       .comp = 0,
1106    };
1107    if (scalar_is_imm_int(addr_s, imm_bits)) {
1108       /* Base is a dumb name for this.  It should be offset */
1109       return (struct nak_io_addr_offset) {
1110          .offset = nir_scalar_as_int(addr_s),
1111       };
1112    }
1113 
1114    addr_s = nir_scalar_chase_movs(addr_s);
1115    if (!nir_scalar_is_alu(addr_s) ||
1116        nir_scalar_alu_op(addr_s) != nir_op_iadd) {
1117       return (struct nak_io_addr_offset) {
1118          .base = addr_s,
1119       };
1120    }
1121 
1122    for (unsigned i = 0; i < 2; i++) {
1123       nir_scalar off_s = nir_scalar_chase_alu_src(addr_s, i);
1124       off_s = nir_scalar_chase_movs(off_s);
1125       if (scalar_is_imm_int(off_s, imm_bits)) {
1126          return (struct nak_io_addr_offset) {
1127             .base = nir_scalar_chase_alu_src(addr_s, 1 - i),
1128             .offset = nir_scalar_as_int(off_s),
1129          };
1130       }
1131    }
1132 
1133    return (struct nak_io_addr_offset) {
1134       .base = addr_s,
1135    };
1136 }
1137