• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora, Ltd.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "nak_private.h"
7 #include "nir_builder.h"
8 #include "nir_xfb_info.h"
9 
10 #include "util/u_math.h"
11 
12 #define OPT(nir, pass, ...) ({                           \
13    bool this_progress = false;                           \
14    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);    \
15    if (this_progress)                                    \
16       progress = true;                                   \
17    this_progress;                                        \
18 })
19 
20 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
21 
22 bool
nak_nir_workgroup_has_one_subgroup(const nir_shader * nir)23 nak_nir_workgroup_has_one_subgroup(const nir_shader *nir)
24 {
25    switch (nir->info.stage) {
26    case MESA_SHADER_VERTEX:
27    case MESA_SHADER_TESS_EVAL:
28    case MESA_SHADER_GEOMETRY:
29    case MESA_SHADER_FRAGMENT:
30       unreachable("Shader stage does not have workgroups");
31       break;
32 
33    case MESA_SHADER_TESS_CTRL:
34       /* Tessellation only ever has one subgroup per workgroup.  The Vulkan
35        * limit on the number of tessellation invocations is 32 to allow for
36        * this.
37        */
38       return true;
39 
40    case MESA_SHADER_COMPUTE:
41    case MESA_SHADER_KERNEL: {
42       if (nir->info.workgroup_size_variable)
43          return false;
44 
45       uint16_t wg_sz = nir->info.workgroup_size[0] *
46                        nir->info.workgroup_size[1] *
47                        nir->info.workgroup_size[2];
48 
49       return wg_sz <= 32;
50    }
51 
52    default:
53       unreachable("Unknown shader stage");
54    }
55 }
56 
57 static void
optimize_nir(nir_shader * nir,const struct nak_compiler * nak,bool allow_copies)58 optimize_nir(nir_shader *nir, const struct nak_compiler *nak, bool allow_copies)
59 {
60    bool progress;
61 
62    unsigned lower_flrp =
63       (nir->options->lower_flrp16 ? 16 : 0) |
64       (nir->options->lower_flrp32 ? 32 : 0) |
65       (nir->options->lower_flrp64 ? 64 : 0);
66 
67    do {
68       progress = false;
69 
70       /* This pass is causing problems with types used by OpenCL :
71        *    https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13955
72        *
73        * Running with it disabled made no difference in the resulting assembly
74        * code.
75        */
76       if (nir->info.stage != MESA_SHADER_KERNEL)
77          OPT(nir, nir_split_array_vars, nir_var_function_temp);
78 
79       OPT(nir, nir_shrink_vec_array_vars, nir_var_function_temp);
80       OPT(nir, nir_opt_deref);
81       if (OPT(nir, nir_opt_memcpy))
82          OPT(nir, nir_split_var_copies);
83 
84       OPT(nir, nir_lower_vars_to_ssa);
85 
86       if (allow_copies) {
87          /* Only run this pass in the first call to brw_nir_optimize.  Later
88           * calls assume that we've lowered away any copy_deref instructions
89           * and we don't want to introduce any more.
90           */
91          OPT(nir, nir_opt_find_array_copies);
92       }
93       OPT(nir, nir_opt_copy_prop_vars);
94       OPT(nir, nir_opt_dead_write_vars);
95       OPT(nir, nir_opt_combine_stores, nir_var_all);
96 
97       OPT(nir, nir_lower_alu_to_scalar, NULL, NULL);
98       OPT(nir, nir_lower_phis_to_scalar, false);
99       OPT(nir, nir_lower_frexp);
100       OPT(nir, nir_copy_prop);
101       OPT(nir, nir_opt_dce);
102       OPT(nir, nir_opt_cse);
103 
104       OPT(nir, nir_opt_peephole_select, 0, false, false);
105       OPT(nir, nir_opt_intrinsics);
106       OPT(nir, nir_opt_idiv_const, 32);
107       OPT(nir, nir_opt_algebraic);
108       OPT(nir, nir_lower_constant_convert_alu_types);
109       OPT(nir, nir_opt_constant_folding);
110 
111       if (lower_flrp != 0) {
112          if (OPT(nir, nir_lower_flrp, lower_flrp, false /* always_precise */))
113             OPT(nir, nir_opt_constant_folding);
114          /* Nothing should rematerialize any flrps */
115          lower_flrp = 0;
116       }
117 
118       OPT(nir, nir_opt_dead_cf);
119       if (OPT(nir, nir_opt_loop)) {
120          /* If nir_opt_loop makes progress, then we need to clean things up
121           * if we want any hope of nir_opt_if or nir_opt_loop_unroll to make
122           * progress.
123           */
124          OPT(nir, nir_copy_prop);
125          OPT(nir, nir_opt_dce);
126       }
127       OPT(nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
128       OPT(nir, nir_opt_conditional_discard);
129       if (nir->options->max_unroll_iterations != 0) {
130          OPT(nir, nir_opt_loop_unroll);
131       }
132       OPT(nir, nir_opt_remove_phis);
133       OPT(nir, nir_opt_gcm, false);
134       OPT(nir, nir_opt_undef);
135       OPT(nir, nir_lower_pack);
136    } while (progress);
137 
138    OPT(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
139 }
140 
141 void
nak_optimize_nir(nir_shader * nir,const struct nak_compiler * nak)142 nak_optimize_nir(nir_shader *nir, const struct nak_compiler *nak)
143 {
144    optimize_nir(nir, nak, false);
145 }
146 
147 static unsigned
lower_bit_size_cb(const nir_instr * instr,void * _data)148 lower_bit_size_cb(const nir_instr *instr, void *_data)
149 {
150    switch (instr->type) {
151    case nir_instr_type_alu: {
152       nir_alu_instr *alu = nir_instr_as_alu(instr);
153       if (nir_op_infos[alu->op].is_conversion)
154          return 0;
155 
156       switch (alu->op) {
157       case nir_op_bit_count:
158       case nir_op_ufind_msb:
159       case nir_op_ifind_msb:
160       case nir_op_find_lsb:
161          /* These are handled specially because the destination is always
162           * 32-bit and so the bit size of the instruction is given by the
163           * source.
164           */
165          return alu->src[0].src.ssa->bit_size == 32 ? 0 : 32;
166       default:
167          break;
168       }
169 
170       const unsigned bit_size = nir_alu_instr_is_comparison(alu)
171                                 ? alu->src[0].src.ssa->bit_size
172                                 : alu->def.bit_size;
173       if (bit_size >= 32)
174          return 0;
175 
176       /* TODO: Some hardware has native 16-bit support */
177       if (bit_size & (8 | 16))
178          return 32;
179 
180       return 0;
181    }
182 
183    case nir_instr_type_intrinsic: {
184       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
185       switch (intrin->intrinsic) {
186       case nir_intrinsic_vote_ieq:
187          if (intrin->src[0].ssa->bit_size != 1 &&
188              intrin->src[0].ssa->bit_size < 32)
189             return 32;
190          return 0;
191 
192       case nir_intrinsic_vote_feq:
193       case nir_intrinsic_read_invocation:
194       case nir_intrinsic_read_first_invocation:
195       case nir_intrinsic_shuffle:
196       case nir_intrinsic_shuffle_xor:
197       case nir_intrinsic_shuffle_up:
198       case nir_intrinsic_shuffle_down:
199       case nir_intrinsic_quad_broadcast:
200       case nir_intrinsic_quad_swap_horizontal:
201       case nir_intrinsic_quad_swap_vertical:
202       case nir_intrinsic_quad_swap_diagonal:
203       case nir_intrinsic_reduce:
204       case nir_intrinsic_inclusive_scan:
205       case nir_intrinsic_exclusive_scan:
206          if (intrin->src[0].ssa->bit_size < 32)
207             return 32;
208          return 0;
209 
210       default:
211          return 0;
212       }
213    }
214 
215    case nir_instr_type_phi: {
216       nir_phi_instr *phi = nir_instr_as_phi(instr);
217       if (phi->def.bit_size < 32 && phi->def.bit_size != 1)
218          return 32;
219       return 0;
220    }
221 
222    default:
223       return 0;
224    }
225 }
226 
227 static nir_def *
nir_udiv_round_up(nir_builder * b,nir_def * n,nir_def * d)228 nir_udiv_round_up(nir_builder *b, nir_def *n, nir_def *d)
229 {
230    return nir_udiv(b, nir_iadd(b, n, nir_iadd_imm(b, d, -1)), d);
231 }
232 
233 static bool
nak_nir_lower_subgroup_id_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * data)234 nak_nir_lower_subgroup_id_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
235                                  void *data)
236 {
237    switch (intrin->intrinsic) {
238    case nir_intrinsic_load_num_subgroups: {
239       b->cursor = nir_instr_remove(&intrin->instr);
240 
241       nir_def *num_subgroups;
242       if (nak_nir_workgroup_has_one_subgroup(b->shader)) {
243          num_subgroups = nir_imm_int(b, 1);
244       } else {
245          assert(b->shader->info.cs.derivative_group == DERIVATIVE_GROUP_NONE);
246 
247          nir_def *workgroup_size = nir_load_workgroup_size(b);
248          workgroup_size =
249             nir_imul(b, nir_imul(b, nir_channel(b, workgroup_size, 0),
250                                     nir_channel(b, workgroup_size, 1)),
251                         nir_channel(b, workgroup_size, 2));
252          nir_def *subgroup_size = nir_load_subgroup_size(b);
253          num_subgroups = nir_udiv_round_up(b, workgroup_size, subgroup_size);
254       }
255       nir_def_rewrite_uses(&intrin->def, num_subgroups);
256 
257       return true;
258    }
259    case nir_intrinsic_load_subgroup_id: {
260       b->cursor = nir_instr_remove(&intrin->instr);
261 
262       nir_def *subgroup_id;
263       if (nak_nir_workgroup_has_one_subgroup(b->shader)) {
264          subgroup_id = nir_imm_int(b, 0);
265       } else {
266          assert(b->shader->info.cs.derivative_group == DERIVATIVE_GROUP_NONE);
267 
268          nir_def *invocation_index = nir_load_local_invocation_index(b);
269          nir_def *subgroup_size = nir_load_subgroup_size(b);
270          subgroup_id = nir_udiv(b, invocation_index, subgroup_size);
271       }
272       nir_def_rewrite_uses(&intrin->def, subgroup_id);
273 
274       return true;
275    }
276    default:
277       return false;
278    }
279 }
280 
281 static bool
nak_nir_lower_subgroup_id(nir_shader * nir)282 nak_nir_lower_subgroup_id(nir_shader *nir)
283 {
284    return nir_shader_intrinsics_pass(nir, nak_nir_lower_subgroup_id_intrin,
285                                      nir_metadata_block_index |
286                                      nir_metadata_dominance,
287                                      NULL);
288 }
289 
290 void
nak_preprocess_nir(nir_shader * nir,const struct nak_compiler * nak)291 nak_preprocess_nir(nir_shader *nir, const struct nak_compiler *nak)
292 {
293    UNUSED bool progress = false;
294 
295    nir_validate_ssa_dominance(nir, "before nak_preprocess_nir");
296 
297    const nir_lower_tex_options tex_options = {
298       .lower_txd_3d = true,
299       .lower_txd_cube_map = true,
300       .lower_txd_clamp = true,
301       .lower_txd_shadow = true,
302       .lower_txp = ~0,
303       /* TODO: More lowering */
304    };
305    OPT(nir, nir_lower_tex, &tex_options);
306    OPT(nir, nir_normalize_cubemap_coords);
307 
308    nir_lower_image_options image_options = {
309       .lower_cube_size = true,
310    };
311    OPT(nir, nir_lower_image, &image_options);
312 
313    OPT(nir, nir_lower_global_vars_to_local);
314 
315    OPT(nir, nir_split_var_copies);
316    OPT(nir, nir_split_struct_vars, nir_var_function_temp);
317 
318    /* Optimize but allow copies because we haven't lowered them yet */
319    optimize_nir(nir, nak, true /* allow_copies */);
320 
321    OPT(nir, nir_lower_load_const_to_scalar);
322    OPT(nir, nir_lower_var_copies);
323    OPT(nir, nir_lower_system_values);
324    OPT(nir, nak_nir_lower_subgroup_id);
325    OPT(nir, nir_lower_compute_system_values, NULL);
326 }
327 
328 static uint16_t
nak_attribute_attr_addr(gl_vert_attrib attrib)329 nak_attribute_attr_addr(gl_vert_attrib attrib)
330 {
331    assert(attrib >= VERT_ATTRIB_GENERIC0);
332    return NAK_ATTR_GENERIC_START + (attrib - VERT_ATTRIB_GENERIC0) * 0x10;
333 }
334 
335 static int
type_size_vec4_bytes(const struct glsl_type * type,bool bindless)336 type_size_vec4_bytes(const struct glsl_type *type, bool bindless)
337 {
338    return glsl_count_vec4_slots(type, false, bindless) * 16;
339 }
340 
341 static bool
nak_nir_lower_vs_inputs(nir_shader * nir)342 nak_nir_lower_vs_inputs(nir_shader *nir)
343 {
344    bool progress = false;
345 
346    nir_foreach_shader_in_variable(var, nir) {
347       var->data.driver_location =
348          nak_attribute_attr_addr(var->data.location);
349    }
350 
351    progress |= OPT(nir, nir_lower_io, nir_var_shader_in, type_size_vec4_bytes,
352                         nir_lower_io_lower_64bit_to_32);
353 
354    return progress;
355 }
356 
357 static uint16_t
nak_varying_attr_addr(gl_varying_slot slot)358 nak_varying_attr_addr(gl_varying_slot slot)
359 {
360    if (slot >= VARYING_SLOT_PATCH0) {
361       return NAK_ATTR_PATCH_START + (slot - VARYING_SLOT_PATCH0) * 0x10;
362    } else if (slot >= VARYING_SLOT_VAR0) {
363       return NAK_ATTR_GENERIC_START + (slot - VARYING_SLOT_VAR0) * 0x10;
364    } else {
365       switch (slot) {
366       case VARYING_SLOT_TESS_LEVEL_OUTER: return NAK_ATTR_TESS_LOD;
367       case VARYING_SLOT_TESS_LEVEL_INNER: return NAK_ATTR_TESS_INTERRIOR;
368       case VARYING_SLOT_PRIMITIVE_ID:     return NAK_ATTR_PRIMITIVE_ID;
369       case VARYING_SLOT_LAYER:            return NAK_ATTR_RT_ARRAY_INDEX;
370       case VARYING_SLOT_VIEWPORT:         return NAK_ATTR_VIEWPORT_INDEX;
371       case VARYING_SLOT_PSIZ:             return NAK_ATTR_POINT_SIZE;
372       case VARYING_SLOT_POS:              return NAK_ATTR_POSITION;
373       case VARYING_SLOT_CLIP_DIST0:       return NAK_ATTR_CLIP_CULL_DIST_0;
374       case VARYING_SLOT_CLIP_DIST1:       return NAK_ATTR_CLIP_CULL_DIST_4;
375       default: unreachable("Invalid varying slot");
376       }
377    }
378 }
379 
380 static uint16_t
nak_sysval_attr_addr(gl_system_value sysval)381 nak_sysval_attr_addr(gl_system_value sysval)
382 {
383    switch (sysval) {
384    case SYSTEM_VALUE_PRIMITIVE_ID:  return NAK_ATTR_PRIMITIVE_ID;
385    case SYSTEM_VALUE_FRAG_COORD:    return NAK_ATTR_POSITION;
386    case SYSTEM_VALUE_POINT_COORD:   return NAK_ATTR_POINT_SPRITE;
387    case SYSTEM_VALUE_TESS_COORD:    return NAK_ATTR_TESS_COORD;
388    case SYSTEM_VALUE_INSTANCE_ID:   return NAK_ATTR_INSTANCE_ID;
389    case SYSTEM_VALUE_VERTEX_ID:     return NAK_ATTR_VERTEX_ID;
390    case SYSTEM_VALUE_FRONT_FACE:    return NAK_ATTR_FRONT_FACE;
391    default: unreachable("Invalid system value");
392    }
393 }
394 
395 static uint8_t
nak_sysval_sysval_idx(gl_system_value sysval)396 nak_sysval_sysval_idx(gl_system_value sysval)
397 {
398    switch (sysval) {
399    case SYSTEM_VALUE_SUBGROUP_INVOCATION:    return NAK_SV_LANE_ID;
400    case SYSTEM_VALUE_VERTICES_IN:            return NAK_SV_VERTEX_COUNT;
401    case SYSTEM_VALUE_INVOCATION_ID:          return NAK_SV_INVOCATION_ID;
402    case SYSTEM_VALUE_HELPER_INVOCATION:      return NAK_SV_THREAD_KILL;
403    case SYSTEM_VALUE_LOCAL_INVOCATION_ID:    return NAK_SV_TID;
404    case SYSTEM_VALUE_WORKGROUP_ID:           return NAK_SV_CTAID;
405    case SYSTEM_VALUE_SUBGROUP_EQ_MASK:       return NAK_SV_LANEMASK_EQ;
406    case SYSTEM_VALUE_SUBGROUP_LT_MASK:       return NAK_SV_LANEMASK_LT;
407    case SYSTEM_VALUE_SUBGROUP_LE_MASK:       return NAK_SV_LANEMASK_LE;
408    case SYSTEM_VALUE_SUBGROUP_GT_MASK:       return NAK_SV_LANEMASK_GT;
409    case SYSTEM_VALUE_SUBGROUP_GE_MASK:       return NAK_SV_LANEMASK_GE;
410    default: unreachable("Invalid system value");
411    }
412 }
413 
414 static bool
nak_nir_lower_system_value_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * data)415 nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
416                                   void *data)
417 {
418    const struct nak_compiler *nak = data;
419 
420    b->cursor = nir_before_instr(&intrin->instr);
421 
422    nir_def *val;
423    switch (intrin->intrinsic) {
424    case nir_intrinsic_load_layer_id: {
425       const uint32_t addr = nak_varying_attr_addr(VARYING_SLOT_LAYER);
426       val = nir_load_input(b, intrin->def.num_components, 32,
427                            nir_imm_int(b, 0), .base = addr,
428                            .dest_type = nir_type_int32);
429       break;
430    }
431 
432    case nir_intrinsic_load_primitive_id: {
433       assert(b->shader->info.stage == MESA_SHADER_TESS_CTRL ||
434              b->shader->info.stage == MESA_SHADER_TESS_EVAL ||
435              b->shader->info.stage == MESA_SHADER_GEOMETRY);
436       val = nir_load_per_vertex_input(b, 1, 32, nir_imm_int(b, 0),
437                                       nir_imm_int(b, 0),
438                                       .base = NAK_ATTR_PRIMITIVE_ID,
439                                       .dest_type = nir_type_int32);
440       break;
441    }
442 
443    case nir_intrinsic_load_front_face:
444    case nir_intrinsic_load_instance_id:
445    case nir_intrinsic_load_vertex_id: {
446       const gl_system_value sysval =
447          nir_system_value_from_intrinsic(intrin->intrinsic);
448       const uint32_t addr = nak_sysval_attr_addr(sysval);
449       val = nir_load_input(b, intrin->def.num_components, 32,
450                            nir_imm_int(b, 0), .base = addr,
451                            .dest_type = nir_type_int32);
452       break;
453    }
454 
455    case nir_intrinsic_load_patch_vertices_in: {
456       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VERTEX_COUNT,
457                                .access = ACCESS_CAN_REORDER);
458       val = nir_extract_u8(b, val, nir_imm_int(b, 1));
459       break;
460    }
461 
462    case nir_intrinsic_load_subgroup_eq_mask:
463    case nir_intrinsic_load_subgroup_lt_mask:
464    case nir_intrinsic_load_subgroup_le_mask:
465    case nir_intrinsic_load_subgroup_gt_mask:
466    case nir_intrinsic_load_subgroup_ge_mask: {
467       const gl_system_value sysval =
468          nir_system_value_from_intrinsic(intrin->intrinsic);
469       const uint32_t idx = nak_sysval_sysval_idx(sysval);
470       val = nir_load_sysval_nv(b, 32, .base = idx,
471                                .access = ACCESS_CAN_REORDER);
472 
473       /* Pad with 0 because all invocations above 31 are off */
474       if (intrin->def.bit_size == 64) {
475          val = nir_u2u32(b, val);
476       } else {
477          assert(intrin->def.bit_size == 32);
478          val = nir_pad_vector_imm_int(b, val, 0, intrin->def.num_components);
479       }
480       break;
481    }
482 
483    case nir_intrinsic_load_subgroup_invocation:
484    case nir_intrinsic_load_helper_invocation:
485    case nir_intrinsic_load_invocation_id:
486    case nir_intrinsic_load_local_invocation_id:
487    case nir_intrinsic_load_workgroup_id:
488    case nir_intrinsic_load_workgroup_id_zero_base: {
489       const gl_system_value sysval =
490          intrin->intrinsic == nir_intrinsic_load_workgroup_id_zero_base ?
491          SYSTEM_VALUE_WORKGROUP_ID :
492          nir_system_value_from_intrinsic(intrin->intrinsic);
493       const uint32_t idx = nak_sysval_sysval_idx(sysval);
494       nir_def *comps[3];
495       assert(intrin->def.num_components <= 3);
496       for (unsigned c = 0; c < intrin->def.num_components; c++) {
497          comps[c] = nir_load_sysval_nv(b, 32, .base = idx + c,
498                                        .access = ACCESS_CAN_REORDER);
499       }
500       val = nir_vec(b, comps, intrin->def.num_components);
501       break;
502    }
503 
504    case nir_intrinsic_is_helper_invocation: {
505       /* Unlike load_helper_invocation, this one isn't re-orderable */
506       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_THREAD_KILL);
507       break;
508    }
509 
510    case nir_intrinsic_shader_clock: {
511       /* The CS2R opcode can load 64 bits worth of sysval data at a time but
512        * it's not actually atomic.  In order to get correct shader clocks, we
513        * need to do a loop where we do
514        *
515        *    CS2R SV_CLOCK_HI
516        *    CS2R SV_CLOCK_LO
517        *    CS2R SV_CLOCK_HI
518        *    CS2R SV_CLOCK_LO
519        *    CS2R SV_CLOCK_HI
520        *    ...
521        *
522        * The moment two high values are the same, we take the low value
523        * between them and that gives us our clock.
524        *
525        * In order to make sure we don't run into any weird races, we also need
526        * to insert a barrier after every load to ensure the one load completes
527        * before we kick off the next load.  Otherwise, if one load happens to
528        * be faster than the other (they are variable latency, after all) we're
529        * still guaranteed that the loads happen in the order we want.
530        */
531       nir_variable *clock =
532          nir_local_variable_create(b->impl, glsl_uvec2_type(), NULL);
533 
534       nir_def *clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_HI);
535       nir_ssa_bar_nv(b, clock_hi);
536 
537       nir_store_var(b, clock, nir_vec2(b, nir_imm_int(b, 0), clock_hi), 0x3);
538 
539       nir_push_loop(b);
540       {
541          nir_def *last_clock = nir_load_var(b, clock);
542 
543          nir_def *clock_lo = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_LO);
544          nir_ssa_bar_nv(b, clock_lo);
545 
546          clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK + 1);
547          nir_ssa_bar_nv(b, clock_hi);
548 
549          nir_store_var(b, clock, nir_vec2(b, clock_lo, clock_hi), 0x3);
550 
551          nir_push_if(b, nir_ieq(b, clock_hi, nir_channel(b, last_clock, 1)));
552          {
553             nir_jump(b, nir_jump_break);
554          }
555          nir_pop_if(b, NULL);
556       }
557       nir_pop_loop(b, NULL);
558 
559       val = nir_load_var(b, clock);
560       if (intrin->def.bit_size == 64)
561          val = nir_pack_64_2x32(b, val);
562       break;
563    }
564 
565    case nir_intrinsic_load_warps_per_sm_nv:
566       val = nir_imm_int(b, nak->warps_per_sm);
567       break;
568 
569    case nir_intrinsic_load_sm_count_nv:
570       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTCFG);
571       val = nir_ubitfield_extract_imm(b, val, 20, 9);
572       break;
573 
574    case nir_intrinsic_load_warp_id_nv:
575       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
576       val = nir_ubitfield_extract_imm(b, val, 8, 7);
577       break;
578 
579    case nir_intrinsic_load_sm_id_nv:
580       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
581       val = nir_ubitfield_extract_imm(b, val, 20, 9);
582       break;
583 
584    default:
585       return false;
586    }
587 
588    if (intrin->def.bit_size == 1)
589       val = nir_i2b(b, val);
590 
591    nir_def_rewrite_uses(&intrin->def, val);
592 
593    return true;
594 }
595 
596 static bool
nak_nir_lower_system_values(nir_shader * nir,const struct nak_compiler * nak)597 nak_nir_lower_system_values(nir_shader *nir, const struct nak_compiler *nak)
598 {
599    return nir_shader_intrinsics_pass(nir, nak_nir_lower_system_value_intrin,
600                                      nir_metadata_none,
601                                      (void *)nak);
602 }
603 
604 static bool
nak_nir_lower_varyings(nir_shader * nir,nir_variable_mode modes)605 nak_nir_lower_varyings(nir_shader *nir, nir_variable_mode modes)
606 {
607    bool progress = false;
608 
609    assert(!(modes & ~(nir_var_shader_in | nir_var_shader_out)));
610 
611    nir_foreach_variable_with_modes(var, nir, modes)
612       var->data.driver_location = nak_varying_attr_addr(var->data.location);
613 
614    OPT(nir, nir_lower_io, modes, type_size_vec4_bytes,
615        nir_lower_io_lower_64bit_to_32);
616 
617    return progress;
618 }
619 
620 struct nak_xfb_info
nak_xfb_from_nir(const struct nir_xfb_info * nir_xfb)621 nak_xfb_from_nir(const struct nir_xfb_info *nir_xfb)
622 {
623    if (nir_xfb == NULL)
624       return (struct nak_xfb_info) { };
625 
626    struct nak_xfb_info nak_xfb = { };
627 
628    u_foreach_bit(b, nir_xfb->buffers_written) {
629       nak_xfb.stride[b] = nir_xfb->buffers[b].stride;
630       nak_xfb.stream[b] = nir_xfb->buffer_to_stream[b];
631    }
632    memset(nak_xfb.attr_index, 0xff, sizeof(nak_xfb.attr_index)); /* = skip */
633 
634    for (unsigned o = 0; o < nir_xfb->output_count; o++) {
635       const nir_xfb_output_info *out = &nir_xfb->outputs[o];
636       const uint8_t b = out->buffer;
637       assert(nir_xfb->buffers_written & BITFIELD_BIT(b));
638 
639       const uint16_t attr_addr = nak_varying_attr_addr(out->location);
640       assert(attr_addr % 4 == 0);
641       const uint16_t attr_idx = attr_addr / 4;
642 
643       assert(out->offset % 4 == 0);
644       uint8_t out_idx = out->offset / 4;
645 
646       u_foreach_bit(c, out->component_mask)
647          nak_xfb.attr_index[b][out_idx++] = attr_idx + c;
648 
649       nak_xfb.attr_count[b] = MAX2(nak_xfb.attr_count[b], out_idx);
650    }
651 
652    return nak_xfb;
653 }
654 
655 static nir_def *
load_frag_w(nir_builder * b,enum nak_interp_loc interp_loc,nir_def * offset)656 load_frag_w(nir_builder *b, enum nak_interp_loc interp_loc, nir_def *offset)
657 {
658    if (offset == NULL)
659       offset = nir_imm_int(b, 0);
660 
661    const uint16_t w_addr =
662       nak_sysval_attr_addr(SYSTEM_VALUE_FRAG_COORD) + 12;
663 
664    const struct nak_nir_ipa_flags flags = {
665       .interp_mode = NAK_INTERP_MODE_SCREEN_LINEAR,
666       .interp_freq = NAK_INTERP_FREQ_PASS,
667       .interp_loc = interp_loc,
668    };
669    uint32_t flags_u32;
670    memcpy(&flags_u32, &flags, sizeof(flags_u32));
671 
672    return nir_ipa_nv(b, nir_imm_float(b, 0), offset,
673                      .base = w_addr, .flags = flags_u32);
674 }
675 
676 static nir_def *
load_interpolated_input(nir_builder * b,unsigned num_components,uint32_t addr,enum nak_interp_mode interp_mode,enum nak_interp_loc interp_loc,nir_def * inv_w,nir_def * offset,const struct nak_compiler * nak)677 load_interpolated_input(nir_builder *b, unsigned num_components, uint32_t addr,
678                         enum nak_interp_mode interp_mode,
679                         enum nak_interp_loc interp_loc,
680                         nir_def *inv_w, nir_def *offset,
681                         const struct nak_compiler *nak)
682 {
683    if (offset == NULL)
684       offset = nir_imm_int(b, 0);
685 
686    if (nak->sm >= 70) {
687       const struct nak_nir_ipa_flags flags = {
688          .interp_mode = interp_mode,
689          .interp_freq = NAK_INTERP_FREQ_PASS,
690          .interp_loc = interp_loc,
691       };
692       uint32_t flags_u32;
693       memcpy(&flags_u32, &flags, sizeof(flags_u32));
694 
695       nir_def *comps[NIR_MAX_VEC_COMPONENTS];
696       for (unsigned c = 0; c < num_components; c++) {
697          comps[c] = nir_ipa_nv(b, nir_imm_float(b, 0), offset,
698                                .base = addr + c * 4,
699                                .flags = flags_u32);
700          if (interp_mode == NAK_INTERP_MODE_PERSPECTIVE)
701             comps[c] = nir_fmul(b, comps[c], inv_w);
702       }
703       return nir_vec(b, comps, num_components);
704    } else if (nak->sm >= 50) {
705       struct nak_nir_ipa_flags flags = {
706          .interp_mode = interp_mode,
707          .interp_freq = NAK_INTERP_FREQ_PASS,
708          .interp_loc = interp_loc,
709       };
710 
711       if (interp_mode == NAK_INTERP_MODE_PERSPECTIVE)
712          flags.interp_freq = NAK_INTERP_FREQ_PASS_MUL_W;
713       else
714          inv_w = nir_imm_float(b, 0);
715 
716       uint32_t flags_u32;
717       memcpy(&flags_u32, &flags, sizeof(flags_u32));
718 
719       nir_def *comps[NIR_MAX_VEC_COMPONENTS];
720       for (unsigned c = 0; c < num_components; c++) {
721          comps[c] = nir_ipa_nv(b, inv_w, offset,
722                                .base = addr + c * 4,
723                                .flags = flags_u32);
724       }
725       return nir_vec(b, comps, num_components);
726    } else {
727       unreachable("Figure out input interpolation on Kepler");
728    }
729 }
730 
731 static nir_def *
load_sample_pos_at(nir_builder * b,nir_def * sample_id,const struct nak_fs_key * fs_key)732 load_sample_pos_at(nir_builder *b, nir_def *sample_id,
733                    const struct nak_fs_key *fs_key)
734 {
735    nir_def *loc = nir_load_ubo(b, 1, 64,
736                                nir_imm_int(b, fs_key->sample_locations_cb),
737                                nir_imm_int(b, fs_key->sample_locations_offset),
738                                .align_mul = 8,
739                                .align_offset = 0,
740                                .range = fs_key->sample_locations_offset + 8);
741 
742    /* Yay little endian */
743    loc = nir_ushr(b, loc, nir_imul_imm(b, sample_id, 8));
744    nir_def *loc_x_u4 = nir_iand_imm(b, loc, 0xf);
745    nir_def *loc_y_u4 = nir_iand_imm(b, nir_ushr_imm(b, loc, 4), 0xf);
746    nir_def *loc_u4 = nir_vec2(b, loc_x_u4, loc_y_u4);
747    nir_def *result = nir_fmul_imm(b, nir_i2f32(b, loc_u4), 1.0 / 16.0);
748 
749    return result;
750 }
751 
752 static nir_def *
load_barycentric_offset(nir_builder * b,nir_intrinsic_instr * bary,const struct nak_fs_key * fs_key)753 load_barycentric_offset(nir_builder *b, nir_intrinsic_instr *bary,
754                         const struct nak_fs_key *fs_key)
755 {
756    nir_def *offset_f;
757 
758    if (bary->intrinsic == nir_intrinsic_load_barycentric_coord_at_sample ||
759        bary->intrinsic == nir_intrinsic_load_barycentric_at_sample) {
760       nir_def *sample_id = bary->src[0].ssa;
761       nir_def *sample_pos = load_sample_pos_at(b, sample_id, fs_key);
762       offset_f = nir_fadd_imm(b, sample_pos, -0.5);
763    } else {
764       offset_f = bary->src[0].ssa;
765    }
766 
767    offset_f = nir_fclamp(b, offset_f, nir_imm_float(b, -0.5),
768                          nir_imm_float(b, 0.437500));
769    nir_def *offset_fixed =
770       nir_f2i32(b, nir_fmul_imm(b, offset_f, 4096.0));
771    nir_def *offset = nir_ior(b, nir_ishl_imm(b, nir_channel(b, offset_fixed, 1), 16),
772                              nir_iand_imm(b, nir_channel(b, offset_fixed, 0),
773                                           0xffff));
774 
775    return offset;
776 }
777 
778 struct lower_fs_input_ctx {
779    const struct nak_compiler *nak;
780    const struct nak_fs_key *fs_key;
781 };
782 
783 static bool
lower_fs_input_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * data)784 lower_fs_input_intrin(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
785 {
786    const struct lower_fs_input_ctx *ctx = data;
787 
788    switch (intrin->intrinsic) {
789    case nir_intrinsic_load_barycentric_pixel: {
790       if (!(ctx->fs_key && ctx->fs_key->force_sample_shading))
791          return false;
792 
793       intrin->intrinsic = nir_intrinsic_load_barycentric_sample;
794       return true;
795    }
796 
797    case nir_intrinsic_load_frag_coord:
798    case nir_intrinsic_load_point_coord: {
799       b->cursor = nir_before_instr(&intrin->instr);
800 
801       const enum nak_interp_loc interp_loc =
802          b->shader->info.fs.uses_sample_shading ? NAK_INTERP_LOC_CENTROID
803                                                 : NAK_INTERP_LOC_DEFAULT;
804       const uint32_t addr =
805          intrin->intrinsic == nir_intrinsic_load_point_coord ?
806          nak_sysval_attr_addr(SYSTEM_VALUE_POINT_COORD) :
807          nak_sysval_attr_addr(SYSTEM_VALUE_FRAG_COORD);
808 
809       nir_def *coord = load_interpolated_input(b, intrin->def.num_components,
810                                                addr,
811                                                NAK_INTERP_MODE_SCREEN_LINEAR,
812                                                interp_loc, NULL, NULL,
813                                                ctx->nak);
814 
815       nir_def_rewrite_uses(&intrin->def, coord);
816       nir_instr_remove(&intrin->instr);
817 
818       return true;
819    }
820 
821    case nir_intrinsic_load_input: {
822       b->cursor = nir_before_instr(&intrin->instr);
823 
824       uint16_t addr = nir_intrinsic_base(intrin) +
825                       nir_src_as_uint(intrin->src[0]) +
826                       nir_intrinsic_component(intrin) * 4;
827 
828       const struct nak_nir_ipa_flags flags = {
829          .interp_mode = NAK_INTERP_MODE_CONSTANT,
830          .interp_freq = NAK_INTERP_FREQ_CONSTANT,
831          .interp_loc = NAK_INTERP_LOC_DEFAULT,
832       };
833       uint32_t flags_u32;
834       memcpy(&flags_u32, &flags, sizeof(flags_u32));
835 
836       nir_def *comps[NIR_MAX_VEC_COMPONENTS];
837       for (unsigned c = 0; c < intrin->def.num_components; c++) {
838          comps[c] = nir_ipa_nv(b, nir_imm_float(b, 0), nir_imm_int(b, 0),
839                                .base = addr + c * 4, .flags = flags_u32);
840       }
841       nir_def *res = nir_vec(b, comps, intrin->def.num_components);
842 
843       nir_def_rewrite_uses(&intrin->def, res);
844       nir_instr_remove(&intrin->instr);
845 
846       return true;
847    }
848 
849    case nir_intrinsic_load_barycentric_coord_pixel:
850    case nir_intrinsic_load_barycentric_coord_centroid:
851    case nir_intrinsic_load_barycentric_coord_sample:
852    case nir_intrinsic_load_barycentric_coord_at_sample:
853    case nir_intrinsic_load_barycentric_coord_at_offset: {
854       b->cursor = nir_before_instr(&intrin->instr);
855 
856       uint32_t addr;
857       enum nak_interp_mode interp_mode;
858       if (nir_intrinsic_interp_mode(intrin) == INTERP_MODE_NOPERSPECTIVE) {
859          addr = NAK_ATTR_BARY_COORD_NO_PERSP;
860          interp_mode = NAK_INTERP_MODE_SCREEN_LINEAR;
861       } else {
862          addr = NAK_ATTR_BARY_COORD;
863          interp_mode = NAK_INTERP_MODE_PERSPECTIVE;
864       }
865 
866       nir_def *offset = NULL;
867       enum nak_interp_loc interp_loc;
868       switch (intrin->intrinsic) {
869       case nir_intrinsic_load_barycentric_coord_at_sample:
870       case nir_intrinsic_load_barycentric_coord_at_offset:
871          interp_loc = NAK_INTERP_LOC_OFFSET;
872          offset = load_barycentric_offset(b, intrin, ctx->fs_key);
873          break;
874       case nir_intrinsic_load_barycentric_coord_centroid:
875       case nir_intrinsic_load_barycentric_coord_sample:
876          interp_loc = NAK_INTERP_LOC_CENTROID;
877          break;
878       case nir_intrinsic_load_barycentric_coord_pixel:
879          interp_loc = NAK_INTERP_LOC_DEFAULT;
880          break;
881       default:
882          unreachable("Unknown intrinsic");
883       }
884 
885       nir_def *inv_w = NULL;
886       if (interp_mode == NAK_INTERP_MODE_PERSPECTIVE)
887          inv_w = nir_frcp(b, load_frag_w(b, interp_loc, offset));
888 
889       nir_def *res = load_interpolated_input(b, intrin->def.num_components,
890                                              addr, interp_mode, interp_loc,
891                                              inv_w, offset, ctx->nak);
892 
893       nir_def_rewrite_uses(&intrin->def, res);
894       nir_instr_remove(&intrin->instr);
895 
896       return true;
897    }
898 
899    case nir_intrinsic_load_interpolated_input: {
900       b->cursor = nir_before_instr(&intrin->instr);
901 
902       const uint16_t addr = nir_intrinsic_base(intrin) +
903                             nir_src_as_uint(intrin->src[1]) +
904                             nir_intrinsic_component(intrin) * 4;
905 
906       nir_intrinsic_instr *bary = nir_src_as_intrinsic(intrin->src[0]);
907 
908       enum nak_interp_mode interp_mode;
909       if (nir_intrinsic_interp_mode(bary) == INTERP_MODE_SMOOTH ||
910           nir_intrinsic_interp_mode(bary) == INTERP_MODE_NONE)
911          interp_mode = NAK_INTERP_MODE_PERSPECTIVE;
912       else
913          interp_mode = NAK_INTERP_MODE_SCREEN_LINEAR;
914 
915       nir_def *offset = NULL;
916       enum nak_interp_loc interp_loc;
917       switch (bary->intrinsic) {
918       case nir_intrinsic_load_barycentric_at_offset:
919       case nir_intrinsic_load_barycentric_at_sample: {
920          interp_loc = NAK_INTERP_LOC_OFFSET;
921          offset = load_barycentric_offset(b, bary, ctx->fs_key);
922          break;
923       }
924 
925       case nir_intrinsic_load_barycentric_centroid:
926       case nir_intrinsic_load_barycentric_sample:
927          interp_loc = NAK_INTERP_LOC_CENTROID;
928          break;
929 
930       case nir_intrinsic_load_barycentric_pixel:
931          interp_loc = NAK_INTERP_LOC_DEFAULT;
932          break;
933 
934       default:
935          unreachable("Unsupported barycentric");
936       }
937 
938       nir_def *inv_w = NULL;
939       if (interp_mode == NAK_INTERP_MODE_PERSPECTIVE)
940          inv_w = nir_frcp(b, load_frag_w(b, interp_loc, offset));
941 
942       nir_def *res = load_interpolated_input(b, intrin->def.num_components,
943                                              addr, interp_mode, interp_loc,
944                                              inv_w, offset, ctx->nak);
945 
946       nir_def_rewrite_uses(&intrin->def, res);
947       nir_instr_remove(&intrin->instr);
948 
949       return true;
950    }
951 
952    case nir_intrinsic_load_sample_mask_in: {
953       if (!b->shader->info.fs.uses_sample_shading &&
954           !(ctx->fs_key && ctx->fs_key->force_sample_shading))
955          return false;
956 
957       b->cursor = nir_after_instr(&intrin->instr);
958 
959       /* Mask off just the current sample */
960       nir_def *sample = nir_load_sample_id(b);
961       nir_def *mask = nir_ishl(b, nir_imm_int(b, 1), sample);
962       mask = nir_iand(b, &intrin->def, mask);
963       nir_def_rewrite_uses_after(&intrin->def, mask, mask->parent_instr);
964 
965       return true;
966    }
967 
968    case nir_intrinsic_load_sample_pos: {
969       b->cursor = nir_before_instr(&intrin->instr);
970 
971       nir_def *sample_id = nir_load_sample_id(b);
972       nir_def *sample_pos = load_sample_pos_at(b, sample_id, ctx->fs_key);
973 
974       nir_def_rewrite_uses(&intrin->def, sample_pos);
975       nir_instr_remove(&intrin->instr);
976 
977       return true;
978    }
979 
980    case nir_intrinsic_load_input_vertex: {
981       b->cursor = nir_before_instr(&intrin->instr);
982 
983       unsigned vertex_id = nir_src_as_uint(intrin->src[0]);
984       assert(vertex_id < 3);
985 
986       const uint16_t addr = nir_intrinsic_base(intrin) +
987                             nir_src_as_uint(intrin->src[1]) +
988                             nir_intrinsic_component(intrin) * 4;
989 
990       nir_def *comps[NIR_MAX_VEC_COMPONENTS];
991       for (unsigned c = 0; c < intrin->def.num_components; c++) {
992          nir_def *data = nir_ldtram_nv(b, .base = addr + c * 4,
993                                        .flags = vertex_id == 2);
994          comps[c] = nir_channel(b, data, vertex_id & 1);
995       }
996       nir_def *res = nir_vec(b, comps, intrin->num_components);
997 
998       nir_def_rewrite_uses(&intrin->def, res);
999       nir_instr_remove(&intrin->instr);
1000 
1001       return true;
1002    }
1003 
1004    default:
1005       return false;
1006    }
1007 }
1008 
1009 static bool
nak_nir_lower_fs_inputs(nir_shader * nir,const struct nak_compiler * nak,const struct nak_fs_key * fs_key)1010 nak_nir_lower_fs_inputs(nir_shader *nir,
1011                         const struct nak_compiler *nak,
1012                         const struct nak_fs_key *fs_key)
1013 {
1014    NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
1015    NIR_PASS_V(nir, nak_nir_lower_varyings, nir_var_shader_in);
1016    NIR_PASS_V(nir, nir_opt_constant_folding);
1017 
1018    const struct lower_fs_input_ctx fs_in_ctx = {
1019       .nak = nak,
1020       .fs_key = fs_key,
1021    };
1022    NIR_PASS_V(nir, nir_shader_intrinsics_pass, lower_fs_input_intrin,
1023               nir_metadata_block_index | nir_metadata_dominance,
1024               (void *)&fs_in_ctx);
1025 
1026    return true;
1027 }
1028 
1029 static int
fs_out_size(const struct glsl_type * type,bool bindless)1030 fs_out_size(const struct glsl_type *type, bool bindless)
1031 {
1032    assert(glsl_type_is_vector_or_scalar(type));
1033    return 16;
1034 }
1035 
1036 static bool
nak_nir_lower_fs_outputs(nir_shader * nir)1037 nak_nir_lower_fs_outputs(nir_shader *nir)
1038 {
1039    if (nir->info.outputs_written == 0)
1040       return false;
1041 
1042    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, true);
1043 
1044    nir->num_outputs = 0;
1045    nir_foreach_shader_out_variable(var, nir) {
1046       switch (var->data.location) {
1047       case FRAG_RESULT_DEPTH:
1048          assert(var->data.index == 0);
1049          assert(var->data.location_frac == 0);
1050          var->data.driver_location = NAK_FS_OUT_DEPTH;
1051          break;
1052       case FRAG_RESULT_STENCIL:
1053          unreachable("EXT_shader_stencil_export not supported");
1054          break;
1055       case FRAG_RESULT_COLOR:
1056          unreachable("Vulkan alway uses explicit locations");
1057          break;
1058       case FRAG_RESULT_SAMPLE_MASK:
1059          assert(var->data.index == 0);
1060          assert(var->data.location_frac == 0);
1061          var->data.driver_location = NAK_FS_OUT_SAMPLE_MASK;
1062          break;
1063       default: {
1064          assert(var->data.location >= FRAG_RESULT_DATA0);
1065          assert(var->data.index < 2);
1066          const unsigned out =
1067             (var->data.location - FRAG_RESULT_DATA0) + var->data.index;
1068          var->data.driver_location = NAK_FS_OUT_COLOR(out);
1069          break;
1070       }
1071       }
1072    }
1073 
1074    NIR_PASS_V(nir, nir_lower_io, nir_var_shader_out, fs_out_size, 0);
1075 
1076    return true;
1077 }
1078 
1079 static bool
nak_mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * cb_data)1080 nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
1081                      unsigned bit_size, unsigned num_components,
1082                      nir_intrinsic_instr *low, nir_intrinsic_instr *high,
1083                      void *cb_data)
1084 {
1085    /*
1086     * Since we legalize these later with nir_lower_mem_access_bit_sizes,
1087     * we can optimistically combine anything that might be profitable
1088     */
1089    assert(util_is_power_of_two_nonzero(align_mul));
1090 
1091    unsigned max_bytes = 128u / 8u;
1092    if (low->intrinsic == nir_intrinsic_load_ubo)
1093       max_bytes = 64u / 8u;
1094 
1095    align_mul = MIN2(align_mul, max_bytes);
1096    align_offset = align_offset % align_mul;
1097    return align_offset + num_components * (bit_size / 8) <= align_mul;
1098 }
1099 
1100 static nir_mem_access_size_align
nak_mem_access_size_align(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,const void * cb_data)1101 nak_mem_access_size_align(nir_intrinsic_op intrin,
1102                           uint8_t bytes, uint8_t bit_size,
1103                           uint32_t align_mul, uint32_t align_offset,
1104                           bool offset_is_const, const void *cb_data)
1105 {
1106    const uint32_t align = nir_combined_align(align_mul, align_offset);
1107    assert(util_is_power_of_two_nonzero(align));
1108 
1109    unsigned bytes_pow2;
1110    if (nir_intrinsic_infos[intrin].has_dest) {
1111       /* Reads can over-fetch a bit if the alignment is okay. */
1112       bytes_pow2 = util_next_power_of_two(bytes);
1113    } else {
1114       bytes_pow2 = 1 << (util_last_bit(bytes) - 1);
1115    }
1116 
1117    unsigned chunk_bytes = MIN3(bytes_pow2, align, 16);
1118    assert(util_is_power_of_two_nonzero(chunk_bytes));
1119    if (intrin == nir_intrinsic_load_ubo)
1120       chunk_bytes = MIN2(chunk_bytes, 8);
1121 
1122    if (intrin == nir_intrinsic_load_ubo && align < 4) {
1123       /* CBufs require 4B alignment unless we're doing a ldc.u8 or ldc.i8.
1124        * In particular, this applies to ldc.u16 which means we either have to
1125        * fall back to two ldc.u8 or use ldc.u32 and shift stuff around to get
1126        * the 16bit value out.  Fortunately, nir_lower_mem_access_bit_sizes()
1127        * can handle over-alignment for reads.
1128        */
1129       if (align == 2 || offset_is_const) {
1130          return (nir_mem_access_size_align) {
1131             .bit_size = 32,
1132             .num_components = 1,
1133             .align = 4,
1134          };
1135       } else {
1136          assert(align == 1);
1137          return (nir_mem_access_size_align) {
1138             .bit_size = 8,
1139             .num_components = 1,
1140             .align = 1,
1141          };
1142       }
1143    } else if (chunk_bytes < 4) {
1144       return (nir_mem_access_size_align) {
1145          .bit_size = chunk_bytes * 8,
1146          .num_components = 1,
1147          .align = chunk_bytes,
1148       };
1149    } else {
1150       return (nir_mem_access_size_align) {
1151          .bit_size = 32,
1152          .num_components = chunk_bytes / 4,
1153          .align = chunk_bytes,
1154       };
1155    }
1156 }
1157 
1158 static bool
nir_shader_has_local_variables(const nir_shader * nir)1159 nir_shader_has_local_variables(const nir_shader *nir)
1160 {
1161    nir_foreach_function(func, nir) {
1162       if (func->impl && !exec_list_is_empty(&func->impl->locals))
1163          return true;
1164    }
1165 
1166    return false;
1167 }
1168 
1169 void
nak_postprocess_nir(nir_shader * nir,const struct nak_compiler * nak,nir_variable_mode robust2_modes,const struct nak_fs_key * fs_key)1170 nak_postprocess_nir(nir_shader *nir,
1171                     const struct nak_compiler *nak,
1172                     nir_variable_mode robust2_modes,
1173                     const struct nak_fs_key *fs_key)
1174 {
1175    UNUSED bool progress = false;
1176 
1177    nak_optimize_nir(nir, nak);
1178 
1179    const nir_lower_subgroups_options subgroups_options = {
1180       .subgroup_size = 32,
1181       .ballot_bit_size = 32,
1182       .ballot_components = 1,
1183       .lower_to_scalar = true,
1184       .lower_vote_eq = true,
1185       .lower_first_invocation_to_ballot = true,
1186       .lower_read_first_invocation = true,
1187       .lower_elect = true,
1188       .lower_inverse_ballot = true,
1189    };
1190    OPT(nir, nir_lower_subgroups, &subgroups_options);
1191    OPT(nir, nak_nir_lower_scan_reduce);
1192 
1193    if (nir_shader_has_local_variables(nir)) {
1194       OPT(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
1195           glsl_get_natural_size_align_bytes);
1196       OPT(nir, nir_lower_explicit_io, nir_var_function_temp,
1197           nir_address_format_32bit_offset);
1198       nak_optimize_nir(nir, nak);
1199    }
1200 
1201    OPT(nir, nir_opt_shrink_vectors);
1202 
1203    nir_load_store_vectorize_options vectorize_opts = {};
1204    vectorize_opts.modes = nir_var_mem_global |
1205                           nir_var_mem_ssbo |
1206                           nir_var_mem_shared |
1207                           nir_var_shader_temp;
1208    vectorize_opts.callback = nak_mem_vectorize_cb;
1209    vectorize_opts.robust_modes = robust2_modes;
1210    OPT(nir, nir_opt_load_store_vectorize, &vectorize_opts);
1211 
1212    nir_lower_mem_access_bit_sizes_options mem_bit_size_options = {
1213       .modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic,
1214       .callback = nak_mem_access_size_align,
1215    };
1216    OPT(nir, nir_lower_mem_access_bit_sizes, &mem_bit_size_options);
1217    OPT(nir, nir_lower_bit_size, lower_bit_size_cb, (void *)nak);
1218 
1219    OPT(nir, nir_opt_combine_barriers, NULL, NULL);
1220 
1221    nak_optimize_nir(nir, nak);
1222 
1223    OPT(nir, nak_nir_lower_tex, nak);
1224    OPT(nir, nir_lower_idiv, NULL);
1225 
1226    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
1227 
1228    OPT(nir, nir_lower_indirect_derefs, 0, UINT32_MAX);
1229 
1230    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1231       OPT(nir, nir_lower_tess_coord_z,
1232           nir->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);
1233    }
1234 
1235    OPT(nir, nak_nir_lower_system_values, nak);
1236 
1237    switch (nir->info.stage) {
1238    case MESA_SHADER_VERTEX:
1239       OPT(nir, nak_nir_lower_vs_inputs);
1240       OPT(nir, nak_nir_lower_varyings, nir_var_shader_out);
1241       OPT(nir, nir_opt_constant_folding);
1242       OPT(nir, nak_nir_lower_vtg_io, nak);
1243       break;
1244 
1245    case MESA_SHADER_TESS_CTRL:
1246    case MESA_SHADER_TESS_EVAL:
1247       OPT(nir, nak_nir_lower_varyings, nir_var_shader_in | nir_var_shader_out);
1248       OPT(nir, nir_opt_constant_folding);
1249       OPT(nir, nak_nir_lower_vtg_io, nak);
1250       break;
1251 
1252    case MESA_SHADER_FRAGMENT:
1253       OPT(nir, nak_nir_lower_fs_inputs, nak, fs_key);
1254       OPT(nir, nak_nir_lower_fs_outputs);
1255       break;
1256 
1257    case MESA_SHADER_GEOMETRY:
1258       OPT(nir, nak_nir_lower_varyings, nir_var_shader_in | nir_var_shader_out);
1259       OPT(nir, nir_opt_constant_folding);
1260       OPT(nir, nak_nir_lower_vtg_io, nak);
1261       OPT(nir, nak_nir_lower_gs_intrinsics);
1262       break;
1263 
1264    case MESA_SHADER_COMPUTE:
1265    case MESA_SHADER_KERNEL:
1266       break;
1267 
1268    default:
1269       unreachable("Unsupported shader stage");
1270    }
1271 
1272    OPT(nir, nir_lower_doubles, NULL, nak->nir_options.lower_doubles_options);
1273    OPT(nir, nir_lower_int64);
1274 
1275    nak_optimize_nir(nir, nak);
1276 
1277    do {
1278       progress = false;
1279       OPT(nir, nir_opt_algebraic_late);
1280       OPT(nir, nak_nir_lower_algebraic_late, nak);
1281 
1282       /* If we're lowering fp64 sat but not min/max, the sat lowering may have
1283        * been undone by nir_opt_algebraic.  Lower sat again just to be sure.
1284        */
1285       if ((nak->nir_options.lower_doubles_options & nir_lower_dsat) &&
1286           !(nak->nir_options.lower_doubles_options & nir_lower_dminmax))
1287          OPT(nir, nir_lower_doubles, NULL, nir_lower_dsat);
1288 
1289       if (progress) {
1290          OPT(nir, nir_opt_constant_folding);
1291          OPT(nir, nir_copy_prop);
1292          OPT(nir, nir_opt_dce);
1293          OPT(nir, nir_opt_cse);
1294       }
1295    } while (progress);
1296 
1297    nir_divergence_analysis(nir);
1298 
1299    OPT(nir, nak_nir_add_barriers, nak);
1300 
1301    /* Re-index blocks and compact SSA defs because we'll use them to index
1302     * arrays
1303     */
1304    nir_foreach_function(func, nir) {
1305       if (func->impl) {
1306          nir_index_blocks(func->impl);
1307          nir_index_ssa_defs(func->impl);
1308       }
1309    }
1310 
1311    if (nak_should_print_nir())
1312       nir_print_shader(nir, stderr);
1313 }
1314 
1315 static bool
scalar_is_imm_int(nir_scalar x,unsigned bits)1316 scalar_is_imm_int(nir_scalar x, unsigned bits)
1317 {
1318    if (!nir_scalar_is_const(x))
1319       return false;
1320 
1321    int64_t imm = nir_scalar_as_int(x);
1322    return u_intN_min(bits) <= imm && imm <= u_intN_max(bits);
1323 }
1324 
1325 struct nak_io_addr_offset
nak_get_io_addr_offset(nir_def * addr,uint8_t imm_bits)1326 nak_get_io_addr_offset(nir_def *addr, uint8_t imm_bits)
1327 {
1328    nir_scalar addr_s = {
1329       .def = addr,
1330       .comp = 0,
1331    };
1332    if (scalar_is_imm_int(addr_s, imm_bits)) {
1333       /* Base is a dumb name for this.  It should be offset */
1334       return (struct nak_io_addr_offset) {
1335          .offset = nir_scalar_as_int(addr_s),
1336       };
1337    }
1338 
1339    addr_s = nir_scalar_chase_movs(addr_s);
1340    if (!nir_scalar_is_alu(addr_s) ||
1341        nir_scalar_alu_op(addr_s) != nir_op_iadd) {
1342       return (struct nak_io_addr_offset) {
1343          .base = addr_s,
1344       };
1345    }
1346 
1347    for (unsigned i = 0; i < 2; i++) {
1348       nir_scalar off_s = nir_scalar_chase_alu_src(addr_s, i);
1349       off_s = nir_scalar_chase_movs(off_s);
1350       if (scalar_is_imm_int(off_s, imm_bits)) {
1351          return (struct nak_io_addr_offset) {
1352             .base = nir_scalar_chase_alu_src(addr_s, 1 - i),
1353             .offset = nir_scalar_as_int(off_s),
1354          };
1355       }
1356    }
1357 
1358    return (struct nak_io_addr_offset) {
1359       .base = addr_s,
1360    };
1361 }
1362