1 /*
2 * Copyright 2022 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "nir_builder.h"
8
9 #include "ac_nir.h"
10 #include "si_pipe.h"
11 #include "si_query.h"
12 #include "si_state.h"
13 #include "si_shader_internal.h"
14
15 struct lower_abi_state {
16 struct si_shader *shader;
17 struct si_shader_args *args;
18
19 nir_def *esgs_ring;
20 nir_def *tess_offchip_ring;
21 nir_def *gsvs_ring[4];
22 };
23
24 #define GET_FIELD_NIR(field) \
25 ac_nir_unpack_arg(b, &args->ac, args->vs_state_bits, \
26 field##__SHIFT, util_bitcount(field##__MASK))
27
si_nir_load_internal_binding(nir_builder * b,struct si_shader_args * args,unsigned slot,unsigned num_components)28 nir_def *si_nir_load_internal_binding(nir_builder *b, struct si_shader_args *args,
29 unsigned slot, unsigned num_components)
30 {
31 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->internal_bindings);
32 return nir_load_smem_amd(b, num_components, addr, nir_imm_int(b, slot * 16));
33 }
34
build_attr_ring_desc(nir_builder * b,struct si_shader * shader,struct si_shader_args * args)35 static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader,
36 struct si_shader_args *args)
37 {
38 struct si_shader_selector *sel = shader->selector;
39
40 nir_def *attr_address =
41 b->shader->info.stage == MESA_SHADER_VERTEX && b->shader->info.vs.blit_sgprs_amd ?
42 ac_nir_load_arg_at_offset(b, &args->ac, args->vs_blit_inputs,
43 b->shader->info.vs.blit_sgprs_amd - 1) :
44 ac_nir_load_arg(b, &args->ac, args->gs_attr_address);
45
46 unsigned stride = 16 * si_shader_num_alloc_param_exports(shader);
47 uint32_t desc[4];
48
49 ac_build_attr_ring_descriptor(sel->screen->info.gfx_level,
50 (uint64_t)sel->screen->info.address32_hi << 32,
51 0xffffffff, stride, desc);
52
53 nir_def *comp[] = {
54 attr_address,
55 nir_imm_int(b, desc[1]),
56 nir_imm_int(b, desc[2]),
57 nir_imm_int(b, desc[3]),
58 };
59
60 return nir_vec(b, comp, 4);
61 }
62
build_tess_ring_desc(nir_builder * b,struct si_screen * screen,struct si_shader_args * args)63 static nir_def *build_tess_ring_desc(nir_builder *b, struct si_screen *screen,
64 struct si_shader_args *args)
65 {
66 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->tes_offchip_addr);
67 uint32_t desc[4];
68
69 ac_build_raw_buffer_descriptor(screen->info.gfx_level,
70 (uint64_t)screen->info.address32_hi << 32,
71 0xffffffff, desc);
72
73 nir_def *comp[4] = {
74 addr,
75 nir_imm_int(b, desc[1]),
76 nir_imm_int(b, desc[2]),
77 nir_imm_int(b, desc[3]),
78 };
79
80 return nir_vec(b, comp, 4);
81 }
82
build_esgs_ring_desc(nir_builder * b,enum amd_gfx_level gfx_level,struct si_shader_args * args)83 static nir_def *build_esgs_ring_desc(nir_builder *b, enum amd_gfx_level gfx_level,
84 struct si_shader_args *args)
85 {
86 nir_def *desc = si_nir_load_internal_binding(b, args, SI_RING_ESGS, 4);
87
88 if (b->shader->info.stage == MESA_SHADER_GEOMETRY)
89 return desc;
90
91 nir_def *vec[4];
92 for (int i = 0; i < 4; i++)
93 vec[i] = nir_channel(b, desc, i);
94
95 vec[1] = nir_ior_imm(b, vec[1], S_008F04_SWIZZLE_ENABLE_GFX6(1));
96 vec[3] = nir_ior_imm(b, vec[3],
97 S_008F0C_ELEMENT_SIZE(1) |
98 S_008F0C_INDEX_STRIDE(3) |
99 S_008F0C_ADD_TID_ENABLE(1));
100
101 /* If MUBUF && ADD_TID_ENABLE, DATA_FORMAT means STRIDE[14:17] on gfx8-9, so set 0. */
102 if (gfx_level == GFX8)
103 vec[3] = nir_iand_imm(b, vec[3], C_008F0C_DATA_FORMAT);
104
105 return nir_vec(b, vec, 4);
106 }
107
build_gsvs_ring_desc(nir_builder * b,struct lower_abi_state * s)108 static void build_gsvs_ring_desc(nir_builder *b, struct lower_abi_state *s)
109 {
110 const struct si_shader_selector *sel = s->shader->selector;
111 const union si_shader_key *key = &s->shader->key;
112
113 if (s->shader->is_gs_copy_shader) {
114 s->gsvs_ring[0] = si_nir_load_internal_binding(b, s->args, SI_RING_GSVS, 4);
115 } else if (b->shader->info.stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg) {
116 nir_def *base_addr = si_nir_load_internal_binding(b, s->args, SI_RING_GSVS, 2);
117 base_addr = nir_pack_64_2x32(b, base_addr);
118
119 /* The conceptual layout of the GSVS ring is
120 * v0c0 .. vLv0 v0c1 .. vLc1 ..
121 * but the real memory layout is swizzled across
122 * threads:
123 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
124 * t16v0c0 ..
125 * Override the buffer descriptor accordingly.
126 */
127
128 for (unsigned stream = 0; stream < 4; stream++) {
129 unsigned num_components = sel->info.num_stream_output_components[stream];
130 if (!num_components)
131 continue;
132
133 unsigned stride = 4 * num_components * b->shader->info.gs.vertices_out;
134 /* Limit on the stride field for <= GFX7. */
135 assert(stride < (1 << 14));
136
137 unsigned num_records = s->shader->wave_size;
138
139 const struct ac_buffer_state buffer_state = {
140 .size = num_records,
141 .format = PIPE_FORMAT_R32_FLOAT,
142 .swizzle = {
143 PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W,
144 },
145 .stride = stride,
146 .swizzle_enable = true,
147 .element_size = 1,
148 .index_stride = 1,
149 .add_tid = true,
150 .gfx10_oob_select = V_008F0C_OOB_SELECT_DISABLED,
151 };
152 uint32_t tmp_desc[4];
153
154 ac_build_buffer_descriptor(sel->screen->info.gfx_level, &buffer_state, tmp_desc);
155
156 nir_def *desc[4];
157 desc[0] = nir_unpack_64_2x32_split_x(b, base_addr);
158 desc[1] = nir_ior_imm(b, nir_unpack_64_2x32_split_y(b, base_addr), tmp_desc[1]);
159 desc[2] = nir_imm_int(b, tmp_desc[2]);
160 desc[3] = nir_imm_int(b, tmp_desc[3]);
161
162 s->gsvs_ring[stream] = nir_vec(b, desc, 4);
163
164 /* next stream's desc addr */
165 base_addr = nir_iadd_imm(b, base_addr, stride * num_records);
166 }
167 }
168 }
169
preload_reusable_variables(nir_builder * b,struct lower_abi_state * s)170 static void preload_reusable_variables(nir_builder *b, struct lower_abi_state *s)
171 {
172 const struct si_shader_selector *sel = s->shader->selector;
173 const union si_shader_key *key = &s->shader->key;
174
175 b->cursor = nir_before_impl(b->impl);
176
177 if (sel->screen->info.gfx_level <= GFX8 && b->shader->info.stage <= MESA_SHADER_GEOMETRY &&
178 (key->ge.as_es || b->shader->info.stage == MESA_SHADER_GEOMETRY)) {
179 s->esgs_ring = build_esgs_ring_desc(b, sel->screen->info.gfx_level, s->args);
180 }
181
182 if (b->shader->info.stage == MESA_SHADER_TESS_CTRL ||
183 b->shader->info.stage == MESA_SHADER_TESS_EVAL)
184 s->tess_offchip_ring = build_tess_ring_desc(b, sel->screen, s->args);
185
186 build_gsvs_ring_desc(b, s);
187 }
188
get_num_vertices_per_prim(nir_builder * b,struct lower_abi_state * s)189 static nir_def *get_num_vertices_per_prim(nir_builder *b, struct lower_abi_state *s)
190 {
191 struct si_shader_args *args = s->args;
192 unsigned num_vertices = si_get_num_vertices_per_output_prim(s->shader);
193
194 if (num_vertices)
195 return nir_imm_int(b, num_vertices);
196 else
197 return nir_iadd_imm(b, GET_FIELD_NIR(GS_STATE_OUTPRIM), 1);
198 }
199
get_small_prim_precision(nir_builder * b,struct lower_abi_state * s,bool lines)200 static nir_def *get_small_prim_precision(nir_builder *b, struct lower_abi_state *s, bool lines)
201 {
202 /* Compute FP32 value "num_samples / quant_mode" using integer ops.
203 * See si_shader.h for how this works.
204 */
205 struct si_shader_args *args = s->args;
206 nir_def *precision = GET_FIELD_NIR(GS_STATE_SMALL_PRIM_PRECISION);
207 nir_def *log_samples = GET_FIELD_NIR(GS_STATE_SMALL_PRIM_PRECISION_LOG_SAMPLES);
208
209 if (lines)
210 precision = nir_iadd(b, precision, log_samples);
211
212 /* The final FP32 value is: 1/2^(15 - precision) */
213 return nir_ishl_imm(b, nir_ior_imm(b, precision, 0x70), 23);
214 }
215
lower_intrinsic(nir_builder * b,nir_instr * instr,struct lower_abi_state * s)216 static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_state *s)
217 {
218 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
219
220 struct si_shader *shader = s->shader;
221 struct si_shader_args *args = s->args;
222 struct si_shader_selector *sel = shader->selector;
223 union si_shader_key *key = &shader->key;
224 gl_shader_stage stage = b->shader->info.stage;
225
226 b->cursor = nir_before_instr(instr);
227
228 nir_def *replacement = NULL;
229
230 switch (intrin->intrinsic) {
231 case nir_intrinsic_load_base_vertex: {
232 nir_def *indexed = GET_FIELD_NIR(VS_STATE_INDEXED);
233 indexed = nir_i2b(b, indexed);
234
235 nir_def *base_vertex = ac_nir_load_arg(b, &args->ac, args->ac.base_vertex);
236 replacement = nir_bcsel(b, indexed, base_vertex, nir_imm_int(b, 0));
237 break;
238 }
239 case nir_intrinsic_load_workgroup_size: {
240 assert(b->shader->info.workgroup_size_variable && sel->info.uses_variable_block_size);
241
242 nir_def *block_size = ac_nir_load_arg(b, &args->ac, args->block_size);
243 nir_def *comp[] = {
244 nir_ubfe_imm(b, block_size, 0, 10),
245 nir_ubfe_imm(b, block_size, 10, 10),
246 nir_ubfe_imm(b, block_size, 20, 10),
247 };
248 replacement = nir_vec(b, comp, 3);
249 break;
250 }
251 case nir_intrinsic_load_tess_level_outer_default:
252 case nir_intrinsic_load_tess_level_inner_default: {
253 nir_def *buf = si_nir_load_internal_binding(b, args, SI_HS_CONST_DEFAULT_TESS_LEVELS, 4);
254 unsigned num_components = intrin->def.num_components;
255 unsigned offset =
256 intrin->intrinsic == nir_intrinsic_load_tess_level_inner_default ? 16 : 0;
257 replacement = nir_load_ubo(b, num_components, 32, buf, nir_imm_int(b, offset),
258 .range = ~0);
259 break;
260 }
261 case nir_intrinsic_load_patch_vertices_in:
262 if (stage == MESA_SHADER_TESS_CTRL)
263 replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5);
264 else if (stage == MESA_SHADER_TESS_EVAL) {
265 replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
266 } else
267 unreachable("no nir_load_patch_vertices_in");
268 replacement = nir_iadd_imm(b, replacement, 1);
269 break;
270 case nir_intrinsic_load_sample_mask_in:
271 replacement = ac_nir_load_arg(b, &args->ac, args->ac.sample_coverage);
272 break;
273 case nir_intrinsic_load_lshs_vertex_stride_amd:
274 if (stage == MESA_SHADER_VERTEX) {
275 replacement = nir_imm_int(b, si_shader_lshs_vertex_stride(shader));
276 } else if (stage == MESA_SHADER_TESS_CTRL) {
277 if (sel->screen->info.gfx_level >= GFX9 && shader->is_monolithic) {
278 replacement = nir_imm_int(b, si_shader_lshs_vertex_stride(shader));
279 } else {
280 nir_def *num_ls_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 17, 6);
281 nir_def *extra_dw = nir_bcsel(b, nir_ieq_imm(b, num_ls_out, 0), nir_imm_int(b, 0), nir_imm_int(b, 4));
282 replacement = nir_iadd_nuw(b, nir_ishl_imm(b, num_ls_out, 4), extra_dw);
283 }
284 } else {
285 unreachable("no nir_load_lshs_vertex_stride_amd");
286 }
287 break;
288 case nir_intrinsic_load_esgs_vertex_stride_amd:
289 assert(sel->screen->info.gfx_level >= GFX9);
290 if (shader->is_monolithic) {
291 replacement = nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4);
292 } else {
293 nir_def *num_es_outputs = GET_FIELD_NIR(GS_STATE_NUM_ES_OUTPUTS);
294 replacement = nir_iadd_imm(b, nir_imul_imm(b, num_es_outputs, 4), 1);
295 }
296 break;
297 case nir_intrinsic_load_tcs_num_patches_amd: {
298 nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
299 replacement = nir_iadd_imm(b, tmp, 1);
300 break;
301 }
302 case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
303 nir_def *per_vtx_out_patch_size = NULL;
304
305 if (stage == MESA_SHADER_TESS_CTRL) {
306 const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written_for_tes);
307 const unsigned out_vtx_size = num_hs_out * 16;
308 const unsigned out_vtx_per_patch = b->shader->info.tess.tcs_vertices_out;
309 per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch);
310 } else {
311 nir_def *num_hs_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
312 nir_def *out_vtx_size = nir_ishl_imm(b, num_hs_out, 4);
313 nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
314 nir_def *out_vtx_per_patch = nir_iadd_imm_nuw(b, o, 1);
315 per_vtx_out_patch_size = nir_imul(b, out_vtx_per_patch, out_vtx_size);
316 }
317
318 nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
319 nir_def *num_patches = nir_iadd_imm_nuw(b, p, 1);
320 replacement = nir_imul(b, per_vtx_out_patch_size, num_patches);
321 break;
322 }
323 case nir_intrinsic_load_clip_half_line_width_amd: {
324 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
325 replacement = nir_load_smem_amd(b, 2, addr, nir_imm_int(b, 32));
326 break;
327 }
328 case nir_intrinsic_load_cull_triangle_viewport_xy_scale_and_offset_amd: {
329 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
330 replacement = nir_load_smem_amd(b, 4, addr, nir_imm_int(b, 0));
331 break;
332 }
333 case nir_intrinsic_load_cull_line_viewport_xy_scale_and_offset_amd: {
334 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
335 replacement = nir_load_smem_amd(b, 4, addr, nir_imm_int(b, 16));
336 break;
337 }
338 case nir_intrinsic_load_num_vertices_per_primitive_amd:
339 replacement = get_num_vertices_per_prim(b, s);
340 break;
341 case nir_intrinsic_load_cull_ccw_amd:
342 /* radeonsi embed cw/ccw info into front/back face enabled */
343 replacement = nir_imm_false(b);
344 break;
345 case nir_intrinsic_load_cull_any_enabled_amd:
346 /* If culling is enabled at compile time, it's always enabled at runtime. */
347 assert(si_shader_culling_enabled(shader));
348 replacement = nir_imm_true(b);
349 break;
350 case nir_intrinsic_load_cull_back_face_enabled_amd:
351 replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_CULL_FACE_BACK));
352 break;
353 case nir_intrinsic_load_cull_front_face_enabled_amd:
354 replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_CULL_FACE_FRONT));
355 break;
356 case nir_intrinsic_load_cull_small_triangle_precision_amd:
357 replacement = get_small_prim_precision(b, s, false);
358 break;
359 case nir_intrinsic_load_cull_small_line_precision_amd:
360 replacement = get_small_prim_precision(b, s, true);
361 break;
362 case nir_intrinsic_load_cull_small_triangles_enabled_amd:
363 /* Triangles always have small primitive culling enabled. */
364 replacement = nir_imm_bool(b, true);
365 break;
366 case nir_intrinsic_load_cull_small_lines_enabled_amd:
367 replacement =
368 nir_imm_bool(b, key->ge.opt.ngg_culling & SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT);
369 break;
370 case nir_intrinsic_load_provoking_vtx_in_prim_amd:
371 replacement = nir_bcsel(b, nir_i2b(b, GET_FIELD_NIR(GS_STATE_PROVOKING_VTX_FIRST)),
372 nir_imm_int(b, 0),
373 nir_iadd_imm(b, get_num_vertices_per_prim(b, s), -1));
374 break;
375 case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
376 replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_PIPELINE_STATS_EMU));
377 break;
378 case nir_intrinsic_load_prim_gen_query_enabled_amd:
379 case nir_intrinsic_load_prim_xfb_query_enabled_amd:
380 replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_STREAMOUT_QUERY_ENABLED));
381 break;
382 case nir_intrinsic_load_clamp_vertex_color_amd:
383 replacement = nir_i2b(b, GET_FIELD_NIR(VS_STATE_CLAMP_VERTEX_COLOR));
384 break;
385 case nir_intrinsic_load_user_clip_plane: {
386 nir_def *buf = si_nir_load_internal_binding(b, args, SI_VS_CONST_CLIP_PLANES, 4);
387 unsigned offset = nir_intrinsic_ucp_id(intrin) * 16;
388 replacement = nir_load_ubo(b, 4, 32, buf, nir_imm_int(b, offset),
389 .range = ~0);
390 break;
391 }
392 case nir_intrinsic_load_streamout_buffer_amd: {
393 unsigned slot = SI_VS_STREAMOUT_BUF0 + nir_intrinsic_base(intrin);
394 replacement = si_nir_load_internal_binding(b, args, slot, 4);
395 break;
396 }
397 case nir_intrinsic_load_xfb_state_address_gfx12_amd: {
398 nir_def *address = si_nir_load_internal_binding(b, args, SI_STREAMOUT_STATE_BUF, 1);
399 nir_def *address32_hi = nir_imm_int(b, s->shader->selector->screen->info.address32_hi);
400 replacement = nir_pack_64_2x32_split(b, address, address32_hi);
401 break;
402 }
403 case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
404 case nir_intrinsic_atomic_add_shader_invocation_count_amd: {
405 enum pipe_statistics_query_index index =
406 intrin->intrinsic == nir_intrinsic_atomic_add_gs_emit_prim_count_amd ?
407 PIPE_STAT_QUERY_GS_PRIMITIVES : PIPE_STAT_QUERY_GS_INVOCATIONS;
408
409 /* GFX11 only needs to emulate PIPE_STAT_QUERY_GS_PRIMITIVES because GS culls,
410 * which makes the pipeline statistic incorrect.
411 */
412 assert(sel->screen->info.gfx_level < GFX11 || index == PIPE_STAT_QUERY_GS_PRIMITIVES);
413
414 nir_def *buf =
415 si_nir_load_internal_binding(b, args, SI_GS_QUERY_EMULATED_COUNTERS_BUF, 4);
416 unsigned offset = si_query_pipestat_end_dw_offset(sel->screen, index) * 4;
417
418 nir_def *count = intrin->src[0].ssa;
419 nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), count,
420 .atomic_op = nir_atomic_op_iadd);
421 break;
422 }
423 case nir_intrinsic_atomic_add_gen_prim_count_amd:
424 case nir_intrinsic_atomic_add_xfb_prim_count_amd: {
425 nir_def *buf = si_nir_load_internal_binding(b, args, SI_GS_QUERY_BUF, 4);
426
427 unsigned stream = nir_intrinsic_stream_id(intrin);
428 unsigned offset = intrin->intrinsic == nir_intrinsic_atomic_add_gen_prim_count_amd ?
429 offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].generated_primitives) :
430 offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].emitted_primitives);
431
432 nir_def *prim_count = intrin->src[0].ssa;
433 nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), prim_count,
434 .atomic_op = nir_atomic_op_iadd);
435 break;
436 }
437 case nir_intrinsic_load_debug_log_desc_amd:
438 replacement = si_nir_load_internal_binding(b, args, SI_RING_SHADER_LOG, 4);
439 break;
440 case nir_intrinsic_load_ring_attr_amd:
441 replacement = build_attr_ring_desc(b, shader, args);
442 break;
443 case nir_intrinsic_load_force_vrs_rates_amd:
444 if (sel->screen->info.gfx_level >= GFX11) {
445 /* Bits [2:5] = VRS rate
446 *
447 * The range is [0, 15].
448 *
449 * If the hw doesn't support VRS 4x4, it will silently use 2x2 instead.
450 */
451 replacement = nir_imm_int(b, V_0283D0_VRS_SHADING_RATE_4X4 << 2);
452 } else {
453 /* Bits [2:3] = VRS rate X
454 * Bits [4:5] = VRS rate Y
455 *
456 * The range is [-2, 1]. Values:
457 * 1: 2x coarser shading rate in that direction.
458 * 0: normal shading rate
459 * -1: 2x finer shading rate (sample shading, not directional)
460 * -2: 4x finer shading rate (sample shading, not directional)
461 *
462 * Sample shading can't go above 8 samples, so both numbers can't be -2
463 * at the same time.
464 */
465 replacement = nir_imm_int(b, (1 << 2) | (1 << 4));
466 }
467 break;
468 case nir_intrinsic_load_sample_positions_amd: {
469 /* Sample locations are packed in 2 user SGPRs, 4 bits per component. */
470 nir_def *sample_id = intrin->src[0].ssa;
471 nir_def *sample_locs =
472 nir_pack_64_2x32_split(b, ac_nir_load_arg(b, &s->args->ac, s->args->sample_locs[0]),
473 ac_nir_load_arg(b, &s->args->ac, s->args->sample_locs[1]));
474 sample_locs = nir_ushr(b, sample_locs, nir_imul_imm(b, sample_id, 8));
475 sample_locs = nir_u2u32(b, sample_locs);
476 nir_def *sample_pos = nir_vec2(b, nir_iand_imm(b, sample_locs, 0xf),
477 nir_ubfe_imm(b, sample_locs, 4, 4));
478 replacement = nir_fmul_imm(b, nir_u2f32(b, sample_pos), 1.0 / 16);
479 break;
480 }
481 case nir_intrinsic_load_ring_tess_factors_amd: {
482 assert(s->tess_offchip_ring);
483 nir_def *addr = nir_channel(b, s->tess_offchip_ring, 0);
484 addr = nir_iadd_imm(b, addr, sel->screen->hs.tess_offchip_ring_size);
485 replacement = nir_vector_insert_imm(b, s->tess_offchip_ring, addr, 0);
486 break;
487 }
488 case nir_intrinsic_load_alpha_reference_amd:
489 replacement = ac_nir_load_arg(b, &args->ac, args->alpha_reference);
490 break;
491 case nir_intrinsic_load_color0:
492 case nir_intrinsic_load_color1: {
493 uint32_t colors_read = sel->info.colors_read;
494
495 int start, offset;
496 if (intrin->intrinsic == nir_intrinsic_load_color0) {
497 start = 0;
498 offset = 0;
499 } else {
500 start = 4;
501 offset = util_bitcount(colors_read & 0xf);
502 }
503
504 nir_def *color[4];
505 for (int i = 0; i < 4; i++) {
506 if (colors_read & BITFIELD_BIT(start + i))
507 color[i] = ac_nir_load_arg_at_offset(b, &args->ac, args->color_start, offset++);
508 else
509 color[i] = nir_undef(b, 1, 32);
510 }
511
512 replacement = nir_vec(b, color, 4);
513 break;
514 }
515 case nir_intrinsic_load_point_coord_maybe_flipped: {
516 /* Load point coordinates (x, y) which are written by the hw after the interpolated inputs */
517 nir_def *baryc = intrin->src[0].ssa;
518 replacement = nir_load_interpolated_input(b, 2, 32, baryc, nir_imm_int(b, 0),
519 .base = si_get_ps_num_interp(shader),
520 .component = 2);
521 break;
522 }
523 case nir_intrinsic_load_poly_line_smooth_enabled:
524 replacement = nir_imm_bool(b, key->ps.mono.poly_line_smoothing);
525 break;
526 case nir_intrinsic_load_initial_edgeflags_amd: {
527 unsigned output_prim = si_get_output_prim_simplified(sel, &shader->key);
528
529 /* Points, lines, and rectangles don't need edge flags. */
530 if (output_prim == MESA_PRIM_POINTS || output_prim == MESA_PRIM_LINES ||
531 output_prim == SI_PRIM_RECTANGLE_LIST) {
532 replacement = nir_imm_int(b, 0);
533 } else if (stage == MESA_SHADER_VERTEX) {
534 if (sel->screen->info.gfx_level >= GFX12) {
535 replacement = nir_iand_imm(b, ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[0]),
536 ac_get_all_edge_flag_bits(sel->screen->info.gfx_level));
537 } else {
538 /* Use the following trick to extract the edge flags:
539 * extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10
540 * shifted = v_mul_u32_u24 extracted, 0x80402u ; shift the bits: 8->9, 9->19, 10->29
541 * result = v_and_b32 shifted, 0x20080200 ; remove garbage
542 */
543 nir_def *tmp = ac_nir_load_arg(b, &args->ac, args->ac.gs_invocation_id);
544 tmp = nir_iand_imm(b, tmp, 0x700);
545 tmp = nir_imul_imm(b, tmp, 0x80402);
546 replacement = nir_iand_imm(b, tmp, 0x20080200);
547 }
548 } else {
549 /* TES and GS: Edge flags are always enabled by the rasterizer state when polygon mode is
550 * enabled, so set all edge flags to 1 for triangles.
551 */
552 replacement = nir_imm_int(b, ac_get_all_edge_flag_bits(sel->screen->info.gfx_level));
553 }
554 break;
555 }
556 case nir_intrinsic_load_ring_esgs_amd:
557 assert(s->esgs_ring);
558 replacement = s->esgs_ring;
559 break;
560 case nir_intrinsic_load_ring_tess_offchip_amd:
561 assert(s->tess_offchip_ring);
562 replacement = s->tess_offchip_ring;
563 break;
564 case nir_intrinsic_load_tcs_tess_levels_to_tes_amd:
565 if (shader->is_monolithic) {
566 replacement = nir_imm_bool(b, key->ge.opt.tes_reads_tess_factors);
567 } else {
568 replacement = nir_ine_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 31, 1), 0);
569 }
570 break;
571 case nir_intrinsic_load_tcs_primitive_mode_amd:
572 if (shader->is_monolithic) {
573 replacement = nir_imm_int(b, key->ge.opt.tes_prim_mode);
574 } else {
575 if (b->shader->info.tess._primitive_mode != TESS_PRIMITIVE_UNSPECIFIED)
576 replacement = nir_imm_int(b, b->shader->info.tess._primitive_mode);
577 else
578 replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 29, 2);
579 }
580 break;
581 case nir_intrinsic_load_ring_gsvs_amd: {
582 unsigned stream_id = nir_intrinsic_stream_id(intrin);
583 /* Unused nir_load_ring_gsvs_amd may not be eliminated yet. */
584 replacement = s->gsvs_ring[stream_id] ?
585 s->gsvs_ring[stream_id] : nir_undef(b, 4, 32);
586 break;
587 }
588 case nir_intrinsic_load_user_data_amd: {
589 nir_def *low_vec4 = ac_nir_load_arg(b, &args->ac, args->cs_user_data[0]);
590 replacement = nir_pad_vector(b, low_vec4, 8);
591
592 if (args->cs_user_data[1].used && intrin->def.num_components > 4) {
593 nir_def *high_vec4 = ac_nir_load_arg(b, &args->ac, args->cs_user_data[1]);
594 for (unsigned i = 0; i < high_vec4->num_components; i++)
595 replacement = nir_vector_insert_imm(b, replacement, nir_channel(b, high_vec4, i), 4 + i);
596 }
597 break;
598 }
599 case nir_intrinsic_load_fbfetch_image_fmask_desc_amd:
600 STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0_FMASK % 2 == 0);
601 replacement = si_nir_load_internal_binding(b, args, SI_PS_IMAGE_COLORBUF0_FMASK, 8);
602 break;
603 case nir_intrinsic_load_fbfetch_image_desc_amd:
604 STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
605 replacement = si_nir_load_internal_binding(b, args, SI_PS_IMAGE_COLORBUF0, 8);
606 break;
607 case nir_intrinsic_load_polygon_stipple_buffer_amd:
608 replacement = si_nir_load_internal_binding(b, args, SI_PS_CONST_POLY_STIPPLE, 4);
609 break;
610 default:
611 return false;
612 }
613
614 if (replacement)
615 nir_def_rewrite_uses(&intrin->def, replacement);
616
617 nir_instr_remove(instr);
618 nir_instr_free(instr);
619
620 return true;
621 }
622
si_nir_lower_abi(nir_shader * nir,struct si_shader * shader,struct si_shader_args * args)623 bool si_nir_lower_abi(nir_shader *nir, struct si_shader *shader, struct si_shader_args *args)
624 {
625 struct lower_abi_state state = {
626 .shader = shader,
627 .args = args,
628 };
629
630 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
631
632 nir_builder b = nir_builder_create(impl);
633
634 preload_reusable_variables(&b, &state);
635
636 bool progress = false;
637 nir_foreach_block_safe(block, impl) {
638 nir_foreach_instr_safe(instr, block) {
639 if (instr->type == nir_instr_type_intrinsic)
640 progress |= lower_intrinsic(&b, instr, &state);
641 }
642 }
643
644 nir_metadata preserved = progress ?
645 nir_metadata_control_flow :
646 nir_metadata_all;
647 nir_metadata_preserve(impl, preserved);
648
649 return progress;
650 }
651