1 /*
2 * Copyright 2022 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "nir_builder.h"
8
9 #include "ac_nir.h"
10 #include "si_pipe.h"
11 #include "si_query.h"
12 #include "si_state.h"
13 #include "si_shader_internal.h"
14
15 struct lower_abi_state {
16 struct si_shader *shader;
17 struct si_shader_args *args;
18
19 nir_def *esgs_ring;
20 nir_def *tess_offchip_ring;
21 nir_def *gsvs_ring[4];
22 };
23
24 #define GET_FIELD_NIR(field) \
25 ac_nir_unpack_arg(b, &args->ac, args->vs_state_bits, \
26 field##__SHIFT, util_bitcount(field##__MASK))
27
si_nir_load_internal_binding(nir_builder * b,struct si_shader_args * args,unsigned slot,unsigned num_components)28 nir_def *si_nir_load_internal_binding(nir_builder *b, struct si_shader_args *args,
29 unsigned slot, unsigned num_components)
30 {
31 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->internal_bindings);
32 return nir_load_smem_amd(b, num_components, addr, nir_imm_int(b, slot * 16));
33 }
34
build_attr_ring_desc(nir_builder * b,struct si_shader * shader,struct si_shader_args * args)35 static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader,
36 struct si_shader_args *args)
37 {
38 struct si_shader_selector *sel = shader->selector;
39
40 nir_def *attr_address =
41 sel->stage == MESA_SHADER_VERTEX && sel->info.base.vs.blit_sgprs_amd ?
42 ac_nir_load_arg_at_offset(b, &args->ac, args->vs_blit_inputs,
43 sel->info.base.vs.blit_sgprs_amd - 1) :
44 ac_nir_load_arg(b, &args->ac, args->gs_attr_address);
45
46 unsigned stride = 16 * shader->info.nr_param_exports;
47 nir_def *comp[] = {
48 attr_address,
49 nir_imm_int(b, S_008F04_BASE_ADDRESS_HI(sel->screen->info.address32_hi) |
50 S_008F04_STRIDE(stride) |
51 S_008F04_SWIZZLE_ENABLE_GFX11(3) /* 16B */),
52 nir_imm_int(b, 0xffffffff),
53 nir_imm_int(b, S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
54 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
55 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
56 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
57 S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT) |
58 S_008F0C_INDEX_STRIDE(2) /* 32 elements */),
59 };
60
61 return nir_vec(b, comp, 4);
62 }
63
64 static nir_def *
fetch_framebuffer(nir_builder * b,struct si_shader_args * args,struct si_shader_selector * sel,union si_shader_key * key)65 fetch_framebuffer(nir_builder *b, struct si_shader_args *args,
66 struct si_shader_selector *sel, union si_shader_key *key)
67 {
68 /* Load the image descriptor. */
69 STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
70 STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0_FMASK % 2 == 0);
71
72 nir_def *zero = nir_imm_zero(b, 1, 32);
73 nir_def *undef = nir_undef(b, 1, 32);
74
75 unsigned chan = 0;
76 nir_def *vec[4] = {undef, undef, undef, undef};
77
78 vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 16);
79
80 if (!key->ps.mono.fbfetch_is_1D)
81 vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 16);
82
83 /* Get the current render target layer index. */
84 if (key->ps.mono.fbfetch_layered)
85 vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 16, 11);
86
87 nir_def *coords = nir_vec(b, vec, 4);
88
89 enum glsl_sampler_dim dim;
90 if (key->ps.mono.fbfetch_msaa)
91 dim = GLSL_SAMPLER_DIM_MS;
92 else if (key->ps.mono.fbfetch_is_1D)
93 dim = GLSL_SAMPLER_DIM_1D;
94 else
95 dim = GLSL_SAMPLER_DIM_2D;
96
97 nir_def *sample_id;
98 if (key->ps.mono.fbfetch_msaa) {
99 sample_id = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 8, 4);
100
101 if (sel->screen->info.gfx_level < GFX11 &&
102 !(sel->screen->debug_flags & DBG(NO_FMASK))) {
103 nir_def *desc =
104 si_nir_load_internal_binding(b, args, SI_PS_IMAGE_COLORBUF0_FMASK, 8);
105
106 nir_def *fmask =
107 nir_bindless_image_fragment_mask_load_amd(
108 b, desc, coords,
109 .image_dim = dim,
110 .image_array = key->ps.mono.fbfetch_layered,
111 .access = ACCESS_CAN_REORDER);
112
113 nir_def *offset = nir_ishl_imm(b, sample_id, 2);
114 /* 3 for EQAA handling, see lower_image_to_fragment_mask_load() */
115 nir_def *width = nir_imm_int(b, 3);
116 sample_id = nir_ubfe(b, fmask, offset, width);
117 }
118 } else {
119 sample_id = zero;
120 }
121
122 nir_def *desc = si_nir_load_internal_binding(b, args, SI_PS_IMAGE_COLORBUF0, 8);
123
124 return nir_bindless_image_load(b, 4, 32, desc, coords, sample_id, zero,
125 .image_dim = dim,
126 .image_array = key->ps.mono.fbfetch_layered,
127 .access = ACCESS_CAN_REORDER);
128 }
129
build_tess_ring_desc(nir_builder * b,struct si_screen * screen,struct si_shader_args * args)130 static nir_def *build_tess_ring_desc(nir_builder *b, struct si_screen *screen,
131 struct si_shader_args *args)
132 {
133 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->tes_offchip_addr);
134
135 uint32_t rsrc3 =
136 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
137 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
138 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
139 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
140
141 if (screen->info.gfx_level >= GFX11) {
142 rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
143 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
144 } else if (screen->info.gfx_level >= GFX10) {
145 rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
146 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
147 S_008F0C_RESOURCE_LEVEL(1);
148 } else {
149 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
150 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
151 }
152
153 nir_def *comp[4] = {
154 addr,
155 nir_imm_int(b, S_008F04_BASE_ADDRESS_HI(screen->info.address32_hi)),
156 nir_imm_int(b, 0xffffffff),
157 nir_imm_int(b, rsrc3),
158 };
159
160 return nir_vec(b, comp, 4);
161 }
162
build_esgs_ring_desc(nir_builder * b,enum amd_gfx_level gfx_level,struct si_shader_args * args)163 static nir_def *build_esgs_ring_desc(nir_builder *b, enum amd_gfx_level gfx_level,
164 struct si_shader_args *args)
165 {
166 nir_def *desc = si_nir_load_internal_binding(b, args, SI_RING_ESGS, 4);
167
168 if (b->shader->info.stage == MESA_SHADER_GEOMETRY)
169 return desc;
170
171 nir_def *vec[4];
172 for (int i = 0; i < 4; i++)
173 vec[i] = nir_channel(b, desc, i);
174
175 vec[1] = nir_ior_imm(b, vec[1], S_008F04_SWIZZLE_ENABLE_GFX6(1));
176 vec[3] = nir_ior_imm(b, vec[3],
177 S_008F0C_ELEMENT_SIZE(1) |
178 S_008F0C_INDEX_STRIDE(3) |
179 S_008F0C_ADD_TID_ENABLE(1));
180
181 /* If MUBUF && ADD_TID_ENABLE, DATA_FORMAT means STRIDE[14:17] on gfx8-9, so set 0. */
182 if (gfx_level == GFX8)
183 vec[3] = nir_iand_imm(b, vec[3], C_008F0C_DATA_FORMAT);
184
185 return nir_vec(b, vec, 4);
186 }
187
build_gsvs_ring_desc(nir_builder * b,struct lower_abi_state * s)188 static void build_gsvs_ring_desc(nir_builder *b, struct lower_abi_state *s)
189 {
190 const struct si_shader_selector *sel = s->shader->selector;
191 const union si_shader_key *key = &s->shader->key;
192
193 if (s->shader->is_gs_copy_shader) {
194 s->gsvs_ring[0] = si_nir_load_internal_binding(b, s->args, SI_RING_GSVS, 4);
195 } else if (sel->stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg) {
196 nir_def *base_addr = si_nir_load_internal_binding(b, s->args, SI_RING_GSVS, 2);
197 base_addr = nir_pack_64_2x32(b, base_addr);
198
199 /* The conceptual layout of the GSVS ring is
200 * v0c0 .. vLv0 v0c1 .. vLc1 ..
201 * but the real memory layout is swizzled across
202 * threads:
203 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
204 * t16v0c0 ..
205 * Override the buffer descriptor accordingly.
206 */
207
208 for (unsigned stream = 0; stream < 4; stream++) {
209 unsigned num_components = sel->info.num_stream_output_components[stream];
210 if (!num_components)
211 continue;
212
213 nir_def *desc[4];
214 desc[0] = nir_unpack_64_2x32_split_x(b, base_addr);
215 desc[1] = nir_unpack_64_2x32_split_y(b, base_addr);
216
217 unsigned stride = 4 * num_components * sel->info.base.gs.vertices_out;
218 /* Limit on the stride field for <= GFX7. */
219 assert(stride < (1 << 14));
220
221 desc[1] = nir_ior_imm(
222 b, desc[1], S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE_GFX6(1));
223
224 unsigned num_records = s->shader->wave_size;
225 desc[2] = nir_imm_int(b, num_records);
226
227 uint32_t rsrc3 =
228 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
229 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
230 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
231 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
232 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
233 S_008F0C_ADD_TID_ENABLE(1);
234
235 if (sel->screen->info.gfx_level >= GFX10) {
236 rsrc3 |=
237 S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
238 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
239 S_008F0C_RESOURCE_LEVEL(1);
240 } else {
241 /* If MUBUF && ADD_TID_ENABLE, DATA_FORMAT means STRIDE[14:17] on gfx8-9, so set 0. */
242 unsigned data_format =
243 sel->screen->info.gfx_level == GFX8 || sel->screen->info.gfx_level == GFX9 ?
244 0 : V_008F0C_BUF_DATA_FORMAT_32;
245
246 rsrc3 |=
247 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
248 S_008F0C_DATA_FORMAT(data_format) |
249 S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
250 }
251
252 desc[3] = nir_imm_int(b, rsrc3);
253
254 s->gsvs_ring[stream] = nir_vec(b, desc, 4);
255
256 /* next stream's desc addr */
257 base_addr = nir_iadd_imm(b, base_addr, stride * num_records);
258 }
259 }
260 }
261
preload_reusable_variables(nir_builder * b,struct lower_abi_state * s)262 static void preload_reusable_variables(nir_builder *b, struct lower_abi_state *s)
263 {
264 const struct si_shader_selector *sel = s->shader->selector;
265 const union si_shader_key *key = &s->shader->key;
266
267 b->cursor = nir_before_impl(b->impl);
268
269 if (sel->screen->info.gfx_level <= GFX8 && sel->stage <= MESA_SHADER_GEOMETRY &&
270 (key->ge.as_es || sel->stage == MESA_SHADER_GEOMETRY)) {
271 s->esgs_ring = build_esgs_ring_desc(b, sel->screen->info.gfx_level, s->args);
272 }
273
274 if (sel->stage == MESA_SHADER_TESS_CTRL || sel->stage == MESA_SHADER_TESS_EVAL)
275 s->tess_offchip_ring = build_tess_ring_desc(b, sel->screen, s->args);
276
277 build_gsvs_ring_desc(b, s);
278 }
279
get_num_vertices_per_prim(nir_builder * b,struct lower_abi_state * s)280 static nir_def *get_num_vertices_per_prim(nir_builder *b, struct lower_abi_state *s)
281 {
282 struct si_shader_args *args = s->args;
283 unsigned num_vertices = gfx10_ngg_get_vertices_per_prim(s->shader);
284
285 if (num_vertices)
286 return nir_imm_int(b, num_vertices);
287 else
288 return nir_iadd_imm(b, GET_FIELD_NIR(GS_STATE_OUTPRIM), 1);
289 }
290
lower_intrinsic(nir_builder * b,nir_instr * instr,struct lower_abi_state * s)291 static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_state *s)
292 {
293 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
294
295 struct si_shader *shader = s->shader;
296 struct si_shader_args *args = s->args;
297 struct si_shader_selector *sel = shader->selector;
298 union si_shader_key *key = &shader->key;
299 gl_shader_stage stage = sel->stage;
300
301 b->cursor = nir_before_instr(instr);
302
303 nir_def *replacement = NULL;
304
305 switch (intrin->intrinsic) {
306 case nir_intrinsic_load_first_vertex:
307 replacement = ac_nir_load_arg(b, &args->ac, args->ac.base_vertex);
308 break;
309 case nir_intrinsic_load_base_vertex: {
310 nir_def *indexed = GET_FIELD_NIR(VS_STATE_INDEXED);
311 indexed = nir_i2b(b, indexed);
312
313 nir_def *base_vertex = ac_nir_load_arg(b, &args->ac, args->ac.base_vertex);
314 replacement = nir_bcsel(b, indexed, base_vertex, nir_imm_int(b, 0));
315 break;
316 }
317 case nir_intrinsic_load_workgroup_size: {
318 assert(sel->info.base.workgroup_size_variable && sel->info.uses_variable_block_size);
319
320 nir_def *block_size = ac_nir_load_arg(b, &args->ac, args->block_size);
321 nir_def *comp[] = {
322 nir_ubfe_imm(b, block_size, 0, 10),
323 nir_ubfe_imm(b, block_size, 10, 10),
324 nir_ubfe_imm(b, block_size, 20, 10),
325 };
326 replacement = nir_vec(b, comp, 3);
327 break;
328 }
329 case nir_intrinsic_load_tess_level_outer_default:
330 case nir_intrinsic_load_tess_level_inner_default: {
331 nir_def *buf = si_nir_load_internal_binding(b, args, SI_HS_CONST_DEFAULT_TESS_LEVELS, 4);
332 unsigned num_components = intrin->def.num_components;
333 unsigned offset =
334 intrin->intrinsic == nir_intrinsic_load_tess_level_inner_default ? 16 : 0;
335 replacement = nir_load_ubo(b, num_components, 32, buf, nir_imm_int(b, offset),
336 .range = ~0);
337 break;
338 }
339 case nir_intrinsic_load_patch_vertices_in:
340 if (stage == MESA_SHADER_TESS_CTRL)
341 replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 11, 5);
342 else if (stage == MESA_SHADER_TESS_EVAL) {
343 replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 6, 5);
344 } else
345 unreachable("no nir_load_patch_vertices_in");
346 replacement = nir_iadd_imm(b, replacement, 1);
347 break;
348 case nir_intrinsic_load_sample_mask_in:
349 replacement = ac_nir_load_arg(b, &args->ac, args->ac.sample_coverage);
350 break;
351 case nir_intrinsic_load_lshs_vertex_stride_amd:
352 if (stage == MESA_SHADER_VERTEX)
353 replacement = nir_imm_int(b, sel->info.lshs_vertex_stride);
354 else if (stage == MESA_SHADER_TESS_CTRL)
355 replacement = sel->screen->info.gfx_level >= GFX9 && shader->is_monolithic ?
356 nir_imm_int(b, key->ge.part.tcs.ls->info.lshs_vertex_stride) :
357 nir_ishl_imm(b, GET_FIELD_NIR(VS_STATE_LS_OUT_VERTEX_SIZE), 2);
358 else
359 unreachable("no nir_load_lshs_vertex_stride_amd");
360 break;
361 case nir_intrinsic_load_esgs_vertex_stride_amd:
362 assert(sel->screen->info.gfx_level >= GFX9);
363 if (shader->is_monolithic) {
364 replacement = nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4);
365 } else {
366 nir_def *num_es_outputs = GET_FIELD_NIR(GS_STATE_NUM_ES_OUTPUTS);
367 replacement = nir_iadd_imm(b, nir_imul_imm(b, num_es_outputs, 4), 1);
368 }
369 break;
370 case nir_intrinsic_load_tcs_num_patches_amd: {
371 nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6);
372 replacement = nir_iadd_imm(b, tmp, 1);
373 break;
374 }
375 case nir_intrinsic_load_hs_out_patch_data_offset_amd:
376 replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 16, 16);
377 break;
378 case nir_intrinsic_load_ring_tess_offchip_offset_amd:
379 replacement = ac_nir_load_arg(b, &args->ac, args->ac.tess_offchip_offset);
380 break;
381 case nir_intrinsic_load_ring_es2gs_offset_amd:
382 replacement = ac_nir_load_arg(b, &args->ac, args->ac.es2gs_offset);
383 break;
384 case nir_intrinsic_load_clip_half_line_width_amd: {
385 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
386 replacement = nir_load_smem_amd(b, 2, addr, nir_imm_int(b, 32));
387 break;
388 }
389 case nir_intrinsic_load_viewport_xy_scale_and_offset: {
390 bool prim_is_lines = key->ge.opt.ngg_culling & SI_NGG_CULL_LINES;
391 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
392 unsigned offset = prim_is_lines ? 16 : 0;
393 replacement = nir_load_smem_amd(b, 4, addr, nir_imm_int(b, offset));
394 break;
395 }
396 case nir_intrinsic_load_num_vertices_per_primitive_amd:
397 replacement = get_num_vertices_per_prim(b, s);
398 break;
399 case nir_intrinsic_load_cull_ccw_amd:
400 /* radeonsi embed cw/ccw info into front/back face enabled */
401 replacement = nir_imm_false(b);
402 break;
403 case nir_intrinsic_load_cull_any_enabled_amd:
404 replacement = nir_imm_bool(b, !!key->ge.opt.ngg_culling);
405 break;
406 case nir_intrinsic_load_cull_back_face_enabled_amd:
407 replacement = nir_imm_bool(b, key->ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE);
408 break;
409 case nir_intrinsic_load_cull_front_face_enabled_amd:
410 replacement = nir_imm_bool(b, key->ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE);
411 break;
412 case nir_intrinsic_load_cull_small_prim_precision_amd: {
413 nir_def *small_prim_precision =
414 key->ge.opt.ngg_culling & SI_NGG_CULL_LINES ?
415 GET_FIELD_NIR(GS_STATE_SMALL_PRIM_PRECISION_NO_AA) :
416 GET_FIELD_NIR(GS_STATE_SMALL_PRIM_PRECISION);
417
418 /* Extract the small prim precision. */
419 small_prim_precision = nir_ior_imm(b, small_prim_precision, 0x70);
420 replacement = nir_ishl_imm(b, small_prim_precision, 23);
421 break;
422 }
423 case nir_intrinsic_load_cull_small_primitives_enabled_amd: {
424 unsigned mask = SI_NGG_CULL_LINES | SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT;
425 replacement = nir_imm_bool(b, (key->ge.opt.ngg_culling & mask) != SI_NGG_CULL_LINES);
426 break;
427 }
428 case nir_intrinsic_load_provoking_vtx_in_prim_amd:
429 replacement = nir_bcsel(b, nir_i2b(b, GET_FIELD_NIR(GS_STATE_PROVOKING_VTX_FIRST)),
430 nir_imm_int(b, 0),
431 nir_iadd_imm(b, get_num_vertices_per_prim(b, s), -1));
432 break;
433 case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
434 replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_PIPELINE_STATS_EMU));
435 break;
436 case nir_intrinsic_load_prim_gen_query_enabled_amd:
437 case nir_intrinsic_load_prim_xfb_query_enabled_amd:
438 replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_STREAMOUT_QUERY_ENABLED));
439 break;
440 case nir_intrinsic_load_clamp_vertex_color_amd:
441 replacement = nir_i2b(b, GET_FIELD_NIR(VS_STATE_CLAMP_VERTEX_COLOR));
442 break;
443 case nir_intrinsic_load_user_clip_plane: {
444 nir_def *buf = si_nir_load_internal_binding(b, args, SI_VS_CONST_CLIP_PLANES, 4);
445 unsigned offset = nir_intrinsic_ucp_id(intrin) * 16;
446 replacement = nir_load_ubo(b, 4, 32, buf, nir_imm_int(b, offset),
447 .range = ~0);
448 break;
449 }
450 case nir_intrinsic_load_streamout_buffer_amd: {
451 unsigned slot = SI_VS_STREAMOUT_BUF0 + nir_intrinsic_base(intrin);
452 replacement = si_nir_load_internal_binding(b, args, slot, 4);
453 break;
454 }
455 case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
456 case nir_intrinsic_atomic_add_shader_invocation_count_amd: {
457 enum pipe_statistics_query_index index =
458 intrin->intrinsic == nir_intrinsic_atomic_add_gs_emit_prim_count_amd ?
459 PIPE_STAT_QUERY_GS_PRIMITIVES : PIPE_STAT_QUERY_GS_INVOCATIONS;
460
461 /* GFX11 only needs to emulate PIPE_STAT_QUERY_GS_PRIMITIVES because GS culls,
462 * which makes the pipeline statistic incorrect.
463 */
464 assert(sel->screen->info.gfx_level < GFX11 || index == PIPE_STAT_QUERY_GS_PRIMITIVES);
465
466 nir_def *buf =
467 si_nir_load_internal_binding(b, args, SI_GS_QUERY_EMULATED_COUNTERS_BUF, 4);
468 unsigned offset = si_query_pipestat_end_dw_offset(sel->screen, index) * 4;
469
470 nir_def *count = intrin->src[0].ssa;
471 nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), count,
472 .atomic_op = nir_atomic_op_iadd);
473 break;
474 }
475 case nir_intrinsic_atomic_add_gen_prim_count_amd:
476 case nir_intrinsic_atomic_add_xfb_prim_count_amd: {
477 nir_def *buf = si_nir_load_internal_binding(b, args, SI_GS_QUERY_BUF, 4);
478
479 unsigned stream = nir_intrinsic_stream_id(intrin);
480 unsigned offset = intrin->intrinsic == nir_intrinsic_atomic_add_gen_prim_count_amd ?
481 offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].generated_primitives) :
482 offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].emitted_primitives);
483
484 nir_def *prim_count = intrin->src[0].ssa;
485 nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), prim_count,
486 .atomic_op = nir_atomic_op_iadd);
487 break;
488 }
489 case nir_intrinsic_load_ring_attr_amd:
490 replacement = build_attr_ring_desc(b, shader, args);
491 break;
492 case nir_intrinsic_load_ring_attr_offset_amd: {
493 nir_def *offset = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_attr_offset, 0, 15);
494 replacement = nir_ishl_imm(b, offset, 9);
495 break;
496 }
497 case nir_intrinsic_load_ring_gs2vs_offset_amd:
498 replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs2vs_offset);
499 break;
500 case nir_intrinsic_load_streamout_config_amd:
501 replacement = ac_nir_load_arg(b, &args->ac, args->ac.streamout_config);
502 break;
503 case nir_intrinsic_load_streamout_write_index_amd:
504 replacement = ac_nir_load_arg(b, &args->ac, args->ac.streamout_write_index);
505 break;
506 case nir_intrinsic_load_streamout_offset_amd:
507 replacement =
508 ac_nir_load_arg(b, &args->ac, args->ac.streamout_offset[nir_intrinsic_base(intrin)]);
509 break;
510 case nir_intrinsic_load_force_vrs_rates_amd:
511 if (sel->screen->info.gfx_level >= GFX11) {
512 /* Bits [2:5] = VRS rate
513 *
514 * The range is [0, 15].
515 *
516 * If the hw doesn't support VRS 4x4, it will silently use 2x2 instead.
517 */
518 replacement = nir_imm_int(b, V_0283D0_VRS_SHADING_RATE_4X4 << 2);
519 } else {
520 /* Bits [2:3] = VRS rate X
521 * Bits [4:5] = VRS rate Y
522 *
523 * The range is [-2, 1]. Values:
524 * 1: 2x coarser shading rate in that direction.
525 * 0: normal shading rate
526 * -1: 2x finer shading rate (sample shading, not directional)
527 * -2: 4x finer shading rate (sample shading, not directional)
528 *
529 * Sample shading can't go above 8 samples, so both numbers can't be -2
530 * at the same time.
531 */
532 replacement = nir_imm_int(b, (1 << 2) | (1 << 4));
533 }
534 break;
535 case nir_intrinsic_load_barycentric_at_sample: {
536 unsigned mode = nir_intrinsic_interp_mode(intrin);
537
538 if (key->ps.mono.interpolate_at_sample_force_center) {
539 replacement = nir_load_barycentric_pixel(b, 32, .interp_mode = mode);
540 } else {
541 nir_def *sample_id = intrin->src[0].ssa;
542 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
543 nir_def *offset = nir_ishl_imm(b, sample_id, 3);
544
545 nir_def *buf = si_nir_load_internal_binding(b, args, SI_PS_CONST_SAMPLE_POSITIONS, 4);
546 nir_def *sample_pos = nir_load_ubo(b, 2, 32, buf, offset, .range = ~0);
547
548 sample_pos = nir_fadd_imm(b, sample_pos, -0.5);
549
550 replacement = nir_load_barycentric_at_offset(b, 32, sample_pos, .interp_mode = mode);
551 }
552 break;
553 }
554 case nir_intrinsic_load_output: {
555 nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
556
557 /* not fbfetch */
558 if (!(stage == MESA_SHADER_FRAGMENT && sem.fb_fetch_output))
559 return false;
560
561 /* Ignore src0, because KHR_blend_func_extended disallows multiple render targets. */
562
563 replacement = fetch_framebuffer(b, args, sel, key);
564 break;
565 }
566 case nir_intrinsic_load_ring_tess_factors_amd: {
567 assert(s->tess_offchip_ring);
568 nir_def *addr = nir_channel(b, s->tess_offchip_ring, 0);
569 addr = nir_iadd_imm(b, addr, sel->screen->hs.tess_offchip_ring_size);
570 replacement = nir_vector_insert_imm(b, s->tess_offchip_ring, addr, 0);
571 break;
572 }
573 case nir_intrinsic_load_ring_tess_factors_offset_amd:
574 replacement = ac_nir_load_arg(b, &args->ac, args->ac.tcs_factor_offset);
575 break;
576 case nir_intrinsic_load_alpha_reference_amd:
577 replacement = ac_nir_load_arg(b, &args->ac, args->alpha_reference);
578 break;
579 case nir_intrinsic_load_front_face:
580 if (!key->ps.opt.force_front_face_input)
581 return false;
582 replacement = nir_imm_bool(b, key->ps.opt.force_front_face_input == 1);
583 break;
584 case nir_intrinsic_load_barycentric_optimize_amd: {
585 nir_def *prim_mask = ac_nir_load_arg(b, &args->ac, args->ac.prim_mask);
586 /* enabled when bit 31 is set */
587 replacement = nir_ilt_imm(b, prim_mask, 0);
588 break;
589 }
590 case nir_intrinsic_load_layer_id:
591 replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 16, 13);
592 break;
593 case nir_intrinsic_load_color0:
594 case nir_intrinsic_load_color1: {
595 uint32_t colors_read = sel->info.colors_read;
596
597 int start, offset;
598 if (intrin->intrinsic == nir_intrinsic_load_color0) {
599 start = 0;
600 offset = 0;
601 } else {
602 start = 4;
603 offset = util_bitcount(colors_read & 0xf);
604 }
605
606 nir_def *color[4];
607 for (int i = 0; i < 4; i++) {
608 if (colors_read & BITFIELD_BIT(start + i)) {
609 color[i] = ac_nir_load_arg_at_offset(b, &args->ac, args->color_start, offset++);
610
611 nir_intrinsic_set_flags(nir_instr_as_intrinsic(color[i]->parent_instr),
612 SI_VECTOR_ARG_IS_COLOR |
613 SI_VECTOR_ARG_COLOR_COMPONENT(start + i));
614 } else {
615 color[i] = nir_undef(b, 1, 32);
616 }
617 }
618
619 replacement = nir_vec(b, color, 4);
620 break;
621 }
622 case nir_intrinsic_load_point_coord_maybe_flipped: {
623 nir_def *interp_param =
624 nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NONE);
625
626 /* Load point coordinates (x, y) which are written by the hw after the interpolated inputs */
627 replacement = nir_load_interpolated_input(b, 2, 32, interp_param, nir_imm_int(b, 0),
628 .base = si_get_ps_num_interp(shader),
629 .component = 2,
630 /* This tells si_nir_scan_shader that it's PARAM_GEN */
631 .io_semantics.no_varying = 1);
632 break;
633 }
634 case nir_intrinsic_load_poly_line_smooth_enabled:
635 replacement = nir_imm_bool(b, key->ps.mono.poly_line_smoothing);
636 break;
637 case nir_intrinsic_load_gs_vertex_offset_amd: {
638 unsigned base = nir_intrinsic_base(intrin);
639 replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[base]);
640 break;
641 }
642 case nir_intrinsic_load_merged_wave_info_amd:
643 replacement = ac_nir_load_arg(b, &args->ac, args->ac.merged_wave_info);
644 break;
645 case nir_intrinsic_load_workgroup_num_input_vertices_amd:
646 replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 12, 9);
647 break;
648 case nir_intrinsic_load_workgroup_num_input_primitives_amd:
649 replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 22, 9);
650 break;
651 case nir_intrinsic_load_initial_edgeflags_amd:
652 if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES ||
653 (shader->selector->stage == MESA_SHADER_VERTEX &&
654 shader->selector->info.base.vs.blit_sgprs_amd)) {
655 /* Line primitives and blits don't need edge flags. */
656 replacement = nir_imm_int(b, 0);
657 } else if (shader->selector->stage == MESA_SHADER_VERTEX) {
658 /* Use the following trick to extract the edge flags:
659 * extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10
660 * shifted = v_mul_u32_u24 extracted, 0x80402u ; shift the bits: 8->9, 9->19, 10->29
661 * result = v_and_b32 shifted, 0x20080200 ; remove garbage
662 */
663 nir_def *tmp = ac_nir_load_arg(b, &args->ac, args->ac.gs_invocation_id);
664 tmp = nir_iand_imm(b, tmp, 0x700);
665 tmp = nir_imul_imm(b, tmp, 0x80402);
666 replacement = nir_iand_imm(b, tmp, 0x20080200);
667 } else {
668 /* Edge flags are always enabled when polygon mode is enabled, so we always have to
669 * return valid edge flags if the primitive type is not lines and if we are not blitting
670 * because the shader doesn't know when polygon mode is enabled.
671 */
672 replacement = nir_imm_int(b, ac_get_all_edge_flag_bits());
673 }
674 break;
675 case nir_intrinsic_load_packed_passthrough_primitive_amd:
676 replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[0]);
677 break;
678 case nir_intrinsic_load_ordered_id_amd:
679 replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 0, 12);
680 break;
681 case nir_intrinsic_load_ring_esgs_amd:
682 assert(s->esgs_ring);
683 replacement = s->esgs_ring;
684 break;
685 case nir_intrinsic_load_tess_rel_patch_id_amd:
686 /* LLVM need to replace patch id arg, so have to be done in LLVM backend. */
687 if (!sel->screen->use_aco)
688 return false;
689
690 if (stage == MESA_SHADER_TESS_CTRL) {
691 replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_rel_ids, 0, 8);
692 } else {
693 assert(stage == MESA_SHADER_TESS_EVAL);
694 replacement = ac_nir_load_arg(b, &args->ac, args->ac.tes_rel_patch_id);
695 }
696 break;
697 case nir_intrinsic_load_ring_tess_offchip_amd:
698 assert(s->tess_offchip_ring);
699 replacement = s->tess_offchip_ring;
700 break;
701 case nir_intrinsic_load_ring_gsvs_amd: {
702 unsigned stream_id = nir_intrinsic_stream_id(intrin);
703 /* Unused nir_load_ring_gsvs_amd may not be eliminated yet. */
704 replacement = s->gsvs_ring[stream_id] ?
705 s->gsvs_ring[stream_id] : nir_undef(b, 4, 32);
706 break;
707 }
708 case nir_intrinsic_load_user_data_amd:
709 replacement = ac_nir_load_arg(b, &args->ac, args->cs_user_data);
710 replacement = nir_pad_vec4(b, replacement);
711 break;
712 default:
713 return false;
714 }
715
716 if (replacement)
717 nir_def_rewrite_uses(&intrin->def, replacement);
718
719 nir_instr_remove(instr);
720 nir_instr_free(instr);
721
722 return true;
723 }
724
lower_tex(nir_builder * b,nir_instr * instr,struct lower_abi_state * s)725 static bool lower_tex(nir_builder *b, nir_instr *instr, struct lower_abi_state *s)
726 {
727 nir_tex_instr *tex = nir_instr_as_tex(instr);
728 const struct si_shader_selector *sel = s->shader->selector;
729 enum amd_gfx_level gfx_level = sel->screen->info.gfx_level;
730
731 b->cursor = nir_before_instr(instr);
732
733 /* Section 8.23.1 (Depth Texture Comparison Mode) of the
734 * OpenGL 4.5 spec says:
735 *
736 * "If the texture’s internal format indicates a fixed-point
737 * depth texture, then D_t and D_ref are clamped to the
738 * range [0, 1]; otherwise no clamping is performed."
739 *
740 * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
741 * so the depth comparison value isn't clamped for Z16 and
742 * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
743 * an explicitly clamped 32-bit float format.
744 */
745
746 /* LLVM keep non-uniform sampler as index, so can't do this in NIR. */
747 if (tex->is_shadow && gfx_level >= GFX8 && gfx_level <= GFX9 && sel->screen->use_aco) {
748 int samp_index = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
749 int comp_index = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
750 assert(samp_index >= 0 && comp_index >= 0);
751
752 nir_def *sampler = tex->src[samp_index].src.ssa;
753 nir_def *compare = tex->src[comp_index].src.ssa;
754 /* Must have been lowered to descriptor. */
755 assert(sampler->num_components > 1);
756
757 nir_def *upgraded = nir_channel(b, sampler, 3);
758 upgraded = nir_i2b(b, nir_ubfe_imm(b, upgraded, 29, 1));
759
760 nir_def *clamped = nir_fsat(b, compare);
761 compare = nir_bcsel(b, upgraded, clamped, compare);
762
763 nir_src_rewrite(&tex->src[comp_index].src, compare);
764 return true;
765 }
766
767 return false;
768 }
769
si_nir_lower_abi(nir_shader * nir,struct si_shader * shader,struct si_shader_args * args)770 bool si_nir_lower_abi(nir_shader *nir, struct si_shader *shader, struct si_shader_args *args)
771 {
772 struct lower_abi_state state = {
773 .shader = shader,
774 .args = args,
775 };
776
777 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
778
779 nir_builder b = nir_builder_create(impl);
780
781 preload_reusable_variables(&b, &state);
782
783 bool progress = false;
784 nir_foreach_block_safe(block, impl) {
785 nir_foreach_instr_safe(instr, block) {
786 if (instr->type == nir_instr_type_intrinsic)
787 progress |= lower_intrinsic(&b, instr, &state);
788 else if (instr->type == nir_instr_type_tex)
789 progress |= lower_tex(&b, instr, &state);
790 }
791 }
792
793 nir_metadata preserved = progress ?
794 nir_metadata_dominance | nir_metadata_block_index :
795 nir_metadata_all;
796 nir_metadata_preserve(impl, preserved);
797
798 return progress;
799 }
800