• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2022 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "nir_builder.h"
8 
9 #include "ac_nir.h"
10 #include "si_pipe.h"
11 #include "si_query.h"
12 #include "si_state.h"
13 #include "si_shader_internal.h"
14 
15 struct lower_abi_state {
16    struct si_shader *shader;
17    struct si_shader_args *args;
18 
19    nir_def *esgs_ring;
20    nir_def *tess_offchip_ring;
21    nir_def *gsvs_ring[4];
22 };
23 
24 #define GET_FIELD_NIR(field) \
25    ac_nir_unpack_arg(b, &args->ac, args->vs_state_bits, \
26                      field##__SHIFT, util_bitcount(field##__MASK))
27 
si_nir_load_internal_binding(nir_builder * b,struct si_shader_args * args,unsigned slot,unsigned num_components)28 nir_def *si_nir_load_internal_binding(nir_builder *b, struct si_shader_args *args,
29                                           unsigned slot, unsigned num_components)
30 {
31    nir_def *addr = ac_nir_load_arg(b, &args->ac, args->internal_bindings);
32    return nir_load_smem_amd(b, num_components, addr, nir_imm_int(b, slot * 16));
33 }
34 
build_attr_ring_desc(nir_builder * b,struct si_shader * shader,struct si_shader_args * args)35 static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader,
36                                          struct si_shader_args *args)
37 {
38    struct si_shader_selector *sel = shader->selector;
39 
40    nir_def *attr_address =
41       sel->stage == MESA_SHADER_VERTEX && sel->info.base.vs.blit_sgprs_amd ?
42       ac_nir_load_arg_at_offset(b, &args->ac, args->vs_blit_inputs,
43                                 sel->info.base.vs.blit_sgprs_amd - 1) :
44       ac_nir_load_arg(b, &args->ac, args->gs_attr_address);
45 
46    unsigned stride = 16 * shader->info.nr_param_exports;
47    nir_def *comp[] = {
48       attr_address,
49       nir_imm_int(b, S_008F04_BASE_ADDRESS_HI(sel->screen->info.address32_hi) |
50                   S_008F04_STRIDE(stride) |
51                   S_008F04_SWIZZLE_ENABLE_GFX11(3) /* 16B */),
52       nir_imm_int(b, 0xffffffff),
53       nir_imm_int(b, S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
54                   S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
55                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
56                   S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
57                   S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT) |
58                   S_008F0C_INDEX_STRIDE(2) /* 32 elements */),
59    };
60 
61    return nir_vec(b, comp, 4);
62 }
63 
64 static nir_def *
fetch_framebuffer(nir_builder * b,struct si_shader_args * args,struct si_shader_selector * sel,union si_shader_key * key)65 fetch_framebuffer(nir_builder *b, struct si_shader_args *args,
66                   struct si_shader_selector *sel, union si_shader_key *key)
67 {
68    /* Load the image descriptor. */
69    STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
70    STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0_FMASK % 2 == 0);
71 
72    nir_def *zero = nir_imm_zero(b, 1, 32);
73    nir_def *undef = nir_undef(b, 1, 32);
74 
75    unsigned chan = 0;
76    nir_def *vec[4] = {undef, undef, undef, undef};
77 
78    vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 16);
79 
80    if (!key->ps.mono.fbfetch_is_1D)
81       vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 16);
82 
83    /* Get the current render target layer index. */
84    if (key->ps.mono.fbfetch_layered)
85       vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 16, 11);
86 
87    nir_def *coords = nir_vec(b, vec, 4);
88 
89    enum glsl_sampler_dim dim;
90    if (key->ps.mono.fbfetch_msaa)
91       dim = GLSL_SAMPLER_DIM_MS;
92    else if (key->ps.mono.fbfetch_is_1D)
93       dim = GLSL_SAMPLER_DIM_1D;
94    else
95       dim = GLSL_SAMPLER_DIM_2D;
96 
97    nir_def *sample_id;
98    if (key->ps.mono.fbfetch_msaa) {
99       sample_id = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 8, 4);
100 
101       if (sel->screen->info.gfx_level < GFX11 &&
102           !(sel->screen->debug_flags & DBG(NO_FMASK))) {
103          nir_def *desc =
104             si_nir_load_internal_binding(b, args, SI_PS_IMAGE_COLORBUF0_FMASK, 8);
105 
106          nir_def *fmask =
107             nir_bindless_image_fragment_mask_load_amd(
108                b, desc, coords,
109                .image_dim = dim,
110                .image_array = key->ps.mono.fbfetch_layered,
111                .access = ACCESS_CAN_REORDER);
112 
113          nir_def *offset = nir_ishl_imm(b, sample_id, 2);
114          /* 3 for EQAA handling, see lower_image_to_fragment_mask_load() */
115          nir_def *width = nir_imm_int(b, 3);
116          sample_id = nir_ubfe(b, fmask, offset, width);
117       }
118    } else {
119       sample_id = zero;
120    }
121 
122    nir_def *desc = si_nir_load_internal_binding(b, args, SI_PS_IMAGE_COLORBUF0, 8);
123 
124    return nir_bindless_image_load(b, 4, 32, desc, coords, sample_id, zero,
125                                   .image_dim = dim,
126                                   .image_array = key->ps.mono.fbfetch_layered,
127                                   .access = ACCESS_CAN_REORDER);
128 }
129 
build_tess_ring_desc(nir_builder * b,struct si_screen * screen,struct si_shader_args * args)130 static nir_def *build_tess_ring_desc(nir_builder *b, struct si_screen *screen,
131                                          struct si_shader_args *args)
132 {
133    nir_def *addr = ac_nir_load_arg(b, &args->ac, args->tes_offchip_addr);
134 
135    uint32_t rsrc3 =
136       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
137       S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
138       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
139       S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
140 
141    if (screen->info.gfx_level >= GFX11) {
142       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
143                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
144    } else if (screen->info.gfx_level >= GFX10) {
145       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
146                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
147                S_008F0C_RESOURCE_LEVEL(1);
148    } else {
149       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
150                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
151    }
152 
153    nir_def *comp[4] = {
154       addr,
155       nir_imm_int(b, S_008F04_BASE_ADDRESS_HI(screen->info.address32_hi)),
156       nir_imm_int(b, 0xffffffff),
157       nir_imm_int(b, rsrc3),
158    };
159 
160    return nir_vec(b, comp, 4);
161 }
162 
build_esgs_ring_desc(nir_builder * b,enum amd_gfx_level gfx_level,struct si_shader_args * args)163 static nir_def *build_esgs_ring_desc(nir_builder *b, enum amd_gfx_level gfx_level,
164                                          struct si_shader_args *args)
165 {
166    nir_def *desc = si_nir_load_internal_binding(b, args, SI_RING_ESGS, 4);
167 
168    if (b->shader->info.stage == MESA_SHADER_GEOMETRY)
169       return desc;
170 
171    nir_def *vec[4];
172    for (int i = 0; i < 4; i++)
173       vec[i] = nir_channel(b, desc, i);
174 
175    vec[1] = nir_ior_imm(b, vec[1], S_008F04_SWIZZLE_ENABLE_GFX6(1));
176    vec[3] = nir_ior_imm(b, vec[3],
177                         S_008F0C_ELEMENT_SIZE(1) |
178                         S_008F0C_INDEX_STRIDE(3) |
179                         S_008F0C_ADD_TID_ENABLE(1));
180 
181    /* If MUBUF && ADD_TID_ENABLE, DATA_FORMAT means STRIDE[14:17] on gfx8-9, so set 0. */
182    if (gfx_level == GFX8)
183       vec[3] = nir_iand_imm(b, vec[3], C_008F0C_DATA_FORMAT);
184 
185    return nir_vec(b, vec, 4);
186 }
187 
build_gsvs_ring_desc(nir_builder * b,struct lower_abi_state * s)188 static void build_gsvs_ring_desc(nir_builder *b, struct lower_abi_state *s)
189 {
190    const struct si_shader_selector *sel = s->shader->selector;
191    const union si_shader_key *key = &s->shader->key;
192 
193    if (s->shader->is_gs_copy_shader) {
194       s->gsvs_ring[0] = si_nir_load_internal_binding(b, s->args, SI_RING_GSVS, 4);
195    } else if (sel->stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg) {
196       nir_def *base_addr = si_nir_load_internal_binding(b, s->args, SI_RING_GSVS, 2);
197       base_addr = nir_pack_64_2x32(b, base_addr);
198 
199       /* The conceptual layout of the GSVS ring is
200        *   v0c0 .. vLv0 v0c1 .. vLc1 ..
201        * but the real memory layout is swizzled across
202        * threads:
203        *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
204        *   t16v0c0 ..
205        * Override the buffer descriptor accordingly.
206        */
207 
208       for (unsigned stream = 0; stream < 4; stream++) {
209          unsigned num_components = sel->info.num_stream_output_components[stream];
210          if (!num_components)
211             continue;
212 
213          nir_def *desc[4];
214          desc[0] = nir_unpack_64_2x32_split_x(b, base_addr);
215          desc[1] = nir_unpack_64_2x32_split_y(b, base_addr);
216 
217          unsigned stride = 4 * num_components * sel->info.base.gs.vertices_out;
218          /* Limit on the stride field for <= GFX7. */
219          assert(stride < (1 << 14));
220 
221          desc[1] = nir_ior_imm(
222             b, desc[1], S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE_GFX6(1));
223 
224          unsigned num_records = s->shader->wave_size;
225          desc[2] = nir_imm_int(b, num_records);
226 
227          uint32_t rsrc3 =
228             S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
229             S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
230             S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
231             S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
232             S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
233             S_008F0C_ADD_TID_ENABLE(1);
234 
235          if (sel->screen->info.gfx_level >= GFX10) {
236             rsrc3 |=
237                S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
238                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
239                S_008F0C_RESOURCE_LEVEL(1);
240          } else {
241             /* If MUBUF && ADD_TID_ENABLE, DATA_FORMAT means STRIDE[14:17] on gfx8-9, so set 0. */
242             unsigned data_format =
243                sel->screen->info.gfx_level == GFX8 || sel->screen->info.gfx_level == GFX9 ?
244                0 : V_008F0C_BUF_DATA_FORMAT_32;
245 
246             rsrc3 |=
247                S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
248                S_008F0C_DATA_FORMAT(data_format) |
249                S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
250          }
251 
252          desc[3] = nir_imm_int(b, rsrc3);
253 
254          s->gsvs_ring[stream] = nir_vec(b, desc, 4);
255 
256          /* next stream's desc addr */
257          base_addr = nir_iadd_imm(b, base_addr, stride * num_records);
258       }
259    }
260 }
261 
preload_reusable_variables(nir_builder * b,struct lower_abi_state * s)262 static void preload_reusable_variables(nir_builder *b, struct lower_abi_state *s)
263 {
264    const struct si_shader_selector *sel = s->shader->selector;
265    const union si_shader_key *key = &s->shader->key;
266 
267    b->cursor = nir_before_impl(b->impl);
268 
269    if (sel->screen->info.gfx_level <= GFX8 && sel->stage <= MESA_SHADER_GEOMETRY &&
270        (key->ge.as_es || sel->stage == MESA_SHADER_GEOMETRY)) {
271       s->esgs_ring = build_esgs_ring_desc(b, sel->screen->info.gfx_level, s->args);
272    }
273 
274    if (sel->stage == MESA_SHADER_TESS_CTRL || sel->stage == MESA_SHADER_TESS_EVAL)
275       s->tess_offchip_ring = build_tess_ring_desc(b, sel->screen, s->args);
276 
277    build_gsvs_ring_desc(b, s);
278 }
279 
get_num_vertices_per_prim(nir_builder * b,struct lower_abi_state * s)280 static nir_def *get_num_vertices_per_prim(nir_builder *b, struct lower_abi_state *s)
281 {
282    struct si_shader_args *args = s->args;
283    unsigned num_vertices = gfx10_ngg_get_vertices_per_prim(s->shader);
284 
285    if (num_vertices)
286       return nir_imm_int(b, num_vertices);
287    else
288       return nir_iadd_imm(b, GET_FIELD_NIR(GS_STATE_OUTPRIM), 1);
289 }
290 
lower_intrinsic(nir_builder * b,nir_instr * instr,struct lower_abi_state * s)291 static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_state *s)
292 {
293    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
294 
295    struct si_shader *shader = s->shader;
296    struct si_shader_args *args = s->args;
297    struct si_shader_selector *sel = shader->selector;
298    union si_shader_key *key = &shader->key;
299    gl_shader_stage stage = sel->stage;
300 
301    b->cursor = nir_before_instr(instr);
302 
303    nir_def *replacement = NULL;
304 
305    switch (intrin->intrinsic) {
306    case nir_intrinsic_load_first_vertex:
307       replacement = ac_nir_load_arg(b, &args->ac, args->ac.base_vertex);
308       break;
309    case nir_intrinsic_load_base_vertex: {
310       nir_def *indexed = GET_FIELD_NIR(VS_STATE_INDEXED);
311       indexed = nir_i2b(b, indexed);
312 
313       nir_def *base_vertex = ac_nir_load_arg(b, &args->ac, args->ac.base_vertex);
314       replacement = nir_bcsel(b, indexed, base_vertex, nir_imm_int(b, 0));
315       break;
316    }
317    case nir_intrinsic_load_workgroup_size: {
318       assert(sel->info.base.workgroup_size_variable && sel->info.uses_variable_block_size);
319 
320       nir_def *block_size = ac_nir_load_arg(b, &args->ac, args->block_size);
321       nir_def *comp[] = {
322          nir_ubfe_imm(b, block_size, 0, 10),
323          nir_ubfe_imm(b, block_size, 10, 10),
324          nir_ubfe_imm(b, block_size, 20, 10),
325       };
326       replacement = nir_vec(b, comp, 3);
327       break;
328    }
329    case nir_intrinsic_load_tess_level_outer_default:
330    case nir_intrinsic_load_tess_level_inner_default: {
331       nir_def *buf = si_nir_load_internal_binding(b, args, SI_HS_CONST_DEFAULT_TESS_LEVELS, 4);
332       unsigned num_components = intrin->def.num_components;
333       unsigned offset =
334          intrin->intrinsic == nir_intrinsic_load_tess_level_inner_default ? 16 : 0;
335       replacement = nir_load_ubo(b, num_components, 32, buf, nir_imm_int(b, offset),
336                                  .range = ~0);
337       break;
338    }
339    case nir_intrinsic_load_patch_vertices_in:
340       if (stage == MESA_SHADER_TESS_CTRL)
341          replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 11, 5);
342       else if (stage == MESA_SHADER_TESS_EVAL) {
343          replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 6, 5);
344       } else
345          unreachable("no nir_load_patch_vertices_in");
346       replacement = nir_iadd_imm(b, replacement, 1);
347       break;
348    case nir_intrinsic_load_sample_mask_in:
349       replacement = ac_nir_load_arg(b, &args->ac, args->ac.sample_coverage);
350       break;
351    case nir_intrinsic_load_lshs_vertex_stride_amd:
352       if (stage == MESA_SHADER_VERTEX)
353          replacement = nir_imm_int(b, sel->info.lshs_vertex_stride);
354       else if (stage == MESA_SHADER_TESS_CTRL)
355          replacement = sel->screen->info.gfx_level >= GFX9 && shader->is_monolithic ?
356             nir_imm_int(b, key->ge.part.tcs.ls->info.lshs_vertex_stride) :
357             nir_ishl_imm(b, GET_FIELD_NIR(VS_STATE_LS_OUT_VERTEX_SIZE), 2);
358       else
359          unreachable("no nir_load_lshs_vertex_stride_amd");
360       break;
361    case nir_intrinsic_load_esgs_vertex_stride_amd:
362       assert(sel->screen->info.gfx_level >= GFX9);
363       if (shader->is_monolithic) {
364          replacement = nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4);
365       } else {
366          nir_def *num_es_outputs = GET_FIELD_NIR(GS_STATE_NUM_ES_OUTPUTS);
367          replacement = nir_iadd_imm(b, nir_imul_imm(b, num_es_outputs, 4), 1);
368       }
369       break;
370    case nir_intrinsic_load_tcs_num_patches_amd: {
371       nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6);
372       replacement = nir_iadd_imm(b, tmp, 1);
373       break;
374    }
375    case nir_intrinsic_load_hs_out_patch_data_offset_amd:
376       replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 16, 16);
377       break;
378    case nir_intrinsic_load_ring_tess_offchip_offset_amd:
379       replacement = ac_nir_load_arg(b, &args->ac, args->ac.tess_offchip_offset);
380       break;
381    case nir_intrinsic_load_ring_es2gs_offset_amd:
382       replacement = ac_nir_load_arg(b, &args->ac, args->ac.es2gs_offset);
383       break;
384    case nir_intrinsic_load_clip_half_line_width_amd: {
385       nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
386       replacement = nir_load_smem_amd(b, 2, addr, nir_imm_int(b, 32));
387       break;
388    }
389    case nir_intrinsic_load_viewport_xy_scale_and_offset: {
390       bool prim_is_lines = key->ge.opt.ngg_culling & SI_NGG_CULL_LINES;
391       nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
392       unsigned offset = prim_is_lines ? 16 : 0;
393       replacement = nir_load_smem_amd(b, 4, addr, nir_imm_int(b, offset));
394       break;
395    }
396    case nir_intrinsic_load_num_vertices_per_primitive_amd:
397       replacement = get_num_vertices_per_prim(b, s);
398       break;
399    case nir_intrinsic_load_cull_ccw_amd:
400       /* radeonsi embed cw/ccw info into front/back face enabled */
401       replacement = nir_imm_false(b);
402       break;
403    case nir_intrinsic_load_cull_any_enabled_amd:
404       replacement = nir_imm_bool(b, !!key->ge.opt.ngg_culling);
405       break;
406    case nir_intrinsic_load_cull_back_face_enabled_amd:
407       replacement = nir_imm_bool(b, key->ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE);
408       break;
409    case nir_intrinsic_load_cull_front_face_enabled_amd:
410       replacement = nir_imm_bool(b, key->ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE);
411       break;
412    case nir_intrinsic_load_cull_small_prim_precision_amd: {
413       nir_def *small_prim_precision =
414          key->ge.opt.ngg_culling & SI_NGG_CULL_LINES ?
415          GET_FIELD_NIR(GS_STATE_SMALL_PRIM_PRECISION_NO_AA) :
416          GET_FIELD_NIR(GS_STATE_SMALL_PRIM_PRECISION);
417 
418       /* Extract the small prim precision. */
419       small_prim_precision = nir_ior_imm(b, small_prim_precision, 0x70);
420       replacement = nir_ishl_imm(b, small_prim_precision, 23);
421       break;
422    }
423    case nir_intrinsic_load_cull_small_primitives_enabled_amd: {
424       unsigned mask = SI_NGG_CULL_LINES | SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT;
425       replacement = nir_imm_bool(b, (key->ge.opt.ngg_culling & mask) != SI_NGG_CULL_LINES);
426       break;
427    }
428    case nir_intrinsic_load_provoking_vtx_in_prim_amd:
429       replacement = nir_bcsel(b, nir_i2b(b, GET_FIELD_NIR(GS_STATE_PROVOKING_VTX_FIRST)),
430                               nir_imm_int(b, 0),
431                               nir_iadd_imm(b, get_num_vertices_per_prim(b, s), -1));
432       break;
433    case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
434       replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_PIPELINE_STATS_EMU));
435       break;
436    case nir_intrinsic_load_prim_gen_query_enabled_amd:
437    case nir_intrinsic_load_prim_xfb_query_enabled_amd:
438       replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_STREAMOUT_QUERY_ENABLED));
439       break;
440    case nir_intrinsic_load_clamp_vertex_color_amd:
441       replacement = nir_i2b(b, GET_FIELD_NIR(VS_STATE_CLAMP_VERTEX_COLOR));
442       break;
443    case nir_intrinsic_load_user_clip_plane: {
444       nir_def *buf = si_nir_load_internal_binding(b, args, SI_VS_CONST_CLIP_PLANES, 4);
445       unsigned offset = nir_intrinsic_ucp_id(intrin) * 16;
446       replacement = nir_load_ubo(b, 4, 32, buf, nir_imm_int(b, offset),
447                                  .range = ~0);
448       break;
449    }
450    case nir_intrinsic_load_streamout_buffer_amd: {
451       unsigned slot = SI_VS_STREAMOUT_BUF0 + nir_intrinsic_base(intrin);
452       replacement = si_nir_load_internal_binding(b, args, slot, 4);
453       break;
454    }
455    case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
456    case nir_intrinsic_atomic_add_shader_invocation_count_amd: {
457       enum pipe_statistics_query_index index =
458          intrin->intrinsic == nir_intrinsic_atomic_add_gs_emit_prim_count_amd ?
459          PIPE_STAT_QUERY_GS_PRIMITIVES : PIPE_STAT_QUERY_GS_INVOCATIONS;
460 
461       /* GFX11 only needs to emulate PIPE_STAT_QUERY_GS_PRIMITIVES because GS culls,
462        * which makes the pipeline statistic incorrect.
463        */
464       assert(sel->screen->info.gfx_level < GFX11 || index == PIPE_STAT_QUERY_GS_PRIMITIVES);
465 
466       nir_def *buf =
467          si_nir_load_internal_binding(b, args, SI_GS_QUERY_EMULATED_COUNTERS_BUF, 4);
468       unsigned offset = si_query_pipestat_end_dw_offset(sel->screen, index) * 4;
469 
470       nir_def *count = intrin->src[0].ssa;
471       nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), count,
472                       .atomic_op = nir_atomic_op_iadd);
473       break;
474    }
475    case nir_intrinsic_atomic_add_gen_prim_count_amd:
476    case nir_intrinsic_atomic_add_xfb_prim_count_amd: {
477       nir_def *buf = si_nir_load_internal_binding(b, args, SI_GS_QUERY_BUF, 4);
478 
479       unsigned stream = nir_intrinsic_stream_id(intrin);
480       unsigned offset = intrin->intrinsic == nir_intrinsic_atomic_add_gen_prim_count_amd ?
481          offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].generated_primitives) :
482          offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].emitted_primitives);
483 
484       nir_def *prim_count = intrin->src[0].ssa;
485       nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), prim_count,
486                       .atomic_op = nir_atomic_op_iadd);
487       break;
488    }
489    case nir_intrinsic_load_ring_attr_amd:
490       replacement = build_attr_ring_desc(b, shader, args);
491       break;
492    case nir_intrinsic_load_ring_attr_offset_amd: {
493       nir_def *offset = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_attr_offset, 0, 15);
494       replacement = nir_ishl_imm(b, offset, 9);
495       break;
496    }
497    case nir_intrinsic_load_ring_gs2vs_offset_amd:
498       replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs2vs_offset);
499       break;
500    case nir_intrinsic_load_streamout_config_amd:
501       replacement = ac_nir_load_arg(b, &args->ac, args->ac.streamout_config);
502       break;
503    case nir_intrinsic_load_streamout_write_index_amd:
504       replacement = ac_nir_load_arg(b, &args->ac, args->ac.streamout_write_index);
505       break;
506    case nir_intrinsic_load_streamout_offset_amd:
507       replacement =
508          ac_nir_load_arg(b, &args->ac, args->ac.streamout_offset[nir_intrinsic_base(intrin)]);
509       break;
510    case nir_intrinsic_load_force_vrs_rates_amd:
511       if (sel->screen->info.gfx_level >= GFX11) {
512          /* Bits [2:5] = VRS rate
513           *
514           * The range is [0, 15].
515           *
516           * If the hw doesn't support VRS 4x4, it will silently use 2x2 instead.
517           */
518          replacement = nir_imm_int(b, V_0283D0_VRS_SHADING_RATE_4X4 << 2);
519       } else {
520          /* Bits [2:3] = VRS rate X
521           * Bits [4:5] = VRS rate Y
522           *
523           * The range is [-2, 1]. Values:
524           *   1: 2x coarser shading rate in that direction.
525           *   0: normal shading rate
526           *  -1: 2x finer shading rate (sample shading, not directional)
527           *  -2: 4x finer shading rate (sample shading, not directional)
528           *
529           * Sample shading can't go above 8 samples, so both numbers can't be -2
530           * at the same time.
531           */
532          replacement = nir_imm_int(b, (1 << 2) | (1 << 4));
533       }
534       break;
535    case nir_intrinsic_load_barycentric_at_sample: {
536       unsigned mode = nir_intrinsic_interp_mode(intrin);
537 
538       if (key->ps.mono.interpolate_at_sample_force_center) {
539          replacement = nir_load_barycentric_pixel(b, 32, .interp_mode = mode);
540       } else {
541          nir_def *sample_id = intrin->src[0].ssa;
542          /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
543          nir_def *offset = nir_ishl_imm(b, sample_id, 3);
544 
545          nir_def *buf = si_nir_load_internal_binding(b, args, SI_PS_CONST_SAMPLE_POSITIONS, 4);
546          nir_def *sample_pos = nir_load_ubo(b, 2, 32, buf, offset, .range = ~0);
547 
548          sample_pos = nir_fadd_imm(b, sample_pos, -0.5);
549 
550          replacement = nir_load_barycentric_at_offset(b, 32, sample_pos, .interp_mode = mode);
551       }
552       break;
553    }
554    case nir_intrinsic_load_output: {
555       nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
556 
557       /* not fbfetch */
558       if (!(stage == MESA_SHADER_FRAGMENT && sem.fb_fetch_output))
559          return false;
560 
561       /* Ignore src0, because KHR_blend_func_extended disallows multiple render targets. */
562 
563       replacement = fetch_framebuffer(b, args, sel, key);
564       break;
565    }
566    case nir_intrinsic_load_ring_tess_factors_amd: {
567       assert(s->tess_offchip_ring);
568       nir_def *addr = nir_channel(b, s->tess_offchip_ring, 0);
569       addr = nir_iadd_imm(b, addr, sel->screen->hs.tess_offchip_ring_size);
570       replacement = nir_vector_insert_imm(b, s->tess_offchip_ring, addr, 0);
571       break;
572    }
573    case nir_intrinsic_load_ring_tess_factors_offset_amd:
574       replacement = ac_nir_load_arg(b, &args->ac, args->ac.tcs_factor_offset);
575       break;
576    case nir_intrinsic_load_alpha_reference_amd:
577       replacement = ac_nir_load_arg(b, &args->ac, args->alpha_reference);
578       break;
579    case nir_intrinsic_load_front_face:
580       if (!key->ps.opt.force_front_face_input)
581          return false;
582       replacement = nir_imm_bool(b, key->ps.opt.force_front_face_input == 1);
583       break;
584    case nir_intrinsic_load_barycentric_optimize_amd: {
585       nir_def *prim_mask = ac_nir_load_arg(b, &args->ac, args->ac.prim_mask);
586       /* enabled when bit 31 is set */
587       replacement = nir_ilt_imm(b, prim_mask, 0);
588       break;
589    }
590    case nir_intrinsic_load_layer_id:
591       replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 16, 13);
592       break;
593    case nir_intrinsic_load_color0:
594    case nir_intrinsic_load_color1: {
595       uint32_t colors_read = sel->info.colors_read;
596 
597       int start, offset;
598       if (intrin->intrinsic == nir_intrinsic_load_color0) {
599          start = 0;
600          offset = 0;
601       } else {
602          start = 4;
603          offset = util_bitcount(colors_read & 0xf);
604       }
605 
606       nir_def *color[4];
607       for (int i = 0; i < 4; i++) {
608          if (colors_read & BITFIELD_BIT(start + i)) {
609             color[i] = ac_nir_load_arg_at_offset(b, &args->ac, args->color_start, offset++);
610 
611             nir_intrinsic_set_flags(nir_instr_as_intrinsic(color[i]->parent_instr),
612                                     SI_VECTOR_ARG_IS_COLOR |
613                                     SI_VECTOR_ARG_COLOR_COMPONENT(start + i));
614          } else {
615             color[i] = nir_undef(b, 1, 32);
616          }
617       }
618 
619       replacement = nir_vec(b, color, 4);
620       break;
621    }
622    case nir_intrinsic_load_point_coord_maybe_flipped: {
623       nir_def *interp_param =
624          nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NONE);
625 
626       /* Load point coordinates (x, y) which are written by the hw after the interpolated inputs */
627       replacement = nir_load_interpolated_input(b, 2, 32, interp_param, nir_imm_int(b, 0),
628                                                 .base = si_get_ps_num_interp(shader),
629                                                 .component = 2,
630                                                 /* This tells si_nir_scan_shader that it's PARAM_GEN */
631                                                 .io_semantics.no_varying = 1);
632       break;
633    }
634    case nir_intrinsic_load_poly_line_smooth_enabled:
635       replacement = nir_imm_bool(b, key->ps.mono.poly_line_smoothing);
636       break;
637    case nir_intrinsic_load_gs_vertex_offset_amd: {
638       unsigned base = nir_intrinsic_base(intrin);
639       replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[base]);
640       break;
641    }
642    case nir_intrinsic_load_merged_wave_info_amd:
643       replacement = ac_nir_load_arg(b, &args->ac, args->ac.merged_wave_info);
644       break;
645    case nir_intrinsic_load_workgroup_num_input_vertices_amd:
646       replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 12, 9);
647       break;
648    case nir_intrinsic_load_workgroup_num_input_primitives_amd:
649       replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 22, 9);
650       break;
651    case nir_intrinsic_load_initial_edgeflags_amd:
652       if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES ||
653           (shader->selector->stage == MESA_SHADER_VERTEX &&
654            shader->selector->info.base.vs.blit_sgprs_amd)) {
655          /* Line primitives and blits don't need edge flags. */
656          replacement = nir_imm_int(b, 0);
657       } else if (shader->selector->stage == MESA_SHADER_VERTEX) {
658          /* Use the following trick to extract the edge flags:
659           *   extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10
660           *   shifted = v_mul_u32_u24 extracted, 0x80402u   ; shift the bits: 8->9, 9->19, 10->29
661           *   result = v_and_b32 shifted, 0x20080200        ; remove garbage
662           */
663          nir_def *tmp = ac_nir_load_arg(b, &args->ac, args->ac.gs_invocation_id);
664          tmp = nir_iand_imm(b, tmp, 0x700);
665          tmp = nir_imul_imm(b, tmp, 0x80402);
666          replacement = nir_iand_imm(b, tmp, 0x20080200);
667       } else {
668          /* Edge flags are always enabled when polygon mode is enabled, so we always have to
669           * return valid edge flags if the primitive type is not lines and if we are not blitting
670           * because the shader doesn't know when polygon mode is enabled.
671           */
672          replacement = nir_imm_int(b, ac_get_all_edge_flag_bits());
673       }
674       break;
675    case nir_intrinsic_load_packed_passthrough_primitive_amd:
676       replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[0]);
677       break;
678    case nir_intrinsic_load_ordered_id_amd:
679       replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 0, 12);
680       break;
681    case nir_intrinsic_load_ring_esgs_amd:
682       assert(s->esgs_ring);
683       replacement = s->esgs_ring;
684       break;
685    case nir_intrinsic_load_tess_rel_patch_id_amd:
686       /* LLVM need to replace patch id arg, so have to be done in LLVM backend. */
687       if (!sel->screen->use_aco)
688          return false;
689 
690       if (stage == MESA_SHADER_TESS_CTRL) {
691          replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_rel_ids, 0, 8);
692       } else {
693          assert(stage == MESA_SHADER_TESS_EVAL);
694          replacement = ac_nir_load_arg(b, &args->ac, args->ac.tes_rel_patch_id);
695       }
696       break;
697    case nir_intrinsic_load_ring_tess_offchip_amd:
698       assert(s->tess_offchip_ring);
699       replacement = s->tess_offchip_ring;
700       break;
701    case nir_intrinsic_load_ring_gsvs_amd: {
702       unsigned stream_id = nir_intrinsic_stream_id(intrin);
703       /* Unused nir_load_ring_gsvs_amd may not be eliminated yet. */
704       replacement = s->gsvs_ring[stream_id] ?
705          s->gsvs_ring[stream_id] : nir_undef(b, 4, 32);
706       break;
707    }
708    case nir_intrinsic_load_user_data_amd:
709       replacement = ac_nir_load_arg(b, &args->ac, args->cs_user_data);
710       replacement = nir_pad_vec4(b, replacement);
711       break;
712    default:
713       return false;
714    }
715 
716    if (replacement)
717       nir_def_rewrite_uses(&intrin->def, replacement);
718 
719    nir_instr_remove(instr);
720    nir_instr_free(instr);
721 
722    return true;
723 }
724 
lower_tex(nir_builder * b,nir_instr * instr,struct lower_abi_state * s)725 static bool lower_tex(nir_builder *b, nir_instr *instr, struct lower_abi_state *s)
726 {
727    nir_tex_instr *tex = nir_instr_as_tex(instr);
728    const struct si_shader_selector *sel = s->shader->selector;
729    enum amd_gfx_level gfx_level = sel->screen->info.gfx_level;
730 
731    b->cursor = nir_before_instr(instr);
732 
733    /* Section 8.23.1 (Depth Texture Comparison Mode) of the
734     * OpenGL 4.5 spec says:
735     *
736     *    "If the texture’s internal format indicates a fixed-point
737     *     depth texture, then D_t and D_ref are clamped to the
738     *     range [0, 1]; otherwise no clamping is performed."
739     *
740     * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
741     * so the depth comparison value isn't clamped for Z16 and
742     * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
743     * an explicitly clamped 32-bit float format.
744     */
745 
746    /* LLVM keep non-uniform sampler as index, so can't do this in NIR. */
747    if (tex->is_shadow && gfx_level >= GFX8 && gfx_level <= GFX9 && sel->screen->use_aco) {
748       int samp_index = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
749       int comp_index = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
750       assert(samp_index >= 0 && comp_index >= 0);
751 
752       nir_def *sampler = tex->src[samp_index].src.ssa;
753       nir_def *compare = tex->src[comp_index].src.ssa;
754       /* Must have been lowered to descriptor. */
755       assert(sampler->num_components > 1);
756 
757       nir_def *upgraded = nir_channel(b, sampler, 3);
758       upgraded = nir_i2b(b, nir_ubfe_imm(b, upgraded, 29, 1));
759 
760       nir_def *clamped = nir_fsat(b, compare);
761       compare = nir_bcsel(b, upgraded, clamped, compare);
762 
763       nir_src_rewrite(&tex->src[comp_index].src, compare);
764       return true;
765    }
766 
767    return false;
768 }
769 
si_nir_lower_abi(nir_shader * nir,struct si_shader * shader,struct si_shader_args * args)770 bool si_nir_lower_abi(nir_shader *nir, struct si_shader *shader, struct si_shader_args *args)
771 {
772    struct lower_abi_state state = {
773       .shader = shader,
774       .args = args,
775    };
776 
777    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
778 
779    nir_builder b = nir_builder_create(impl);
780 
781    preload_reusable_variables(&b, &state);
782 
783    bool progress = false;
784    nir_foreach_block_safe(block, impl) {
785       nir_foreach_instr_safe(instr, block) {
786          if (instr->type == nir_instr_type_intrinsic)
787             progress |= lower_intrinsic(&b, instr, &state);
788          else if (instr->type == nir_instr_type_tex)
789             progress |= lower_tex(&b, instr, &state);
790       }
791    }
792 
793    nir_metadata preserved = progress ?
794       nir_metadata_dominance | nir_metadata_block_index :
795       nir_metadata_all;
796    nir_metadata_preserve(impl, preserved);
797 
798    return progress;
799 }
800