• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "ac_nir.h"
26 #include "nir_builder.h"
27 
28 /*
29  * Lower NIR cross-stage I/O intrinsics into the memory accesses that actually happen on the HW.
30  *
31  * These HW stages are used only when a Geometry Shader is used.
32  * Export Shader (ES) runs the SW stage before GS, can be either VS or TES.
33  *
34  * * GFX6-8:
35  *   ES and GS are separate HW stages.
36  *   I/O is passed between them through VRAM.
37  * * GFX9+:
38  *   ES and GS are merged into a single HW stage.
39  *   I/O is passed between them through LDS.
40  *
41  */
42 
43 typedef struct {
44    /* Which hardware generation we're dealing with */
45    enum amd_gfx_level gfx_level;
46 
47    /* I/O semantic -> real location used by lowering. */
48    ac_nir_map_io_driver_location map_io;
49 
50    /* Stride of an ES invocation outputs in esgs ring, in bytes. */
51    unsigned esgs_itemsize;
52 
53    /* Enable fix for triangle strip adjacency in geometry shader. */
54    bool gs_triangle_strip_adjacency_fix;
55 } lower_esgs_io_state;
56 
57 static nir_ssa_def *
emit_split_buffer_load(nir_builder * b,nir_ssa_def * desc,nir_ssa_def * v_off,nir_ssa_def * s_off,unsigned component_stride,unsigned num_components,unsigned bit_size)58 emit_split_buffer_load(nir_builder *b, nir_ssa_def *desc, nir_ssa_def *v_off, nir_ssa_def *s_off,
59                        unsigned component_stride, unsigned num_components, unsigned bit_size)
60 {
61    unsigned total_bytes = num_components * bit_size / 8u;
62    unsigned full_dwords = total_bytes / 4u;
63    unsigned remaining_bytes = total_bytes - full_dwords * 4u;
64 
65    /* Accomodate max number of split 64-bit loads */
66    nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS * 2u];
67 
68    /* Assume that 1x32-bit load is better than 1x16-bit + 1x8-bit */
69    if (remaining_bytes == 3) {
70       remaining_bytes = 0;
71       full_dwords++;
72    }
73 
74    for (unsigned i = 0; i < full_dwords; ++i)
75       comps[i] = nir_build_load_buffer_amd(b, 1, 32, desc, v_off, s_off,
76                                            .base = component_stride * i, .memory_modes = nir_var_shader_in);
77 
78    if (remaining_bytes)
79       comps[full_dwords] = nir_build_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off,
80                                                      .base = component_stride * full_dwords, .memory_modes = nir_var_shader_in);
81 
82    return nir_extract_bits(b, comps, full_dwords + !!remaining_bytes, 0, num_components, bit_size);
83 }
84 
85 static void
emit_split_buffer_store(nir_builder * b,nir_ssa_def * d,nir_ssa_def * desc,nir_ssa_def * v_off,nir_ssa_def * s_off,unsigned component_stride,unsigned num_components,unsigned bit_size,unsigned writemask,bool swizzled,bool slc)86 emit_split_buffer_store(nir_builder *b, nir_ssa_def *d, nir_ssa_def *desc, nir_ssa_def *v_off, nir_ssa_def *s_off,
87                         unsigned component_stride, unsigned num_components, unsigned bit_size,
88                         unsigned writemask, bool swizzled, bool slc)
89 {
90    while (writemask) {
91       int start, count;
92       u_bit_scan_consecutive_range(&writemask, &start, &count);
93       assert(start >= 0 && count >= 0);
94 
95       unsigned bytes = count * bit_size / 8u;
96       unsigned start_byte = start * bit_size / 8u;
97 
98       while (bytes) {
99          unsigned store_bytes = MIN2(bytes, 4u);
100          if ((start_byte % 4) == 1 || (start_byte % 4) == 3)
101             store_bytes = MIN2(store_bytes, 1);
102          else if ((start_byte % 4) == 2)
103             store_bytes = MIN2(store_bytes, 2);
104 
105          nir_ssa_def *store_val = nir_extract_bits(b, &d, 1, start_byte * 8u, 1, store_bytes * 8u);
106          nir_build_store_buffer_amd(b, store_val, desc, v_off, s_off, .is_swizzled = swizzled, .slc_amd = slc,
107                                     .base = start_byte, .memory_modes = nir_var_shader_out);
108 
109          start_byte += store_bytes;
110          bytes -= store_bytes;
111       }
112    }
113 }
114 
115 static bool
lower_es_output_store(nir_builder * b,nir_instr * instr,void * state)116 lower_es_output_store(nir_builder *b,
117                       nir_instr *instr,
118                       void *state)
119 {
120    if (instr->type != nir_instr_type_intrinsic)
121       return false;
122 
123    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
124 
125    if (intrin->intrinsic != nir_intrinsic_store_output)
126       return false;
127 
128    /* The ARB_shader_viewport_layer_array spec contains the
129     * following issue:
130     *
131     *    2) What happens if gl_ViewportIndex or gl_Layer is
132     *    written in the vertex shader and a geometry shader is
133     *    present?
134     *
135     *    RESOLVED: The value written by the last vertex processing
136     *    stage is used. If the last vertex processing stage
137     *    (vertex, tessellation evaluation or geometry) does not
138     *    statically assign to gl_ViewportIndex or gl_Layer, index
139     *    or layer zero is assumed.
140     *
141     * Vulkan spec 15.7 Built-In Variables:
142     *
143     *   The last active pre-rasterization shader stage (in pipeline order)
144     *   controls the Layer that is used. Outputs in previous shader stages
145     *   are not used, even if the last stage fails to write the Layer.
146     *
147     *   The last active pre-rasterization shader stage (in pipeline order)
148     *   controls the ViewportIndex that is used. Outputs in previous shader
149     *   stages are not used, even if the last stage fails to write the
150     *   ViewportIndex.
151     *
152     * So writes to those outputs in ES are simply ignored.
153     */
154    unsigned semantic = nir_intrinsic_io_semantics(intrin).location;
155    if (semantic == VARYING_SLOT_LAYER || semantic == VARYING_SLOT_VIEWPORT) {
156       nir_instr_remove(instr);
157       return true;
158    }
159 
160    lower_esgs_io_state *st = (lower_esgs_io_state *) state;
161    unsigned write_mask = nir_intrinsic_write_mask(intrin);
162 
163    b->cursor = nir_before_instr(instr);
164    nir_ssa_def *io_off = ac_nir_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u, st->map_io);
165 
166    if (st->gfx_level <= GFX8) {
167       /* GFX6-8: ES is a separate HW stage, data is passed from ES to GS in VRAM. */
168       nir_ssa_def *ring = nir_build_load_ring_esgs_amd(b);
169       nir_ssa_def *es2gs_off = nir_build_load_ring_es2gs_offset_amd(b);
170       emit_split_buffer_store(b, intrin->src[0].ssa, ring, io_off, es2gs_off, 4u,
171                               intrin->src[0].ssa->num_components, intrin->src[0].ssa->bit_size,
172                               write_mask, true, true);
173    } else {
174       /* GFX9+: ES is merged into GS, data is passed through LDS. */
175       nir_ssa_def *vertex_idx = nir_build_load_local_invocation_index(b);
176       nir_ssa_def *off = nir_iadd(b, nir_imul_imm(b, vertex_idx, st->esgs_itemsize), io_off);
177       nir_build_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask,
178                              .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
179    }
180 
181    nir_instr_remove(instr);
182    return true;
183 }
184 
185 static nir_ssa_def *
gs_get_vertex_offset(nir_builder * b,lower_esgs_io_state * st,unsigned vertex_index)186 gs_get_vertex_offset(nir_builder *b, lower_esgs_io_state *st, unsigned vertex_index)
187 {
188    nir_ssa_def *origin = nir_build_load_gs_vertex_offset_amd(b, .base = vertex_index);
189    if (!st->gs_triangle_strip_adjacency_fix)
190       return origin;
191 
192    unsigned fixed_index;
193    if (st->gfx_level < GFX9) {
194       /* Rotate vertex index by 2. */
195       fixed_index = (vertex_index + 4) % 6;
196    } else {
197       /* This issue has been fixed for GFX10+ */
198       assert(st->gfx_level == GFX9);
199       /* 6 vertex offset are packed to 3 vgprs for GFX9+ */
200       fixed_index = (vertex_index + 2) % 3;
201    }
202    nir_ssa_def *fixed = nir_build_load_gs_vertex_offset_amd(b, .base = fixed_index);
203 
204    nir_ssa_def *prim_id = nir_load_primitive_id(b);
205    /* odd primitive id use fixed offset */
206    nir_ssa_def *cond = nir_i2b(b, nir_iand_imm(b, prim_id, 1));
207    return nir_bcsel(b, cond, fixed, origin);
208 }
209 
210 static nir_ssa_def *
gs_per_vertex_input_vertex_offset_gfx6(nir_builder * b,lower_esgs_io_state * st,nir_src * vertex_src)211 gs_per_vertex_input_vertex_offset_gfx6(nir_builder *b, lower_esgs_io_state *st,
212                                        nir_src *vertex_src)
213 {
214    if (nir_src_is_const(*vertex_src))
215       return gs_get_vertex_offset(b, st, nir_src_as_uint(*vertex_src));
216 
217    nir_ssa_def *vertex_offset = gs_get_vertex_offset(b, st, 0);
218 
219    for (unsigned i = 1; i < b->shader->info.gs.vertices_in; ++i) {
220       nir_ssa_def *cond = nir_ieq_imm(b, vertex_src->ssa, i);
221       nir_ssa_def *elem = gs_get_vertex_offset(b, st, i);
222       vertex_offset = nir_bcsel(b, cond, elem, vertex_offset);
223    }
224 
225    return vertex_offset;
226 }
227 
228 static nir_ssa_def *
gs_per_vertex_input_vertex_offset_gfx9(nir_builder * b,lower_esgs_io_state * st,nir_src * vertex_src)229 gs_per_vertex_input_vertex_offset_gfx9(nir_builder *b, lower_esgs_io_state *st,
230                                        nir_src *vertex_src)
231 {
232    if (nir_src_is_const(*vertex_src)) {
233       unsigned vertex = nir_src_as_uint(*vertex_src);
234       return nir_ubfe(b, gs_get_vertex_offset(b, st, vertex / 2u),
235                       nir_imm_int(b, (vertex & 1u) * 16u), nir_imm_int(b, 16u));
236    }
237 
238    nir_ssa_def *vertex_offset = gs_get_vertex_offset(b, st, 0);
239 
240    for (unsigned i = 1; i < b->shader->info.gs.vertices_in; i++) {
241       nir_ssa_def *cond = nir_ieq_imm(b, vertex_src->ssa, i);
242       nir_ssa_def *elem = gs_get_vertex_offset(b, st, i / 2u * 2u);
243       if (i % 2u)
244          elem = nir_ishr_imm(b, elem, 16u);
245 
246       vertex_offset = nir_bcsel(b, cond, elem, vertex_offset);
247    }
248 
249    return nir_iand_imm(b, vertex_offset, 0xffffu);
250 }
251 
252 static nir_ssa_def *
gs_per_vertex_input_offset(nir_builder * b,lower_esgs_io_state * st,nir_intrinsic_instr * instr)253 gs_per_vertex_input_offset(nir_builder *b,
254                            lower_esgs_io_state *st,
255                            nir_intrinsic_instr *instr)
256 {
257    nir_src *vertex_src = nir_get_io_arrayed_index_src(instr);
258    nir_ssa_def *vertex_offset = st->gfx_level >= GFX9
259       ? gs_per_vertex_input_vertex_offset_gfx9(b, st, vertex_src)
260       : gs_per_vertex_input_vertex_offset_gfx6(b, st, vertex_src);
261 
262    unsigned base_stride = st->gfx_level >= GFX9 ? 1 : 64 /* Wave size on GFX6-8 */;
263    nir_ssa_def *io_off = ac_nir_calc_io_offset(b, instr, nir_imm_int(b, base_stride * 4u), base_stride, st->map_io);
264    nir_ssa_def *off = nir_iadd(b, io_off, vertex_offset);
265    return nir_imul_imm(b, off, 4u);
266 }
267 
268 static nir_ssa_def *
lower_gs_per_vertex_input_load(nir_builder * b,nir_instr * instr,void * state)269 lower_gs_per_vertex_input_load(nir_builder *b,
270                                nir_instr *instr,
271                                void *state)
272 {
273    lower_esgs_io_state *st = (lower_esgs_io_state *) state;
274    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
275    nir_ssa_def *off = gs_per_vertex_input_offset(b, st, intrin);
276 
277    if (st->gfx_level >= GFX9)
278       return nir_build_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
279                                    .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
280 
281    unsigned wave_size = 64u; /* GFX6-8 only support wave64 */
282    nir_ssa_def *ring = nir_build_load_ring_esgs_amd(b);
283    return emit_split_buffer_load(b, ring, off, nir_imm_zero(b, 1, 32), 4u * wave_size,
284                                  intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size);
285 }
286 
287 static bool
filter_load_per_vertex_input(const nir_instr * instr,UNUSED const void * state)288 filter_load_per_vertex_input(const nir_instr *instr, UNUSED const void *state)
289 {
290    return instr->type == nir_instr_type_intrinsic && nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_per_vertex_input;
291 }
292 
293 void
ac_nir_lower_es_outputs_to_mem(nir_shader * shader,ac_nir_map_io_driver_location map,enum amd_gfx_level gfx_level,unsigned esgs_itemsize)294 ac_nir_lower_es_outputs_to_mem(nir_shader *shader,
295                                ac_nir_map_io_driver_location map,
296                                enum amd_gfx_level gfx_level,
297                                unsigned esgs_itemsize)
298 {
299    lower_esgs_io_state state = {
300       .gfx_level = gfx_level,
301       .esgs_itemsize = esgs_itemsize,
302       .map_io = map,
303    };
304 
305    nir_shader_instructions_pass(shader,
306                                 lower_es_output_store,
307                                 nir_metadata_block_index | nir_metadata_dominance,
308                                 &state);
309 }
310 
311 void
ac_nir_lower_gs_inputs_to_mem(nir_shader * shader,ac_nir_map_io_driver_location map,enum amd_gfx_level gfx_level,bool triangle_strip_adjacency_fix)312 ac_nir_lower_gs_inputs_to_mem(nir_shader *shader,
313                               ac_nir_map_io_driver_location map,
314                               enum amd_gfx_level gfx_level,
315                               bool triangle_strip_adjacency_fix)
316 {
317    lower_esgs_io_state state = {
318       .gfx_level = gfx_level,
319       .map_io = map,
320       .gs_triangle_strip_adjacency_fix = triangle_strip_adjacency_fix,
321    };
322 
323    nir_shader_lower_instructions(shader,
324                                  filter_load_per_vertex_input,
325                                  lower_gs_per_vertex_input_load,
326                                  &state);
327 }
328