• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_nir.h"
8 #include "nir_builder.h"
9 
10 /*
11  * These NIR passes are used to lower NIR cross-stage I/O intrinsics into the
12  * memory accesses that actually happen on the HW.
13  *
14  * Each input and output has a 16-byte (4 dwords) slot reserved for it, and
15  * can have up to 4 components. Each component is 32 bits.
16  *
17  * ## VS-TCS-TES I/O - Terminology:
18  *
19  * * patch - Group of vertices, used instead of primitives in tessellation
20  * * per-vertex - input or output which can be different for every vertex.
21  * * per-patch - input output which applies to a patch (a group of vertices)
22  *
23  * ## VS-TCS-TES I/O - How it works:
24  *
25  * ```
26  * SW model:    SW VS         SW TCS    tessellator    SW TES
27  *                ┊             ┊             ┊          ┊
28  *              ┌────┐        ┌────┐        ┌────┐    ┌─────┐
29  * HW pipeline: │ LS │─╮   ╭─>│ HS │─╮   ╭─>│ FF │ ╭─>│VS/ES30  *              └────┘ │   │  └────┘ │   │  └────┘ │  └─────┘
31  * Memory:             ╰─>LDS<──╯    ╰─>VRAM───────╯
32  * ```
33  *
34  * * SW VS runs as a HW LS (Local Shader, merged into HS on GFX9+),
35  *   and SW TCS runs as HW HS (Hull Shader).
36  *   SW TES runs as either HW VS or HW ES (Export Shader).
37  * * LS and HS share the same LDS space.
38  * * LS (SW VS) stores outputs to LDS to be read by HS (SW TCS).
39  * * HS (SW TCS) stores outputs in LDS if the HS (SW TCS) reads them.
40  * * HS (SW TCS) stores outputs in VRAM if the next stage (SW TES) reads them.
41  *
42  * Side note: some old HW supports having TES read from the same LDS space where LS/HS write, but
43  * Mesa always stores HS outputs to VRAM to avoid forcing TES waves to run on the same CU as the LS/HS waves.
44  *
45  * ### Passing VS-TCS I/O in registers
46  *
47  * On GPUs that run SW VS and  SW TCS on the same HW stage (HS on GFX9+),
48  * IO can be passed through registers instead of LDS when the following conditions are met:
49  *
50  * 1. TCS input and output patch size match
51  * 2. Floating point execution modes in SW VS and SW TCS match
52  * 3. The SW VS output is not written indirectly, and the corresponding SW TCS input is not read indirectly
53  *
54  * Some HS outputs could be passed through registers to, but this is a TODO.
55  *
56  * ### LDS layout used by VS-TCS:
57  *
58  * ```
59  * TCS per-vertex inputs for patch 0  <─── 0
60  * TCS per-vertex inputs for patch 1
61  * TCS per-vertex inputs for patch 2  <─── hs_per_vertex_input_lds_offset (rel_patch_id = 2)
62  * ...
63  * TCS per-vertex outputs for patch 0 <─── output_patch0_offset
64  * TCS per-patch outputs for patch 0  <─── output_patch0_patch_data_offset
65  * TCS per-vertex outputs for patch 1
66  * TCS per-patch outputs for patch 1
67  * TCS per-vertex outputs for patch 2 <─── hs_output_lds_offset (rel_patch_id = 2, per-vertex)
68  * TCS per-patch outputs for patch 2  <─── hs_output_lds_offset (rel_patch_id = 2, per-patch)
69  * ...
70  * ```
71  *
72  * ### VRAM layout used by TCS-TES I/O:
73  *
74  * ```
75  * attr 0 of patch 0 vertex 0   <─── "off-chip LDS" offset
76  * attr 0 of patch 0 vertex 1
77  * attr 0 of patch 0 vertex 2
78  * ...
79  * attr 0 of patch 1 vertex 0
80  * attr 0 of patch 1 vertex 1
81  * attr 0 of patch 1 vertex 2   <─── hs_per_vertex_output_vmem_offset (attribute slot = 0, rel_patch_id = 1, vertex index = 1)
82  * ...
83  * attr 0 of patch 2 vertex 0
84  * attr 0 of patch 2 vertex 1
85  * attr 0 of patch 2 vertex 2
86  * ...
87  * attr 1 of patch 0 vertex 0
88  * attr 1 of patch 0 vertex 1
89  * attr 1 of patch 0 vertex 2
90  * ...
91  * ...
92  * per-patch attr 0 of patch 0  <─── hs_out_patch_data_offset_amd
93  * per-patch attr 0 of patch 1
94  * per-patch attr 0 of patch 2  <─── hs_per_patch_output_vmem_offset (attribute slot = 0, rel_patch_id = 2)
95  * ...
96  * per-patch attr 1 of patch 0
97  * per-patch attr 1 of patch 1
98  * per-patch attr 1 of patch 2
99  * ...
100  * ```
101  *
102  */
103 
104 typedef struct {
105    /* Which hardware generation we're dealing with */
106    enum amd_gfx_level gfx_level;
107 
108    /* I/O semantic -> real location used by lowering. */
109    ac_nir_map_io_driver_location map_io;
110 
111    /* True if merged VS+TCS (on GFX9+) has the same number
112     * of input and output patch size.
113     */
114    bool tcs_in_out_eq;
115 
116    /* Bit mask of TCS per-vertex inputs (VS outputs) which
117     * are passed between the two stages only in temporaries (registers).
118     */
119    uint64_t tcs_temp_only_inputs;
120 
121    /* Bit mask of TCS outputs read by TES. */
122    uint64_t tes_inputs_read;
123    uint64_t tes_patch_inputs_read;
124 
125    /* Whether TES reads the tess factors. */
126    bool tes_reads_tessfactors;
127 
128    unsigned tcs_num_reserved_outputs;
129    unsigned tcs_num_reserved_patch_outputs;
130 
131    /* Location (slot) where tessellation levels are stored. */
132    int tcs_tess_lvl_in_loc;
133    int tcs_tess_lvl_out_loc;
134 
135    /* True if the output patch fits the subgroup, so all TCS outputs are always written in the same
136     * subgroup that reads them.
137     */
138    bool tcs_out_patch_fits_subgroup;
139 
140    /* Set if all invocations will write to all tess factors, so tess factors
141     * can be passed by register.
142     */
143    bool tcs_pass_tessfactors_by_reg;
144 
145    /* Whether all TCS inputs are accessed using gl_InvocationID and passed via VGPRs.
146     * In that case, no LDS is allocated for TCS inputs.
147     */
148    bool tcs_no_inputs_in_lds;
149 
150    /* Save TCS tess factor for tess factor writer. */
151    nir_variable *tcs_tess_level_outer;
152    nir_variable *tcs_tess_level_inner;
153    unsigned tcs_tess_level_outer_base;
154    unsigned tcs_tess_level_outer_mask;
155    unsigned tcs_tess_level_inner_base;
156    unsigned tcs_tess_level_inner_mask;
157 } lower_tess_io_state;
158 
159 static bool
match_mask(gl_shader_stage stage,nir_intrinsic_instr * intrin,uint64_t mask,bool match_indirect)160 match_mask(gl_shader_stage stage,
161            nir_intrinsic_instr *intrin,
162            uint64_t mask,
163            bool match_indirect)
164 {
165    bool indirect = !nir_src_is_const(*nir_get_io_offset_src(intrin));
166    if (indirect)
167       return match_indirect;
168 
169    uint64_t slot = nir_intrinsic_io_semantics(intrin).location;
170    if (stage == MESA_SHADER_TESS_CTRL &&
171        intrin->intrinsic != nir_intrinsic_load_per_vertex_input &&
172        intrin->intrinsic != nir_intrinsic_store_per_vertex_output)
173       slot -= VARYING_SLOT_PATCH0;
174 
175    return (UINT64_C(1) << slot) & mask;
176 }
177 
178 static bool
tcs_output_needs_vmem(nir_intrinsic_instr * intrin,lower_tess_io_state * st)179 tcs_output_needs_vmem(nir_intrinsic_instr *intrin,
180                       lower_tess_io_state *st)
181 {
182    uint64_t mask = intrin->intrinsic == nir_intrinsic_store_per_vertex_output
183                    ? st->tes_inputs_read
184                    : st->tes_patch_inputs_read;
185 
186    /* no_varying indicates that TES doesn't read the output. */
187    return !nir_intrinsic_io_semantics(intrin).no_varying &&
188           match_mask(MESA_SHADER_TESS_CTRL, intrin, mask, true);
189 }
190 
191 static bool
tcs_output_needs_lds(nir_intrinsic_instr * intrin,nir_shader * shader)192 tcs_output_needs_lds(nir_intrinsic_instr *intrin,
193                      nir_shader *shader)
194 {
195    uint64_t mask = intrin->intrinsic == nir_intrinsic_store_per_vertex_output
196                    ? shader->info.outputs_read
197                    : shader->info.patch_outputs_read;
198 
199    return match_mask(MESA_SHADER_TESS_CTRL, intrin, mask, true);
200 }
201 
202 static bool
lower_ls_output_store(nir_builder * b,nir_intrinsic_instr * intrin,void * state)203 lower_ls_output_store(nir_builder *b,
204                       nir_intrinsic_instr *intrin,
205                       void *state)
206 {
207    if (intrin->intrinsic != nir_intrinsic_store_output)
208       return false;
209 
210    /* The ARB_shader_viewport_layer_array spec contains the
211     * following issue:
212     *
213     *    2) What happens if gl_ViewportIndex or gl_Layer is
214     *    written in the vertex shader and a geometry shader is
215     *    present?
216     *
217     *    RESOLVED: The value written by the last vertex processing
218     *    stage is used. If the last vertex processing stage
219     *    (vertex, tessellation evaluation or geometry) does not
220     *    statically assign to gl_ViewportIndex or gl_Layer, index
221     *    or layer zero is assumed.
222     *
223     * So writes to those outputs in VS-as-LS are simply ignored.
224     */
225    unsigned semantic = nir_intrinsic_io_semantics(intrin).location;
226    if (semantic == VARYING_SLOT_LAYER || semantic == VARYING_SLOT_VIEWPORT) {
227       nir_instr_remove(&intrin->instr);
228       return true;
229    }
230 
231    lower_tess_io_state *st = (lower_tess_io_state *) state;
232 
233    /* If this is a temp-only TCS input, we don't need to use shared memory at all. */
234    if (match_mask(MESA_SHADER_VERTEX, intrin, st->tcs_temp_only_inputs, false))
235       return false;
236 
237    b->cursor = nir_before_instr(&intrin->instr);
238 
239    nir_def *vertex_idx = nir_load_local_invocation_index(b);
240    nir_def *base_off_var = nir_imul(b, vertex_idx, nir_load_lshs_vertex_stride_amd(b));
241 
242    nir_def *io_off = ac_nir_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u, st->map_io);
243    unsigned write_mask = nir_intrinsic_write_mask(intrin);
244 
245    nir_def *off = nir_iadd_nuw(b, base_off_var, io_off);
246    nir_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask);
247 
248    /* NOTE: don't remove the store_output intrinsic on GFX9+ when tcs_in_out_eq,
249     * it will be used by same-invocation TCS input loads.
250     */
251    if (!st->tcs_in_out_eq)
252       nir_instr_remove(&intrin->instr);
253 
254    return true;
255 }
256 
257 static bool
filter_load_tcs_per_vertex_input(const nir_instr * instr,UNUSED const void * state)258 filter_load_tcs_per_vertex_input(const nir_instr *instr,
259                                  UNUSED const void *state)
260 {
261    if (instr->type != nir_instr_type_intrinsic)
262       return false;
263 
264    lower_tess_io_state *st = (lower_tess_io_state *) state;
265    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
266 
267    if (intrin->intrinsic != nir_intrinsic_load_per_vertex_input)
268       return false;
269    if (!st->tcs_in_out_eq)
270       return true;
271 
272    /* tcs_in_out_eq: a same-invocation input load, without indirect offset,
273     * can use temporaries, no need to use shared memory.
274     */
275    nir_src *off_src = nir_get_io_offset_src(intrin);
276    nir_src *vertex_index_src = nir_get_io_arrayed_index_src(intrin);
277    nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
278 
279    bool can_use_temps = nir_src_is_const(*off_src) &&
280                         vertex_index_instr->type == nir_instr_type_intrinsic &&
281                         nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
282 
283    return !can_use_temps;
284 }
285 
286 static nir_def *
hs_per_vertex_input_lds_offset(nir_builder * b,lower_tess_io_state * st,nir_intrinsic_instr * instr)287 hs_per_vertex_input_lds_offset(nir_builder *b,
288                                lower_tess_io_state *st,
289                                nir_intrinsic_instr *instr)
290 {
291    nir_def *tcs_in_vtxcnt = nir_load_patch_vertices_in(b);
292    nir_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
293    nir_def *vertex_index = nir_get_io_arrayed_index_src(instr)->ssa;
294 
295    nir_def *stride = nir_load_lshs_vertex_stride_amd(b);
296    nir_def *tcs_in_patch_stride = nir_imul(b, tcs_in_vtxcnt, stride);
297    nir_def *vertex_index_off = nir_imul(b, vertex_index, stride);
298 
299    nir_def *tcs_in_current_patch_offset = nir_imul(b, rel_patch_id, tcs_in_patch_stride);
300 
301    nir_def *io_offset = ac_nir_calc_io_offset(b, instr, nir_imm_int(b, 16u), 4u, st->map_io);
302 
303    return nir_iadd_nuw(b, nir_iadd_nuw(b, tcs_in_current_patch_offset, vertex_index_off), io_offset);
304 }
305 
306 static nir_def *
hs_output_lds_offset(nir_builder * b,lower_tess_io_state * st,nir_intrinsic_instr * intrin)307 hs_output_lds_offset(nir_builder *b,
308                      lower_tess_io_state *st,
309                      nir_intrinsic_instr *intrin)
310 {
311    bool per_vertex = intrin &&
312                      (intrin->intrinsic == nir_intrinsic_store_per_vertex_output ||
313                       intrin->intrinsic == nir_intrinsic_load_per_vertex_output);
314 
315    unsigned output_vertex_size = st->tcs_num_reserved_outputs * 16u;
316    unsigned pervertex_output_patch_size = b->shader->info.tess.tcs_vertices_out * output_vertex_size;
317    unsigned output_patch_stride = pervertex_output_patch_size + st->tcs_num_reserved_patch_outputs * 16u;
318 
319    nir_def *off = intrin
320                     ? ac_nir_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u, st->map_io)
321                     : nir_imm_int(b, 0);
322 
323    nir_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
324    nir_def *patch_offset = nir_imul_imm(b, rel_patch_id, output_patch_stride);
325 
326    nir_def *output_patch_offset;
327    if (st->tcs_no_inputs_in_lds)
328       output_patch_offset = patch_offset;
329    else {
330       nir_def *tcs_in_vtxcnt = nir_load_patch_vertices_in(b);
331       nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
332       nir_def *input_patch_size =
333          nir_imul(b, tcs_in_vtxcnt, nir_load_lshs_vertex_stride_amd(b));
334       nir_def *output_patch0_offset = nir_imul(b, input_patch_size, tcs_num_patches);
335       output_patch_offset = nir_iadd_nuw(b, patch_offset, output_patch0_offset);
336    }
337 
338    if (per_vertex) {
339       nir_def *vertex_index = nir_get_io_arrayed_index_src(intrin)->ssa;
340       nir_def *vertex_index_off = nir_imul_imm(b, vertex_index, output_vertex_size);
341 
342       off = nir_iadd_nuw(b, off, vertex_index_off);
343       return nir_iadd_nuw(b, off, output_patch_offset);
344    } else {
345       off = nir_iadd_imm_nuw(b, off, pervertex_output_patch_size);
346       return nir_iadd_nuw(b, off, output_patch_offset);
347    }
348 }
349 
350 static nir_def *
hs_per_vertex_output_vmem_offset(nir_builder * b,lower_tess_io_state * st,nir_intrinsic_instr * intrin)351 hs_per_vertex_output_vmem_offset(nir_builder *b,
352                                  lower_tess_io_state *st,
353                                  nir_intrinsic_instr *intrin)
354 {
355    nir_def *out_vertices_per_patch = b->shader->info.stage == MESA_SHADER_TESS_CTRL
356                                          ? nir_imm_int(b, b->shader->info.tess.tcs_vertices_out)
357                                          : nir_load_patch_vertices_in(b);
358 
359    nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
360    nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, out_vertices_per_patch, 16u));
361    nir_def *io_offset = ac_nir_calc_io_offset(b, intrin, attr_stride, 4u, st->map_io);
362 
363    nir_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
364    nir_def *patch_offset = nir_imul(b, rel_patch_id, nir_imul_imm(b, out_vertices_per_patch, 16u));
365 
366    nir_def *vertex_index = nir_get_io_arrayed_index_src(intrin)->ssa;
367    nir_def *vertex_index_off = nir_imul_imm(b, vertex_index, 16u);
368 
369    return nir_iadd_nuw(b, nir_iadd_nuw(b, patch_offset, vertex_index_off), io_offset);
370 }
371 
372 static nir_def *
hs_per_patch_output_vmem_offset(nir_builder * b,lower_tess_io_state * st,nir_intrinsic_instr * intrin,unsigned const_base_offset)373 hs_per_patch_output_vmem_offset(nir_builder *b,
374                                 lower_tess_io_state *st,
375                                 nir_intrinsic_instr *intrin,
376                                 unsigned const_base_offset)
377 {
378    nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
379    nir_def *per_patch_data_offset = nir_load_hs_out_patch_data_offset_amd(b);
380 
381    nir_def * off = intrin
382                     ? ac_nir_calc_io_offset(b, intrin, nir_imul_imm(b, tcs_num_patches, 16u), 4u, st->map_io)
383                     : nir_imm_int(b, 0);
384 
385    if (const_base_offset)
386       off = nir_iadd_nuw(b, off, nir_imul_imm(b, tcs_num_patches, const_base_offset));
387 
388    nir_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
389    nir_def *patch_offset = nir_imul_imm(b, rel_patch_id, 16u);
390    off = nir_iadd_nuw(b, off, per_patch_data_offset);
391    return nir_iadd_nuw(b, off, patch_offset);
392 }
393 
394 static nir_def *
lower_hs_per_vertex_input_load(nir_builder * b,nir_instr * instr,void * state)395 lower_hs_per_vertex_input_load(nir_builder *b,
396                                nir_instr *instr,
397                                void *state)
398 {
399    lower_tess_io_state *st = (lower_tess_io_state *) state;
400    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
401 
402    nir_def *off = hs_per_vertex_input_lds_offset(b, st, intrin);
403    return nir_load_shared(b, intrin->def.num_components, intrin->def.bit_size, off);
404 }
405 
406 static nir_def *
lower_hs_output_store(nir_builder * b,nir_intrinsic_instr * intrin,lower_tess_io_state * st)407 lower_hs_output_store(nir_builder *b,
408                       nir_intrinsic_instr *intrin,
409                       lower_tess_io_state *st)
410 {
411    assert(intrin->intrinsic == nir_intrinsic_store_per_vertex_output ||
412           intrin->intrinsic == nir_intrinsic_store_output);
413 
414    nir_io_semantics semantics = nir_intrinsic_io_semantics(intrin);
415    nir_def *store_val = intrin->src[0].ssa;
416    unsigned base = nir_intrinsic_base(intrin);
417    unsigned component = nir_intrinsic_component(intrin);
418    unsigned write_mask = nir_intrinsic_write_mask(intrin);
419    bool is_tess_factor = semantics.location == VARYING_SLOT_TESS_LEVEL_INNER ||
420                          semantics.location == VARYING_SLOT_TESS_LEVEL_OUTER;
421    bool write_to_vmem = !is_tess_factor && tcs_output_needs_vmem(intrin, st);
422    bool write_to_lds = (is_tess_factor && !st->tcs_pass_tessfactors_by_reg) ||
423       tcs_output_needs_lds(intrin, b->shader);
424 
425    /* Remember tess factor location so that we can load them from LDS and/or
426     * store them to VMEM when hs_emit_write_tess_factors().
427     */
428    if (is_tess_factor) {
429       unsigned mapped_location =
430          st->map_io ? st->map_io(semantics.location) : nir_intrinsic_base(intrin);
431 
432       if (semantics.location == VARYING_SLOT_TESS_LEVEL_INNER)
433          st->tcs_tess_lvl_in_loc = mapped_location * 16u;
434       else
435          st->tcs_tess_lvl_out_loc = mapped_location * 16u;
436    }
437 
438    if (write_to_vmem) {
439       nir_def *vmem_off = intrin->intrinsic == nir_intrinsic_store_per_vertex_output
440                             ? hs_per_vertex_output_vmem_offset(b, st, intrin)
441                             : hs_per_patch_output_vmem_offset(b, st, intrin, 0);
442 
443       nir_def *hs_ring_tess_offchip = nir_load_ring_tess_offchip_amd(b);
444       nir_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
445       nir_def *zero = nir_imm_int(b, 0);
446       nir_store_buffer_amd(b, store_val, hs_ring_tess_offchip, vmem_off, offchip_offset, zero,
447                            .write_mask = write_mask, .memory_modes = nir_var_shader_out,
448                            .access = ACCESS_COHERENT);
449    }
450 
451    if (write_to_lds) {
452       nir_def *lds_off = hs_output_lds_offset(b, st, intrin);
453       nir_store_shared(b, store_val, lds_off, .write_mask = write_mask);
454    }
455 
456    /* Save tess factor to be used by tess factor writer or reconstruct
457     * store output instruction later.
458     */
459    if (is_tess_factor && st->tcs_pass_tessfactors_by_reg) {
460       if (semantics.location == VARYING_SLOT_TESS_LEVEL_INNER) {
461          st->tcs_tess_level_inner_base = base;
462          st->tcs_tess_level_inner_mask |= write_mask << component;
463          ac_nir_store_var_components(b, st->tcs_tess_level_inner, store_val,
464                                      component, write_mask);
465       } else {
466          st->tcs_tess_level_outer_base = base;
467          st->tcs_tess_level_outer_mask |= write_mask << component;
468          ac_nir_store_var_components(b, st->tcs_tess_level_outer, store_val,
469                                      component, write_mask);
470       }
471    }
472 
473    return NIR_LOWER_INSTR_PROGRESS_REPLACE;
474 }
475 
476 static nir_def *
lower_hs_output_load(nir_builder * b,nir_intrinsic_instr * intrin,lower_tess_io_state * st)477 lower_hs_output_load(nir_builder *b,
478                      nir_intrinsic_instr *intrin,
479                      lower_tess_io_state *st)
480 {
481    nir_def *off = hs_output_lds_offset(b, st, intrin);
482    return nir_load_shared(b, intrin->def.num_components, intrin->def.bit_size, off);
483 }
484 
485 static void
update_hs_barrier(nir_intrinsic_instr * intrin,lower_tess_io_state * st)486 update_hs_barrier(nir_intrinsic_instr *intrin, lower_tess_io_state *st)
487 {
488    /* Output loads and stores are lowered to shared memory access,
489     * so we have to update the barriers to also reflect this.
490     */
491    unsigned mem_modes = nir_intrinsic_memory_modes(intrin);
492    if (mem_modes & nir_var_shader_out) {
493       mem_modes |= nir_var_mem_shared;
494       mem_modes &= ~nir_var_shader_out;
495    }
496    nir_intrinsic_set_memory_modes(intrin, mem_modes);
497 
498    mesa_scope exec_scope = nir_intrinsic_execution_scope(intrin);
499    if (exec_scope == SCOPE_WORKGROUP && st->tcs_out_patch_fits_subgroup)
500       nir_intrinsic_set_execution_scope(intrin, SCOPE_SUBGROUP);
501 
502    mesa_scope mem_scope = nir_intrinsic_memory_scope(intrin);
503    if (mem_scope == SCOPE_WORKGROUP && st->tcs_out_patch_fits_subgroup)
504       nir_intrinsic_set_memory_scope(intrin, SCOPE_SUBGROUP);
505 }
506 
507 static nir_def *
lower_hs_output_access(nir_builder * b,nir_instr * instr,void * state)508 lower_hs_output_access(nir_builder *b,
509                        nir_instr *instr,
510                        void *state)
511 {
512    lower_tess_io_state *st = (lower_tess_io_state *) state;
513    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
514 
515    if (intrin->intrinsic == nir_intrinsic_store_output ||
516        intrin->intrinsic == nir_intrinsic_store_per_vertex_output) {
517       return lower_hs_output_store(b, intrin, st);
518    } else if (intrin->intrinsic == nir_intrinsic_load_output ||
519               intrin->intrinsic == nir_intrinsic_load_per_vertex_output) {
520       return lower_hs_output_load(b, intrin, st);
521    } else if (intrin->intrinsic == nir_intrinsic_barrier) {
522       update_hs_barrier(intrin, st);
523       return NIR_LOWER_INSTR_PROGRESS;
524    } else {
525       unreachable("intrinsic not supported by lower_hs_output_access");
526    }
527 }
528 
529 static void
hs_emit_write_tess_factors(nir_shader * shader,lower_tess_io_state * st)530 hs_emit_write_tess_factors(nir_shader *shader,
531                            lower_tess_io_state *st)
532 {
533    unsigned outer_comps;
534    unsigned inner_comps;
535 
536    switch (shader->info.tess._primitive_mode) {
537    case TESS_PRIMITIVE_ISOLINES:
538       outer_comps = 2;
539       inner_comps = 0;
540       break;
541    case TESS_PRIMITIVE_TRIANGLES:
542       outer_comps = 3;
543       inner_comps = 1;
544       break;
545    case TESS_PRIMITIVE_QUADS:
546       outer_comps = 4;
547       inner_comps = 2;
548       break;
549    default:
550       unreachable("invalid primitive mode");
551       return;
552    }
553 
554    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
555    assert(impl);
556    nir_block *last_block = nir_impl_last_block(impl);
557    assert(last_block);
558 
559    /* We assume there is always a single end block in the shader. */
560 
561    nir_builder builder = nir_builder_at(nir_after_block(last_block));
562    nir_builder *b = &builder; /* This is to avoid the & */
563 
564    /* If tess factors are load from LDS, wait previous LDS stores done. */
565    if (!st->tcs_pass_tessfactors_by_reg) {
566       mesa_scope scope = st->tcs_out_patch_fits_subgroup ?
567                         SCOPE_SUBGROUP : SCOPE_WORKGROUP;
568 
569       nir_barrier(b, .execution_scope = scope, .memory_scope = scope,
570                          .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared);
571    }
572 
573    nir_def *invocation_id = nir_load_invocation_id(b);
574 
575    /* Only the 1st invocation of each patch needs to do this. */
576    nir_if *invocation_id_zero = nir_push_if(b, nir_ieq_imm(b, invocation_id, 0));
577 
578    /* When the output patch size is <= 32 then we can flatten the branch here
579     * because we know for sure that at least 1 invocation in all waves will
580     * take the branch.
581     */
582    if (shader->info.tess.tcs_vertices_out <= 32)
583       invocation_id_zero->control = nir_selection_control_divergent_always_taken;
584 
585    const bool tess_lvl_in_written = st->tcs_tess_lvl_in_loc >= 0;
586    const bool tess_lvl_out_written = st->tcs_tess_lvl_out_loc >= 0;
587 
588    nir_def *tessfactors_outer = NULL;
589    nir_def *tessfactors_inner = NULL;
590    if (st->tcs_pass_tessfactors_by_reg) {
591       if (tess_lvl_out_written) {
592          tessfactors_outer = nir_load_var(b, st->tcs_tess_level_outer);
593          tessfactors_outer = nir_trim_vector(b, tessfactors_outer, outer_comps);
594       }
595 
596       if (inner_comps && tess_lvl_in_written) {
597          tessfactors_inner = nir_load_var(b, st->tcs_tess_level_inner);
598          tessfactors_inner = nir_trim_vector(b, tessfactors_inner, inner_comps);
599       }
600    } else {
601       /* Base LDS address of per-patch outputs in the current patch. */
602       nir_def *lds_base = hs_output_lds_offset(b, st, NULL);
603 
604       /* Load all tessellation factors (aka. tess levels) from LDS. */
605       if (tess_lvl_out_written) {
606          tessfactors_outer = nir_load_shared(b, outer_comps, 32, lds_base,
607                                              .base = st->tcs_tess_lvl_out_loc);
608       }
609 
610       if (inner_comps && tess_lvl_in_written) {
611          tessfactors_inner = nir_load_shared(b, inner_comps, 32, lds_base,
612                                              .base = st->tcs_tess_lvl_in_loc);
613       }
614    }
615 
616    /* Set tess factor to be zero if user did not write them. */
617    if (!tessfactors_outer)
618       tessfactors_outer = nir_imm_zero(b, outer_comps, 32);
619    if (inner_comps && !tessfactors_inner)
620       tessfactors_inner = nir_imm_zero(b, inner_comps, 32);
621 
622    /* The descriptor where tess factors have to be stored by the shader. */
623    nir_def *tessfactor_ring = nir_load_ring_tess_factors_amd(b);
624 
625    nir_def *zero = nir_imm_int(b, 0);
626    nir_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
627    nir_def *tess_factors_base = nir_load_ring_tess_factors_offset_amd(b);
628    nir_def *tess_factors_offset = nir_imul_imm(b, rel_patch_id, (inner_comps + outer_comps) * 4u);
629    unsigned tess_factors_const_offset = 0;
630 
631    if (st->gfx_level <= GFX8) {
632       /* Store the dynamic HS control word. */
633       nir_if *rel_patch_id_zero = nir_push_if(b, nir_ieq_imm(b, rel_patch_id, 0));
634       nir_def *ctrlw = nir_imm_int(b, 0x80000000u);
635       nir_store_buffer_amd(b, ctrlw, tessfactor_ring, zero, tess_factors_base, zero,
636                            .access = ACCESS_COHERENT);
637       tess_factors_const_offset += 4;
638       nir_pop_if(b, rel_patch_id_zero);
639    }
640 
641    /* Store tess factors for the tessellator */
642    if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
643       /* LINES reversal */
644       nir_def *t = nir_vec2(b, nir_channel(b, tessfactors_outer, 1), nir_channel(b, tessfactors_outer, 0));
645       nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
646                            .base = tess_factors_const_offset, .access = ACCESS_COHERENT);
647    } else if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
648       nir_def *t = nir_vec4(b, nir_channel(b, tessfactors_outer, 0), nir_channel(b, tessfactors_outer, 1),
649                                 nir_channel(b, tessfactors_outer, 2), nir_channel(b, tessfactors_inner, 0));
650       nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
651                            .base = tess_factors_const_offset, .access = ACCESS_COHERENT);
652    } else {
653       nir_store_buffer_amd(b, tessfactors_outer, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
654                            .base = tess_factors_const_offset, .access = ACCESS_COHERENT);
655       nir_store_buffer_amd(b, tessfactors_inner, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
656                            .base = tess_factors_const_offset + 4u * outer_comps, .access = ACCESS_COHERENT);
657    }
658 
659    if (st->tes_reads_tessfactors) {
660       /* Store to offchip for TES to read - only if TES actually reads them */
661       nir_def *hs_ring_tess_offchip = nir_load_ring_tess_offchip_amd(b);
662       nir_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
663 
664       if (tess_lvl_out_written) {
665          nir_def *vmem_off_outer =
666             hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_out_loc);
667 
668          nir_store_buffer_amd(b, tessfactors_outer, hs_ring_tess_offchip,
669                               vmem_off_outer, offchip_offset, zero,
670                               .memory_modes = nir_var_shader_out,
671                               .access = ACCESS_COHERENT);
672       }
673 
674       if (inner_comps && tess_lvl_in_written) {
675          nir_def *vmem_off_inner =
676             hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_in_loc);
677 
678          nir_store_buffer_amd(b, tessfactors_inner, hs_ring_tess_offchip,
679                               vmem_off_inner, offchip_offset, zero,
680                               .memory_modes = nir_var_shader_out,
681                               .access = ACCESS_COHERENT);
682       }
683    }
684 
685    nir_pop_if(b, invocation_id_zero);
686 
687    nir_metadata_preserve(impl, nir_metadata_none);
688 }
689 
690 static nir_def *
lower_tes_input_load(nir_builder * b,nir_instr * instr,void * state)691 lower_tes_input_load(nir_builder *b,
692                      nir_instr *instr,
693                      void *state)
694 {
695    lower_tess_io_state *st = (lower_tess_io_state *) state;
696    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
697 
698    nir_def *offchip_ring = nir_load_ring_tess_offchip_amd(b);
699    nir_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
700    nir_def *off = intrin->intrinsic == nir_intrinsic_load_per_vertex_input
701                     ? hs_per_vertex_output_vmem_offset(b, st, intrin)
702                     : hs_per_patch_output_vmem_offset(b, st, intrin, 0);
703 
704    nir_def *zero = nir_imm_int(b, 0);
705 
706    return nir_load_buffer_amd(b, intrin->def.num_components,
707                               intrin->def.bit_size, offchip_ring,
708                               off, offchip_offset, zero,
709                               .access = ACCESS_COHERENT);
710 }
711 
712 static bool
filter_hs_output_access(const nir_instr * instr,UNUSED const void * st)713 filter_hs_output_access(const nir_instr *instr,
714                          UNUSED const void *st)
715 {
716    if (instr->type != nir_instr_type_intrinsic)
717       return false;
718 
719    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
720    return intrin->intrinsic == nir_intrinsic_store_output ||
721           intrin->intrinsic == nir_intrinsic_store_per_vertex_output ||
722           intrin->intrinsic == nir_intrinsic_load_output ||
723           intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
724           intrin->intrinsic == nir_intrinsic_barrier;
725 }
726 
727 static bool
filter_any_input_access(const nir_instr * instr,UNUSED const void * st)728 filter_any_input_access(const nir_instr *instr,
729                         UNUSED const void *st)
730 {
731    if (instr->type != nir_instr_type_intrinsic)
732       return false;
733 
734    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
735    return intrin->intrinsic == nir_intrinsic_load_input ||
736           intrin->intrinsic == nir_intrinsic_load_per_vertex_input;
737 }
738 
739 void
ac_nir_lower_ls_outputs_to_mem(nir_shader * shader,ac_nir_map_io_driver_location map,bool tcs_in_out_eq,uint64_t tcs_temp_only_inputs)740 ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
741                                ac_nir_map_io_driver_location map,
742                                bool tcs_in_out_eq,
743                                uint64_t tcs_temp_only_inputs)
744 {
745    assert(shader->info.stage == MESA_SHADER_VERTEX);
746 
747    lower_tess_io_state state = {
748       .tcs_in_out_eq = tcs_in_out_eq,
749       .tcs_temp_only_inputs = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
750       .map_io = map,
751    };
752 
753    nir_shader_intrinsics_pass(shader, lower_ls_output_store,
754                                 nir_metadata_block_index | nir_metadata_dominance,
755                                 &state);
756 }
757 
758 void
ac_nir_lower_hs_inputs_to_mem(nir_shader * shader,ac_nir_map_io_driver_location map,bool tcs_in_out_eq)759 ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
760                               ac_nir_map_io_driver_location map,
761                               bool tcs_in_out_eq)
762 {
763    assert(shader->info.stage == MESA_SHADER_TESS_CTRL);
764 
765    lower_tess_io_state state = {
766       .tcs_in_out_eq = tcs_in_out_eq,
767       .map_io = map,
768    };
769 
770    nir_shader_lower_instructions(shader,
771                                  filter_load_tcs_per_vertex_input,
772                                  lower_hs_per_vertex_input_load,
773                                  &state);
774 }
775 
776 static void
reconstruct_tess_factor_outputs(nir_shader * shader,lower_tess_io_state * st)777 reconstruct_tess_factor_outputs(nir_shader *shader, lower_tess_io_state *st)
778 {
779    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
780    nir_builder builder = nir_builder_create(impl);
781    nir_builder *b = &builder;
782    b->cursor = nir_after_impl(impl);
783 
784    if (st->tcs_tess_level_outer_mask) {
785       nir_def *val = nir_load_var(b, st->tcs_tess_level_outer);
786       nir_store_output(b, val, nir_imm_int(b, 0),
787                        .base = st->tcs_tess_level_outer_base,
788                        .write_mask = st->tcs_tess_level_outer_mask,
789                        .io_semantics.location = VARYING_SLOT_TESS_LEVEL_OUTER);
790    }
791 
792    if (st->tcs_tess_level_inner_mask) {
793       nir_def *val = nir_load_var(b, st->tcs_tess_level_inner);
794       nir_store_output(b, val, nir_imm_int(b, 0),
795                        .base = st->tcs_tess_level_inner_base,
796                        .write_mask = st->tcs_tess_level_inner_mask,
797                        .io_semantics.location = VARYING_SLOT_TESS_LEVEL_INNER);
798    }
799 }
800 
801 void
ac_nir_lower_hs_outputs_to_mem(nir_shader * shader,ac_nir_map_io_driver_location map,enum amd_gfx_level gfx_level,bool tes_reads_tessfactors,uint64_t tes_inputs_read,uint64_t tes_patch_inputs_read,unsigned num_reserved_tcs_outputs,unsigned num_reserved_tcs_patch_outputs,unsigned wave_size,bool no_inputs_in_lds,bool pass_tessfactors_by_reg,bool emit_tess_factor_write)802 ac_nir_lower_hs_outputs_to_mem(nir_shader *shader,
803                                ac_nir_map_io_driver_location map,
804                                enum amd_gfx_level gfx_level,
805                                bool tes_reads_tessfactors,
806                                uint64_t tes_inputs_read,
807                                uint64_t tes_patch_inputs_read,
808                                unsigned num_reserved_tcs_outputs,
809                                unsigned num_reserved_tcs_patch_outputs,
810                                unsigned wave_size,
811                                bool no_inputs_in_lds,
812                                bool pass_tessfactors_by_reg,
813                                bool emit_tess_factor_write)
814 {
815    assert(shader->info.stage == MESA_SHADER_TESS_CTRL);
816 
817    lower_tess_io_state state = {
818       .gfx_level = gfx_level,
819       .tes_reads_tessfactors = tes_reads_tessfactors,
820       .tes_inputs_read = tes_inputs_read,
821       .tes_patch_inputs_read = tes_patch_inputs_read,
822       .tcs_num_reserved_outputs = num_reserved_tcs_outputs,
823       .tcs_num_reserved_patch_outputs = num_reserved_tcs_patch_outputs,
824       .tcs_out_patch_fits_subgroup = wave_size % shader->info.tess.tcs_vertices_out == 0,
825       .tcs_pass_tessfactors_by_reg = pass_tessfactors_by_reg,
826       .tcs_no_inputs_in_lds = no_inputs_in_lds,
827       .tcs_tess_lvl_in_loc = -1,
828       .tcs_tess_lvl_out_loc = -1,
829       .map_io = map,
830    };
831 
832    if (pass_tessfactors_by_reg) {
833       nir_function_impl *impl = nir_shader_get_entrypoint(shader);
834       state.tcs_tess_level_outer =
835          nir_local_variable_create(impl, glsl_vec4_type(), "tess outer");
836       state.tcs_tess_level_inner =
837          nir_local_variable_create(impl, glsl_vec4_type(), "tess inner");
838    }
839 
840    nir_shader_lower_instructions(shader,
841                                  filter_hs_output_access,
842                                  lower_hs_output_access,
843                                  &state);
844 
845    if (emit_tess_factor_write) {
846       hs_emit_write_tess_factors(shader, &state);
847    } else if (pass_tessfactors_by_reg) {
848       /* Reconstruct tess factor nir_store_output instruction if it's going to be passed
849        * by reg instead of LDS and we use a compiler backend TCS epilog.
850        *
851        * TCS does not call nir_lower_io_to_temporaries(). It's not a problem when LLVM
852        * because LLVM support variable. But ACO does not support variable, so we do similar
853        * thing as nir_lower_io_to_temporaries() to move store output instruction out of
854        * control flow.
855        */
856       reconstruct_tess_factor_outputs(shader, &state);
857    }
858 }
859 
860 void
ac_nir_lower_tes_inputs_to_mem(nir_shader * shader,ac_nir_map_io_driver_location map)861 ac_nir_lower_tes_inputs_to_mem(nir_shader *shader,
862                                ac_nir_map_io_driver_location map)
863 {
864    assert(shader->info.stage == MESA_SHADER_TESS_EVAL);
865 
866    lower_tess_io_state state = {
867       .map_io = map,
868    };
869 
870    nir_shader_lower_instructions(shader,
871                                  filter_any_input_access,
872                                  lower_tes_input_load,
873                                  &state);
874 }
875