1 /*
2 * Copyright © 2021 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_nir.h"
8 #include "ac_nir_helpers.h"
9
10 #include "nir_builder.h"
11
12 typedef struct {
13 nir_def *outputs[64][4];
14 nir_def *outputs_16bit_lo[16][4];
15 nir_def *outputs_16bit_hi[16][4];
16
17 ac_nir_gs_output_info *info;
18
19 nir_def *vertex_count[4];
20 nir_def *primitive_count[4];
21 } lower_legacy_gs_state;
22
23 static bool
lower_legacy_gs_store_output(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)24 lower_legacy_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin,
25 lower_legacy_gs_state *s)
26 {
27 /* Assume:
28 * - the shader used nir_lower_io_to_temporaries
29 * - 64-bit outputs are lowered
30 * - no indirect indexing is present
31 */
32 assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
33
34 b->cursor = nir_before_instr(&intrin->instr);
35
36 unsigned component = nir_intrinsic_component(intrin);
37 unsigned write_mask = nir_intrinsic_write_mask(intrin);
38 nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
39
40 nir_def **outputs;
41 if (sem.location < VARYING_SLOT_VAR0_16BIT) {
42 outputs = s->outputs[sem.location];
43 } else {
44 unsigned index = sem.location - VARYING_SLOT_VAR0_16BIT;
45 if (sem.high_16bits)
46 outputs = s->outputs_16bit_hi[index];
47 else
48 outputs = s->outputs_16bit_lo[index];
49 }
50
51 nir_def *store_val = intrin->src[0].ssa;
52 /* 64bit output has been lowered to 32bit */
53 assert(store_val->bit_size <= 32);
54
55 /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */
56 const bool non_dedicated_16bit = sem.location < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16;
57
58 u_foreach_bit (i, write_mask) {
59 unsigned comp = component + i;
60 nir_def *store_component = nir_channel(b, store_val, i);
61
62 if (non_dedicated_16bit) {
63 if (sem.high_16bits) {
64 nir_def *lo = outputs[comp] ? nir_unpack_32_2x16_split_x(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16);
65 outputs[comp] = nir_pack_32_2x16_split(b, lo, store_component);
66 } else {
67 nir_def *hi = outputs[comp] ? nir_unpack_32_2x16_split_y(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16);
68 outputs[comp] = nir_pack_32_2x16_split(b, store_component, hi);
69 }
70 } else {
71 outputs[comp] = store_component;
72 }
73 }
74
75 nir_instr_remove(&intrin->instr);
76 return true;
77 }
78
79 static bool
lower_legacy_gs_emit_vertex_with_counter(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)80 lower_legacy_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intrin,
81 lower_legacy_gs_state *s)
82 {
83 b->cursor = nir_before_instr(&intrin->instr);
84
85 unsigned stream = nir_intrinsic_stream_id(intrin);
86 nir_def *vtxidx = intrin->src[0].ssa;
87
88 nir_def *gsvs_ring = nir_load_ring_gsvs_amd(b, .stream_id = stream);
89 nir_def *soffset = nir_load_ring_gs2vs_offset_amd(b);
90
91 unsigned offset = 0;
92 u_foreach_bit64 (i, b->shader->info.outputs_written) {
93 for (unsigned j = 0; j < 4; j++) {
94 nir_def *output = s->outputs[i][j];
95 /* Next vertex emit need a new value, reset all outputs. */
96 s->outputs[i][j] = NULL;
97
98 const uint8_t usage_mask = s->info->varying_mask[i] | s->info->sysval_mask[i];
99
100 if (!(usage_mask & (1 << j)) ||
101 ((s->info->streams[i] >> (j * 2)) & 0x3) != stream)
102 continue;
103
104 unsigned base = offset * b->shader->info.gs.vertices_out * 4;
105 offset++;
106
107 /* no one set this output, skip the buffer store */
108 if (!output)
109 continue;
110
111 nir_def *voffset = nir_ishl_imm(b, vtxidx, 2);
112
113 /* extend 8/16 bit to 32 bit, 64 bit has been lowered */
114 nir_def *data = nir_u2uN(b, output, 32);
115
116 nir_store_buffer_amd(b, data, gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
117 .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL |
118 ACCESS_IS_SWIZZLED_AMD,
119 .base = base,
120 /* For ACO to not reorder this store around EmitVertex/EndPrimitve */
121 .memory_modes = nir_var_shader_out);
122 }
123 }
124
125 u_foreach_bit (i, b->shader->info.outputs_written_16bit) {
126 for (unsigned j = 0; j < 4; j++) {
127 nir_def *output_lo = s->outputs_16bit_lo[i][j];
128 nir_def *output_hi = s->outputs_16bit_hi[i][j];
129 /* Next vertex emit need a new value, reset all outputs. */
130 s->outputs_16bit_lo[i][j] = NULL;
131 s->outputs_16bit_hi[i][j] = NULL;
132
133 bool has_lo_16bit = (s->info->varying_mask_16bit_lo[i] & (1 << j)) &&
134 ((s->info->streams_16bit_lo[i] >> (j * 2)) & 0x3) == stream;
135 bool has_hi_16bit = (s->info->varying_mask_16bit_hi[i] & (1 << j)) &&
136 ((s->info->streams_16bit_hi[i] >> (j * 2)) & 0x3) == stream;
137 if (!has_lo_16bit && !has_hi_16bit)
138 continue;
139
140 unsigned base = offset * b->shader->info.gs.vertices_out;
141 offset++;
142
143 bool has_lo_16bit_out = has_lo_16bit && output_lo;
144 bool has_hi_16bit_out = has_hi_16bit && output_hi;
145
146 /* no one set needed output, skip the buffer store */
147 if (!has_lo_16bit_out && !has_hi_16bit_out)
148 continue;
149
150 if (!has_lo_16bit_out)
151 output_lo = nir_undef(b, 1, 16);
152
153 if (!has_hi_16bit_out)
154 output_hi = nir_undef(b, 1, 16);
155
156 nir_def *voffset = nir_iadd_imm(b, vtxidx, base);
157 voffset = nir_ishl_imm(b, voffset, 2);
158
159 nir_store_buffer_amd(b, nir_pack_32_2x16_split(b, output_lo, output_hi),
160 gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
161 .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL |
162 ACCESS_IS_SWIZZLED_AMD,
163 /* For ACO to not reorder this store around EmitVertex/EndPrimitve */
164 .memory_modes = nir_var_shader_out);
165 }
166 }
167
168 /* Signal vertex emission. */
169 nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
170 .base = AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8));
171
172 nir_instr_remove(&intrin->instr);
173 return true;
174 }
175
176 static bool
lower_legacy_gs_set_vertex_and_primitive_count(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)177 lower_legacy_gs_set_vertex_and_primitive_count(nir_builder *b, nir_intrinsic_instr *intrin,
178 lower_legacy_gs_state *s)
179 {
180 b->cursor = nir_before_instr(&intrin->instr);
181
182 unsigned stream = nir_intrinsic_stream_id(intrin);
183
184 s->vertex_count[stream] = intrin->src[0].ssa;
185 s->primitive_count[stream] = intrin->src[1].ssa;
186
187 nir_instr_remove(&intrin->instr);
188 return true;
189 }
190
191 static bool
lower_legacy_gs_end_primitive_with_counter(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)192 lower_legacy_gs_end_primitive_with_counter(nir_builder *b, nir_intrinsic_instr *intrin,
193 lower_legacy_gs_state *s)
194 {
195 b->cursor = nir_before_instr(&intrin->instr);
196 const unsigned stream = nir_intrinsic_stream_id(intrin);
197
198 /* Signal primitive emission. */
199 nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
200 .base = AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8));
201
202 nir_instr_remove(&intrin->instr);
203 return true;
204 }
205
206 static bool
lower_legacy_gs_intrinsic(nir_builder * b,nir_intrinsic_instr * intrin,void * state)207 lower_legacy_gs_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
208 {
209 lower_legacy_gs_state *s = (lower_legacy_gs_state *) state;
210
211 if (intrin->intrinsic == nir_intrinsic_store_output)
212 return lower_legacy_gs_store_output(b, intrin, s);
213 else if (intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter)
214 return lower_legacy_gs_emit_vertex_with_counter(b, intrin, s);
215 else if (intrin->intrinsic == nir_intrinsic_end_primitive_with_counter)
216 return lower_legacy_gs_end_primitive_with_counter(b, intrin, s);
217 else if (intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count)
218 return lower_legacy_gs_set_vertex_and_primitive_count(b, intrin, s);
219
220 return false;
221 }
222
223 void
ac_nir_lower_legacy_gs(nir_shader * nir,bool has_gen_prim_query,bool has_pipeline_stats_query,ac_nir_gs_output_info * output_info)224 ac_nir_lower_legacy_gs(nir_shader *nir,
225 bool has_gen_prim_query,
226 bool has_pipeline_stats_query,
227 ac_nir_gs_output_info *output_info)
228 {
229 lower_legacy_gs_state s = {
230 .info = output_info,
231 };
232
233 unsigned num_vertices_per_primitive = 0;
234 switch (nir->info.gs.output_primitive) {
235 case MESA_PRIM_POINTS:
236 num_vertices_per_primitive = 1;
237 break;
238 case MESA_PRIM_LINE_STRIP:
239 num_vertices_per_primitive = 2;
240 break;
241 case MESA_PRIM_TRIANGLE_STRIP:
242 num_vertices_per_primitive = 3;
243 break;
244 default:
245 unreachable("Invalid GS output primitive.");
246 break;
247 }
248
249 nir_shader_intrinsics_pass(nir, lower_legacy_gs_intrinsic,
250 nir_metadata_control_flow, &s);
251
252 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
253
254 nir_builder builder = nir_builder_at(nir_after_impl(impl));
255 nir_builder *b = &builder;
256
257 /* Emit shader query for mix use legacy/NGG GS */
258 bool progress = ac_nir_gs_shader_query(b,
259 has_gen_prim_query,
260 has_pipeline_stats_query,
261 has_pipeline_stats_query,
262 num_vertices_per_primitive,
263 64,
264 s.vertex_count,
265 s.primitive_count);
266
267 /* Wait for all stores to finish. */
268 nir_barrier(b, .execution_scope = SCOPE_INVOCATION,
269 .memory_scope = SCOPE_DEVICE,
270 .memory_semantics = NIR_MEMORY_RELEASE,
271 .memory_modes = nir_var_shader_out | nir_var_mem_ssbo |
272 nir_var_mem_global | nir_var_image);
273
274 /* Signal that the GS is done. */
275 nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
276 .base = AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE);
277
278 if (progress)
279 nir_metadata_preserve(impl, nir_metadata_none);
280 }
281