1 /*
2 * Copyright © 2021 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_nir.h"
8 #include "ac_nir_helpers.h"
9
10 #include "nir_builder.h"
11
12 typedef struct {
13 nir_def *outputs[64][4];
14 nir_def *outputs_16bit_lo[16][4];
15 nir_def *outputs_16bit_hi[16][4];
16
17 ac_nir_gs_output_info *info;
18
19 nir_def *vertex_count[4];
20 nir_def *primitive_count[4];
21 } lower_legacy_gs_state;
22
23 static bool
lower_legacy_gs_store_output(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)24 lower_legacy_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin,
25 lower_legacy_gs_state *s)
26 {
27 /* Assume:
28 * - the shader used nir_lower_io_to_temporaries
29 * - 64-bit outputs are lowered
30 * - no indirect indexing is present
31 */
32 assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
33
34 b->cursor = nir_before_instr(&intrin->instr);
35
36 unsigned component = nir_intrinsic_component(intrin);
37 unsigned write_mask = nir_intrinsic_write_mask(intrin);
38 nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
39
40 nir_def **outputs;
41 if (sem.location < VARYING_SLOT_VAR0_16BIT) {
42 outputs = s->outputs[sem.location];
43 } else {
44 unsigned index = sem.location - VARYING_SLOT_VAR0_16BIT;
45 if (sem.high_16bits)
46 outputs = s->outputs_16bit_hi[index];
47 else
48 outputs = s->outputs_16bit_lo[index];
49 }
50
51 nir_def *store_val = intrin->src[0].ssa;
52 /* 64bit output has been lowered to 32bit */
53 assert(store_val->bit_size <= 32);
54
55 /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */
56 const bool non_dedicated_16bit = sem.location < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16;
57
58 u_foreach_bit (i, write_mask) {
59 unsigned comp = component + i;
60 nir_def *store_component = nir_channel(b, store_val, i);
61
62 if (non_dedicated_16bit) {
63 if (sem.high_16bits) {
64 nir_def *lo = outputs[comp] ? nir_unpack_32_2x16_split_x(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16);
65 outputs[comp] = nir_pack_32_2x16_split(b, lo, store_component);
66 } else {
67 nir_def *hi = outputs[comp] ? nir_unpack_32_2x16_split_y(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16);
68 outputs[comp] = nir_pack_32_2x16_split(b, store_component, hi);
69 }
70 } else {
71 outputs[comp] = store_component;
72 }
73 }
74
75 nir_instr_remove(&intrin->instr);
76 return true;
77 }
78
79 static bool
lower_legacy_gs_emit_vertex_with_counter(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)80 lower_legacy_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intrin,
81 lower_legacy_gs_state *s)
82 {
83 b->cursor = nir_before_instr(&intrin->instr);
84
85 unsigned stream = nir_intrinsic_stream_id(intrin);
86 nir_def *vtxidx = intrin->src[0].ssa;
87
88 nir_def *gsvs_ring = nir_load_ring_gsvs_amd(b, .stream_id = stream);
89 nir_def *soffset = nir_load_ring_gs2vs_offset_amd(b);
90
91 unsigned offset = 0;
92 u_foreach_bit64 (i, b->shader->info.outputs_written) {
93 for (unsigned j = 0; j < 4; j++) {
94 nir_def *output = s->outputs[i][j];
95 /* Next vertex emit need a new value, reset all outputs. */
96 s->outputs[i][j] = NULL;
97
98 const uint8_t usage_mask = s->info->varying_mask[i] | s->info->sysval_mask[i];
99
100 if (!(usage_mask & (1 << j)) ||
101 ((s->info->streams[i] >> (j * 2)) & 0x3) != stream)
102 continue;
103
104 unsigned base = offset * b->shader->info.gs.vertices_out * 4;
105 offset++;
106
107 /* no one set this output, skip the buffer store */
108 if (!output)
109 continue;
110
111 nir_def *voffset = nir_ishl_imm(b, vtxidx, 2);
112
113 /* extend 8/16 bit to 32 bit, 64 bit has been lowered */
114 nir_def *data = nir_u2uN(b, output, 32);
115
116 nir_store_buffer_amd(b, data, gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
117 .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL |
118 ACCESS_IS_SWIZZLED_AMD,
119 .base = base,
120 /* For ACO to not reorder this store around EmitVertex/EndPrimitve */
121 .memory_modes = nir_var_shader_out);
122 }
123 }
124
125 u_foreach_bit (i, b->shader->info.outputs_written_16bit) {
126 for (unsigned j = 0; j < 4; j++) {
127 nir_def *output_lo = s->outputs_16bit_lo[i][j];
128 nir_def *output_hi = s->outputs_16bit_hi[i][j];
129 /* Next vertex emit need a new value, reset all outputs. */
130 s->outputs_16bit_lo[i][j] = NULL;
131 s->outputs_16bit_hi[i][j] = NULL;
132
133 bool has_lo_16bit = (s->info->varying_mask_16bit_lo[i] & (1 << j)) &&
134 ((s->info->streams_16bit_lo[i] >> (j * 2)) & 0x3) == stream;
135 bool has_hi_16bit = (s->info->varying_mask_16bit_hi[i] & (1 << j)) &&
136 ((s->info->streams_16bit_hi[i] >> (j * 2)) & 0x3) == stream;
137 if (!has_lo_16bit && !has_hi_16bit)
138 continue;
139
140 unsigned base = offset * b->shader->info.gs.vertices_out;
141 offset++;
142
143 bool has_lo_16bit_out = has_lo_16bit && output_lo;
144 bool has_hi_16bit_out = has_hi_16bit && output_hi;
145
146 /* no one set needed output, skip the buffer store */
147 if (!has_lo_16bit_out && !has_hi_16bit_out)
148 continue;
149
150 if (!has_lo_16bit_out)
151 output_lo = nir_undef(b, 1, 16);
152
153 if (!has_hi_16bit_out)
154 output_hi = nir_undef(b, 1, 16);
155
156 nir_def *voffset = nir_iadd_imm(b, vtxidx, base);
157 voffset = nir_ishl_imm(b, voffset, 2);
158
159 nir_store_buffer_amd(b, nir_pack_32_2x16_split(b, output_lo, output_hi),
160 gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
161 .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL |
162 ACCESS_IS_SWIZZLED_AMD,
163 /* For ACO to not reorder this store around EmitVertex/EndPrimitve */
164 .memory_modes = nir_var_shader_out);
165 }
166 }
167
168 /* Signal vertex emission. */
169 nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
170 .base = AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8));
171
172 nir_instr_remove(&intrin->instr);
173 return true;
174 }
175
176 static bool
lower_legacy_gs_set_vertex_and_primitive_count(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)177 lower_legacy_gs_set_vertex_and_primitive_count(nir_builder *b, nir_intrinsic_instr *intrin,
178 lower_legacy_gs_state *s)
179 {
180 b->cursor = nir_before_instr(&intrin->instr);
181
182 unsigned stream = nir_intrinsic_stream_id(intrin);
183
184 s->vertex_count[stream] = intrin->src[0].ssa;
185 s->primitive_count[stream] = intrin->src[1].ssa;
186
187 nir_instr_remove(&intrin->instr);
188 return true;
189 }
190
191 static bool
lower_legacy_gs_end_primitive_with_counter(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)192 lower_legacy_gs_end_primitive_with_counter(nir_builder *b, nir_intrinsic_instr *intrin,
193 lower_legacy_gs_state *s)
194 {
195 b->cursor = nir_before_instr(&intrin->instr);
196 const unsigned stream = nir_intrinsic_stream_id(intrin);
197
198 /* Signal primitive emission. */
199 nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
200 .base = AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8));
201
202 nir_instr_remove(&intrin->instr);
203 return true;
204 }
205
206 static bool
lower_legacy_gs_intrinsic(nir_builder * b,nir_instr * instr,void * state)207 lower_legacy_gs_intrinsic(nir_builder *b, nir_instr *instr, void *state)
208 {
209 lower_legacy_gs_state *s = (lower_legacy_gs_state *) state;
210
211 if (instr->type != nir_instr_type_intrinsic)
212 return false;
213
214 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
215
216 if (intrin->intrinsic == nir_intrinsic_store_output)
217 return lower_legacy_gs_store_output(b, intrin, s);
218 else if (intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter)
219 return lower_legacy_gs_emit_vertex_with_counter(b, intrin, s);
220 else if (intrin->intrinsic == nir_intrinsic_end_primitive_with_counter)
221 return lower_legacy_gs_end_primitive_with_counter(b, intrin, s);
222 else if (intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count)
223 return lower_legacy_gs_set_vertex_and_primitive_count(b, intrin, s);
224
225 return false;
226 }
227
228 void
ac_nir_lower_legacy_gs(nir_shader * nir,bool has_gen_prim_query,bool has_pipeline_stats_query,ac_nir_gs_output_info * output_info)229 ac_nir_lower_legacy_gs(nir_shader *nir,
230 bool has_gen_prim_query,
231 bool has_pipeline_stats_query,
232 ac_nir_gs_output_info *output_info)
233 {
234 lower_legacy_gs_state s = {
235 .info = output_info,
236 };
237
238 unsigned num_vertices_per_primitive = 0;
239 switch (nir->info.gs.output_primitive) {
240 case MESA_PRIM_POINTS:
241 num_vertices_per_primitive = 1;
242 break;
243 case MESA_PRIM_LINE_STRIP:
244 num_vertices_per_primitive = 2;
245 break;
246 case MESA_PRIM_TRIANGLE_STRIP:
247 num_vertices_per_primitive = 3;
248 break;
249 default:
250 unreachable("Invalid GS output primitive.");
251 break;
252 }
253
254 nir_shader_instructions_pass(nir, lower_legacy_gs_intrinsic,
255 nir_metadata_control_flow, &s);
256
257 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
258
259 nir_builder builder = nir_builder_at(nir_after_impl(impl));
260 nir_builder *b = &builder;
261
262 /* Emit shader query for mix use legacy/NGG GS */
263 bool progress = ac_nir_gs_shader_query(b,
264 has_gen_prim_query,
265 has_pipeline_stats_query,
266 has_pipeline_stats_query,
267 num_vertices_per_primitive,
268 64,
269 s.vertex_count,
270 s.primitive_count);
271
272 /* Wait for all stores to finish. */
273 nir_barrier(b, .execution_scope = SCOPE_INVOCATION,
274 .memory_scope = SCOPE_DEVICE,
275 .memory_semantics = NIR_MEMORY_RELEASE,
276 .memory_modes = nir_var_shader_out | nir_var_mem_ssbo |
277 nir_var_mem_global | nir_var_image);
278
279 /* Signal that the GS is done. */
280 nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
281 .base = AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE);
282
283 if (progress)
284 nir_metadata_preserve(impl, nir_metadata_none);
285 }
286