1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_instruction_selection.h"
26 #include "vulkan/radv_descriptor_set.h"
27 #include "vulkan/radv_shader.h"
28 #include "nir_control_flow.h"
29 #include "sid.h"
30 #include "ac_exp_param.h"
31
32 namespace aco {
33
34 namespace {
35
get_interp_input(nir_intrinsic_op intrin,enum glsl_interp_mode interp)36 unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
37 {
38 switch (interp) {
39 case INTERP_MODE_SMOOTH:
40 case INTERP_MODE_NONE:
41 if (intrin == nir_intrinsic_load_barycentric_pixel ||
42 intrin == nir_intrinsic_load_barycentric_at_sample ||
43 intrin == nir_intrinsic_load_barycentric_at_offset)
44 return S_0286CC_PERSP_CENTER_ENA(1);
45 else if (intrin == nir_intrinsic_load_barycentric_centroid)
46 return S_0286CC_PERSP_CENTROID_ENA(1);
47 else if (intrin == nir_intrinsic_load_barycentric_sample)
48 return S_0286CC_PERSP_SAMPLE_ENA(1);
49 break;
50 case INTERP_MODE_NOPERSPECTIVE:
51 if (intrin == nir_intrinsic_load_barycentric_pixel)
52 return S_0286CC_LINEAR_CENTER_ENA(1);
53 else if (intrin == nir_intrinsic_load_barycentric_centroid)
54 return S_0286CC_LINEAR_CENTROID_ENA(1);
55 else if (intrin == nir_intrinsic_load_barycentric_sample)
56 return S_0286CC_LINEAR_SAMPLE_ENA(1);
57 break;
58 default:
59 break;
60 }
61 return 0;
62 }
63
64 /* If one side of a divergent IF ends in a branch and the other doesn't, we
65 * might have to emit the contents of the side without the branch at the merge
66 * block instead. This is so that we can use any SGPR live-out of the side
67 * without the branch without creating a linear phi in the invert or merge block. */
68 bool
sanitize_if(nir_function_impl * impl,nir_if * nif)69 sanitize_if(nir_function_impl *impl, nir_if *nif)
70 {
71 //TODO: skip this if the condition is uniform and there are no divergent breaks/continues?
72
73 nir_block *then_block = nir_if_last_then_block(nif);
74 nir_block *else_block = nir_if_last_else_block(nif);
75 bool then_jump = nir_block_ends_in_jump(then_block) || nir_block_is_unreachable(then_block);
76 bool else_jump = nir_block_ends_in_jump(else_block) || nir_block_is_unreachable(else_block);
77 if (then_jump == else_jump)
78 return false;
79
80 /* If the continue from block is empty then return as there is nothing to
81 * move.
82 */
83 if (nir_cf_list_is_empty_block(else_jump ? &nif->then_list : &nif->else_list))
84 return false;
85
86 /* Even though this if statement has a jump on one side, we may still have
87 * phis afterwards. Single-source phis can be produced by loop unrolling
88 * or dead control-flow passes and are perfectly legal. Run a quick phi
89 * removal on the block after the if to clean up any such phis.
90 */
91 nir_opt_remove_phis_block(nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
92
93 /* Finally, move the continue from branch after the if-statement. */
94 nir_block *last_continue_from_blk = else_jump ? then_block : else_block;
95 nir_block *first_continue_from_blk = else_jump ?
96 nir_if_first_then_block(nif) : nir_if_first_else_block(nif);
97
98 nir_cf_list tmp;
99 nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk),
100 nir_after_block(last_continue_from_blk));
101 nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node));
102
103 /* nir_cf_extract() invalidates dominance metadata, but it should still be
104 * correct because of the specific type of transformation we did. Block
105 * indices are not valid except for block_0's, which is all we care about for
106 * nir_block_is_unreachable(). */
107 impl->valid_metadata = impl->valid_metadata | nir_metadata_dominance | nir_metadata_block_index;
108
109 return true;
110 }
111
112 bool
sanitize_cf_list(nir_function_impl * impl,struct exec_list * cf_list)113 sanitize_cf_list(nir_function_impl *impl, struct exec_list *cf_list)
114 {
115 bool progress = false;
116 foreach_list_typed(nir_cf_node, cf_node, node, cf_list) {
117 switch (cf_node->type) {
118 case nir_cf_node_block:
119 break;
120 case nir_cf_node_if: {
121 nir_if *nif = nir_cf_node_as_if(cf_node);
122 progress |= sanitize_cf_list(impl, &nif->then_list);
123 progress |= sanitize_cf_list(impl, &nif->else_list);
124 progress |= sanitize_if(impl, nif);
125 break;
126 }
127 case nir_cf_node_loop: {
128 nir_loop *loop = nir_cf_node_as_loop(cf_node);
129 progress |= sanitize_cf_list(impl, &loop->body);
130 break;
131 }
132 case nir_cf_node_function:
133 unreachable("Invalid cf type");
134 }
135 }
136
137 return progress;
138 }
139
fill_desc_set_info(isel_context * ctx,nir_function_impl * impl)140 void fill_desc_set_info(isel_context *ctx, nir_function_impl *impl)
141 {
142 radv_pipeline_layout *pipeline_layout = ctx->options->layout;
143
144 unsigned resource_flag_count = 1; /* +1 to reserve flags[0] for aliased resources */
145 for (unsigned i = 0; i < pipeline_layout->num_sets; i++) {
146 radv_descriptor_set_layout *layout = pipeline_layout->set[i].layout;
147 ctx->resource_flag_offsets[i] = resource_flag_count;
148 resource_flag_count += layout->binding_count;
149 }
150 ctx->buffer_resource_flags = std::vector<uint8_t>(resource_flag_count);
151
152 nir_foreach_variable_with_modes(var, impl->function->shader, nir_var_mem_ssbo) {
153 if (var->data.access & ACCESS_RESTRICT) {
154 uint32_t offset = ctx->resource_flag_offsets[var->data.descriptor_set];
155 ctx->buffer_resource_flags[offset + var->data.binding] |= buffer_is_restrict;
156 }
157 }
158
159 nir_foreach_block(block, impl) {
160 nir_foreach_instr(instr, block) {
161 if (instr->type != nir_instr_type_intrinsic)
162 continue;
163 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
164 if (!nir_intrinsic_has_access(intrin))
165 continue;
166
167 nir_ssa_def *res = NULL;
168 unsigned access = nir_intrinsic_access(intrin);
169 unsigned flags = 0;
170 bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
171 switch (intrin->intrinsic) {
172 case nir_intrinsic_load_ssbo: {
173 if (nir_dest_is_divergent(intrin->dest) && (!glc || ctx->program->chip_class >= GFX8))
174 flags |= glc ? has_glc_vmem_load : has_nonglc_vmem_load;
175 res = intrin->src[0].ssa;
176 break;
177 }
178 case nir_intrinsic_ssbo_atomic_add:
179 case nir_intrinsic_ssbo_atomic_imin:
180 case nir_intrinsic_ssbo_atomic_umin:
181 case nir_intrinsic_ssbo_atomic_imax:
182 case nir_intrinsic_ssbo_atomic_umax:
183 case nir_intrinsic_ssbo_atomic_and:
184 case nir_intrinsic_ssbo_atomic_or:
185 case nir_intrinsic_ssbo_atomic_xor:
186 case nir_intrinsic_ssbo_atomic_exchange:
187 case nir_intrinsic_ssbo_atomic_comp_swap:
188 flags |= has_glc_vmem_load | has_glc_vmem_store;
189 res = intrin->src[0].ssa;
190 break;
191 case nir_intrinsic_store_ssbo:
192 if (nir_src_is_divergent(intrin->src[2]) ||
193 ctx->program->chip_class < GFX8 || ctx->program->chip_class >= GFX10_3 ||
194 (intrin->src[0].ssa->bit_size < 32 && !can_subdword_ssbo_store_use_smem(intrin)))
195 flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
196 res = intrin->src[1].ssa;
197 break;
198 case nir_intrinsic_load_global:
199 if (!(access & ACCESS_NON_WRITEABLE))
200 flags |= glc ? has_glc_vmem_load : has_nonglc_vmem_load;
201 break;
202 case nir_intrinsic_store_global:
203 flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
204 break;
205 case nir_intrinsic_global_atomic_add:
206 case nir_intrinsic_global_atomic_imin:
207 case nir_intrinsic_global_atomic_umin:
208 case nir_intrinsic_global_atomic_imax:
209 case nir_intrinsic_global_atomic_umax:
210 case nir_intrinsic_global_atomic_and:
211 case nir_intrinsic_global_atomic_or:
212 case nir_intrinsic_global_atomic_xor:
213 case nir_intrinsic_global_atomic_exchange:
214 case nir_intrinsic_global_atomic_comp_swap:
215 flags |= has_glc_vmem_load | has_glc_vmem_store;
216 break;
217 case nir_intrinsic_image_deref_load:
218 res = intrin->src[0].ssa;
219 flags |= glc ? has_glc_vmem_load : has_nonglc_vmem_load;
220 break;
221 case nir_intrinsic_image_deref_store:
222 res = intrin->src[0].ssa;
223 flags |= (glc || ctx->program->chip_class == GFX6) ? has_glc_vmem_store : has_nonglc_vmem_store;
224 break;
225 case nir_intrinsic_image_deref_atomic_add:
226 case nir_intrinsic_image_deref_atomic_umin:
227 case nir_intrinsic_image_deref_atomic_imin:
228 case nir_intrinsic_image_deref_atomic_umax:
229 case nir_intrinsic_image_deref_atomic_imax:
230 case nir_intrinsic_image_deref_atomic_and:
231 case nir_intrinsic_image_deref_atomic_or:
232 case nir_intrinsic_image_deref_atomic_xor:
233 case nir_intrinsic_image_deref_atomic_exchange:
234 case nir_intrinsic_image_deref_atomic_comp_swap:
235 res = intrin->src[0].ssa;
236 flags |= has_glc_vmem_load | has_glc_vmem_store;
237 break;
238 default:
239 continue;
240 }
241
242 uint8_t *flags_ptr;
243 uint32_t count;
244 get_buffer_resource_flags(ctx, res, access, &flags_ptr, &count);
245
246 for (unsigned i = 0; i < count; i++)
247 flags_ptr[i] |= flags;
248 }
249 }
250 }
251
apply_nuw_to_ssa(isel_context * ctx,nir_ssa_def * ssa)252 void apply_nuw_to_ssa(isel_context *ctx, nir_ssa_def *ssa)
253 {
254 nir_ssa_scalar scalar;
255 scalar.def = ssa;
256 scalar.comp = 0;
257
258 if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != nir_op_iadd)
259 return;
260
261 nir_alu_instr *add = nir_instr_as_alu(ssa->parent_instr);
262
263 if (add->no_unsigned_wrap)
264 return;
265
266 nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
267 nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
268
269 if (nir_ssa_scalar_is_const(src0)) {
270 nir_ssa_scalar tmp = src0;
271 src0 = src1;
272 src1 = tmp;
273 }
274
275 uint32_t src1_ub = nir_unsigned_upper_bound(ctx->shader, ctx->range_ht,
276 src1, &ctx->ub_config);
277 add->no_unsigned_wrap =
278 !nir_addition_might_overflow(ctx->shader, ctx->range_ht, src0, src1_ub,
279 &ctx->ub_config);
280 }
281
apply_nuw_to_offsets(isel_context * ctx,nir_function_impl * impl)282 void apply_nuw_to_offsets(isel_context *ctx, nir_function_impl *impl)
283 {
284 nir_metadata_require(impl, nir_metadata_dominance);
285
286 nir_foreach_block(block, impl) {
287 nir_foreach_instr(instr, block) {
288 if (instr->type != nir_instr_type_intrinsic)
289 continue;
290 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
291
292 switch (intrin->intrinsic) {
293 case nir_intrinsic_load_constant:
294 case nir_intrinsic_load_uniform:
295 case nir_intrinsic_load_push_constant:
296 if (!nir_src_is_divergent(intrin->src[0]))
297 apply_nuw_to_ssa(ctx, intrin->src[0].ssa);
298 break;
299 case nir_intrinsic_load_ubo:
300 case nir_intrinsic_load_ssbo:
301 if (!nir_src_is_divergent(intrin->src[1]))
302 apply_nuw_to_ssa(ctx, intrin->src[1].ssa);
303 break;
304 case nir_intrinsic_store_ssbo:
305 if (!nir_src_is_divergent(intrin->src[2]))
306 apply_nuw_to_ssa(ctx, intrin->src[2].ssa);
307 break;
308 default:
309 break;
310 }
311 }
312 }
313 }
314
get_reg_class(isel_context * ctx,RegType type,unsigned components,unsigned bitsize)315 RegClass get_reg_class(isel_context *ctx, RegType type, unsigned components, unsigned bitsize)
316 {
317 if (bitsize == 1)
318 return RegClass(RegType::sgpr, ctx->program->lane_mask.size() * components);
319 else
320 return RegClass::get(type, components * bitsize / 8u);
321 }
322
323 void
setup_vs_output_info(isel_context * ctx,nir_shader * nir,bool export_prim_id,bool export_clip_dists,radv_vs_output_info * outinfo)324 setup_vs_output_info(isel_context *ctx, nir_shader *nir,
325 bool export_prim_id, bool export_clip_dists,
326 radv_vs_output_info *outinfo)
327 {
328 memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
329 sizeof(outinfo->vs_output_param_offset));
330
331 outinfo->param_exports = 0;
332 int pos_written = 0x1;
333 if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer)
334 pos_written |= 1 << 1;
335
336 uint64_t mask = nir->info.outputs_written;
337 while (mask) {
338 int idx = u_bit_scan64(&mask);
339 if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER ||
340 idx == VARYING_SLOT_PRIMITIVE_ID || idx == VARYING_SLOT_VIEWPORT ||
341 ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
342 if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
343 outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
344 }
345 }
346 if (outinfo->writes_layer &&
347 outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
348 /* when ctx->options->key.has_multiview_view_index = true, the layer
349 * variable isn't declared in NIR and it's isel's job to get the layer */
350 outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
351 }
352
353 if (export_prim_id) {
354 assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
355 outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
356 }
357
358 ctx->export_clip_dists = export_clip_dists;
359 ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
360 ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
361
362 assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
363
364 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
365 pos_written |= 1 << 2;
366 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
367 pos_written |= 1 << 3;
368
369 outinfo->pos_exports = util_bitcount(pos_written);
370 }
371
372 void
setup_vs_variables(isel_context * ctx,nir_shader * nir)373 setup_vs_variables(isel_context *ctx, nir_shader *nir)
374 {
375 if (ctx->stage == vertex_vs || ctx->stage == vertex_ngg) {
376 radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
377 setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
378 ctx->options->key.vs_common_out.export_clip_dists, outinfo);
379
380 /* TODO: NGG streamout */
381 if (ctx->stage.hw == HWStage::NGG)
382 assert(!ctx->args->shader_info->so.num_outputs);
383
384 /* TODO: check if the shader writes edge flags (not in Vulkan) */
385 ctx->ngg_nogs_early_prim_export = true;
386 } else if (ctx->stage == vertex_ls) {
387 ctx->tcs_num_inputs = ctx->program->info->vs.num_linked_outputs;
388 }
389
390 if (ctx->stage == vertex_ngg && ctx->args->options->key.vs_common_out.export_prim_id) {
391 /* We need to store the primitive IDs in LDS */
392 unsigned lds_size = ctx->program->info->ngg_info.esgs_ring_size;
393 ctx->program->config->lds_size = (lds_size + ctx->program->lds_alloc_granule - 1) /
394 ctx->program->lds_alloc_granule;
395 }
396 }
397
setup_gs_variables(isel_context * ctx,nir_shader * nir)398 void setup_gs_variables(isel_context *ctx, nir_shader *nir)
399 {
400 if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
401 ctx->program->config->lds_size = ctx->program->info->gs_ring_info.lds_size; /* Already in units of the alloc granularity */
402 } else if (ctx->stage == vertex_geometry_ngg || ctx->stage == tess_eval_geometry_ngg) {
403 radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
404 setup_vs_output_info(ctx, nir, false,
405 ctx->options->key.vs_common_out.export_clip_dists, outinfo);
406
407 unsigned ngg_gs_scratch_bytes = ctx->args->shader_info->so.num_outputs ? (44u * 4u) : (8u * 4u);
408 unsigned ngg_emit_bytes = ctx->args->shader_info->ngg_info.ngg_emit_size * 4u;
409 unsigned esgs_ring_bytes = ctx->args->shader_info->ngg_info.esgs_ring_size;
410
411 ctx->ngg_gs_primflags_offset = ctx->args->shader_info->gs.gsvs_vertex_size;
412 ctx->ngg_gs_emit_vtx_bytes = ctx->ngg_gs_primflags_offset + 4u;
413 ctx->ngg_gs_emit_addr = esgs_ring_bytes;
414 ctx->ngg_gs_scratch_addr = ctx->ngg_gs_emit_addr + ngg_emit_bytes;
415
416 unsigned total_lds_bytes = esgs_ring_bytes + ngg_emit_bytes + ngg_gs_scratch_bytes;
417 assert(total_lds_bytes >= ctx->ngg_gs_emit_addr);
418 assert(total_lds_bytes >= ctx->ngg_gs_scratch_addr);
419 ctx->program->config->lds_size = (total_lds_bytes + ctx->program->lds_alloc_granule - 1) / ctx->program->lds_alloc_granule;
420
421 /* Make sure we have enough room for emitted GS vertices */
422 assert((ngg_emit_bytes % (ctx->ngg_gs_emit_vtx_bytes * nir->info.gs.vertices_out)) == 0);
423
424 /* See if the number of vertices and primitives are compile-time known */
425 nir_gs_count_vertices_and_primitives(nir, ctx->ngg_gs_const_vtxcnt, ctx->ngg_gs_const_prmcnt, 4u);
426 ctx->ngg_gs_early_alloc = ctx->ngg_gs_const_vtxcnt[0] == nir->info.gs.vertices_out && ctx->ngg_gs_const_prmcnt[0] != -1;
427 }
428
429 if (ctx->stage.has(SWStage::VS))
430 ctx->program->info->gs.es_type = MESA_SHADER_VERTEX;
431 else if (ctx->stage.has(SWStage::TES))
432 ctx->program->info->gs.es_type = MESA_SHADER_TESS_EVAL;
433 }
434
435 void
setup_tcs_info(isel_context * ctx,nir_shader * nir,nir_shader * vs)436 setup_tcs_info(isel_context *ctx, nir_shader *nir, nir_shader *vs)
437 {
438 /* When the number of TCS input and output vertices are the same (typically 3):
439 * - There is an equal amount of LS and HS invocations
440 * - In case of merged LSHS shaders, the LS and HS halves of the shader
441 * always process the exact same vertex. We can use this knowledge to optimize them.
442 *
443 * We don't set tcs_in_out_eq if the float controls differ because that might
444 * involve different float modes for the same block and our optimizer
445 * doesn't handle a instruction dominating another with a different mode.
446 */
447 ctx->tcs_in_out_eq =
448 ctx->stage == vertex_tess_control_hs &&
449 ctx->args->options->key.tcs.input_vertices == nir->info.tess.tcs_vertices_out &&
450 vs->info.float_controls_execution_mode == nir->info.float_controls_execution_mode;
451
452 if (ctx->tcs_in_out_eq) {
453 ctx->tcs_temp_only_inputs = ~nir->info.tess.tcs_cross_invocation_inputs_read &
454 ~nir->info.inputs_read_indirectly &
455 nir->info.inputs_read;
456 }
457
458 ctx->tcs_num_inputs = ctx->program->info->tcs.num_linked_inputs;
459 ctx->tcs_num_outputs = ctx->program->info->tcs.num_linked_outputs;
460 ctx->tcs_num_patch_outputs = ctx->program->info->tcs.num_linked_patch_outputs;
461
462 ctx->tcs_num_patches = get_tcs_num_patches(
463 ctx->args->options->key.tcs.input_vertices,
464 nir->info.tess.tcs_vertices_out,
465 ctx->tcs_num_inputs,
466 ctx->tcs_num_outputs,
467 ctx->tcs_num_patch_outputs,
468 ctx->args->options->tess_offchip_block_dw_size,
469 ctx->args->options->chip_class,
470 ctx->args->options->family);
471 unsigned lds_size = calculate_tess_lds_size(
472 ctx->args->options->chip_class,
473 ctx->args->options->key.tcs.input_vertices,
474 nir->info.tess.tcs_vertices_out,
475 ctx->tcs_num_inputs,
476 ctx->tcs_num_patches,
477 ctx->tcs_num_outputs,
478 ctx->tcs_num_patch_outputs);
479
480 ctx->args->shader_info->tcs.num_patches = ctx->tcs_num_patches;
481 ctx->args->shader_info->tcs.num_lds_blocks = lds_size;
482 ctx->program->config->lds_size = (lds_size + ctx->program->lds_alloc_granule - 1) /
483 ctx->program->lds_alloc_granule;
484 }
485
486 void
setup_tes_variables(isel_context * ctx,nir_shader * nir)487 setup_tes_variables(isel_context *ctx, nir_shader *nir)
488 {
489 ctx->tcs_num_patches = ctx->args->options->key.tes.num_patches;
490 ctx->tcs_num_outputs = ctx->program->info->tes.num_linked_inputs;
491
492 if (ctx->stage == tess_eval_vs || ctx->stage == tess_eval_ngg) {
493 radv_vs_output_info *outinfo = &ctx->program->info->tes.outinfo;
494 setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
495 ctx->options->key.vs_common_out.export_clip_dists, outinfo);
496
497 /* TODO: NGG streamout */
498 if (ctx->stage.hw == HWStage::NGG)
499 assert(!ctx->args->shader_info->so.num_outputs);
500
501 /* Tess eval shaders can't write edge flags, so this can be always true. */
502 ctx->ngg_nogs_early_prim_export = true;
503 }
504 }
505
506 void
setup_variables(isel_context * ctx,nir_shader * nir)507 setup_variables(isel_context *ctx, nir_shader *nir)
508 {
509 switch (nir->info.stage) {
510 case MESA_SHADER_FRAGMENT: {
511 break;
512 }
513 case MESA_SHADER_COMPUTE: {
514 ctx->program->config->lds_size = (nir->info.cs.shared_size + ctx->program->lds_alloc_granule - 1) /
515 ctx->program->lds_alloc_granule;
516 break;
517 }
518 case MESA_SHADER_VERTEX: {
519 setup_vs_variables(ctx, nir);
520 break;
521 }
522 case MESA_SHADER_GEOMETRY: {
523 setup_gs_variables(ctx, nir);
524 break;
525 }
526 case MESA_SHADER_TESS_CTRL: {
527 break;
528 }
529 case MESA_SHADER_TESS_EVAL: {
530 setup_tes_variables(ctx, nir);
531 break;
532 }
533 default:
534 unreachable("Unhandled shader stage.");
535 }
536
537 /* Make sure we fit the available LDS space. */
538 assert((ctx->program->config->lds_size * ctx->program->lds_alloc_granule) <= ctx->program->lds_limit);
539 }
540
541 void
setup_nir(isel_context * ctx,nir_shader * nir)542 setup_nir(isel_context *ctx, nir_shader *nir)
543 {
544 /* the variable setup has to be done before lower_io / CSE */
545 setup_variables(ctx, nir);
546
547 nir_convert_to_lcssa(nir, true, false);
548 nir_lower_phis_to_scalar(nir);
549
550 nir_function_impl *func = nir_shader_get_entrypoint(nir);
551 nir_index_ssa_defs(func);
552 }
553
554 void
setup_xnack(Program * program)555 setup_xnack(Program *program)
556 {
557 switch (program->family) {
558 /* GFX8 APUs */
559 case CHIP_CARRIZO:
560 case CHIP_STONEY:
561 /* GFX9 APUS */
562 case CHIP_RAVEN:
563 case CHIP_RAVEN2:
564 case CHIP_RENOIR:
565 program->xnack_enabled = true;
566 break;
567 default:
568 break;
569 }
570 }
571
572 } /* end namespace */
573
init_context(isel_context * ctx,nir_shader * shader)574 void init_context(isel_context *ctx, nir_shader *shader)
575 {
576 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
577 unsigned lane_mask_size = ctx->program->lane_mask.size();
578
579 ctx->shader = shader;
580
581 /* Init NIR range analysis. */
582 ctx->range_ht =_mesa_pointer_hash_table_create(NULL);
583 ctx->ub_config.min_subgroup_size = 64;
584 ctx->ub_config.max_subgroup_size = 64;
585 if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && ctx->options->key.cs.subgroup_size) {
586 ctx->ub_config.min_subgroup_size = ctx->options->key.cs.subgroup_size;
587 ctx->ub_config.max_subgroup_size = ctx->options->key.cs.subgroup_size;
588 }
589 ctx->ub_config.max_work_group_invocations = 2048;
590 ctx->ub_config.max_work_group_count[0] = 65535;
591 ctx->ub_config.max_work_group_count[1] = 65535;
592 ctx->ub_config.max_work_group_count[2] = 65535;
593 ctx->ub_config.max_work_group_size[0] = 2048;
594 ctx->ub_config.max_work_group_size[1] = 2048;
595 ctx->ub_config.max_work_group_size[2] = 2048;
596 for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
597 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[i];
598 unsigned dfmt = attrib_format & 0xf;
599 unsigned nfmt = (attrib_format >> 4) & 0x7;
600
601 uint32_t max = UINT32_MAX;
602 if (nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
603 max = 0x3f800000u;
604 } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
605 nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED) {
606 bool uscaled = nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED;
607 switch (dfmt) {
608 case V_008F0C_BUF_DATA_FORMAT_8:
609 case V_008F0C_BUF_DATA_FORMAT_8_8:
610 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
611 max = uscaled ? 0x437f0000u : UINT8_MAX;
612 break;
613 case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
614 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
615 max = uscaled ? 0x447fc000u : 1023;
616 break;
617 case V_008F0C_BUF_DATA_FORMAT_10_11_11:
618 case V_008F0C_BUF_DATA_FORMAT_11_11_10:
619 max = uscaled ? 0x44ffe000u : 2047;
620 break;
621 case V_008F0C_BUF_DATA_FORMAT_16:
622 case V_008F0C_BUF_DATA_FORMAT_16_16:
623 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
624 max = uscaled ? 0x477fff00u : UINT16_MAX;
625 break;
626 case V_008F0C_BUF_DATA_FORMAT_32:
627 case V_008F0C_BUF_DATA_FORMAT_32_32:
628 case V_008F0C_BUF_DATA_FORMAT_32_32_32:
629 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
630 max = uscaled ? 0x4f800000u : UINT32_MAX;
631 break;
632 }
633 }
634 ctx->ub_config.vertex_attrib_max[i] = max;
635 }
636
637 nir_divergence_analysis(shader);
638 nir_opt_uniform_atomics(shader);
639
640 fill_desc_set_info(ctx, impl);
641
642 apply_nuw_to_offsets(ctx, impl);
643
644 /* sanitize control flow */
645 nir_metadata_require(impl, nir_metadata_dominance);
646 sanitize_cf_list(impl, &impl->body);
647 nir_metadata_preserve(impl, ~nir_metadata_block_index);
648
649 /* we'll need this for isel */
650 nir_metadata_require(impl, nir_metadata_block_index);
651
652 if (!ctx->stage.has(SWStage::GSCopy) && ctx->options->dump_preoptir) {
653 fprintf(stderr, "NIR shader before instruction selection:\n");
654 nir_print_shader(shader, stderr);
655 }
656
657 ctx->first_temp_id = ctx->program->peekAllocationId();
658 ctx->program->allocateRange(impl->ssa_alloc);
659 RegClass *regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;
660
661 unsigned spi_ps_inputs = 0;
662
663 std::unique_ptr<unsigned[]> nir_to_aco{new unsigned[impl->num_blocks]()};
664
665 /* TODO: make this recursive to improve compile times and merge with fill_desc_set_info() */
666 bool done = false;
667 while (!done) {
668 done = true;
669 nir_foreach_block(block, impl) {
670 nir_foreach_instr(instr, block) {
671 switch(instr->type) {
672 case nir_instr_type_alu: {
673 nir_alu_instr *alu_instr = nir_instr_as_alu(instr);
674 RegType type = RegType::sgpr;
675 switch(alu_instr->op) {
676 case nir_op_fmul:
677 case nir_op_fadd:
678 case nir_op_fsub:
679 case nir_op_fmax:
680 case nir_op_fmin:
681 case nir_op_fneg:
682 case nir_op_fabs:
683 case nir_op_fsat:
684 case nir_op_fsign:
685 case nir_op_frcp:
686 case nir_op_frsq:
687 case nir_op_fsqrt:
688 case nir_op_fexp2:
689 case nir_op_flog2:
690 case nir_op_ffract:
691 case nir_op_ffloor:
692 case nir_op_fceil:
693 case nir_op_ftrunc:
694 case nir_op_fround_even:
695 case nir_op_fsin:
696 case nir_op_fcos:
697 case nir_op_f2f16:
698 case nir_op_f2f16_rtz:
699 case nir_op_f2f16_rtne:
700 case nir_op_f2f32:
701 case nir_op_f2f64:
702 case nir_op_u2f16:
703 case nir_op_u2f32:
704 case nir_op_u2f64:
705 case nir_op_i2f16:
706 case nir_op_i2f32:
707 case nir_op_i2f64:
708 case nir_op_pack_half_2x16_split:
709 case nir_op_unpack_half_2x16_split_x:
710 case nir_op_unpack_half_2x16_split_y:
711 case nir_op_fddx:
712 case nir_op_fddy:
713 case nir_op_fddx_fine:
714 case nir_op_fddy_fine:
715 case nir_op_fddx_coarse:
716 case nir_op_fddy_coarse:
717 case nir_op_fquantize2f16:
718 case nir_op_ldexp:
719 case nir_op_frexp_sig:
720 case nir_op_frexp_exp:
721 case nir_op_cube_face_index:
722 case nir_op_cube_face_coord:
723 type = RegType::vgpr;
724 break;
725 case nir_op_f2i16:
726 case nir_op_f2u16:
727 case nir_op_f2i32:
728 case nir_op_f2u32:
729 case nir_op_f2i64:
730 case nir_op_f2u64:
731 case nir_op_b2i8:
732 case nir_op_b2i16:
733 case nir_op_b2i32:
734 case nir_op_b2i64:
735 case nir_op_b2b32:
736 case nir_op_b2f16:
737 case nir_op_b2f32:
738 case nir_op_mov:
739 type = nir_dest_is_divergent(alu_instr->dest.dest) ? RegType::vgpr : RegType::sgpr;
740 break;
741 case nir_op_bcsel:
742 type = nir_dest_is_divergent(alu_instr->dest.dest) ? RegType::vgpr : RegType::sgpr;
743 /* fallthrough */
744 default:
745 for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
746 if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
747 type = RegType::vgpr;
748 }
749 break;
750 }
751
752 RegClass rc = get_reg_class(ctx, type, alu_instr->dest.dest.ssa.num_components, alu_instr->dest.dest.ssa.bit_size);
753 regclasses[alu_instr->dest.dest.ssa.index] = rc;
754 break;
755 }
756 case nir_instr_type_load_const: {
757 unsigned num_components = nir_instr_as_load_const(instr)->def.num_components;
758 unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size;
759 RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
760 regclasses[nir_instr_as_load_const(instr)->def.index] = rc;
761 break;
762 }
763 case nir_instr_type_intrinsic: {
764 nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
765 if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)
766 break;
767 RegType type = RegType::sgpr;
768 switch(intrinsic->intrinsic) {
769 case nir_intrinsic_load_push_constant:
770 case nir_intrinsic_load_work_group_id:
771 case nir_intrinsic_load_num_work_groups:
772 case nir_intrinsic_load_subgroup_id:
773 case nir_intrinsic_load_num_subgroups:
774 case nir_intrinsic_load_first_vertex:
775 case nir_intrinsic_load_base_instance:
776 case nir_intrinsic_get_ssbo_size:
777 case nir_intrinsic_vote_all:
778 case nir_intrinsic_vote_any:
779 case nir_intrinsic_read_first_invocation:
780 case nir_intrinsic_read_invocation:
781 case nir_intrinsic_first_invocation:
782 case nir_intrinsic_ballot:
783 type = RegType::sgpr;
784 break;
785 case nir_intrinsic_load_sample_id:
786 case nir_intrinsic_load_sample_mask_in:
787 case nir_intrinsic_load_input:
788 case nir_intrinsic_load_output:
789 case nir_intrinsic_load_input_vertex:
790 case nir_intrinsic_load_per_vertex_input:
791 case nir_intrinsic_load_per_vertex_output:
792 case nir_intrinsic_load_vertex_id:
793 case nir_intrinsic_load_vertex_id_zero_base:
794 case nir_intrinsic_load_barycentric_sample:
795 case nir_intrinsic_load_barycentric_pixel:
796 case nir_intrinsic_load_barycentric_model:
797 case nir_intrinsic_load_barycentric_centroid:
798 case nir_intrinsic_load_barycentric_at_sample:
799 case nir_intrinsic_load_barycentric_at_offset:
800 case nir_intrinsic_load_interpolated_input:
801 case nir_intrinsic_load_frag_coord:
802 case nir_intrinsic_load_sample_pos:
803 case nir_intrinsic_load_layer_id:
804 case nir_intrinsic_load_local_invocation_id:
805 case nir_intrinsic_load_local_invocation_index:
806 case nir_intrinsic_load_subgroup_invocation:
807 case nir_intrinsic_load_tess_coord:
808 case nir_intrinsic_write_invocation_amd:
809 case nir_intrinsic_mbcnt_amd:
810 case nir_intrinsic_load_instance_id:
811 case nir_intrinsic_ssbo_atomic_add:
812 case nir_intrinsic_ssbo_atomic_imin:
813 case nir_intrinsic_ssbo_atomic_umin:
814 case nir_intrinsic_ssbo_atomic_imax:
815 case nir_intrinsic_ssbo_atomic_umax:
816 case nir_intrinsic_ssbo_atomic_and:
817 case nir_intrinsic_ssbo_atomic_or:
818 case nir_intrinsic_ssbo_atomic_xor:
819 case nir_intrinsic_ssbo_atomic_exchange:
820 case nir_intrinsic_ssbo_atomic_comp_swap:
821 case nir_intrinsic_global_atomic_add:
822 case nir_intrinsic_global_atomic_imin:
823 case nir_intrinsic_global_atomic_umin:
824 case nir_intrinsic_global_atomic_imax:
825 case nir_intrinsic_global_atomic_umax:
826 case nir_intrinsic_global_atomic_and:
827 case nir_intrinsic_global_atomic_or:
828 case nir_intrinsic_global_atomic_xor:
829 case nir_intrinsic_global_atomic_exchange:
830 case nir_intrinsic_global_atomic_comp_swap:
831 case nir_intrinsic_image_deref_atomic_add:
832 case nir_intrinsic_image_deref_atomic_umin:
833 case nir_intrinsic_image_deref_atomic_imin:
834 case nir_intrinsic_image_deref_atomic_umax:
835 case nir_intrinsic_image_deref_atomic_imax:
836 case nir_intrinsic_image_deref_atomic_and:
837 case nir_intrinsic_image_deref_atomic_or:
838 case nir_intrinsic_image_deref_atomic_xor:
839 case nir_intrinsic_image_deref_atomic_exchange:
840 case nir_intrinsic_image_deref_atomic_comp_swap:
841 case nir_intrinsic_image_deref_size:
842 case nir_intrinsic_shared_atomic_add:
843 case nir_intrinsic_shared_atomic_imin:
844 case nir_intrinsic_shared_atomic_umin:
845 case nir_intrinsic_shared_atomic_imax:
846 case nir_intrinsic_shared_atomic_umax:
847 case nir_intrinsic_shared_atomic_and:
848 case nir_intrinsic_shared_atomic_or:
849 case nir_intrinsic_shared_atomic_xor:
850 case nir_intrinsic_shared_atomic_exchange:
851 case nir_intrinsic_shared_atomic_comp_swap:
852 case nir_intrinsic_shared_atomic_fadd:
853 case nir_intrinsic_load_scratch:
854 case nir_intrinsic_load_invocation_id:
855 case nir_intrinsic_load_primitive_id:
856 type = RegType::vgpr;
857 break;
858 case nir_intrinsic_shuffle:
859 case nir_intrinsic_quad_broadcast:
860 case nir_intrinsic_quad_swap_horizontal:
861 case nir_intrinsic_quad_swap_vertical:
862 case nir_intrinsic_quad_swap_diagonal:
863 case nir_intrinsic_quad_swizzle_amd:
864 case nir_intrinsic_masked_swizzle_amd:
865 case nir_intrinsic_inclusive_scan:
866 case nir_intrinsic_exclusive_scan:
867 case nir_intrinsic_reduce:
868 case nir_intrinsic_load_ubo:
869 case nir_intrinsic_load_ssbo:
870 case nir_intrinsic_load_global:
871 case nir_intrinsic_vulkan_resource_index:
872 case nir_intrinsic_load_shared:
873 type = nir_dest_is_divergent(intrinsic->dest) ? RegType::vgpr : RegType::sgpr;
874 break;
875 case nir_intrinsic_load_view_index:
876 type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
877 break;
878 default:
879 for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; i++) {
880 if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
881 type = RegType::vgpr;
882 }
883 break;
884 }
885 RegClass rc = get_reg_class(ctx, type, intrinsic->dest.ssa.num_components, intrinsic->dest.ssa.bit_size);
886 regclasses[intrinsic->dest.ssa.index] = rc;
887
888 switch(intrinsic->intrinsic) {
889 case nir_intrinsic_load_barycentric_sample:
890 case nir_intrinsic_load_barycentric_pixel:
891 case nir_intrinsic_load_barycentric_centroid:
892 case nir_intrinsic_load_barycentric_at_sample:
893 case nir_intrinsic_load_barycentric_at_offset: {
894 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic);
895 spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode);
896 break;
897 }
898 case nir_intrinsic_load_barycentric_model:
899 spi_ps_inputs |= S_0286CC_PERSP_PULL_MODEL_ENA(1);
900 break;
901 case nir_intrinsic_load_front_face:
902 spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1);
903 break;
904 case nir_intrinsic_load_frag_coord:
905 case nir_intrinsic_load_sample_pos: {
906 uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa);
907 for (unsigned i = 0; i < 4; i++) {
908 if (mask & (1 << i))
909 spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
910
911 }
912 break;
913 }
914 case nir_intrinsic_load_sample_id:
915 spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
916 break;
917 case nir_intrinsic_load_sample_mask_in:
918 spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
919 spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
920 break;
921 default:
922 break;
923 }
924 break;
925 }
926 case nir_instr_type_tex: {
927 nir_tex_instr* tex = nir_instr_as_tex(instr);
928 RegType type = nir_dest_is_divergent(tex->dest) ? RegType::vgpr : RegType::sgpr;
929
930 if (tex->op == nir_texop_texture_samples) {
931 assert(!tex->dest.ssa.divergent);
932 }
933
934 RegClass rc = get_reg_class(ctx, type, tex->dest.ssa.num_components,
935 tex->dest.ssa.bit_size);
936 regclasses[tex->dest.ssa.index] = rc;
937 break;
938 }
939 case nir_instr_type_parallel_copy: {
940 nir_foreach_parallel_copy_entry(entry, nir_instr_as_parallel_copy(instr)) {
941 regclasses[entry->dest.ssa.index] = regclasses[entry->src.ssa->index];
942 }
943 break;
944 }
945 case nir_instr_type_ssa_undef: {
946 unsigned num_components = nir_instr_as_ssa_undef(instr)->def.num_components;
947 unsigned bit_size = nir_instr_as_ssa_undef(instr)->def.bit_size;
948 RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
949 regclasses[nir_instr_as_ssa_undef(instr)->def.index] = rc;
950 break;
951 }
952 case nir_instr_type_phi: {
953 nir_phi_instr* phi = nir_instr_as_phi(instr);
954 RegType type;
955 unsigned size = phi->dest.ssa.num_components;
956
957 if (phi->dest.ssa.bit_size == 1) {
958 assert(size == 1 && "multiple components not yet supported on boolean phis.");
959 type = RegType::sgpr;
960 size *= lane_mask_size;
961 regclasses[phi->dest.ssa.index] = RegClass(type, size);
962 break;
963 }
964
965 if (nir_dest_is_divergent(phi->dest)) {
966 type = RegType::vgpr;
967 } else {
968 type = RegType::sgpr;
969 nir_foreach_phi_src (src, phi) {
970 if (regclasses[src->src.ssa->index].type() == RegType::vgpr)
971 type = RegType::vgpr;
972 if (regclasses[src->src.ssa->index].type() == RegType::none)
973 done = false;
974 }
975 }
976
977 RegClass rc = get_reg_class(ctx, type, phi->dest.ssa.num_components, phi->dest.ssa.bit_size);
978 if (rc != regclasses[phi->dest.ssa.index]) {
979 done = false;
980 } else {
981 nir_foreach_phi_src(src, phi)
982 assert(regclasses[src->src.ssa->index].size() == rc.size());
983 }
984 regclasses[phi->dest.ssa.index] = rc;
985 break;
986 }
987 default:
988 break;
989 }
990 }
991 }
992 }
993
994 if (G_0286CC_POS_W_FLOAT_ENA(spi_ps_inputs)) {
995 /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */
996 spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);
997 }
998
999 if (!(spi_ps_inputs & 0x7F)) {
1000 /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */
1001 spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);
1002 }
1003
1004 ctx->program->config->spi_ps_input_ena = spi_ps_inputs;
1005 ctx->program->config->spi_ps_input_addr = spi_ps_inputs;
1006
1007 ctx->cf_info.nir_to_aco.reset(nir_to_aco.release());
1008
1009 /* align and copy constant data */
1010 while (ctx->program->constant_data.size() % 4u)
1011 ctx->program->constant_data.push_back(0);
1012 ctx->constant_data_offset = ctx->program->constant_data.size();
1013 ctx->program->constant_data.insert(ctx->program->constant_data.end(),
1014 (uint8_t*)shader->constant_data,
1015 (uint8_t*)shader->constant_data + shader->constant_data_size);
1016 }
1017
cleanup_context(isel_context * ctx)1018 void cleanup_context(isel_context *ctx)
1019 {
1020 _mesa_hash_table_destroy(ctx->range_ht, NULL);
1021 }
1022
1023 isel_context
setup_isel_context(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,struct radv_shader_args * args,bool is_gs_copy_shader)1024 setup_isel_context(Program* program,
1025 unsigned shader_count,
1026 struct nir_shader *const *shaders,
1027 ac_shader_config* config,
1028 struct radv_shader_args *args,
1029 bool is_gs_copy_shader)
1030 {
1031 SWStage sw_stage = SWStage::None;
1032 for (unsigned i = 0; i < shader_count; i++) {
1033 switch (shaders[i]->info.stage) {
1034 case MESA_SHADER_VERTEX:
1035 sw_stage = sw_stage | SWStage::VS;
1036 break;
1037 case MESA_SHADER_TESS_CTRL:
1038 sw_stage = sw_stage | SWStage::TCS;
1039 break;
1040 case MESA_SHADER_TESS_EVAL:
1041 sw_stage = sw_stage | SWStage::TES;
1042 break;
1043 case MESA_SHADER_GEOMETRY:
1044 sw_stage = sw_stage | (is_gs_copy_shader ? SWStage::GSCopy : SWStage::GS);
1045 break;
1046 case MESA_SHADER_FRAGMENT:
1047 sw_stage = sw_stage | SWStage::FS;
1048 break;
1049 case MESA_SHADER_COMPUTE:
1050 sw_stage = sw_stage | SWStage::CS;
1051 break;
1052 default:
1053 unreachable("Shader stage not implemented");
1054 }
1055 }
1056 bool gfx9_plus = args->options->chip_class >= GFX9;
1057 bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10;
1058 HWStage hw_stage { };
1059 if (sw_stage == SWStage::VS && args->shader_info->vs.as_es && !ngg)
1060 hw_stage = HWStage::ES;
1061 else if (sw_stage == SWStage::VS && !args->shader_info->vs.as_ls && !ngg)
1062 hw_stage = HWStage::VS;
1063 else if (sw_stage == SWStage::VS && ngg)
1064 hw_stage = HWStage::NGG; /* GFX10/NGG: VS without GS uses the HW GS stage */
1065 else if (sw_stage == SWStage::GS)
1066 hw_stage = HWStage::GS;
1067 else if (sw_stage == SWStage::FS)
1068 hw_stage = HWStage::FS;
1069 else if (sw_stage == SWStage::CS)
1070 hw_stage = HWStage::CS;
1071 else if (sw_stage == SWStage::GSCopy)
1072 hw_stage = HWStage::VS;
1073 else if (sw_stage == SWStage::VS_GS && gfx9_plus && !ngg)
1074 hw_stage = HWStage::GS; /* GFX6-9: VS+GS merged into a GS (and GFX10/legacy) */
1075 else if (sw_stage == SWStage::VS_GS && ngg)
1076 hw_stage = HWStage::NGG; /* GFX10+: VS+GS merged into an NGG GS */
1077 else if (sw_stage == SWStage::VS && args->shader_info->vs.as_ls)
1078 hw_stage = HWStage::LS; /* GFX6-8: VS is a Local Shader, when tessellation is used */
1079 else if (sw_stage == SWStage::TCS)
1080 hw_stage = HWStage::HS; /* GFX6-8: TCS is a Hull Shader */
1081 else if (sw_stage == SWStage::VS_TCS)
1082 hw_stage = HWStage::HS; /* GFX9-10: VS+TCS merged into a Hull Shader */
1083 else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && !ngg)
1084 hw_stage = HWStage::VS; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */
1085 else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && ngg)
1086 hw_stage = HWStage::NGG; /* GFX10/NGG: TES without GS */
1087 else if (sw_stage == SWStage::TES && args->shader_info->tes.as_es && !ngg)
1088 hw_stage = HWStage::ES; /* GFX6-8: TES is an Export Shader */
1089 else if (sw_stage == SWStage::TES_GS && gfx9_plus && !ngg)
1090 hw_stage = HWStage::GS; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */
1091 else if (sw_stage == SWStage::TES_GS && ngg)
1092 hw_stage = HWStage::NGG; /* GFX10+: TES+GS merged into an NGG GS */
1093 else
1094 unreachable("Shader stage not implemented");
1095
1096 init_program(program, Stage { hw_stage, sw_stage }, args->shader_info,
1097 args->options->chip_class, args->options->family, config);
1098
1099 isel_context ctx = {};
1100 ctx.program = program;
1101 ctx.args = args;
1102 ctx.options = args->options;
1103 ctx.stage = program->stage;
1104
1105 /* TODO: Check if we need to adjust min_waves for unknown workgroup sizes. */
1106 if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS) {
1107 /* PS and legacy VS have separate waves, no workgroups */
1108 program->workgroup_size = program->wave_size;
1109 } else if (program->stage == compute_cs) {
1110 /* CS sets the workgroup size explicitly */
1111 program->workgroup_size = shaders[0]->info.cs.local_size[0] *
1112 shaders[0]->info.cs.local_size[1] *
1113 shaders[0]->info.cs.local_size[2];
1114 } else if (program->stage.hw == HWStage::ES || program->stage == geometry_gs) {
1115 /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8 (not implemented in Mesa) */
1116 program->workgroup_size = program->wave_size;
1117 } else if (program->stage.hw == HWStage::GS) {
1118 /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */
1119 assert(program->chip_class >= GFX9);
1120 uint32_t es_verts_per_subgrp = G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
1121 uint32_t gs_instr_prims_in_subgrp = G_028A44_GS_INST_PRIMS_IN_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
1122 uint32_t workgroup_size = MAX2(es_verts_per_subgrp, gs_instr_prims_in_subgrp);
1123 program->workgroup_size = MAX2(MIN2(workgroup_size, 256), 1);
1124 } else if (program->stage == vertex_ls) {
1125 /* Unmerged LS operates in workgroups */
1126 program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but those are not plumbed to ACO for LS */
1127 } else if (program->stage == tess_control_hs) {
1128 /* Unmerged HS operates in workgroups, size is determined by the output vertices */
1129 setup_tcs_info(&ctx, shaders[0], NULL);
1130 program->workgroup_size = ctx.tcs_num_patches * shaders[0]->info.tess.tcs_vertices_out;
1131 } else if (program->stage == vertex_tess_control_hs) {
1132 /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS invocations */
1133 setup_tcs_info(&ctx, shaders[1], shaders[0]);
1134 program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
1135 } else if (program->stage.hw == HWStage::NGG) {
1136 gfx10_ngg_info &ngg_info = args->shader_info->ngg_info;
1137 unsigned num_gs_invocations = (program->stage.has(SWStage::GS)) ? MAX2(shaders[1]->info.gs.invocations, 1) : 1;
1138
1139 /* Max ES (SW VS/TES) threads */
1140 uint32_t max_esverts = ngg_info.hw_max_esverts;
1141 /* Max GS input primitives = max GS threads */
1142 uint32_t max_gs_input_prims = ngg_info.max_gsprims * num_gs_invocations;
1143 /* Maximum output vertices -- each thread can export only 1 vertex */
1144 uint32_t max_out_vtx = ngg_info.max_out_verts;
1145 /* Maximum output primitives -- each thread can export only 1 or 0 primitive */
1146 uint32_t max_out_prm = ngg_info.max_gsprims * num_gs_invocations * ngg_info.prim_amp_factor;
1147
1148 program->workgroup_size = MAX4(max_esverts, max_gs_input_prims, max_out_vtx, max_out_prm);
1149 } else {
1150 unreachable("Unsupported shader stage.");
1151 }
1152
1153 calc_min_waves(program);
1154 program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
1155 program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
1156
1157 unsigned scratch_size = 0;
1158 if (program->stage == gs_copy_vs) {
1159 assert(shader_count == 1);
1160 setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo);
1161 } else {
1162 for (unsigned i = 0; i < shader_count; i++) {
1163 nir_shader *nir = shaders[i];
1164 setup_nir(&ctx, nir);
1165 }
1166
1167 for (unsigned i = 0; i < shader_count; i++)
1168 scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
1169 }
1170
1171 ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024);
1172
1173 ctx.block = ctx.program->create_and_insert_block();
1174 ctx.block->loop_nest_depth = 0;
1175 ctx.block->kind = block_kind_top_level;
1176
1177 setup_xnack(program);
1178 program->sram_ecc_enabled = args->options->family == CHIP_ARCTURUS;
1179 /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
1180 program->has_fast_fma32 = program->chip_class >= GFX9;
1181 if (args->options->family == CHIP_TAHITI || args->options->family == CHIP_CARRIZO || args->options->family == CHIP_HAWAII)
1182 program->has_fast_fma32 = true;
1183
1184 return ctx;
1185 }
1186
1187 }
1188