1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_instruction_selection.h"
26
27 #include "common/ac_nir.h"
28 #include "common/sid.h"
29
30 #include "nir_control_flow.h"
31
32 #include <vector>
33
34 namespace aco {
35
36 namespace {
37
38 bool
is_loop_header_block(nir_block * block)39 is_loop_header_block(nir_block* block)
40 {
41 return block->cf_node.parent->type == nir_cf_node_loop &&
42 block == nir_loop_first_block(nir_cf_node_as_loop(block->cf_node.parent));
43 }
44
45 /* similar to nir_block_is_unreachable(), but does not require dominance information */
46 bool
is_block_reachable(nir_function_impl * impl,nir_block * known_reachable,nir_block * block)47 is_block_reachable(nir_function_impl* impl, nir_block* known_reachable, nir_block* block)
48 {
49 if (block == nir_start_block(impl) || block == known_reachable)
50 return true;
51
52 /* skip loop back-edges */
53 if (is_loop_header_block(block)) {
54 nir_loop* loop = nir_cf_node_as_loop(block->cf_node.parent);
55 nir_block* preheader = nir_block_cf_tree_prev(nir_loop_first_block(loop));
56 return is_block_reachable(impl, known_reachable, preheader);
57 }
58
59 set_foreach (block->predecessors, entry) {
60 if (is_block_reachable(impl, known_reachable, (nir_block*)entry->key))
61 return true;
62 }
63
64 return false;
65 }
66
67 /* Check whether the given SSA def is only used by cross-lane instructions. */
68 bool
only_used_by_cross_lane_instrs(nir_def * ssa,bool follow_phis=true)69 only_used_by_cross_lane_instrs(nir_def* ssa, bool follow_phis = true)
70 {
71 nir_foreach_use (src, ssa) {
72 switch (nir_src_parent_instr(src)->type) {
73 case nir_instr_type_alu: {
74 nir_alu_instr* alu = nir_instr_as_alu(nir_src_parent_instr(src));
75 if (alu->op != nir_op_unpack_64_2x32_split_x && alu->op != nir_op_unpack_64_2x32_split_y)
76 return false;
77 if (!only_used_by_cross_lane_instrs(&alu->def, follow_phis))
78 return false;
79
80 continue;
81 }
82 case nir_instr_type_intrinsic: {
83 nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
84 if (intrin->intrinsic != nir_intrinsic_read_invocation &&
85 intrin->intrinsic != nir_intrinsic_read_first_invocation &&
86 intrin->intrinsic != nir_intrinsic_lane_permute_16_amd)
87 return false;
88
89 continue;
90 }
91 case nir_instr_type_phi: {
92 /* Don't follow more than 1 phis, this avoids infinite loops. */
93 if (!follow_phis)
94 return false;
95
96 nir_phi_instr* phi = nir_instr_as_phi(nir_src_parent_instr(src));
97 if (!only_used_by_cross_lane_instrs(&phi->def, false))
98 return false;
99
100 continue;
101 }
102 default: return false;
103 }
104 }
105
106 return true;
107 }
108
109 /* If one side of a divergent IF ends in a branch and the other doesn't, we
110 * might have to emit the contents of the side without the branch at the merge
111 * block instead. This is so that we can use any SGPR live-out of the side
112 * without the branch without creating a linear phi in the invert or merge block. */
113 bool
sanitize_if(nir_function_impl * impl,nir_if * nif)114 sanitize_if(nir_function_impl* impl, nir_if* nif)
115 {
116 // TODO: skip this if the condition is uniform and there are no divergent breaks/continues?
117
118 nir_block* then_block = nir_if_last_then_block(nif);
119 nir_block* else_block = nir_if_last_else_block(nif);
120 bool then_jump = nir_block_ends_in_jump(then_block) ||
121 !is_block_reachable(impl, nir_if_first_then_block(nif), then_block);
122 bool else_jump = nir_block_ends_in_jump(else_block) ||
123 !is_block_reachable(impl, nir_if_first_else_block(nif), else_block);
124 if (then_jump == else_jump)
125 return false;
126
127 /* If the continue from block is empty then return as there is nothing to
128 * move.
129 */
130 if (nir_cf_list_is_empty_block(else_jump ? &nif->then_list : &nif->else_list))
131 return false;
132
133 /* Even though this if statement has a jump on one side, we may still have
134 * phis afterwards. Single-source phis can be produced by loop unrolling
135 * or dead control-flow passes and are perfectly legal. Run a quick phi
136 * removal on the block after the if to clean up any such phis.
137 */
138 nir_opt_remove_phis_block(nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
139
140 /* Finally, move the continue from branch after the if-statement. */
141 nir_block* last_continue_from_blk = else_jump ? then_block : else_block;
142 nir_block* first_continue_from_blk =
143 else_jump ? nir_if_first_then_block(nif) : nir_if_first_else_block(nif);
144
145 nir_cf_list tmp;
146 nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk),
147 nir_after_block(last_continue_from_blk));
148 nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node));
149
150 return true;
151 }
152
153 bool
sanitize_cf_list(nir_function_impl * impl,struct exec_list * cf_list)154 sanitize_cf_list(nir_function_impl* impl, struct exec_list* cf_list)
155 {
156 bool progress = false;
157 foreach_list_typed (nir_cf_node, cf_node, node, cf_list) {
158 switch (cf_node->type) {
159 case nir_cf_node_block: break;
160 case nir_cf_node_if: {
161 nir_if* nif = nir_cf_node_as_if(cf_node);
162 progress |= sanitize_cf_list(impl, &nif->then_list);
163 progress |= sanitize_cf_list(impl, &nif->else_list);
164 progress |= sanitize_if(impl, nif);
165 break;
166 }
167 case nir_cf_node_loop: {
168 nir_loop* loop = nir_cf_node_as_loop(cf_node);
169 assert(!nir_loop_has_continue_construct(loop));
170 progress |= sanitize_cf_list(impl, &loop->body);
171 break;
172 }
173 case nir_cf_node_function: unreachable("Invalid cf type");
174 }
175 }
176
177 return progress;
178 }
179
180 void
apply_nuw_to_ssa(isel_context * ctx,nir_def * ssa)181 apply_nuw_to_ssa(isel_context* ctx, nir_def* ssa)
182 {
183 nir_scalar scalar;
184 scalar.def = ssa;
185 scalar.comp = 0;
186
187 if (!nir_scalar_is_alu(scalar) || nir_scalar_alu_op(scalar) != nir_op_iadd)
188 return;
189
190 nir_alu_instr* add = nir_instr_as_alu(ssa->parent_instr);
191
192 if (add->no_unsigned_wrap)
193 return;
194
195 nir_scalar src0 = nir_scalar_chase_alu_src(scalar, 0);
196 nir_scalar src1 = nir_scalar_chase_alu_src(scalar, 1);
197
198 if (nir_scalar_is_const(src0)) {
199 nir_scalar tmp = src0;
200 src0 = src1;
201 src1 = tmp;
202 }
203
204 uint32_t src1_ub = nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, src1, &ctx->ub_config);
205 add->no_unsigned_wrap =
206 !nir_addition_might_overflow(ctx->shader, ctx->range_ht, src0, src1_ub, &ctx->ub_config);
207 }
208
209 void
apply_nuw_to_offsets(isel_context * ctx,nir_function_impl * impl)210 apply_nuw_to_offsets(isel_context* ctx, nir_function_impl* impl)
211 {
212 nir_foreach_block (block, impl) {
213 nir_foreach_instr (instr, block) {
214 if (instr->type != nir_instr_type_intrinsic)
215 continue;
216 nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr);
217
218 switch (intrin->intrinsic) {
219 case nir_intrinsic_load_constant:
220 case nir_intrinsic_load_uniform:
221 case nir_intrinsic_load_push_constant:
222 if (!nir_src_is_divergent(intrin->src[0]))
223 apply_nuw_to_ssa(ctx, intrin->src[0].ssa);
224 break;
225 case nir_intrinsic_load_ubo:
226 case nir_intrinsic_load_ssbo:
227 if (!nir_src_is_divergent(intrin->src[1]))
228 apply_nuw_to_ssa(ctx, intrin->src[1].ssa);
229 break;
230 case nir_intrinsic_store_ssbo:
231 if (!nir_src_is_divergent(intrin->src[2]))
232 apply_nuw_to_ssa(ctx, intrin->src[2].ssa);
233 break;
234 case nir_intrinsic_load_scratch: apply_nuw_to_ssa(ctx, intrin->src[0].ssa); break;
235 case nir_intrinsic_store_scratch:
236 case nir_intrinsic_load_smem_amd: apply_nuw_to_ssa(ctx, intrin->src[1].ssa); break;
237 default: break;
238 }
239 }
240 }
241 }
242
243 RegClass
get_reg_class(isel_context * ctx,RegType type,unsigned components,unsigned bitsize)244 get_reg_class(isel_context* ctx, RegType type, unsigned components, unsigned bitsize)
245 {
246 if (bitsize == 1)
247 return RegClass(RegType::sgpr, ctx->program->lane_mask.size() * components);
248 else
249 return RegClass::get(type, components * bitsize / 8u);
250 }
251
252 void
setup_tcs_info(isel_context * ctx)253 setup_tcs_info(isel_context* ctx)
254 {
255 ctx->tcs_in_out_eq = ctx->program->info.vs.tcs_in_out_eq;
256 ctx->tcs_temp_only_inputs = ctx->program->info.vs.tcs_temp_only_input_mask;
257 }
258
259 void
setup_lds_size(isel_context * ctx,nir_shader * nir)260 setup_lds_size(isel_context* ctx, nir_shader* nir)
261 {
262 /* TCS and GFX9 GS are special cases, already in units of the allocation granule. */
263 if (ctx->stage.has(SWStage::TCS))
264 ctx->program->config->lds_size = ctx->program->info.tcs.num_lds_blocks;
265 else if (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->options->gfx_level >= GFX9)
266 ctx->program->config->lds_size = ctx->program->info.gfx9_gs_ring_lds_size;
267 else
268 ctx->program->config->lds_size =
269 DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
270
271 /* Make sure we fit the available LDS space. */
272 assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <=
273 ctx->program->dev.lds_limit);
274 }
275
276 void
setup_nir(isel_context * ctx,nir_shader * nir)277 setup_nir(isel_context* ctx, nir_shader* nir)
278 {
279 nir_convert_to_lcssa(nir, true, false);
280 nir_lower_phis_to_scalar(nir, true);
281
282 nir_function_impl* func = nir_shader_get_entrypoint(nir);
283 nir_index_ssa_defs(func);
284 }
285
286 } /* end namespace */
287
288 void
init_context(isel_context * ctx,nir_shader * shader)289 init_context(isel_context* ctx, nir_shader* shader)
290 {
291 nir_function_impl* impl = nir_shader_get_entrypoint(shader);
292 ctx->shader = shader;
293
294 /* Init NIR range analysis. */
295 ctx->range_ht = _mesa_pointer_hash_table_create(NULL);
296 ctx->ub_config.min_subgroup_size = ctx->program->wave_size;
297 ctx->ub_config.max_subgroup_size = ctx->program->wave_size;
298 ctx->ub_config.max_workgroup_invocations = 2048;
299 ctx->ub_config.max_workgroup_count[0] = 65535;
300 ctx->ub_config.max_workgroup_count[1] = 65535;
301 ctx->ub_config.max_workgroup_count[2] = 65535;
302 ctx->ub_config.max_workgroup_size[0] = 2048;
303 ctx->ub_config.max_workgroup_size[1] = 2048;
304 ctx->ub_config.max_workgroup_size[2] = 2048;
305
306 nir_divergence_analysis(shader);
307 if (nir_opt_uniform_atomics(shader) && nir_lower_int64(shader))
308 nir_divergence_analysis(shader);
309
310 apply_nuw_to_offsets(ctx, impl);
311
312 /* sanitize control flow */
313 sanitize_cf_list(impl, &impl->body);
314 nir_metadata_preserve(impl, nir_metadata_none);
315
316 /* we'll need these for isel */
317 nir_metadata_require(impl, nir_metadata_block_index);
318
319 if (ctx->options->dump_preoptir) {
320 fprintf(stderr, "NIR shader before instruction selection:\n");
321 nir_print_shader(shader, stderr);
322 }
323
324 ctx->first_temp_id = ctx->program->peekAllocationId();
325 ctx->program->allocateRange(impl->ssa_alloc);
326 RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;
327
328 std::unique_ptr<unsigned[]> nir_to_aco{new unsigned[impl->num_blocks]()};
329
330 /* TODO: make this recursive to improve compile times */
331 bool done = false;
332 while (!done) {
333 done = true;
334 nir_foreach_block (block, impl) {
335 nir_foreach_instr (instr, block) {
336 switch (instr->type) {
337 case nir_instr_type_alu: {
338 nir_alu_instr* alu_instr = nir_instr_as_alu(instr);
339 RegType type = alu_instr->def.divergent ? RegType::vgpr : RegType::sgpr;
340 switch (alu_instr->op) {
341 case nir_op_fmul:
342 case nir_op_fmulz:
343 case nir_op_fadd:
344 case nir_op_fsub:
345 case nir_op_ffma:
346 case nir_op_ffmaz:
347 case nir_op_fmax:
348 case nir_op_fmin:
349 case nir_op_fneg:
350 case nir_op_fabs:
351 case nir_op_fsat:
352 case nir_op_fsign:
353 case nir_op_frcp:
354 case nir_op_frsq:
355 case nir_op_fsqrt:
356 case nir_op_fexp2:
357 case nir_op_flog2:
358 case nir_op_ffract:
359 case nir_op_ffloor:
360 case nir_op_fceil:
361 case nir_op_ftrunc:
362 case nir_op_fround_even:
363 case nir_op_fsin_amd:
364 case nir_op_fcos_amd:
365 case nir_op_f2f16:
366 case nir_op_f2f16_rtz:
367 case nir_op_f2f16_rtne:
368 case nir_op_f2f32:
369 case nir_op_f2f64:
370 case nir_op_u2f16:
371 case nir_op_u2f32:
372 case nir_op_u2f64:
373 case nir_op_i2f16:
374 case nir_op_i2f32:
375 case nir_op_i2f64:
376 case nir_op_pack_half_2x16_rtz_split:
377 case nir_op_pack_half_2x16_split:
378 case nir_op_pack_unorm_2x16:
379 case nir_op_pack_snorm_2x16:
380 case nir_op_pack_uint_2x16:
381 case nir_op_pack_sint_2x16:
382 case nir_op_unpack_half_2x16_split_x:
383 case nir_op_unpack_half_2x16_split_y:
384 case nir_op_fddx:
385 case nir_op_fddy:
386 case nir_op_fddx_fine:
387 case nir_op_fddy_fine:
388 case nir_op_fddx_coarse:
389 case nir_op_fddy_coarse:
390 case nir_op_fquantize2f16:
391 case nir_op_ldexp:
392 case nir_op_frexp_sig:
393 case nir_op_frexp_exp:
394 case nir_op_cube_amd:
395 case nir_op_msad_4x8:
396 case nir_op_udot_4x8_uadd:
397 case nir_op_sdot_4x8_iadd:
398 case nir_op_sudot_4x8_iadd:
399 case nir_op_udot_4x8_uadd_sat:
400 case nir_op_sdot_4x8_iadd_sat:
401 case nir_op_sudot_4x8_iadd_sat:
402 case nir_op_udot_2x16_uadd:
403 case nir_op_sdot_2x16_iadd:
404 case nir_op_udot_2x16_uadd_sat:
405 case nir_op_sdot_2x16_iadd_sat: type = RegType::vgpr; break;
406 case nir_op_f2i16:
407 case nir_op_f2u16:
408 case nir_op_f2i32:
409 case nir_op_f2u32:
410 case nir_op_b2i8:
411 case nir_op_b2i16:
412 case nir_op_b2i32:
413 case nir_op_b2b32:
414 case nir_op_b2f16:
415 case nir_op_b2f32:
416 case nir_op_mov: break;
417 case nir_op_iabs:
418 case nir_op_iadd:
419 case nir_op_iadd_sat:
420 case nir_op_uadd_sat:
421 case nir_op_isub:
422 case nir_op_isub_sat:
423 case nir_op_usub_sat:
424 case nir_op_imul:
425 case nir_op_imin:
426 case nir_op_imax:
427 case nir_op_umin:
428 case nir_op_umax:
429 case nir_op_ishl:
430 case nir_op_ishr:
431 case nir_op_ushr:
432 /* packed 16bit instructions have to be VGPR */
433 type = alu_instr->def.num_components == 2 ? RegType::vgpr : type;
434 FALLTHROUGH;
435 default:
436 for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
437 if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
438 type = RegType::vgpr;
439 }
440 break;
441 }
442
443 RegClass rc =
444 get_reg_class(ctx, type, alu_instr->def.num_components, alu_instr->def.bit_size);
445 regclasses[alu_instr->def.index] = rc;
446 break;
447 }
448 case nir_instr_type_load_const: {
449 unsigned num_components = nir_instr_as_load_const(instr)->def.num_components;
450 unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size;
451 RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
452 regclasses[nir_instr_as_load_const(instr)->def.index] = rc;
453 break;
454 }
455 case nir_instr_type_intrinsic: {
456 nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr);
457 if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)
458 break;
459 if (intrinsic->intrinsic == nir_intrinsic_strict_wqm_coord_amd) {
460 regclasses[intrinsic->def.index] =
461 RegClass::get(RegType::vgpr, intrinsic->def.num_components * 4 +
462 nir_intrinsic_base(intrinsic))
463 .as_linear();
464 break;
465 }
466 RegType type = RegType::sgpr;
467 switch (intrinsic->intrinsic) {
468 case nir_intrinsic_load_push_constant:
469 case nir_intrinsic_load_workgroup_id:
470 case nir_intrinsic_load_num_workgroups:
471 case nir_intrinsic_load_ray_launch_size:
472 case nir_intrinsic_load_sbt_base_amd:
473 case nir_intrinsic_load_subgroup_id:
474 case nir_intrinsic_load_num_subgroups:
475 case nir_intrinsic_load_first_vertex:
476 case nir_intrinsic_load_base_instance:
477 case nir_intrinsic_vote_all:
478 case nir_intrinsic_vote_any:
479 case nir_intrinsic_read_first_invocation:
480 case nir_intrinsic_as_uniform:
481 case nir_intrinsic_read_invocation:
482 case nir_intrinsic_first_invocation:
483 case nir_intrinsic_ballot:
484 case nir_intrinsic_ballot_relaxed:
485 case nir_intrinsic_bindless_image_samples:
486 case nir_intrinsic_load_scalar_arg_amd:
487 case nir_intrinsic_load_lds_ngg_scratch_base_amd:
488 case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd:
489 case nir_intrinsic_load_smem_amd: type = RegType::sgpr; break;
490 case nir_intrinsic_load_sample_id:
491 case nir_intrinsic_load_input:
492 case nir_intrinsic_load_output:
493 case nir_intrinsic_load_input_vertex:
494 case nir_intrinsic_load_per_vertex_input:
495 case nir_intrinsic_load_per_vertex_output:
496 case nir_intrinsic_load_vertex_id_zero_base:
497 case nir_intrinsic_load_barycentric_sample:
498 case nir_intrinsic_load_barycentric_pixel:
499 case nir_intrinsic_load_barycentric_model:
500 case nir_intrinsic_load_barycentric_centroid:
501 case nir_intrinsic_load_barycentric_at_offset:
502 case nir_intrinsic_load_interpolated_input:
503 case nir_intrinsic_load_frag_coord:
504 case nir_intrinsic_load_frag_shading_rate:
505 case nir_intrinsic_load_sample_pos:
506 case nir_intrinsic_load_local_invocation_id:
507 case nir_intrinsic_load_local_invocation_index:
508 case nir_intrinsic_load_subgroup_invocation:
509 case nir_intrinsic_load_ray_launch_id:
510 case nir_intrinsic_load_tess_coord:
511 case nir_intrinsic_write_invocation_amd:
512 case nir_intrinsic_mbcnt_amd:
513 case nir_intrinsic_lane_permute_16_amd:
514 case nir_intrinsic_load_instance_id:
515 case nir_intrinsic_ssbo_atomic:
516 case nir_intrinsic_ssbo_atomic_swap:
517 case nir_intrinsic_global_atomic_amd:
518 case nir_intrinsic_global_atomic_swap_amd:
519 case nir_intrinsic_bindless_image_atomic:
520 case nir_intrinsic_bindless_image_atomic_swap:
521 case nir_intrinsic_bindless_image_size:
522 case nir_intrinsic_shared_atomic:
523 case nir_intrinsic_shared_atomic_swap:
524 case nir_intrinsic_load_scratch:
525 case nir_intrinsic_load_invocation_id:
526 case nir_intrinsic_load_primitive_id:
527 case nir_intrinsic_load_typed_buffer_amd:
528 case nir_intrinsic_load_buffer_amd:
529 case nir_intrinsic_load_initial_edgeflags_amd:
530 case nir_intrinsic_gds_atomic_add_amd:
531 case nir_intrinsic_bvh64_intersect_ray_amd:
532 case nir_intrinsic_load_vector_arg_amd:
533 case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
534 case nir_intrinsic_ordered_xfb_counter_add_amd:
535 case nir_intrinsic_cmat_muladd_amd: type = RegType::vgpr; break;
536 case nir_intrinsic_load_shared:
537 case nir_intrinsic_load_shared2_amd:
538 /* When the result of these loads is only used by cross-lane instructions,
539 * it is beneficial to use a VGPR destination. This is because this allows
540 * to put the s_waitcnt further down, which decreases latency.
541 */
542 if (only_used_by_cross_lane_instrs(&intrinsic->def)) {
543 type = RegType::vgpr;
544 break;
545 }
546 FALLTHROUGH;
547 case nir_intrinsic_shuffle:
548 case nir_intrinsic_quad_broadcast:
549 case nir_intrinsic_quad_swap_horizontal:
550 case nir_intrinsic_quad_swap_vertical:
551 case nir_intrinsic_quad_swap_diagonal:
552 case nir_intrinsic_quad_swizzle_amd:
553 case nir_intrinsic_masked_swizzle_amd:
554 case nir_intrinsic_rotate:
555 case nir_intrinsic_inclusive_scan:
556 case nir_intrinsic_exclusive_scan:
557 case nir_intrinsic_reduce:
558 case nir_intrinsic_load_ubo:
559 case nir_intrinsic_load_ssbo:
560 case nir_intrinsic_load_global_amd:
561 type = intrinsic->def.divergent ? RegType::vgpr : RegType::sgpr;
562 break;
563 case nir_intrinsic_load_view_index:
564 type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
565 break;
566 default:
567 for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
568 i++) {
569 if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
570 type = RegType::vgpr;
571 }
572 break;
573 }
574 RegClass rc =
575 get_reg_class(ctx, type, intrinsic->def.num_components, intrinsic->def.bit_size);
576 regclasses[intrinsic->def.index] = rc;
577 break;
578 }
579 case nir_instr_type_tex: {
580 nir_tex_instr* tex = nir_instr_as_tex(instr);
581 RegType type = tex->def.divergent ? RegType::vgpr : RegType::sgpr;
582
583 if (tex->op == nir_texop_texture_samples) {
584 assert(!tex->def.divergent);
585 }
586
587 RegClass rc = get_reg_class(ctx, type, tex->def.num_components, tex->def.bit_size);
588 regclasses[tex->def.index] = rc;
589 break;
590 }
591 case nir_instr_type_undef: {
592 unsigned num_components = nir_instr_as_undef(instr)->def.num_components;
593 unsigned bit_size = nir_instr_as_undef(instr)->def.bit_size;
594 RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
595 regclasses[nir_instr_as_undef(instr)->def.index] = rc;
596 break;
597 }
598 case nir_instr_type_phi: {
599 nir_phi_instr* phi = nir_instr_as_phi(instr);
600 RegType type = RegType::sgpr;
601 unsigned num_components = phi->def.num_components;
602 assert((phi->def.bit_size != 1 || num_components == 1) &&
603 "Multiple components not supported on boolean phis.");
604
605 if (phi->def.divergent) {
606 type = RegType::vgpr;
607 } else {
608 nir_foreach_phi_src (src, phi) {
609 if (regclasses[src->src.ssa->index].type() == RegType::vgpr)
610 type = RegType::vgpr;
611 }
612 }
613
614 RegClass rc = get_reg_class(ctx, type, num_components, phi->def.bit_size);
615 if (rc != regclasses[phi->def.index])
616 done = false;
617 regclasses[phi->def.index] = rc;
618 break;
619 }
620 default: break;
621 }
622 }
623 }
624 }
625
626 ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena;
627 ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr;
628
629 ctx->cf_info.nir_to_aco = std::move(nir_to_aco);
630
631 /* align and copy constant data */
632 while (ctx->program->constant_data.size() % 4u)
633 ctx->program->constant_data.push_back(0);
634 ctx->constant_data_offset = ctx->program->constant_data.size();
635 ctx->program->constant_data.insert(ctx->program->constant_data.end(),
636 (uint8_t*)shader->constant_data,
637 (uint8_t*)shader->constant_data + shader->constant_data_size);
638 }
639
640 void
cleanup_context(isel_context * ctx)641 cleanup_context(isel_context* ctx)
642 {
643 _mesa_hash_table_destroy(ctx->range_ht, NULL);
644 }
645
646 isel_context
setup_isel_context(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args,SWStage sw_stage)647 setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
648 ac_shader_config* config, const struct aco_compiler_options* options,
649 const struct aco_shader_info* info, const struct ac_shader_args* args,
650 SWStage sw_stage)
651 {
652 for (unsigned i = 0; i < shader_count; i++) {
653 switch (shaders[i]->info.stage) {
654 case MESA_SHADER_VERTEX: sw_stage = sw_stage | SWStage::VS; break;
655 case MESA_SHADER_TESS_CTRL: sw_stage = sw_stage | SWStage::TCS; break;
656 case MESA_SHADER_TESS_EVAL: sw_stage = sw_stage | SWStage::TES; break;
657 case MESA_SHADER_GEOMETRY: sw_stage = sw_stage | SWStage::GS; break;
658 case MESA_SHADER_FRAGMENT: sw_stage = sw_stage | SWStage::FS; break;
659 case MESA_SHADER_KERNEL:
660 case MESA_SHADER_COMPUTE: sw_stage = sw_stage | SWStage::CS; break;
661 case MESA_SHADER_TASK: sw_stage = sw_stage | SWStage::TS; break;
662 case MESA_SHADER_MESH: sw_stage = sw_stage | SWStage::MS; break;
663 case MESA_SHADER_RAYGEN:
664 case MESA_SHADER_CLOSEST_HIT:
665 case MESA_SHADER_MISS:
666 case MESA_SHADER_CALLABLE:
667 case MESA_SHADER_INTERSECTION:
668 case MESA_SHADER_ANY_HIT: sw_stage = SWStage::RT; break;
669 default: unreachable("Shader stage not implemented");
670 }
671 }
672
673 init_program(program, Stage{info->hw_stage, sw_stage}, info, options->gfx_level, options->family,
674 options->wgp_mode, config);
675
676 isel_context ctx = {};
677 ctx.program = program;
678 ctx.args = args;
679 ctx.options = options;
680 ctx.stage = program->stage;
681
682 program->workgroup_size = program->info.workgroup_size;
683 assert(program->workgroup_size);
684
685 /* Mesh shading only works on GFX10.3+. */
686 ASSERTED bool mesh_shading = ctx.stage.has(SWStage::TS) || ctx.stage.has(SWStage::MS);
687 assert(!mesh_shading || ctx.program->gfx_level >= GFX10_3);
688
689 setup_tcs_info(&ctx);
690
691 calc_min_waves(program);
692
693 unsigned scratch_size = 0;
694 for (unsigned i = 0; i < shader_count; i++) {
695 nir_shader* nir = shaders[i];
696 setup_nir(&ctx, nir);
697 setup_lds_size(&ctx, nir);
698 }
699
700 for (unsigned i = 0; i < shader_count; i++)
701 scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
702
703 ctx.program->config->scratch_bytes_per_wave = scratch_size * ctx.program->wave_size;
704
705 unsigned nir_num_blocks = 0;
706 for (unsigned i = 0; i < shader_count; i++)
707 nir_num_blocks += nir_shader_get_entrypoint(shaders[i])->num_blocks;
708 ctx.program->blocks.reserve(nir_num_blocks * 2);
709 ctx.block = ctx.program->create_and_insert_block();
710 ctx.block->kind = block_kind_top_level;
711
712 return ctx;
713 }
714
715 } // namespace aco
716