• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_instruction_selection.h"
8 
9 #include "common/nir/ac_nir.h"
10 #include "common/sid.h"
11 
12 #include "nir_control_flow.h"
13 #include "nir_builder.h"
14 
15 #include <vector>
16 
17 namespace aco {
18 
19 namespace {
20 
21 /* Check whether the given SSA def is only used by cross-lane instructions. */
22 bool
only_used_by_cross_lane_instrs(nir_def * ssa,bool follow_phis=true)23 only_used_by_cross_lane_instrs(nir_def* ssa, bool follow_phis = true)
24 {
25    nir_foreach_use (src, ssa) {
26       switch (nir_src_parent_instr(src)->type) {
27       case nir_instr_type_alu: {
28          nir_alu_instr* alu = nir_instr_as_alu(nir_src_parent_instr(src));
29          if (alu->op != nir_op_unpack_64_2x32_split_x && alu->op != nir_op_unpack_64_2x32_split_y)
30             return false;
31          if (!only_used_by_cross_lane_instrs(&alu->def, follow_phis))
32             return false;
33 
34          continue;
35       }
36       case nir_instr_type_intrinsic: {
37          nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
38          if (intrin->intrinsic != nir_intrinsic_read_invocation &&
39              intrin->intrinsic != nir_intrinsic_read_first_invocation &&
40              intrin->intrinsic != nir_intrinsic_lane_permute_16_amd)
41             return false;
42 
43          continue;
44       }
45       case nir_instr_type_phi: {
46          /* Don't follow more than 1 phis, this avoids infinite loops. */
47          if (!follow_phis)
48             return false;
49 
50          nir_phi_instr* phi = nir_instr_as_phi(nir_src_parent_instr(src));
51          if (!only_used_by_cross_lane_instrs(&phi->def, false))
52             return false;
53 
54          continue;
55       }
56       default: return false;
57       }
58    }
59 
60    return true;
61 }
62 
63 /* If one side of a divergent IF ends in a branch and the other doesn't, we
64  * might have to emit the contents of the side without the branch at the merge
65  * block instead. This is so that we can use any SGPR live-out of the side
66  * without the branch without creating a linear phi in the invert or merge block.
67  *
68  * This also removes any unreachable merge blocks.
69  */
70 bool
sanitize_if(nir_function_impl * impl,nir_if * nif)71 sanitize_if(nir_function_impl* impl, nir_if* nif)
72 {
73    nir_block* then_block = nir_if_last_then_block(nif);
74    nir_block* else_block = nir_if_last_else_block(nif);
75    bool then_jump = nir_block_ends_in_jump(then_block);
76    bool else_jump = nir_block_ends_in_jump(else_block);
77    if (!then_jump && !else_jump)
78       return false;
79 
80    /* If the continue from block is empty then return as there is nothing to
81     * move.
82     */
83    if (nir_cf_list_is_empty_block(then_jump ? &nif->else_list : &nif->then_list))
84       return false;
85 
86    /* Even though this if statement has a jump on one side, we may still have
87     * phis afterwards.  Single-source phis can be produced by loop unrolling
88     * or dead control-flow passes and are perfectly legal.  Run a quick phi
89     * removal on the block after the if to clean up any such phis.
90     */
91    nir_remove_single_src_phis_block(nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
92 
93    /* Finally, move the continue from branch after the if-statement. */
94    nir_block* last_continue_from_blk = then_jump ? else_block : then_block;
95    nir_block* first_continue_from_blk =
96       then_jump ? nir_if_first_else_block(nif) : nir_if_first_then_block(nif);
97 
98    /* We don't need to repair SSA. nir_remove_after_cf_node() replaces any uses with undef. */
99    if (then_jump && else_jump)
100       nir_remove_after_cf_node(&nif->cf_node);
101 
102    nir_cf_list tmp;
103    nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk),
104                   nir_after_block(last_continue_from_blk));
105    nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node));
106 
107    return true;
108 }
109 
110 bool
sanitize_cf_list(nir_function_impl * impl,struct exec_list * cf_list)111 sanitize_cf_list(nir_function_impl* impl, struct exec_list* cf_list)
112 {
113    bool progress = false;
114    foreach_list_typed (nir_cf_node, cf_node, node, cf_list) {
115       switch (cf_node->type) {
116       case nir_cf_node_block: break;
117       case nir_cf_node_if: {
118          nir_if* nif = nir_cf_node_as_if(cf_node);
119          progress |= sanitize_cf_list(impl, &nif->then_list);
120          progress |= sanitize_cf_list(impl, &nif->else_list);
121          progress |= sanitize_if(impl, nif);
122          break;
123       }
124       case nir_cf_node_loop: {
125          nir_loop* loop = nir_cf_node_as_loop(cf_node);
126          assert(!nir_loop_has_continue_construct(loop));
127          progress |= sanitize_cf_list(impl, &loop->body);
128 
129          /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
130           * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
131           */
132          if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
133             nir_builder b = nir_builder_create(impl);
134             b.cursor = nir_after_block_before_jump(nir_loop_last_block(loop));
135 
136             nir_def *cond = nir_imm_false(&b);
137             /* We don't use block divergence information, so just this is enough. */
138             cond->divergent = false;
139 
140             nir_push_if(&b, cond);
141             nir_jump(&b, nir_jump_break);
142             nir_pop_if(&b, NULL);
143 
144             progress = true;
145          }
146          break;
147       }
148       case nir_cf_node_function: unreachable("Invalid cf type");
149       }
150    }
151 
152    return progress;
153 }
154 
155 void
apply_nuw_to_ssa(isel_context * ctx,nir_def * ssa)156 apply_nuw_to_ssa(isel_context* ctx, nir_def* ssa)
157 {
158    nir_scalar scalar;
159    scalar.def = ssa;
160    scalar.comp = 0;
161 
162    if (!nir_scalar_is_alu(scalar) || nir_scalar_alu_op(scalar) != nir_op_iadd)
163       return;
164 
165    nir_alu_instr* add = nir_instr_as_alu(ssa->parent_instr);
166 
167    if (add->no_unsigned_wrap)
168       return;
169 
170    nir_scalar src0 = nir_scalar_chase_alu_src(scalar, 0);
171    nir_scalar src1 = nir_scalar_chase_alu_src(scalar, 1);
172 
173    if (nir_scalar_is_const(src0)) {
174       nir_scalar tmp = src0;
175       src0 = src1;
176       src1 = tmp;
177    }
178 
179    uint32_t src1_ub = nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, src1, &ctx->ub_config);
180    add->no_unsigned_wrap =
181       !nir_addition_might_overflow(ctx->shader, ctx->range_ht, src0, src1_ub, &ctx->ub_config);
182 }
183 
184 void
apply_nuw_to_offsets(isel_context * ctx,nir_function_impl * impl)185 apply_nuw_to_offsets(isel_context* ctx, nir_function_impl* impl)
186 {
187    nir_foreach_block (block, impl) {
188       nir_foreach_instr (instr, block) {
189          if (instr->type != nir_instr_type_intrinsic)
190             continue;
191          nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr);
192 
193          switch (intrin->intrinsic) {
194          case nir_intrinsic_load_constant:
195          case nir_intrinsic_load_uniform:
196          case nir_intrinsic_load_push_constant:
197             if (!nir_src_is_divergent(&intrin->src[0]))
198                apply_nuw_to_ssa(ctx, intrin->src[0].ssa);
199             break;
200          case nir_intrinsic_load_ubo:
201          case nir_intrinsic_load_ssbo:
202             if (!nir_src_is_divergent(&intrin->src[1]))
203                apply_nuw_to_ssa(ctx, intrin->src[1].ssa);
204             break;
205          case nir_intrinsic_store_ssbo:
206             if (!nir_src_is_divergent(&intrin->src[2]))
207                apply_nuw_to_ssa(ctx, intrin->src[2].ssa);
208             break;
209          case nir_intrinsic_load_scratch: apply_nuw_to_ssa(ctx, intrin->src[0].ssa); break;
210          case nir_intrinsic_store_scratch:
211          case nir_intrinsic_load_smem_amd: apply_nuw_to_ssa(ctx, intrin->src[1].ssa); break;
212          default: break;
213          }
214       }
215    }
216 }
217 
218 RegClass
get_reg_class(isel_context * ctx,RegType type,unsigned components,unsigned bitsize)219 get_reg_class(isel_context* ctx, RegType type, unsigned components, unsigned bitsize)
220 {
221    if (bitsize == 1)
222       return RegClass(RegType::sgpr, ctx->program->lane_mask.size() * components);
223    else
224       return RegClass::get(type, components * bitsize / 8u);
225 }
226 
227 void
setup_tcs_info(isel_context * ctx)228 setup_tcs_info(isel_context* ctx)
229 {
230    ctx->tcs_in_out_eq = ctx->program->info.vs.tcs_in_out_eq;
231    ctx->any_tcs_inputs_via_lds = ctx->program->info.vs.any_tcs_inputs_via_lds;
232 }
233 
234 void
setup_lds_size(isel_context * ctx,nir_shader * nir)235 setup_lds_size(isel_context* ctx, nir_shader* nir)
236 {
237    /* TCS and GFX9 GS are special cases, already in units of the allocation granule. */
238    if (ctx->stage.has(SWStage::TCS))
239       ctx->program->config->lds_size = ctx->program->info.tcs.num_lds_blocks;
240    else if (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->options->gfx_level >= GFX9)
241       ctx->program->config->lds_size = ctx->program->info.gfx9_gs_ring_lds_size;
242    else
243       ctx->program->config->lds_size =
244          DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
245 
246    /* Make sure we fit the available LDS space. */
247    assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <=
248           ctx->program->dev.lds_limit);
249 }
250 
251 void
setup_nir(isel_context * ctx,nir_shader * nir)252 setup_nir(isel_context* ctx, nir_shader* nir)
253 {
254    nir_convert_to_lcssa(nir, true, false);
255    if (nir_lower_phis_to_scalar(nir, true)) {
256       nir_copy_prop(nir);
257       nir_opt_dce(nir);
258    }
259 
260    nir_function_impl* func = nir_shader_get_entrypoint(nir);
261    nir_index_ssa_defs(func);
262 }
263 
264 /* Returns true if we can skip uniformization of a merge phi. This makes the destination divergent,
265  * and so is only safe if the inconsistency it introduces into the divergence analysis won't break
266  * code generation. If we unsafely skip uniformization, later instructions (such as SSBO loads,
267  * some subgroup intrinsics and certain conversions) can use divergence analysis information which
268  * is no longer correct.
269  */
270 bool
skip_uniformize_merge_phi(nir_def * ssa,unsigned depth)271 skip_uniformize_merge_phi(nir_def* ssa, unsigned depth)
272 {
273    if (depth >= 16)
274       return false;
275 
276    nir_foreach_use (src, ssa) {
277       switch (nir_src_parent_instr(src)->type) {
278       case nir_instr_type_alu: {
279          nir_alu_instr* alu = nir_instr_as_alu(nir_src_parent_instr(src));
280          if (alu->def.divergent)
281             break;
282 
283          switch (alu->op) {
284          case nir_op_f2i16:
285          case nir_op_f2u16:
286          case nir_op_f2i32:
287          case nir_op_f2u32:
288          case nir_op_b2i8:
289          case nir_op_b2i16:
290          case nir_op_b2i32:
291          case nir_op_b2b32:
292          case nir_op_b2f16:
293          case nir_op_b2f32:
294          case nir_op_b2f64:
295          case nir_op_mov:
296             /* These opcodes p_as_uniform or vote_any() the source, so fail immediately. We don't
297              * need to do this for non-nir_op_b2 if we know we'll move it back into a VGPR,
298              * in which case the p_as_uniform would be eliminated. This would be way too fragile,
299              * though.
300              */
301             return false;
302          default:
303             if (!skip_uniformize_merge_phi(&alu->def, depth + 1))
304                return false;
305             break;
306          }
307          break;
308       }
309       case nir_instr_type_intrinsic: {
310          nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
311          unsigned src_idx = src - intrin->src;
312          /* nir_intrinsic_lane_permute_16_amd is only safe because we don't use divergence analysis
313           * for it's instruction selection. We use that intrinsic for NGG culling. All others are
314           * stores with VGPR sources.
315           */
316          if (intrin->intrinsic == nir_intrinsic_lane_permute_16_amd ||
317              intrin->intrinsic == nir_intrinsic_export_amd ||
318              intrin->intrinsic == nir_intrinsic_export_dual_src_blend_amd ||
319              (intrin->intrinsic == nir_intrinsic_export_row_amd && src_idx == 0) ||
320              (intrin->intrinsic == nir_intrinsic_store_buffer_amd && src_idx == 0) ||
321              (intrin->intrinsic == nir_intrinsic_store_ssbo && src_idx == 0) ||
322              (intrin->intrinsic == nir_intrinsic_store_global && src_idx == 0) ||
323              (intrin->intrinsic == nir_intrinsic_store_scratch && src_idx == 0) ||
324              (intrin->intrinsic == nir_intrinsic_store_shared && src_idx == 0))
325             break;
326          return false;
327       }
328       case nir_instr_type_phi: {
329          nir_phi_instr* phi = nir_instr_as_phi(nir_src_parent_instr(src));
330          if (phi->def.divergent || skip_uniformize_merge_phi(&phi->def, depth + 1))
331             break;
332          return false;
333       }
334       case nir_instr_type_tex: {
335          /* This is either used as a VGPR source or it's a (potentially undef) descriptor. */
336          break;
337       }
338       default: {
339          return false;
340       }
341       }
342    }
343 
344    return true;
345 }
346 
347 } /* end namespace */
348 
349 void
init_context(isel_context * ctx,nir_shader * shader)350 init_context(isel_context* ctx, nir_shader* shader)
351 {
352    nir_function_impl* impl = nir_shader_get_entrypoint(shader);
353    ctx->shader = shader;
354 
355    /* Init NIR range analysis. */
356    ctx->range_ht = _mesa_pointer_hash_table_create(NULL);
357    ctx->ub_config.min_subgroup_size = ctx->program->wave_size;
358    ctx->ub_config.max_subgroup_size = ctx->program->wave_size;
359    ctx->ub_config.max_workgroup_invocations = 2048;
360    ctx->ub_config.max_workgroup_count[0] = 4294967295;
361    ctx->ub_config.max_workgroup_count[1] = 65535;
362    ctx->ub_config.max_workgroup_count[2] = 65535;
363    ctx->ub_config.max_workgroup_size[0] = 1024;
364    ctx->ub_config.max_workgroup_size[1] = 1024;
365    ctx->ub_config.max_workgroup_size[2] = 1024;
366 
367    uint32_t options =
368       shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs;
369    nir_divergence_analysis_impl(impl, (nir_divergence_options)options);
370    shader->info.divergence_analysis_run = true;
371 
372    apply_nuw_to_offsets(ctx, impl);
373    ac_nir_flag_smem_for_loads(shader, ctx->program->gfx_level, false, true);
374 
375    /* sanitize control flow */
376    sanitize_cf_list(impl, &impl->body);
377    nir_metadata_preserve(impl, nir_metadata_none);
378 
379    /* we'll need these for isel */
380    nir_metadata_require(impl, nir_metadata_block_index);
381 
382    if (ctx->options->dump_preoptir) {
383       fprintf(stderr, "NIR shader before instruction selection:\n");
384       nir_print_shader(shader, stderr);
385    }
386 
387    ctx->first_temp_id = ctx->program->peekAllocationId();
388    ctx->program->allocateRange(impl->ssa_alloc);
389    RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;
390 
391    /* TODO: make this recursive to improve compile times */
392    bool done = false;
393    while (!done) {
394       done = true;
395       nir_foreach_block (block, impl) {
396          nir_foreach_instr (instr, block) {
397             switch (instr->type) {
398             case nir_instr_type_alu: {
399                nir_alu_instr* alu_instr = nir_instr_as_alu(instr);
400                RegType type = RegType::sgpr;
401 
402                /* packed 16bit instructions have to be VGPR */
403                if (alu_instr->def.num_components == 2 &&
404                    nir_op_infos[alu_instr->op].output_size == 0)
405                   type = RegType::vgpr;
406 
407                switch (alu_instr->op) {
408                case nir_op_f2i16:
409                case nir_op_f2u16:
410                case nir_op_f2i32:
411                case nir_op_f2u32:
412                case nir_op_mov:
413                   if (alu_instr->def.divergent &&
414                       regclasses[alu_instr->src[0].src.ssa->index].type() == RegType::vgpr)
415                      type = RegType::vgpr;
416                   break;
417                case nir_op_fmulz:
418                case nir_op_ffmaz:
419                case nir_op_f2f64:
420                case nir_op_u2f64:
421                case nir_op_i2f64:
422                case nir_op_pack_unorm_2x16:
423                case nir_op_pack_snorm_2x16:
424                case nir_op_pack_uint_2x16:
425                case nir_op_pack_sint_2x16:
426                case nir_op_ldexp:
427                case nir_op_frexp_sig:
428                case nir_op_frexp_exp:
429                case nir_op_cube_amd:
430                case nir_op_msad_4x8:
431                case nir_op_mqsad_4x8:
432                case nir_op_udot_4x8_uadd:
433                case nir_op_sdot_4x8_iadd:
434                case nir_op_sudot_4x8_iadd:
435                case nir_op_udot_4x8_uadd_sat:
436                case nir_op_sdot_4x8_iadd_sat:
437                case nir_op_sudot_4x8_iadd_sat:
438                case nir_op_udot_2x16_uadd:
439                case nir_op_sdot_2x16_iadd:
440                case nir_op_udot_2x16_uadd_sat:
441                case nir_op_sdot_2x16_iadd_sat:
442                case nir_op_alignbyte_amd: type = RegType::vgpr; break;
443                case nir_op_fmul:
444                case nir_op_ffma:
445                case nir_op_fadd:
446                case nir_op_fsub:
447                case nir_op_fmax:
448                case nir_op_fmin:
449                case nir_op_fsat:
450                case nir_op_fneg:
451                case nir_op_fabs:
452                case nir_op_fsign:
453                case nir_op_i2f16:
454                case nir_op_i2f32:
455                case nir_op_u2f16:
456                case nir_op_u2f32:
457                case nir_op_f2f16:
458                case nir_op_f2f16_rtz:
459                case nir_op_f2f16_rtne:
460                case nir_op_f2f32:
461                case nir_op_fquantize2f16:
462                case nir_op_ffract:
463                case nir_op_ffloor:
464                case nir_op_fceil:
465                case nir_op_ftrunc:
466                case nir_op_fround_even:
467                case nir_op_frcp:
468                case nir_op_frsq:
469                case nir_op_fsqrt:
470                case nir_op_fexp2:
471                case nir_op_flog2:
472                case nir_op_fsin_amd:
473                case nir_op_fcos_amd:
474                case nir_op_pack_half_2x16_rtz_split:
475                case nir_op_pack_half_2x16_split:
476                case nir_op_unpack_half_2x16_split_x:
477                case nir_op_unpack_half_2x16_split_y: {
478                   if (ctx->program->gfx_level < GFX11_5 ||
479                       alu_instr->src[0].src.ssa->bit_size > 32) {
480                      type = RegType::vgpr;
481                      break;
482                   }
483                   FALLTHROUGH;
484                }
485                default:
486                   for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
487                      if (alu_instr->src[i].src.ssa->bit_size == 1
488                             ? nir_src_is_divergent(&alu_instr->src[i].src)
489                             : regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
490                         type = RegType::vgpr;
491                   }
492                   break;
493                }
494 
495                RegClass rc =
496                   get_reg_class(ctx, type, alu_instr->def.num_components, alu_instr->def.bit_size);
497                regclasses[alu_instr->def.index] = rc;
498                break;
499             }
500             case nir_instr_type_load_const: {
501                unsigned num_components = nir_instr_as_load_const(instr)->def.num_components;
502                unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size;
503                RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
504                regclasses[nir_instr_as_load_const(instr)->def.index] = rc;
505                break;
506             }
507             case nir_instr_type_intrinsic: {
508                nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr);
509                if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)
510                   break;
511                if (intrinsic->intrinsic == nir_intrinsic_strict_wqm_coord_amd) {
512                   regclasses[intrinsic->def.index] =
513                      RegClass::get(RegType::vgpr, intrinsic->def.num_components * 4 +
514                                                      nir_intrinsic_base(intrinsic))
515                         .as_linear();
516                   break;
517                }
518                RegType type = RegType::sgpr;
519                switch (intrinsic->intrinsic) {
520                case nir_intrinsic_load_push_constant:
521                case nir_intrinsic_load_workgroup_id:
522                case nir_intrinsic_load_num_workgroups:
523                case nir_intrinsic_load_sbt_base_amd:
524                case nir_intrinsic_load_subgroup_id:
525                case nir_intrinsic_load_num_subgroups:
526                case nir_intrinsic_vote_all:
527                case nir_intrinsic_vote_any:
528                case nir_intrinsic_read_first_invocation:
529                case nir_intrinsic_as_uniform:
530                case nir_intrinsic_read_invocation:
531                case nir_intrinsic_first_invocation:
532                case nir_intrinsic_ballot:
533                case nir_intrinsic_ballot_relaxed:
534                case nir_intrinsic_bindless_image_samples:
535                case nir_intrinsic_load_scalar_arg_amd:
536                case nir_intrinsic_load_lds_ngg_scratch_base_amd:
537                case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd:
538                case nir_intrinsic_load_smem_amd:
539                case nir_intrinsic_unit_test_uniform_amd: type = RegType::sgpr; break;
540                case nir_intrinsic_load_input:
541                case nir_intrinsic_load_per_primitive_input:
542                case nir_intrinsic_load_output:
543                case nir_intrinsic_load_input_vertex:
544                case nir_intrinsic_load_per_vertex_input:
545                case nir_intrinsic_load_per_vertex_output:
546                case nir_intrinsic_load_interpolated_input:
547                case nir_intrinsic_write_invocation_amd:
548                case nir_intrinsic_mbcnt_amd:
549                case nir_intrinsic_lane_permute_16_amd:
550                case nir_intrinsic_dpp16_shift_amd:
551                case nir_intrinsic_ssbo_atomic:
552                case nir_intrinsic_ssbo_atomic_swap:
553                case nir_intrinsic_global_atomic_amd:
554                case nir_intrinsic_global_atomic_swap_amd:
555                case nir_intrinsic_bindless_image_atomic:
556                case nir_intrinsic_bindless_image_atomic_swap:
557                case nir_intrinsic_bindless_image_size:
558                case nir_intrinsic_shared_atomic:
559                case nir_intrinsic_shared_atomic_swap:
560                case nir_intrinsic_load_scratch:
561                case nir_intrinsic_load_typed_buffer_amd:
562                case nir_intrinsic_load_buffer_amd:
563                case nir_intrinsic_load_initial_edgeflags_amd:
564                case nir_intrinsic_gds_atomic_add_amd:
565                case nir_intrinsic_bvh64_intersect_ray_amd:
566                case nir_intrinsic_load_vector_arg_amd:
567                case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
568                case nir_intrinsic_cmat_muladd_amd:
569                case nir_intrinsic_unit_test_divergent_amd: type = RegType::vgpr; break;
570                case nir_intrinsic_load_shared:
571                case nir_intrinsic_load_shared2_amd:
572                   /* When the result of these loads is only used by cross-lane instructions,
573                    * it is beneficial to use a VGPR destination. This is because this allows
574                    * to put the s_waitcnt further down, which decreases latency.
575                    */
576                   if (only_used_by_cross_lane_instrs(&intrinsic->def)) {
577                      type = RegType::vgpr;
578                      break;
579                   }
580                   FALLTHROUGH;
581                case nir_intrinsic_shuffle:
582                case nir_intrinsic_quad_broadcast:
583                case nir_intrinsic_quad_swap_horizontal:
584                case nir_intrinsic_quad_swap_vertical:
585                case nir_intrinsic_quad_swap_diagonal:
586                case nir_intrinsic_quad_swizzle_amd:
587                case nir_intrinsic_masked_swizzle_amd:
588                case nir_intrinsic_rotate:
589                case nir_intrinsic_inclusive_scan:
590                case nir_intrinsic_exclusive_scan:
591                case nir_intrinsic_reduce:
592                case nir_intrinsic_load_ubo:
593                case nir_intrinsic_load_ssbo:
594                case nir_intrinsic_load_global_amd:
595                   type = intrinsic->def.divergent ? RegType::vgpr : RegType::sgpr;
596                   break;
597                case nir_intrinsic_ddx:
598                case nir_intrinsic_ddy:
599                case nir_intrinsic_ddx_fine:
600                case nir_intrinsic_ddy_fine:
601                case nir_intrinsic_ddx_coarse:
602                case nir_intrinsic_ddy_coarse: type = RegType::vgpr; break;
603                default:
604                   for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
605                        i++) {
606                      if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
607                         type = RegType::vgpr;
608                   }
609                   break;
610                }
611                RegClass rc =
612                   get_reg_class(ctx, type, intrinsic->def.num_components, intrinsic->def.bit_size);
613                regclasses[intrinsic->def.index] = rc;
614                break;
615             }
616             case nir_instr_type_tex: {
617                nir_tex_instr* tex = nir_instr_as_tex(instr);
618                RegType type = tex->def.divergent ? RegType::vgpr : RegType::sgpr;
619 
620                if (tex->op == nir_texop_texture_samples) {
621                   assert(!tex->def.divergent);
622                }
623 
624                RegClass rc = get_reg_class(ctx, type, tex->def.num_components, tex->def.bit_size);
625                regclasses[tex->def.index] = rc;
626                break;
627             }
628             case nir_instr_type_undef: {
629                unsigned num_components = nir_instr_as_undef(instr)->def.num_components;
630                unsigned bit_size = nir_instr_as_undef(instr)->def.bit_size;
631                RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
632                regclasses[nir_instr_as_undef(instr)->def.index] = rc;
633                break;
634             }
635             case nir_instr_type_phi: {
636                nir_phi_instr* phi = nir_instr_as_phi(instr);
637                RegType type = RegType::sgpr;
638                unsigned num_components = phi->def.num_components;
639                assert((phi->def.bit_size != 1 || num_components == 1) &&
640                       "Multiple components not supported on boolean phis.");
641 
642                if (phi->def.divergent) {
643                   type = RegType::vgpr;
644                } else {
645                   bool vgpr_src = false;
646                   nir_foreach_phi_src (src, phi)
647                      vgpr_src |= regclasses[src->src.ssa->index].type() == RegType::vgpr;
648 
649                   if (vgpr_src) {
650                      type = RegType::vgpr;
651 
652                      /* This might be the case because of nir_divergence_ignore_undef_if_phi_srcs. */
653                      bool divergent_merge = false;
654                      if (nir_cf_node_prev(&block->cf_node) &&
655                          nir_cf_node_prev(&block->cf_node)->type == nir_cf_node_if) {
656                         nir_if* nif = nir_cf_node_as_if(nir_cf_node_prev(&block->cf_node));
657                         divergent_merge = nir_src_is_divergent(&nif->condition);
658                      }
659 
660                      /* In case of uniform phis after divergent merges, ensure that the dst is an
661                       * SGPR and does not contain undefined values for some invocations.
662                       */
663                      if (divergent_merge && !skip_uniformize_merge_phi(&phi->def, 0))
664                         type = RegType::sgpr;
665                   }
666                }
667 
668                RegClass rc = get_reg_class(ctx, type, num_components, phi->def.bit_size);
669                if (rc != regclasses[phi->def.index])
670                   done = false;
671                regclasses[phi->def.index] = rc;
672                break;
673             }
674             default: break;
675             }
676          }
677       }
678    }
679 
680    ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena;
681    ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr;
682 
683    /* align and copy constant data */
684    while (ctx->program->constant_data.size() % 4u)
685       ctx->program->constant_data.push_back(0);
686    ctx->constant_data_offset = ctx->program->constant_data.size();
687    ctx->program->constant_data.insert(ctx->program->constant_data.end(),
688                                       (uint8_t*)shader->constant_data,
689                                       (uint8_t*)shader->constant_data + shader->constant_data_size);
690 
691    BITSET_CLEAR_RANGE(ctx->output_args, 0, BITSET_SIZE(ctx->output_args));
692 }
693 
694 void
cleanup_context(isel_context * ctx)695 cleanup_context(isel_context* ctx)
696 {
697    _mesa_hash_table_destroy(ctx->range_ht, NULL);
698 }
699 
700 isel_context
setup_isel_context(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args,SWStage sw_stage)701 setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
702                    ac_shader_config* config, const struct aco_compiler_options* options,
703                    const struct aco_shader_info* info, const struct ac_shader_args* args,
704                    SWStage sw_stage)
705 {
706    for (unsigned i = 0; i < shader_count; i++) {
707       switch (shaders[i]->info.stage) {
708       case MESA_SHADER_VERTEX: sw_stage = sw_stage | SWStage::VS; break;
709       case MESA_SHADER_TESS_CTRL: sw_stage = sw_stage | SWStage::TCS; break;
710       case MESA_SHADER_TESS_EVAL: sw_stage = sw_stage | SWStage::TES; break;
711       case MESA_SHADER_GEOMETRY: sw_stage = sw_stage | SWStage::GS; break;
712       case MESA_SHADER_FRAGMENT: sw_stage = sw_stage | SWStage::FS; break;
713       case MESA_SHADER_KERNEL:
714       case MESA_SHADER_COMPUTE: sw_stage = sw_stage | SWStage::CS; break;
715       case MESA_SHADER_TASK: sw_stage = sw_stage | SWStage::TS; break;
716       case MESA_SHADER_MESH: sw_stage = sw_stage | SWStage::MS; break;
717       case MESA_SHADER_RAYGEN:
718       case MESA_SHADER_CLOSEST_HIT:
719       case MESA_SHADER_MISS:
720       case MESA_SHADER_CALLABLE:
721       case MESA_SHADER_INTERSECTION:
722       case MESA_SHADER_ANY_HIT: sw_stage = SWStage::RT; break;
723       default: unreachable("Shader stage not implemented");
724       }
725    }
726 
727    init_program(program, Stage{info->hw_stage, sw_stage}, info, options->gfx_level, options->family,
728                 options->wgp_mode, config);
729 
730    isel_context ctx = {};
731    ctx.program = program;
732    ctx.args = args;
733    ctx.options = options;
734    ctx.stage = program->stage;
735 
736    program->workgroup_size = program->info.workgroup_size;
737    assert(program->workgroup_size);
738 
739    /* Mesh shading only works on GFX10.3+. */
740    ASSERTED bool mesh_shading = ctx.stage.has(SWStage::TS) || ctx.stage.has(SWStage::MS);
741    assert(!mesh_shading || ctx.program->gfx_level >= GFX10_3);
742 
743    setup_tcs_info(&ctx);
744 
745    calc_min_waves(program);
746 
747    unsigned scratch_size = 0;
748    for (unsigned i = 0; i < shader_count; i++) {
749       nir_shader* nir = shaders[i];
750       setup_nir(&ctx, nir);
751       setup_lds_size(&ctx, nir);
752    }
753 
754    for (unsigned i = 0; i < shader_count; i++)
755       scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
756 
757    ctx.program->config->scratch_bytes_per_wave = scratch_size * ctx.program->wave_size;
758 
759    unsigned nir_num_blocks = 0;
760    for (unsigned i = 0; i < shader_count; i++)
761       nir_num_blocks += nir_shader_get_entrypoint(shaders[i])->num_blocks;
762    ctx.program->blocks.reserve(nir_num_blocks * 2);
763    ctx.block = ctx.program->create_and_insert_block();
764    ctx.block->kind = block_kind_top_level;
765 
766    return ctx;
767 }
768 
769 } // namespace aco
770