1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "nir.h"
26
27 /* This pass computes for each ssa definition if it is uniform.
28 * That is, the variable has the same value for all invocations
29 * of the group.
30 *
31 * This divergence analysis pass expects the shader to be in LCSSA-form.
32 *
33 * This algorithm implements "The Simple Divergence Analysis" from
34 * Diogo Sampaio, Rafael De Souza, Sylvain Collange, Fernando Magno Quintão Pereira.
35 * Divergence Analysis. ACM Transactions on Programming Languages and Systems (TOPLAS),
36 * ACM, 2013, 35 (4), pp.13:1-13:36. <10.1145/2523815>. <hal-00909072v2>
37 */
38
39 struct divergence_state {
40 const gl_shader_stage stage;
41 nir_shader *shader;
42 nir_divergence_options options;
43 nir_loop *loop;
44
45 /* Whether the caller requested vertex divergence (meaning between vertices
46 * of the same primitive) instead of subgroup invocation divergence
47 * (between invocations of the same subgroup). For example, patch input
48 * loads are always convergent, while subgroup intrinsics are divergent
49 * because vertices of the same primitive can be processed by different
50 * subgroups.
51 */
52 bool vertex_divergence;
53
54 /** current control flow state */
55 /* True if some loop-active invocations might take a different control-flow path.
56 * A divergent break does not cause subsequent control-flow to be considered
57 * divergent because those invocations are no longer active in the loop.
58 * For a divergent if, both sides are considered divergent flow because
59 * the other side is still loop-active. */
60 bool divergent_loop_cf;
61 /* True if a divergent continue happened since the loop header */
62 bool divergent_loop_continue;
63 /* True if a divergent break happened since the loop header */
64 bool divergent_loop_break;
65
66 /* True if we visit the block for the fist time */
67 bool first_visit;
68 /* True if we visit a block that is dominated by a loop with a divergent break */
69 bool consider_loop_invariance;
70 };
71
72 static bool
73 visit_cf_list(struct exec_list *list, struct divergence_state *state);
74
75 bool
nir_src_is_divergent(nir_src * src)76 nir_src_is_divergent(nir_src *src)
77 {
78 if (src->ssa->divergent)
79 return true;
80
81 nir_cf_node *use_node = nir_src_get_block(src)->cf_node.parent;
82 nir_cf_node *def_node = src->ssa->parent_instr->block->cf_node.parent;
83
84 /* Short-cut the common case. */
85 if (def_node == use_node)
86 return false;
87
88 /* If the source was computed in a divergent loop, and is not
89 * loop-invariant, then it must also be considered divergent.
90 */
91 bool loop_invariant = src->ssa->loop_invariant;
92 while (def_node) {
93 if (def_node->type == nir_cf_node_loop) {
94 /* Check whether the use is inside this loop. */
95 for (nir_cf_node *node = use_node; node != NULL; node = node->parent) {
96 if (def_node == node)
97 return false;
98 }
99
100 /* Because the use is outside of this loop, it is divergent. */
101 if (nir_cf_node_as_loop(def_node)->divergent_break && !loop_invariant)
102 return true;
103
104 /* For outer loops, consider this variable not loop invariant. */
105 loop_invariant = false;
106 }
107
108 def_node = def_node->parent;
109 }
110
111 return false;
112 }
113
114 static inline bool
src_divergent(nir_src src,struct divergence_state * state)115 src_divergent(nir_src src, struct divergence_state *state)
116 {
117 if (!state->consider_loop_invariance)
118 return src.ssa->divergent;
119
120 return nir_src_is_divergent(&src);
121 }
122
123 static inline bool
src_invariant(nir_src * src,void * loop)124 src_invariant(nir_src *src, void *loop)
125 {
126 nir_block *first_block = nir_loop_first_block(loop);
127
128 /* Invariant if SSA is defined before the current loop. */
129 if (src->ssa->parent_instr->block->index < first_block->index)
130 return true;
131
132 if (!src->ssa->loop_invariant)
133 return false;
134
135 /* The value might be defined in a nested loop. */
136 nir_cf_node *cf_node = src->ssa->parent_instr->block->cf_node.parent;
137 while (cf_node->type != nir_cf_node_loop)
138 cf_node = cf_node->parent;
139
140 return nir_cf_node_as_loop(cf_node) == loop;
141 }
142
143 static bool
visit_alu(nir_alu_instr * instr,struct divergence_state * state)144 visit_alu(nir_alu_instr *instr, struct divergence_state *state)
145 {
146 if (instr->def.divergent)
147 return false;
148
149 unsigned num_src = nir_op_infos[instr->op].num_inputs;
150
151 for (unsigned i = 0; i < num_src; i++) {
152 if (src_divergent(instr->src[i].src, state)) {
153 instr->def.divergent = true;
154 return true;
155 }
156 }
157
158 return false;
159 }
160
161
162 /* On some HW uniform loads where there is a pending store/atomic from another
163 * wave can "tear" so that different invocations see the pre-store value and
164 * the post-store value even though they are loading from the same location.
165 * This means we have to assume it's not uniform unless it's readonly.
166 *
167 * TODO The Vulkan memory model is much more strict here and requires an
168 * atomic or volatile load for the data race to be valid, which could allow us
169 * to do better if it's in use, however we currently don't have that
170 * information plumbed through.
171 */
172 static bool
load_may_tear(struct divergence_state * state,nir_intrinsic_instr * instr)173 load_may_tear(struct divergence_state *state, nir_intrinsic_instr *instr)
174 {
175 return (state->options & nir_divergence_uniform_load_tears) &&
176 !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE);
177 }
178
179 static bool
visit_intrinsic(nir_intrinsic_instr * instr,struct divergence_state * state)180 visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
181 {
182 if (!nir_intrinsic_infos[instr->intrinsic].has_dest)
183 return false;
184
185 if (instr->def.divergent)
186 return false;
187
188 nir_divergence_options options = state->options;
189 gl_shader_stage stage = state->stage;
190 bool is_divergent = false;
191 switch (instr->intrinsic) {
192 case nir_intrinsic_shader_clock:
193 case nir_intrinsic_ballot:
194 case nir_intrinsic_ballot_relaxed:
195 case nir_intrinsic_as_uniform:
196 case nir_intrinsic_read_invocation:
197 case nir_intrinsic_read_first_invocation:
198 case nir_intrinsic_read_invocation_cond_ir3:
199 case nir_intrinsic_read_getlast_ir3:
200 case nir_intrinsic_vote_any:
201 case nir_intrinsic_vote_all:
202 case nir_intrinsic_vote_feq:
203 case nir_intrinsic_vote_ieq:
204 case nir_intrinsic_first_invocation:
205 case nir_intrinsic_last_invocation:
206 case nir_intrinsic_load_subgroup_id:
207 case nir_intrinsic_shared_append_amd:
208 case nir_intrinsic_shared_consume_amd:
209 /* VS/TES/GS invocations of the same primitive can be in different
210 * subgroups, so subgroup ops are always divergent between vertices of
211 * the same primitive.
212 */
213 is_divergent = state->vertex_divergence;
214 break;
215
216 /* Intrinsics which are always uniform */
217 case nir_intrinsic_load_preamble:
218 case nir_intrinsic_load_push_constant:
219 case nir_intrinsic_load_push_constant_zink:
220 case nir_intrinsic_load_work_dim:
221 case nir_intrinsic_load_num_workgroups:
222 case nir_intrinsic_load_workgroup_size:
223 case nir_intrinsic_load_num_subgroups:
224 case nir_intrinsic_load_ray_launch_size:
225 case nir_intrinsic_load_sbt_base_amd:
226 case nir_intrinsic_load_subgroup_size:
227 case nir_intrinsic_load_subgroup_id_shift_ir3:
228 case nir_intrinsic_load_base_instance:
229 case nir_intrinsic_load_base_vertex:
230 case nir_intrinsic_load_first_vertex:
231 case nir_intrinsic_load_draw_id:
232 case nir_intrinsic_load_is_indexed_draw:
233 case nir_intrinsic_load_viewport_scale:
234 case nir_intrinsic_load_user_clip_plane:
235 case nir_intrinsic_load_viewport_x_scale:
236 case nir_intrinsic_load_viewport_y_scale:
237 case nir_intrinsic_load_viewport_z_scale:
238 case nir_intrinsic_load_viewport_offset:
239 case nir_intrinsic_load_viewport_x_offset:
240 case nir_intrinsic_load_viewport_y_offset:
241 case nir_intrinsic_load_viewport_z_offset:
242 case nir_intrinsic_load_cull_triangle_viewport_xy_scale_and_offset_amd:
243 case nir_intrinsic_load_cull_line_viewport_xy_scale_and_offset_amd:
244 case nir_intrinsic_load_blend_const_color_a_float:
245 case nir_intrinsic_load_blend_const_color_b_float:
246 case nir_intrinsic_load_blend_const_color_g_float:
247 case nir_intrinsic_load_blend_const_color_r_float:
248 case nir_intrinsic_load_blend_const_color_rgba:
249 case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
250 case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
251 case nir_intrinsic_load_line_width:
252 case nir_intrinsic_load_aa_line_width:
253 case nir_intrinsic_load_xfb_address:
254 case nir_intrinsic_load_num_vertices:
255 case nir_intrinsic_load_fb_layers_v3d:
256 case nir_intrinsic_load_fep_w_v3d:
257 case nir_intrinsic_load_tcs_num_patches_amd:
258 case nir_intrinsic_load_tcs_tess_levels_to_tes_amd:
259 case nir_intrinsic_load_tcs_primitive_mode_amd:
260 case nir_intrinsic_load_patch_vertices_in:
261 case nir_intrinsic_load_ring_tess_factors_amd:
262 case nir_intrinsic_load_ring_tess_offchip_amd:
263 case nir_intrinsic_load_ring_tess_factors_offset_amd:
264 case nir_intrinsic_load_ring_tess_offchip_offset_amd:
265 case nir_intrinsic_load_ring_mesh_scratch_amd:
266 case nir_intrinsic_load_ring_mesh_scratch_offset_amd:
267 case nir_intrinsic_load_ring_esgs_amd:
268 case nir_intrinsic_load_ring_es2gs_offset_amd:
269 case nir_intrinsic_load_ring_task_draw_amd:
270 case nir_intrinsic_load_ring_task_payload_amd:
271 case nir_intrinsic_load_sample_positions_amd:
272 case nir_intrinsic_load_rasterization_samples_amd:
273 case nir_intrinsic_load_ring_gsvs_amd:
274 case nir_intrinsic_load_ring_gs2vs_offset_amd:
275 case nir_intrinsic_load_streamout_config_amd:
276 case nir_intrinsic_load_streamout_write_index_amd:
277 case nir_intrinsic_load_streamout_offset_amd:
278 case nir_intrinsic_load_task_ring_entry_amd:
279 case nir_intrinsic_load_ring_attr_amd:
280 case nir_intrinsic_load_ring_attr_offset_amd:
281 case nir_intrinsic_load_provoking_vtx_amd:
282 case nir_intrinsic_load_sample_positions_pan:
283 case nir_intrinsic_load_workgroup_num_input_vertices_amd:
284 case nir_intrinsic_load_workgroup_num_input_primitives_amd:
285 case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
286 case nir_intrinsic_load_prim_gen_query_enabled_amd:
287 case nir_intrinsic_load_prim_xfb_query_enabled_amd:
288 case nir_intrinsic_load_merged_wave_info_amd:
289 case nir_intrinsic_load_clamp_vertex_color_amd:
290 case nir_intrinsic_load_cull_front_face_enabled_amd:
291 case nir_intrinsic_load_cull_back_face_enabled_amd:
292 case nir_intrinsic_load_cull_ccw_amd:
293 case nir_intrinsic_load_cull_small_triangles_enabled_amd:
294 case nir_intrinsic_load_cull_small_lines_enabled_amd:
295 case nir_intrinsic_load_cull_any_enabled_amd:
296 case nir_intrinsic_load_cull_small_triangle_precision_amd:
297 case nir_intrinsic_load_cull_small_line_precision_amd:
298 case nir_intrinsic_load_user_data_amd:
299 case nir_intrinsic_load_force_vrs_rates_amd:
300 case nir_intrinsic_load_tess_level_inner_default:
301 case nir_intrinsic_load_tess_level_outer_default:
302 case nir_intrinsic_load_scalar_arg_amd:
303 case nir_intrinsic_load_smem_amd:
304 case nir_intrinsic_load_resume_shader_address_amd:
305 case nir_intrinsic_load_reloc_const_intel:
306 case nir_intrinsic_load_btd_global_arg_addr_intel:
307 case nir_intrinsic_load_btd_local_arg_addr_intel:
308 case nir_intrinsic_load_inline_data_intel:
309 case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
310 case nir_intrinsic_load_lshs_vertex_stride_amd:
311 case nir_intrinsic_load_esgs_vertex_stride_amd:
312 case nir_intrinsic_load_hs_out_patch_data_offset_amd:
313 case nir_intrinsic_load_clip_half_line_width_amd:
314 case nir_intrinsic_load_num_vertices_per_primitive_amd:
315 case nir_intrinsic_load_streamout_buffer_amd:
316 case nir_intrinsic_load_ordered_id_amd:
317 case nir_intrinsic_load_gs_wave_id_amd:
318 case nir_intrinsic_load_provoking_vtx_in_prim_amd:
319 case nir_intrinsic_load_lds_ngg_scratch_base_amd:
320 case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd:
321 case nir_intrinsic_load_btd_shader_type_intel:
322 case nir_intrinsic_load_base_global_invocation_id:
323 case nir_intrinsic_load_base_workgroup_id:
324 case nir_intrinsic_load_alpha_reference_amd:
325 case nir_intrinsic_load_ubo_uniform_block_intel:
326 case nir_intrinsic_load_ssbo_uniform_block_intel:
327 case nir_intrinsic_load_shared_uniform_block_intel:
328 case nir_intrinsic_load_barycentric_optimize_amd:
329 case nir_intrinsic_load_poly_line_smooth_enabled:
330 case nir_intrinsic_load_rasterization_primitive_amd:
331 case nir_intrinsic_unit_test_uniform_amd:
332 case nir_intrinsic_load_global_constant_uniform_block_intel:
333 case nir_intrinsic_load_debug_log_desc_amd:
334 case nir_intrinsic_load_xfb_state_address_gfx12_amd:
335 case nir_intrinsic_cmat_length:
336 case nir_intrinsic_load_vs_primitive_stride_ir3:
337 case nir_intrinsic_load_vs_vertex_stride_ir3:
338 case nir_intrinsic_load_hs_patch_stride_ir3:
339 case nir_intrinsic_load_tess_factor_base_ir3:
340 case nir_intrinsic_load_tess_param_base_ir3:
341 case nir_intrinsic_load_primitive_location_ir3:
342 case nir_intrinsic_preamble_start_ir3:
343 case nir_intrinsic_optimization_barrier_sgpr_amd:
344 case nir_intrinsic_load_printf_buffer_address:
345 case nir_intrinsic_load_printf_buffer_size:
346 case nir_intrinsic_load_printf_base_identifier:
347 case nir_intrinsic_load_core_id_agx:
348 case nir_intrinsic_load_samples_log2_agx:
349 case nir_intrinsic_load_active_subgroup_count_agx:
350 case nir_intrinsic_load_constant_base_ptr:
351 is_divergent = false;
352 break;
353
354 /* This is divergent because it specifically loads sequential values into
355 * successive SIMD lanes.
356 */
357 case nir_intrinsic_load_global_block_intel:
358 is_divergent = true;
359 break;
360
361 case nir_intrinsic_decl_reg:
362 is_divergent = nir_intrinsic_divergent(instr);
363 break;
364
365 /* Intrinsics with divergence depending on shader stage and hardware */
366 case nir_intrinsic_load_shader_record_ptr:
367 is_divergent = !(options & nir_divergence_shader_record_ptr_uniform);
368 break;
369 case nir_intrinsic_load_frag_shading_rate:
370 is_divergent = !(options & nir_divergence_single_frag_shading_rate_per_subgroup);
371 break;
372 case nir_intrinsic_load_input:
373 case nir_intrinsic_load_per_primitive_input:
374 is_divergent = src_divergent(instr->src[0], state);
375
376 if (stage == MESA_SHADER_FRAGMENT) {
377 is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
378 } else if (stage == MESA_SHADER_TESS_EVAL) {
379 /* Patch input loads are uniform between vertices of the same primitive. */
380 if (state->vertex_divergence)
381 is_divergent = false;
382 else
383 is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
384 } else {
385 is_divergent = true;
386 }
387 break;
388 case nir_intrinsic_load_attribute_pan:
389 assert(stage == MESA_SHADER_VERTEX);
390 is_divergent = src_divergent(instr->src[0], state) ||
391 src_divergent(instr->src[1], state) ||
392 src_divergent(instr->src[2], state);
393 break;
394 case nir_intrinsic_load_per_vertex_input:
395 is_divergent = src_divergent(instr->src[0], state) ||
396 src_divergent(instr->src[1], state);
397 if (stage == MESA_SHADER_TESS_CTRL)
398 is_divergent |= !(options & nir_divergence_single_patch_per_tcs_subgroup);
399 if (stage == MESA_SHADER_TESS_EVAL)
400 is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
401 else
402 is_divergent = true;
403 break;
404 case nir_intrinsic_load_input_vertex:
405 is_divergent = src_divergent(instr->src[1], state);
406 assert(stage == MESA_SHADER_FRAGMENT);
407 is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
408 break;
409 case nir_intrinsic_load_output:
410 is_divergent = src_divergent(instr->src[0], state);
411 switch (stage) {
412 case MESA_SHADER_TESS_CTRL:
413 is_divergent |= !(options & nir_divergence_single_patch_per_tcs_subgroup);
414 break;
415 case MESA_SHADER_FRAGMENT:
416 is_divergent = true;
417 break;
418 case MESA_SHADER_TASK:
419 case MESA_SHADER_MESH:
420 /* NV_mesh_shader only (EXT_mesh_shader does not allow loading outputs).
421 * Divergent if src[0] is, so nothing else to do.
422 */
423 break;
424 default:
425 unreachable("Invalid stage for load_output");
426 }
427 break;
428 case nir_intrinsic_load_per_view_output:
429 is_divergent = instr->src[0].ssa->divergent ||
430 instr->src[1].ssa->divergent ||
431 (stage == MESA_SHADER_TESS_CTRL &&
432 !(options & nir_divergence_single_patch_per_tcs_subgroup));
433 break;
434 case nir_intrinsic_load_per_vertex_output:
435 /* TCS and NV_mesh_shader only (EXT_mesh_shader does not allow loading outputs). */
436 assert(stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_MESH);
437 is_divergent = src_divergent(instr->src[0], state) ||
438 src_divergent(instr->src[1], state) ||
439 (stage == MESA_SHADER_TESS_CTRL &&
440 !(options & nir_divergence_single_patch_per_tcs_subgroup));
441 break;
442 case nir_intrinsic_load_per_primitive_output:
443 /* NV_mesh_shader only (EXT_mesh_shader does not allow loading outputs). */
444 assert(stage == MESA_SHADER_MESH);
445 is_divergent = src_divergent(instr->src[0], state) ||
446 src_divergent(instr->src[1], state);
447 break;
448 case nir_intrinsic_load_layer_id:
449 case nir_intrinsic_load_front_face:
450 case nir_intrinsic_load_front_face_fsign:
451 case nir_intrinsic_load_back_face_agx:
452 assert(stage == MESA_SHADER_FRAGMENT || state->shader->info.internal);
453 is_divergent = !(options & nir_divergence_single_prim_per_subgroup);
454 break;
455 case nir_intrinsic_load_view_index:
456 assert(stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_KERNEL);
457 if (options & nir_divergence_view_index_uniform)
458 is_divergent = false;
459 else if (stage == MESA_SHADER_FRAGMENT)
460 is_divergent = !(options & nir_divergence_single_prim_per_subgroup);
461 else
462 is_divergent = true;
463 break;
464 case nir_intrinsic_load_fs_input_interp_deltas:
465 assert(stage == MESA_SHADER_FRAGMENT);
466 is_divergent = src_divergent(instr->src[0], state);
467 is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
468 break;
469 case nir_intrinsic_load_instance_id:
470 is_divergent = !state->vertex_divergence;
471 break;
472 case nir_intrinsic_load_primitive_id:
473 if (stage == MESA_SHADER_FRAGMENT)
474 is_divergent = !(options & nir_divergence_single_prim_per_subgroup);
475 else if (stage == MESA_SHADER_TESS_CTRL)
476 is_divergent = !state->vertex_divergence &&
477 !(options & nir_divergence_single_patch_per_tcs_subgroup);
478 else if (stage == MESA_SHADER_TESS_EVAL)
479 is_divergent = !state->vertex_divergence &&
480 !(options & nir_divergence_single_patch_per_tes_subgroup);
481 else if (stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_VERTEX)
482 is_divergent = !state->vertex_divergence;
483 else if (stage == MESA_SHADER_ANY_HIT ||
484 stage == MESA_SHADER_CLOSEST_HIT ||
485 stage == MESA_SHADER_INTERSECTION)
486 is_divergent = true;
487 else
488 unreachable("Invalid stage for load_primitive_id");
489 break;
490 case nir_intrinsic_load_tess_level_inner:
491 case nir_intrinsic_load_tess_level_outer:
492 if (stage == MESA_SHADER_TESS_CTRL)
493 is_divergent = !(options & nir_divergence_single_patch_per_tcs_subgroup);
494 else if (stage == MESA_SHADER_TESS_EVAL)
495 is_divergent = !(options & nir_divergence_single_patch_per_tes_subgroup);
496 else
497 unreachable("Invalid stage for load_primitive_tess_level_*");
498 break;
499
500 case nir_intrinsic_load_workgroup_index:
501 case nir_intrinsic_load_workgroup_id:
502 assert(gl_shader_stage_uses_workgroup(stage) || stage == MESA_SHADER_TESS_CTRL);
503 if (stage == MESA_SHADER_COMPUTE)
504 is_divergent |= (options & nir_divergence_multiple_workgroup_per_compute_subgroup);
505 break;
506
507 /* Clustered reductions are uniform if cluster_size == subgroup_size or
508 * the source is uniform and the operation is invariant.
509 * Inclusive scans are uniform if
510 * the source is uniform and the operation is invariant
511 */
512 case nir_intrinsic_reduce:
513 if (nir_intrinsic_cluster_size(instr) == 0) {
514 /* Cluster size of 0 means the subgroup size.
515 * This is uniform within a subgroup, but divergent between
516 * vertices of the same primitive because they may be in
517 * different subgroups.
518 */
519 is_divergent = state->vertex_divergence;
520 break;
521 }
522 FALLTHROUGH;
523 case nir_intrinsic_inclusive_scan:
524 case nir_intrinsic_inclusive_scan_clusters_ir3: {
525 nir_op op = nir_intrinsic_reduction_op(instr);
526 is_divergent = src_divergent(instr->src[0], state) ||
527 state->vertex_divergence;
528 if (op != nir_op_umin && op != nir_op_imin && op != nir_op_fmin &&
529 op != nir_op_umax && op != nir_op_imax && op != nir_op_fmax &&
530 op != nir_op_iand && op != nir_op_ior)
531 is_divergent = true;
532 break;
533 }
534
535 case nir_intrinsic_reduce_clusters_ir3:
536 /* This reduces the last invocations in all 8-wide clusters. It should
537 * behave the same as reduce with cluster_size == subgroup_size.
538 */
539 is_divergent = state->vertex_divergence;
540 break;
541
542 case nir_intrinsic_load_ubo:
543 case nir_intrinsic_load_ubo_vec4:
544 case nir_intrinsic_ldc_nv:
545 case nir_intrinsic_ldcx_nv:
546 is_divergent = (src_divergent(instr->src[0], state) &&
547 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
548 src_divergent(instr->src[1], state);
549 break;
550
551 case nir_intrinsic_load_ssbo:
552 case nir_intrinsic_load_ssbo_ir3:
553 is_divergent = (src_divergent(instr->src[0], state) &&
554 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
555 src_divergent(instr->src[1], state) ||
556 load_may_tear(state, instr);
557 break;
558
559 case nir_intrinsic_load_shared:
560 case nir_intrinsic_load_shared_ir3:
561 is_divergent = src_divergent(instr->src[0], state) ||
562 (options & nir_divergence_uniform_load_tears);
563 break;
564
565 case nir_intrinsic_load_global:
566 case nir_intrinsic_load_global_2x32:
567 case nir_intrinsic_load_global_ir3:
568 case nir_intrinsic_load_deref: {
569 if (load_may_tear(state, instr)) {
570 is_divergent = true;
571 break;
572 }
573
574 unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
575 for (unsigned i = 0; i < num_srcs; i++) {
576 if (src_divergent(instr->src[i], state)) {
577 is_divergent = true;
578 break;
579 }
580 }
581 break;
582 }
583
584 case nir_intrinsic_get_ssbo_size:
585 case nir_intrinsic_deref_buffer_array_length:
586 is_divergent = src_divergent(instr->src[0], state) &&
587 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
588 break;
589
590 case nir_intrinsic_image_samples_identical:
591 case nir_intrinsic_image_deref_samples_identical:
592 case nir_intrinsic_bindless_image_samples_identical:
593 case nir_intrinsic_image_fragment_mask_load_amd:
594 case nir_intrinsic_image_deref_fragment_mask_load_amd:
595 case nir_intrinsic_bindless_image_fragment_mask_load_amd:
596 is_divergent = (src_divergent(instr->src[0], state) &&
597 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
598 src_divergent(instr->src[1], state) ||
599 load_may_tear(state, instr);
600 break;
601
602 case nir_intrinsic_image_texel_address:
603 case nir_intrinsic_image_deref_texel_address:
604 case nir_intrinsic_bindless_image_texel_address:
605 is_divergent = (src_divergent(instr->src[0], state) &&
606 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
607 src_divergent(instr->src[1], state) ||
608 src_divergent(instr->src[2], state);
609 break;
610
611 case nir_intrinsic_image_load:
612 case nir_intrinsic_image_deref_load:
613 case nir_intrinsic_bindless_image_load:
614 case nir_intrinsic_image_sparse_load:
615 case nir_intrinsic_image_deref_sparse_load:
616 case nir_intrinsic_bindless_image_sparse_load:
617 is_divergent = (src_divergent(instr->src[0], state) &&
618 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
619 src_divergent(instr->src[1], state) ||
620 src_divergent(instr->src[2], state) ||
621 src_divergent(instr->src[3], state) ||
622 load_may_tear(state, instr);
623 break;
624
625 case nir_intrinsic_optimization_barrier_vgpr_amd:
626 is_divergent = src_divergent(instr->src[0], state);
627 break;
628
629 /* Intrinsics with divergence depending on sources */
630 case nir_intrinsic_convert_alu_types:
631 case nir_intrinsic_ddx:
632 case nir_intrinsic_ddx_fine:
633 case nir_intrinsic_ddx_coarse:
634 case nir_intrinsic_ddy:
635 case nir_intrinsic_ddy_fine:
636 case nir_intrinsic_ddy_coarse:
637 case nir_intrinsic_ballot_bitfield_extract:
638 case nir_intrinsic_ballot_find_lsb:
639 case nir_intrinsic_ballot_find_msb:
640 case nir_intrinsic_ballot_bit_count_reduce:
641 case nir_intrinsic_rotate:
642 case nir_intrinsic_shuffle_xor:
643 case nir_intrinsic_shuffle_up:
644 case nir_intrinsic_shuffle_down:
645 case nir_intrinsic_shuffle_xor_uniform_ir3:
646 case nir_intrinsic_shuffle_up_uniform_ir3:
647 case nir_intrinsic_shuffle_down_uniform_ir3:
648 case nir_intrinsic_quad_broadcast:
649 case nir_intrinsic_quad_swap_horizontal:
650 case nir_intrinsic_quad_swap_vertical:
651 case nir_intrinsic_quad_swap_diagonal:
652 case nir_intrinsic_quad_vote_any:
653 case nir_intrinsic_quad_vote_all:
654 case nir_intrinsic_load_shared2_amd:
655 case nir_intrinsic_load_global_constant:
656 case nir_intrinsic_load_global_amd:
657 case nir_intrinsic_load_uniform:
658 case nir_intrinsic_load_constant:
659 case nir_intrinsic_load_sample_pos_from_id:
660 case nir_intrinsic_load_kernel_input:
661 case nir_intrinsic_load_task_payload:
662 case nir_intrinsic_load_buffer_amd:
663 case nir_intrinsic_load_typed_buffer_amd:
664 case nir_intrinsic_image_levels:
665 case nir_intrinsic_image_deref_levels:
666 case nir_intrinsic_bindless_image_levels:
667 case nir_intrinsic_image_samples:
668 case nir_intrinsic_image_deref_samples:
669 case nir_intrinsic_bindless_image_samples:
670 case nir_intrinsic_image_size:
671 case nir_intrinsic_image_deref_size:
672 case nir_intrinsic_bindless_image_size:
673 case nir_intrinsic_image_descriptor_amd:
674 case nir_intrinsic_image_deref_descriptor_amd:
675 case nir_intrinsic_bindless_image_descriptor_amd:
676 case nir_intrinsic_strict_wqm_coord_amd:
677 case nir_intrinsic_copy_deref:
678 case nir_intrinsic_vulkan_resource_index:
679 case nir_intrinsic_vulkan_resource_reindex:
680 case nir_intrinsic_load_vulkan_descriptor:
681 case nir_intrinsic_atomic_counter_read:
682 case nir_intrinsic_atomic_counter_read_deref:
683 case nir_intrinsic_quad_swizzle_amd:
684 case nir_intrinsic_masked_swizzle_amd:
685 case nir_intrinsic_is_sparse_texels_resident:
686 case nir_intrinsic_is_sparse_resident_zink:
687 case nir_intrinsic_sparse_residency_code_and:
688 case nir_intrinsic_bvh64_intersect_ray_amd:
689 case nir_intrinsic_image_deref_load_param_intel:
690 case nir_intrinsic_image_load_raw_intel:
691 case nir_intrinsic_get_ubo_size:
692 case nir_intrinsic_load_ssbo_address:
693 case nir_intrinsic_load_global_constant_bounded:
694 case nir_intrinsic_load_global_constant_offset:
695 case nir_intrinsic_load_reg:
696 case nir_intrinsic_load_constant_agx:
697 case nir_intrinsic_load_reg_indirect:
698 case nir_intrinsic_load_const_ir3:
699 case nir_intrinsic_load_frag_size_ir3:
700 case nir_intrinsic_load_frag_offset_ir3:
701 case nir_intrinsic_bindless_resource_ir3: {
702 unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
703 for (unsigned i = 0; i < num_srcs; i++) {
704 if (src_divergent(instr->src[i], state)) {
705 is_divergent = true;
706 break;
707 }
708 }
709 break;
710 }
711
712 case nir_intrinsic_resource_intel:
713 /* Not having the non_uniform flag with divergent sources is undefined
714 * behavior. The Intel driver defines it pick the lowest numbered live
715 * SIMD lane (via emit_uniformize).
716 */
717 if ((nir_intrinsic_resource_access_intel(instr) &
718 nir_resource_intel_non_uniform) != 0) {
719 unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
720 for (unsigned i = 0; i < num_srcs; i++) {
721 if (src_divergent(instr->src[i], state)) {
722 is_divergent = true;
723 break;
724 }
725 }
726 }
727 break;
728
729 case nir_intrinsic_shuffle:
730 is_divergent = src_divergent(instr->src[0], state) &&
731 src_divergent(instr->src[1], state);
732 break;
733
734 /* Intrinsics which are always divergent */
735 case nir_intrinsic_inverse_ballot:
736 case nir_intrinsic_load_color0:
737 case nir_intrinsic_load_color1:
738 case nir_intrinsic_load_param:
739 case nir_intrinsic_load_sample_id:
740 case nir_intrinsic_load_sample_id_no_per_sample:
741 case nir_intrinsic_load_sample_mask_in:
742 case nir_intrinsic_load_interpolated_input:
743 case nir_intrinsic_load_point_coord_maybe_flipped:
744 case nir_intrinsic_load_barycentric_pixel:
745 case nir_intrinsic_load_barycentric_centroid:
746 case nir_intrinsic_load_barycentric_sample:
747 case nir_intrinsic_load_barycentric_model:
748 case nir_intrinsic_load_barycentric_at_sample:
749 case nir_intrinsic_load_barycentric_at_offset:
750 case nir_intrinsic_load_barycentric_at_offset_nv:
751 case nir_intrinsic_load_barycentric_coord_pixel:
752 case nir_intrinsic_load_barycentric_coord_centroid:
753 case nir_intrinsic_load_barycentric_coord_sample:
754 case nir_intrinsic_load_barycentric_coord_at_sample:
755 case nir_intrinsic_load_barycentric_coord_at_offset:
756 case nir_intrinsic_load_persp_center_rhw_ir3:
757 case nir_intrinsic_interp_deref_at_offset:
758 case nir_intrinsic_interp_deref_at_sample:
759 case nir_intrinsic_interp_deref_at_centroid:
760 case nir_intrinsic_interp_deref_at_vertex:
761 case nir_intrinsic_load_tess_coord:
762 case nir_intrinsic_load_tess_coord_xy:
763 case nir_intrinsic_load_point_coord:
764 case nir_intrinsic_load_line_coord:
765 case nir_intrinsic_load_frag_coord:
766 case nir_intrinsic_load_frag_coord_zw:
767 case nir_intrinsic_load_frag_coord_zw_pan:
768 case nir_intrinsic_load_frag_coord_unscaled_ir3:
769 case nir_intrinsic_load_pixel_coord:
770 case nir_intrinsic_load_fully_covered:
771 case nir_intrinsic_load_sample_pos:
772 case nir_intrinsic_load_sample_pos_or_center:
773 case nir_intrinsic_load_vertex_id_zero_base:
774 case nir_intrinsic_load_vertex_id:
775 case nir_intrinsic_load_invocation_id:
776 case nir_intrinsic_load_local_invocation_id:
777 case nir_intrinsic_load_local_invocation_index:
778 case nir_intrinsic_load_global_invocation_id:
779 case nir_intrinsic_load_global_invocation_index:
780 case nir_intrinsic_load_subgroup_invocation:
781 case nir_intrinsic_load_subgroup_eq_mask:
782 case nir_intrinsic_load_subgroup_ge_mask:
783 case nir_intrinsic_load_subgroup_gt_mask:
784 case nir_intrinsic_load_subgroup_le_mask:
785 case nir_intrinsic_load_subgroup_lt_mask:
786 case nir_intrinsic_load_helper_invocation:
787 case nir_intrinsic_is_helper_invocation:
788 case nir_intrinsic_load_scratch:
789 case nir_intrinsic_deref_atomic:
790 case nir_intrinsic_deref_atomic_swap:
791 case nir_intrinsic_ssbo_atomic:
792 case nir_intrinsic_ssbo_atomic_swap:
793 case nir_intrinsic_ssbo_atomic_ir3:
794 case nir_intrinsic_ssbo_atomic_swap_ir3:
795 case nir_intrinsic_image_deref_atomic:
796 case nir_intrinsic_image_deref_atomic_swap:
797 case nir_intrinsic_image_atomic:
798 case nir_intrinsic_image_atomic_swap:
799 case nir_intrinsic_bindless_image_atomic:
800 case nir_intrinsic_bindless_image_atomic_swap:
801 case nir_intrinsic_shared_atomic:
802 case nir_intrinsic_shared_atomic_swap:
803 case nir_intrinsic_task_payload_atomic:
804 case nir_intrinsic_task_payload_atomic_swap:
805 case nir_intrinsic_global_atomic:
806 case nir_intrinsic_global_atomic_swap:
807 case nir_intrinsic_global_atomic_amd:
808 case nir_intrinsic_global_atomic_agx:
809 case nir_intrinsic_global_atomic_swap_amd:
810 case nir_intrinsic_global_atomic_swap_agx:
811 case nir_intrinsic_global_atomic_2x32:
812 case nir_intrinsic_global_atomic_swap_2x32:
813 case nir_intrinsic_global_atomic_ir3:
814 case nir_intrinsic_global_atomic_swap_ir3:
815 case nir_intrinsic_atomic_counter_add:
816 case nir_intrinsic_atomic_counter_min:
817 case nir_intrinsic_atomic_counter_max:
818 case nir_intrinsic_atomic_counter_and:
819 case nir_intrinsic_atomic_counter_or:
820 case nir_intrinsic_atomic_counter_xor:
821 case nir_intrinsic_atomic_counter_inc:
822 case nir_intrinsic_atomic_counter_pre_dec:
823 case nir_intrinsic_atomic_counter_post_dec:
824 case nir_intrinsic_atomic_counter_exchange:
825 case nir_intrinsic_atomic_counter_comp_swap:
826 case nir_intrinsic_atomic_counter_add_deref:
827 case nir_intrinsic_atomic_counter_min_deref:
828 case nir_intrinsic_atomic_counter_max_deref:
829 case nir_intrinsic_atomic_counter_and_deref:
830 case nir_intrinsic_atomic_counter_or_deref:
831 case nir_intrinsic_atomic_counter_xor_deref:
832 case nir_intrinsic_atomic_counter_inc_deref:
833 case nir_intrinsic_atomic_counter_pre_dec_deref:
834 case nir_intrinsic_atomic_counter_post_dec_deref:
835 case nir_intrinsic_atomic_counter_exchange_deref:
836 case nir_intrinsic_atomic_counter_comp_swap_deref:
837 case nir_intrinsic_exclusive_scan:
838 case nir_intrinsic_exclusive_scan_clusters_ir3:
839 case nir_intrinsic_ballot_bit_count_exclusive:
840 case nir_intrinsic_ballot_bit_count_inclusive:
841 case nir_intrinsic_write_invocation_amd:
842 case nir_intrinsic_mbcnt_amd:
843 case nir_intrinsic_lane_permute_16_amd:
844 case nir_intrinsic_dpp16_shift_amd:
845 case nir_intrinsic_elect:
846 case nir_intrinsic_elect_any_ir3:
847 case nir_intrinsic_load_tlb_color_brcm:
848 case nir_intrinsic_load_tess_rel_patch_id_amd:
849 case nir_intrinsic_load_gs_vertex_offset_amd:
850 case nir_intrinsic_is_subgroup_invocation_lt_amd:
851 case nir_intrinsic_load_packed_passthrough_primitive_amd:
852 case nir_intrinsic_load_initial_edgeflags_amd:
853 case nir_intrinsic_gds_atomic_add_amd:
854 case nir_intrinsic_load_rt_arg_scratch_offset_amd:
855 case nir_intrinsic_load_intersection_opaque_amd:
856 case nir_intrinsic_load_vector_arg_amd:
857 case nir_intrinsic_load_btd_stack_id_intel:
858 case nir_intrinsic_load_topology_id_intel:
859 case nir_intrinsic_load_scratch_base_ptr:
860 case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
861 case nir_intrinsic_ordered_add_loop_gfx12_amd:
862 case nir_intrinsic_xfb_counter_sub_gfx11_amd:
863 case nir_intrinsic_unit_test_divergent_amd:
864 case nir_intrinsic_load_stack:
865 case nir_intrinsic_load_ray_launch_id:
866 case nir_intrinsic_load_ray_instance_custom_index:
867 case nir_intrinsic_load_ray_geometry_index:
868 case nir_intrinsic_load_ray_world_direction:
869 case nir_intrinsic_load_ray_world_origin:
870 case nir_intrinsic_load_ray_object_origin:
871 case nir_intrinsic_load_ray_object_direction:
872 case nir_intrinsic_load_ray_t_min:
873 case nir_intrinsic_load_ray_t_max:
874 case nir_intrinsic_load_ray_object_to_world:
875 case nir_intrinsic_load_ray_world_to_object:
876 case nir_intrinsic_load_ray_hit_kind:
877 case nir_intrinsic_load_ray_flags:
878 case nir_intrinsic_load_cull_mask:
879 case nir_intrinsic_load_sysval_nv:
880 case nir_intrinsic_emit_vertex_nv:
881 case nir_intrinsic_end_primitive_nv:
882 case nir_intrinsic_report_ray_intersection:
883 case nir_intrinsic_rq_proceed:
884 case nir_intrinsic_rq_load:
885 case nir_intrinsic_load_ray_triangle_vertex_positions:
886 case nir_intrinsic_cmat_extract:
887 case nir_intrinsic_cmat_muladd_amd:
888 case nir_intrinsic_dpas_intel:
889 case nir_intrinsic_isberd_nv:
890 case nir_intrinsic_al2p_nv:
891 case nir_intrinsic_ald_nv:
892 case nir_intrinsic_ipa_nv:
893 case nir_intrinsic_ldtram_nv:
894 case nir_intrinsic_printf:
895 case nir_intrinsic_load_gs_header_ir3:
896 case nir_intrinsic_load_tcs_header_ir3:
897 case nir_intrinsic_load_rel_patch_id_ir3:
898 case nir_intrinsic_brcst_active_ir3:
899 case nir_intrinsic_load_helper_op_id_agx:
900 case nir_intrinsic_load_helper_arg_lo_agx:
901 case nir_intrinsic_load_helper_arg_hi_agx:
902 case nir_intrinsic_stack_map_agx:
903 case nir_intrinsic_stack_unmap_agx:
904 case nir_intrinsic_load_exported_agx:
905 case nir_intrinsic_load_local_pixel_agx:
906 case nir_intrinsic_load_coefficients_agx:
907 case nir_intrinsic_load_active_subgroup_invocation_agx:
908 case nir_intrinsic_load_sample_mask:
909 case nir_intrinsic_quad_ballot_agx:
910 case nir_intrinsic_load_agx:
911 is_divergent = true;
912 break;
913
914 default:
915 #ifdef NDEBUG
916 is_divergent = true;
917 break;
918 #else
919 nir_print_instr(&instr->instr, stderr);
920 unreachable("\nNIR divergence analysis: Unhandled intrinsic.");
921 #endif
922 }
923
924 instr->def.divergent = is_divergent;
925 return is_divergent;
926 }
927
928 static bool
visit_tex(nir_tex_instr * instr,struct divergence_state * state)929 visit_tex(nir_tex_instr *instr, struct divergence_state *state)
930 {
931 if (instr->def.divergent)
932 return false;
933
934 bool is_divergent = false;
935
936 for (unsigned i = 0; i < instr->num_srcs; i++) {
937 switch (instr->src[i].src_type) {
938 case nir_tex_src_sampler_deref:
939 case nir_tex_src_sampler_handle:
940 case nir_tex_src_sampler_offset:
941 is_divergent |= src_divergent(instr->src[i].src, state) &&
942 instr->sampler_non_uniform;
943 break;
944 case nir_tex_src_texture_deref:
945 case nir_tex_src_texture_handle:
946 case nir_tex_src_texture_offset:
947 is_divergent |= src_divergent(instr->src[i].src, state) &&
948 instr->texture_non_uniform;
949 break;
950 default:
951 is_divergent |= src_divergent(instr->src[i].src, state);
952 break;
953 }
954 }
955
956 instr->def.divergent = is_divergent;
957 return is_divergent;
958 }
959
960 static bool
visit_def(nir_def * def,struct divergence_state * state)961 visit_def(nir_def *def, struct divergence_state *state)
962 {
963 return false;
964 }
965
966 static bool
nir_variable_mode_is_uniform(nir_variable_mode mode)967 nir_variable_mode_is_uniform(nir_variable_mode mode)
968 {
969 switch (mode) {
970 case nir_var_uniform:
971 case nir_var_mem_ubo:
972 case nir_var_mem_ssbo:
973 case nir_var_mem_shared:
974 case nir_var_mem_task_payload:
975 case nir_var_mem_global:
976 case nir_var_image:
977 return true;
978 default:
979 return false;
980 }
981 }
982
983 static bool
nir_variable_is_uniform(nir_shader * shader,nir_variable * var,struct divergence_state * state)984 nir_variable_is_uniform(nir_shader *shader, nir_variable *var,
985 struct divergence_state *state)
986 {
987 if (nir_variable_mode_is_uniform(var->data.mode))
988 return true;
989
990 /* Handle system value variables. */
991 if (var->data.mode == nir_var_system_value) {
992 /* Fake the instruction to reuse visit_intrinsic for all sysvals. */
993 nir_intrinsic_instr fake_instr;
994
995 memset(&fake_instr, 0, sizeof(fake_instr));
996 fake_instr.intrinsic =
997 nir_intrinsic_from_system_value(var->data.location);
998
999 visit_intrinsic(&fake_instr, state);
1000 return !fake_instr.def.divergent;
1001 }
1002
1003 nir_divergence_options options = state->options;
1004 gl_shader_stage stage = shader->info.stage;
1005
1006 if (stage == MESA_SHADER_FRAGMENT &&
1007 (options & nir_divergence_single_prim_per_subgroup) &&
1008 var->data.mode == nir_var_shader_in &&
1009 var->data.interpolation == INTERP_MODE_FLAT)
1010 return true;
1011
1012 if (stage == MESA_SHADER_TESS_CTRL &&
1013 (options & nir_divergence_single_patch_per_tcs_subgroup) &&
1014 var->data.mode == nir_var_shader_out && var->data.patch)
1015 return true;
1016
1017 if (stage == MESA_SHADER_TESS_EVAL &&
1018 (options & nir_divergence_single_patch_per_tes_subgroup) &&
1019 var->data.mode == nir_var_shader_in && var->data.patch)
1020 return true;
1021
1022 return false;
1023 }
1024
1025 static bool
visit_deref(nir_shader * shader,nir_deref_instr * deref,struct divergence_state * state)1026 visit_deref(nir_shader *shader, nir_deref_instr *deref,
1027 struct divergence_state *state)
1028 {
1029 if (deref->def.divergent)
1030 return false;
1031
1032 bool is_divergent = false;
1033 switch (deref->deref_type) {
1034 case nir_deref_type_var:
1035 is_divergent = !nir_variable_is_uniform(shader, deref->var, state);
1036 break;
1037 case nir_deref_type_array:
1038 case nir_deref_type_ptr_as_array:
1039 is_divergent = src_divergent(deref->arr.index, state);
1040 FALLTHROUGH;
1041 case nir_deref_type_struct:
1042 case nir_deref_type_array_wildcard:
1043 is_divergent |= src_divergent(deref->parent, state);
1044 break;
1045 case nir_deref_type_cast:
1046 is_divergent = !nir_variable_mode_is_uniform(deref->var->data.mode) ||
1047 src_divergent(deref->parent, state);
1048 break;
1049 }
1050
1051 deref->def.divergent = is_divergent;
1052 return is_divergent;
1053 }
1054
1055 static bool
visit_jump(nir_jump_instr * jump,struct divergence_state * state)1056 visit_jump(nir_jump_instr *jump, struct divergence_state *state)
1057 {
1058 switch (jump->type) {
1059 case nir_jump_continue:
1060 if (state->divergent_loop_continue)
1061 return false;
1062 if (state->divergent_loop_cf)
1063 state->divergent_loop_continue = true;
1064 return state->divergent_loop_continue;
1065 case nir_jump_break:
1066 if (state->divergent_loop_break)
1067 return false;
1068 if (state->divergent_loop_cf)
1069 state->divergent_loop_break = true;
1070 return state->divergent_loop_break;
1071 case nir_jump_halt:
1072 /* This totally kills invocations so it doesn't add divergence */
1073 break;
1074 case nir_jump_return:
1075 unreachable("NIR divergence analysis: Unsupported return instruction.");
1076 break;
1077 case nir_jump_goto:
1078 case nir_jump_goto_if:
1079 unreachable("NIR divergence analysis: Unsupported goto_if instruction.");
1080 break;
1081 }
1082 return false;
1083 }
1084
1085 static bool
set_ssa_def_not_divergent(nir_def * def,void * invariant)1086 set_ssa_def_not_divergent(nir_def *def, void *invariant)
1087 {
1088 def->divergent = false;
1089 def->loop_invariant = *(bool *)invariant;
1090 return true;
1091 }
1092
1093 static bool
instr_is_loop_invariant(nir_instr * instr,struct divergence_state * state)1094 instr_is_loop_invariant(nir_instr *instr, struct divergence_state *state)
1095 {
1096 if (!state->loop)
1097 return false;
1098
1099 switch (instr->type) {
1100 case nir_instr_type_load_const:
1101 case nir_instr_type_undef:
1102 case nir_instr_type_debug_info:
1103 case nir_instr_type_jump:
1104 return true;
1105 case nir_instr_type_intrinsic:
1106 if (!nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
1107 return false;
1108 FALLTHROUGH;
1109 case nir_instr_type_alu:
1110 case nir_instr_type_deref:
1111 case nir_instr_type_tex:
1112 return nir_foreach_src(instr, src_invariant, state->loop);
1113 case nir_instr_type_phi:
1114 case nir_instr_type_call:
1115 case nir_instr_type_parallel_copy:
1116 default:
1117 unreachable("NIR divergence analysis: Unsupported instruction type.");
1118 }
1119 }
1120
1121 static bool
update_instr_divergence(nir_instr * instr,struct divergence_state * state)1122 update_instr_divergence(nir_instr *instr, struct divergence_state *state)
1123 {
1124 switch (instr->type) {
1125 case nir_instr_type_alu:
1126 return visit_alu(nir_instr_as_alu(instr), state);
1127 case nir_instr_type_intrinsic:
1128 return visit_intrinsic(nir_instr_as_intrinsic(instr), state);
1129 case nir_instr_type_tex:
1130 return visit_tex(nir_instr_as_tex(instr), state);
1131 case nir_instr_type_load_const:
1132 return visit_def(&nir_instr_as_load_const(instr)->def, state);
1133 case nir_instr_type_undef:
1134 return visit_def(&nir_instr_as_undef(instr)->def, state);
1135 case nir_instr_type_deref:
1136 return visit_deref(state->shader, nir_instr_as_deref(instr), state);
1137 case nir_instr_type_debug_info:
1138 return false;
1139 case nir_instr_type_jump:
1140 case nir_instr_type_phi:
1141 case nir_instr_type_call:
1142 case nir_instr_type_parallel_copy:
1143 default:
1144 unreachable("NIR divergence analysis: Unsupported instruction type.");
1145 }
1146 }
1147
1148 static bool
visit_block(nir_block * block,struct divergence_state * state)1149 visit_block(nir_block *block, struct divergence_state *state)
1150 {
1151 bool has_changed = false;
1152
1153 nir_foreach_instr(instr, block) {
1154 /* phis are handled when processing the branches */
1155 if (instr->type == nir_instr_type_phi)
1156 continue;
1157
1158 if (state->first_visit) {
1159 bool invariant = instr_is_loop_invariant(instr, state);
1160 nir_foreach_def(instr, set_ssa_def_not_divergent, &invariant);
1161 }
1162
1163 if (instr->type == nir_instr_type_jump) {
1164 has_changed |= visit_jump(nir_instr_as_jump(instr), state);
1165 } else {
1166 has_changed |= update_instr_divergence(instr, state);
1167 }
1168 }
1169
1170 bool divergent = state->divergent_loop_cf ||
1171 state->divergent_loop_continue ||
1172 state->divergent_loop_break;
1173 if (divergent != block->divergent) {
1174 block->divergent = divergent;
1175 has_changed = true;
1176 }
1177
1178 return has_changed;
1179 }
1180
1181 /* There are 3 types of phi instructions:
1182 * (1) gamma: represent the joining point of different paths
1183 * created by an “if-then-else” branch.
1184 * The resulting value is divergent if the branch condition
1185 * or any of the source values is divergent. */
1186 static bool
visit_if_merge_phi(nir_phi_instr * phi,bool if_cond_divergent,bool ignore_undef)1187 visit_if_merge_phi(nir_phi_instr *phi, bool if_cond_divergent, bool ignore_undef)
1188 {
1189 if (phi->def.divergent)
1190 return false;
1191
1192 unsigned defined_srcs = 0;
1193 nir_foreach_phi_src(src, phi) {
1194 /* if any source value is divergent, the resulting value is divergent */
1195 if (nir_src_is_divergent(&src->src)) {
1196 phi->def.divergent = true;
1197 return true;
1198 }
1199 if (src->src.ssa->parent_instr->type != nir_instr_type_undef) {
1200 defined_srcs++;
1201 }
1202 }
1203
1204 if (!(ignore_undef && defined_srcs <= 1) && if_cond_divergent) {
1205 phi->def.divergent = true;
1206 return true;
1207 }
1208
1209 return false;
1210 }
1211
1212 /* There are 3 types of phi instructions:
1213 * (2) mu: which only exist at loop headers,
1214 * merge initial and loop-carried values.
1215 * The resulting value is divergent if any source value
1216 * is divergent or a divergent loop continue condition
1217 * is associated with a different ssa-def. */
1218 static bool
visit_loop_header_phi(nir_phi_instr * phi,nir_block * preheader,bool divergent_continue)1219 visit_loop_header_phi(nir_phi_instr *phi, nir_block *preheader, bool divergent_continue)
1220 {
1221 if (phi->def.divergent)
1222 return false;
1223
1224 nir_def *same = NULL;
1225 nir_foreach_phi_src(src, phi) {
1226 /* if any source value is divergent, the resulting value is divergent */
1227 if (nir_src_is_divergent(&src->src)) {
1228 phi->def.divergent = true;
1229 return true;
1230 }
1231 /* if this loop is uniform, we're done here */
1232 if (!divergent_continue)
1233 continue;
1234 /* skip the loop preheader */
1235 if (src->pred == preheader)
1236 continue;
1237
1238 /* check if all loop-carried values are from the same ssa-def */
1239 if (!same)
1240 same = src->src.ssa;
1241 else if (same != src->src.ssa) {
1242 phi->def.divergent = true;
1243 return true;
1244 }
1245 }
1246
1247 return false;
1248 }
1249
1250 /* There are 3 types of phi instructions:
1251 * (3) eta: represent values that leave a loop.
1252 * The resulting value is divergent if the source value is divergent
1253 * or any loop exit condition is divergent for a value which is
1254 * not loop-invariant (see nir_src_is_divergent()).
1255 */
1256 static bool
visit_loop_exit_phi(nir_phi_instr * phi,nir_loop * loop)1257 visit_loop_exit_phi(nir_phi_instr *phi, nir_loop *loop)
1258 {
1259 if (phi->def.divergent)
1260 return false;
1261
1262 nir_def *same = NULL;
1263 nir_foreach_phi_src(src, phi) {
1264 /* If any loop exit condition is divergent and this value is not loop
1265 * invariant, or if the source value is divergent, then the resulting
1266 * value is divergent.
1267 */
1268 if ((loop->divergent_break && !src_invariant(&src->src, loop)) ||
1269 nir_src_is_divergent(&src->src)) {
1270 phi->def.divergent = true;
1271 return true;
1272 }
1273
1274 /* if this loop is uniform, we're done here */
1275 if (!loop->divergent_break)
1276 continue;
1277
1278 /* check if all loop-exit values are from the same ssa-def */
1279 if (!same)
1280 same = src->src.ssa;
1281 else if (same != src->src.ssa) {
1282 phi->def.divergent = true;
1283 return true;
1284 }
1285 }
1286
1287 return false;
1288 }
1289
1290 static bool
visit_if(nir_if * if_stmt,struct divergence_state * state)1291 visit_if(nir_if *if_stmt, struct divergence_state *state)
1292 {
1293 bool progress = false;
1294 bool cond_divergent = src_divergent(if_stmt->condition, state);
1295
1296 struct divergence_state then_state = *state;
1297 then_state.divergent_loop_cf |= cond_divergent;
1298 progress |= visit_cf_list(&if_stmt->then_list, &then_state);
1299
1300 struct divergence_state else_state = *state;
1301 else_state.divergent_loop_cf |= cond_divergent;
1302 progress |= visit_cf_list(&if_stmt->else_list, &else_state);
1303
1304 /* handle phis after the IF */
1305 bool invariant = state->loop && src_invariant(&if_stmt->condition, state->loop);
1306 nir_foreach_phi(phi, nir_cf_node_cf_tree_next(&if_stmt->cf_node)) {
1307 if (state->first_visit) {
1308 phi->def.divergent = false;
1309 phi->def.loop_invariant =
1310 invariant && nir_foreach_src(&phi->instr, src_invariant, state->loop);
1311 }
1312 bool ignore_undef = state->options & nir_divergence_ignore_undef_if_phi_srcs;
1313 progress |= visit_if_merge_phi(phi, cond_divergent, ignore_undef);
1314 }
1315
1316 /* join loop divergence information from both branch legs */
1317 state->divergent_loop_continue |= then_state.divergent_loop_continue ||
1318 else_state.divergent_loop_continue;
1319 state->divergent_loop_break |= then_state.divergent_loop_break ||
1320 else_state.divergent_loop_break;
1321
1322 /* A divergent continue makes succeeding loop CF divergent:
1323 * not all loop-active invocations participate in the remaining loop-body
1324 * which means that a following break might be taken by some invocations, only */
1325 state->divergent_loop_cf |= state->divergent_loop_continue;
1326
1327 state->consider_loop_invariance |= then_state.consider_loop_invariance ||
1328 else_state.consider_loop_invariance;
1329
1330 return progress;
1331 }
1332
1333 static bool
visit_loop(nir_loop * loop,struct divergence_state * state)1334 visit_loop(nir_loop *loop, struct divergence_state *state)
1335 {
1336 assert(!nir_loop_has_continue_construct(loop));
1337 bool progress = false;
1338 nir_block *loop_header = nir_loop_first_block(loop);
1339 nir_block *loop_preheader = nir_block_cf_tree_prev(loop_header);
1340
1341 /* handle loop header phis first: we have no knowledge yet about
1342 * the loop's control flow or any loop-carried sources. */
1343 nir_foreach_phi(phi, loop_header) {
1344 if (!state->first_visit && phi->def.divergent)
1345 continue;
1346
1347 phi->def.loop_invariant = false;
1348 nir_foreach_phi_src(src, phi) {
1349 if (src->pred == loop_preheader) {
1350 phi->def.divergent = nir_src_is_divergent(&src->src);
1351 break;
1352 }
1353 }
1354 progress |= phi->def.divergent;
1355 }
1356
1357 /* setup loop state */
1358 struct divergence_state loop_state = *state;
1359 loop_state.loop = loop;
1360 loop_state.divergent_loop_cf = false;
1361 loop_state.divergent_loop_continue = false;
1362 loop_state.divergent_loop_break = false;
1363
1364 /* process loop body until no further changes are made */
1365 bool repeat;
1366 do {
1367 progress |= visit_cf_list(&loop->body, &loop_state);
1368 repeat = false;
1369
1370 /* revisit loop header phis to see if something has changed */
1371 nir_foreach_phi(phi, loop_header) {
1372 repeat |= visit_loop_header_phi(phi, loop_preheader,
1373 loop_state.divergent_loop_continue);
1374 }
1375
1376 loop_state.divergent_loop_cf = false;
1377 loop_state.first_visit = false;
1378 } while (repeat);
1379
1380 loop->divergent_continue = loop_state.divergent_loop_continue;
1381 loop->divergent_break = loop_state.divergent_loop_break;
1382
1383 /* handle phis after the loop */
1384 nir_foreach_phi(phi, nir_cf_node_cf_tree_next(&loop->cf_node)) {
1385 if (state->first_visit) {
1386 phi->def.divergent = false;
1387 phi->def.loop_invariant = false;
1388 }
1389 progress |= visit_loop_exit_phi(phi, loop);
1390 }
1391
1392 state->consider_loop_invariance |= loop_state.consider_loop_invariance ||
1393 loop->divergent_break;
1394 return progress;
1395 }
1396
1397 static bool
visit_cf_list(struct exec_list * list,struct divergence_state * state)1398 visit_cf_list(struct exec_list *list, struct divergence_state *state)
1399 {
1400 bool has_changed = false;
1401
1402 foreach_list_typed(nir_cf_node, node, node, list) {
1403 switch (node->type) {
1404 case nir_cf_node_block:
1405 has_changed |= visit_block(nir_cf_node_as_block(node), state);
1406 break;
1407 case nir_cf_node_if:
1408 has_changed |= visit_if(nir_cf_node_as_if(node), state);
1409 break;
1410 case nir_cf_node_loop:
1411 has_changed |= visit_loop(nir_cf_node_as_loop(node), state);
1412 break;
1413 case nir_cf_node_function:
1414 unreachable("NIR divergence analysis: Unsupported cf_node type.");
1415 }
1416 }
1417
1418 return has_changed;
1419 }
1420
1421 void
nir_divergence_analysis_impl(nir_function_impl * impl,nir_divergence_options options)1422 nir_divergence_analysis_impl(nir_function_impl *impl, nir_divergence_options options)
1423 {
1424 nir_metadata_require(impl, nir_metadata_block_index);
1425
1426 struct divergence_state state = {
1427 .stage = impl->function->shader->info.stage,
1428 .shader = impl->function->shader,
1429 .options = options,
1430 .loop = NULL,
1431 .divergent_loop_cf = false,
1432 .divergent_loop_continue = false,
1433 .divergent_loop_break = false,
1434 .first_visit = true,
1435 };
1436
1437 visit_cf_list(&impl->body, &state);
1438
1439 nir_metadata_preserve(impl, nir_metadata_all);
1440 }
1441
1442 void
nir_divergence_analysis(nir_shader * shader)1443 nir_divergence_analysis(nir_shader *shader)
1444 {
1445 shader->info.divergence_analysis_run = true;
1446 nir_divergence_analysis_impl(nir_shader_get_entrypoint(shader),
1447 shader->options->divergence_analysis_options);
1448 }
1449
1450 /* Compute divergence between vertices of the same primitive. This uses
1451 * the same divergent field in nir_def and nir_loop as the regular divergence
1452 * pass.
1453 */
1454 void
nir_vertex_divergence_analysis(nir_shader * shader)1455 nir_vertex_divergence_analysis(nir_shader *shader)
1456 {
1457 shader->info.divergence_analysis_run = false;
1458
1459 struct divergence_state state = {
1460 .stage = shader->info.stage,
1461 .shader = shader,
1462 .options = shader->options->divergence_analysis_options,
1463 .loop = NULL,
1464 .vertex_divergence = true,
1465 .first_visit = true,
1466 };
1467
1468 nir_metadata_require(nir_shader_get_entrypoint(shader),
1469 nir_metadata_block_index);
1470 visit_cf_list(&nir_shader_get_entrypoint(shader)->body, &state);
1471 nir_metadata_preserve(nir_shader_get_entrypoint(shader), nir_metadata_all);
1472 }
1473
1474 bool
nir_has_divergent_loop(nir_shader * shader)1475 nir_has_divergent_loop(nir_shader *shader)
1476 {
1477 nir_function_impl *func = nir_shader_get_entrypoint(shader);
1478
1479 foreach_list_typed(nir_cf_node, node, node, &func->body) {
1480 if (node->type == nir_cf_node_loop) {
1481 if (nir_cf_node_as_loop(node)->divergent_break)
1482 return true;
1483 }
1484 }
1485
1486 return false;
1487 }
1488
1489 /* Recommended when computing divergence information in shared code such
1490 * as the GLSL linker.
1491 */
1492 void
nir_clear_divergence_info(nir_shader * nir)1493 nir_clear_divergence_info(nir_shader *nir)
1494 {
1495 nir_foreach_function_impl(impl, nir) {
1496 nir_foreach_block(block, impl) {
1497 /* true is the safer value. */
1498 block->divergent = true;
1499
1500 nir_foreach_instr(instr, block) {
1501 nir_def *def = nir_instr_def(instr);
1502 if (def)
1503 def->divergent = true;
1504 }
1505 }
1506 }
1507 }
1508