1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "nir.h"
26
27 /* This pass computes for each ssa definition if it is uniform.
28 * That is, the variable has the same value for all invocations
29 * of the group.
30 *
31 * This divergence analysis pass expects the shader to be in LCSSA-form.
32 *
33 * This algorithm implements "The Simple Divergence Analysis" from
34 * Diogo Sampaio, Rafael De Souza, Sylvain Collange, Fernando Magno Quintão Pereira.
35 * Divergence Analysis. ACM Transactions on Programming Languages and Systems (TOPLAS),
36 * ACM, 2013, 35 (4), pp.13:1-13:36. <10.1145/2523815>. <hal-00909072v2>
37 */
38
39 struct divergence_state {
40 const gl_shader_stage stage;
41 nir_shader *shader;
42 nir_function_impl *impl;
43 nir_divergence_options options;
44 nir_loop *loop;
45 bool loop_all_invariant;
46
47 /* Whether the caller requested vertex divergence (meaning between vertices
48 * of the same primitive) instead of subgroup invocation divergence
49 * (between invocations of the same subgroup). For example, patch input
50 * loads are always convergent, while subgroup intrinsics are divergent
51 * because vertices of the same primitive can be processed by different
52 * subgroups.
53 */
54 bool vertex_divergence;
55
56 /** current control flow state */
57 /* True if some loop-active invocations might take a different control-flow path.
58 * A divergent break does not cause subsequent control-flow to be considered
59 * divergent because those invocations are no longer active in the loop.
60 * For a divergent if, both sides are considered divergent flow because
61 * the other side is still loop-active. */
62 bool divergent_loop_cf;
63 /* True if a divergent continue happened since the loop header */
64 bool divergent_loop_continue;
65 /* True if a divergent break happened since the loop header */
66 bool divergent_loop_break;
67
68 /* True if we visit the block for the fist time */
69 bool first_visit;
70 /* True if we visit a block that is dominated by a loop with a divergent break */
71 bool consider_loop_invariance;
72 };
73
74 static bool
75 visit_cf_list(struct exec_list *list, struct divergence_state *state);
76
77 bool
nir_src_is_divergent(nir_src * src)78 nir_src_is_divergent(nir_src *src)
79 {
80 if (src->ssa->divergent)
81 return true;
82
83 nir_cf_node *use_node = nir_src_get_block(src)->cf_node.parent;
84 nir_cf_node *def_node = src->ssa->parent_instr->block->cf_node.parent;
85
86 /* Short-cut the common case. */
87 if (def_node == use_node)
88 return false;
89
90 /* If the source was computed in a divergent loop, and is not
91 * loop-invariant, then it must also be considered divergent.
92 */
93 bool loop_invariant = src->ssa->loop_invariant;
94 while (def_node) {
95 if (def_node->type == nir_cf_node_loop) {
96 /* Check whether the use is inside this loop. */
97 for (nir_cf_node *node = use_node; node != NULL; node = node->parent) {
98 if (def_node == node)
99 return false;
100 }
101
102 /* Because the use is outside of this loop, it is divergent. */
103 if (nir_cf_node_as_loop(def_node)->divergent_break && !loop_invariant)
104 return true;
105
106 /* For outer loops, consider this variable not loop invariant. */
107 loop_invariant = false;
108 }
109
110 def_node = def_node->parent;
111 }
112
113 return false;
114 }
115
116 static inline bool
src_divergent(nir_src src,struct divergence_state * state)117 src_divergent(nir_src src, struct divergence_state *state)
118 {
119 if (!state->consider_loop_invariance)
120 return src.ssa->divergent;
121
122 return nir_src_is_divergent(&src);
123 }
124
125 static inline bool
src_invariant(nir_src * src,void * loop)126 src_invariant(nir_src *src, void *loop)
127 {
128 nir_block *first_block = nir_loop_first_block(loop);
129
130 /* Invariant if SSA is defined before the current loop. */
131 if (src->ssa->parent_instr->block->index < first_block->index)
132 return true;
133
134 if (!src->ssa->loop_invariant)
135 return false;
136
137 /* The value might be defined in a nested loop. */
138 nir_cf_node *cf_node = src->ssa->parent_instr->block->cf_node.parent;
139 while (cf_node->type != nir_cf_node_loop)
140 cf_node = cf_node->parent;
141
142 return nir_cf_node_as_loop(cf_node) == loop;
143 }
144
145 static bool
visit_alu(nir_alu_instr * instr,struct divergence_state * state)146 visit_alu(nir_alu_instr *instr, struct divergence_state *state)
147 {
148 if (instr->def.divergent)
149 return false;
150
151 unsigned num_src = nir_op_infos[instr->op].num_inputs;
152
153 for (unsigned i = 0; i < num_src; i++) {
154 if (src_divergent(instr->src[i].src, state)) {
155 instr->def.divergent = true;
156 return true;
157 }
158 }
159
160 return false;
161 }
162
163
164 /* On some HW uniform loads where there is a pending store/atomic from another
165 * wave can "tear" so that different invocations see the pre-store value and
166 * the post-store value even though they are loading from the same location.
167 * This means we have to assume it's not uniform unless it's readonly.
168 *
169 * TODO The Vulkan memory model is much more strict here and requires an
170 * atomic or volatile load for the data race to be valid, which could allow us
171 * to do better if it's in use, however we currently don't have that
172 * information plumbed through.
173 */
174 static bool
load_may_tear(struct divergence_state * state,nir_intrinsic_instr * instr)175 load_may_tear(struct divergence_state *state, nir_intrinsic_instr *instr)
176 {
177 return (state->options & nir_divergence_uniform_load_tears) &&
178 !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE);
179 }
180
181 static bool
visit_intrinsic(nir_intrinsic_instr * instr,struct divergence_state * state)182 visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
183 {
184 if (!nir_intrinsic_infos[instr->intrinsic].has_dest)
185 return false;
186
187 if (instr->def.divergent)
188 return false;
189
190 nir_divergence_options options = state->options;
191 gl_shader_stage stage = state->stage;
192 bool is_divergent = false;
193 switch (instr->intrinsic) {
194 case nir_intrinsic_shader_clock:
195 case nir_intrinsic_ballot:
196 case nir_intrinsic_ballot_relaxed:
197 case nir_intrinsic_as_uniform:
198 case nir_intrinsic_read_invocation:
199 case nir_intrinsic_read_first_invocation:
200 case nir_intrinsic_read_invocation_cond_ir3:
201 case nir_intrinsic_read_getlast_ir3:
202 case nir_intrinsic_vote_any:
203 case nir_intrinsic_vote_all:
204 case nir_intrinsic_vote_feq:
205 case nir_intrinsic_vote_ieq:
206 case nir_intrinsic_first_invocation:
207 case nir_intrinsic_last_invocation:
208 case nir_intrinsic_load_subgroup_id:
209 case nir_intrinsic_shared_append_amd:
210 case nir_intrinsic_shared_consume_amd:
211 /* VS/TES/GS invocations of the same primitive can be in different
212 * subgroups, so subgroup ops are always divergent between vertices of
213 * the same primitive.
214 */
215 is_divergent = state->vertex_divergence;
216 break;
217
218 /* Intrinsics which are always uniform */
219 case nir_intrinsic_load_preamble:
220 case nir_intrinsic_load_push_constant:
221 case nir_intrinsic_load_push_constant_zink:
222 case nir_intrinsic_load_work_dim:
223 case nir_intrinsic_load_num_workgroups:
224 case nir_intrinsic_load_workgroup_size:
225 case nir_intrinsic_load_num_subgroups:
226 case nir_intrinsic_load_ray_launch_size:
227 case nir_intrinsic_load_sbt_base_amd:
228 case nir_intrinsic_load_subgroup_size:
229 case nir_intrinsic_load_subgroup_id_shift_ir3:
230 case nir_intrinsic_load_base_instance:
231 case nir_intrinsic_load_base_vertex:
232 case nir_intrinsic_load_first_vertex:
233 case nir_intrinsic_load_draw_id:
234 case nir_intrinsic_load_is_indexed_draw:
235 case nir_intrinsic_load_viewport_scale:
236 case nir_intrinsic_load_user_clip_plane:
237 case nir_intrinsic_load_viewport_x_scale:
238 case nir_intrinsic_load_viewport_y_scale:
239 case nir_intrinsic_load_viewport_z_scale:
240 case nir_intrinsic_load_viewport_offset:
241 case nir_intrinsic_load_viewport_x_offset:
242 case nir_intrinsic_load_viewport_y_offset:
243 case nir_intrinsic_load_viewport_z_offset:
244 case nir_intrinsic_load_cull_triangle_viewport_xy_scale_and_offset_amd:
245 case nir_intrinsic_load_cull_line_viewport_xy_scale_and_offset_amd:
246 case nir_intrinsic_load_blend_const_color_a_float:
247 case nir_intrinsic_load_blend_const_color_b_float:
248 case nir_intrinsic_load_blend_const_color_g_float:
249 case nir_intrinsic_load_blend_const_color_r_float:
250 case nir_intrinsic_load_blend_const_color_rgba:
251 case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
252 case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
253 case nir_intrinsic_load_line_width:
254 case nir_intrinsic_load_aa_line_width:
255 case nir_intrinsic_load_xfb_address:
256 case nir_intrinsic_load_num_vertices:
257 case nir_intrinsic_load_fb_layers_v3d:
258 case nir_intrinsic_load_fep_w_v3d:
259 case nir_intrinsic_load_tcs_num_patches_amd:
260 case nir_intrinsic_load_tcs_tess_levels_to_tes_amd:
261 case nir_intrinsic_load_tcs_primitive_mode_amd:
262 case nir_intrinsic_load_patch_vertices_in:
263 case nir_intrinsic_load_ring_tess_factors_amd:
264 case nir_intrinsic_load_ring_tess_offchip_amd:
265 case nir_intrinsic_load_ring_tess_factors_offset_amd:
266 case nir_intrinsic_load_ring_tess_offchip_offset_amd:
267 case nir_intrinsic_load_ring_mesh_scratch_amd:
268 case nir_intrinsic_load_ring_mesh_scratch_offset_amd:
269 case nir_intrinsic_load_ring_esgs_amd:
270 case nir_intrinsic_load_ring_es2gs_offset_amd:
271 case nir_intrinsic_load_ring_task_draw_amd:
272 case nir_intrinsic_load_ring_task_payload_amd:
273 case nir_intrinsic_load_sample_positions_amd:
274 case nir_intrinsic_load_rasterization_samples_amd:
275 case nir_intrinsic_load_ring_gsvs_amd:
276 case nir_intrinsic_load_ring_gs2vs_offset_amd:
277 case nir_intrinsic_load_streamout_config_amd:
278 case nir_intrinsic_load_streamout_write_index_amd:
279 case nir_intrinsic_load_streamout_offset_amd:
280 case nir_intrinsic_load_task_ring_entry_amd:
281 case nir_intrinsic_load_ring_attr_amd:
282 case nir_intrinsic_load_ring_attr_offset_amd:
283 case nir_intrinsic_load_provoking_vtx_amd:
284 case nir_intrinsic_load_sample_positions_pan:
285 case nir_intrinsic_load_workgroup_num_input_vertices_amd:
286 case nir_intrinsic_load_workgroup_num_input_primitives_amd:
287 case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
288 case nir_intrinsic_load_prim_gen_query_enabled_amd:
289 case nir_intrinsic_load_prim_xfb_query_enabled_amd:
290 case nir_intrinsic_load_merged_wave_info_amd:
291 case nir_intrinsic_load_clamp_vertex_color_amd:
292 case nir_intrinsic_load_cull_front_face_enabled_amd:
293 case nir_intrinsic_load_cull_back_face_enabled_amd:
294 case nir_intrinsic_load_cull_ccw_amd:
295 case nir_intrinsic_load_cull_small_triangles_enabled_amd:
296 case nir_intrinsic_load_cull_small_lines_enabled_amd:
297 case nir_intrinsic_load_cull_any_enabled_amd:
298 case nir_intrinsic_load_cull_small_triangle_precision_amd:
299 case nir_intrinsic_load_cull_small_line_precision_amd:
300 case nir_intrinsic_load_user_data_amd:
301 case nir_intrinsic_load_force_vrs_rates_amd:
302 case nir_intrinsic_load_tess_level_inner_default:
303 case nir_intrinsic_load_tess_level_outer_default:
304 case nir_intrinsic_load_scalar_arg_amd:
305 case nir_intrinsic_load_smem_amd:
306 case nir_intrinsic_load_resume_shader_address_amd:
307 case nir_intrinsic_load_reloc_const_intel:
308 case nir_intrinsic_load_btd_global_arg_addr_intel:
309 case nir_intrinsic_load_btd_local_arg_addr_intel:
310 case nir_intrinsic_load_inline_data_intel:
311 case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
312 case nir_intrinsic_load_lshs_vertex_stride_amd:
313 case nir_intrinsic_load_esgs_vertex_stride_amd:
314 case nir_intrinsic_load_hs_out_patch_data_offset_amd:
315 case nir_intrinsic_load_clip_half_line_width_amd:
316 case nir_intrinsic_load_num_vertices_per_primitive_amd:
317 case nir_intrinsic_load_streamout_buffer_amd:
318 case nir_intrinsic_load_ordered_id_amd:
319 case nir_intrinsic_load_gs_wave_id_amd:
320 case nir_intrinsic_load_provoking_vtx_in_prim_amd:
321 case nir_intrinsic_load_lds_ngg_scratch_base_amd:
322 case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd:
323 case nir_intrinsic_load_btd_shader_type_intel:
324 case nir_intrinsic_load_base_global_invocation_id:
325 case nir_intrinsic_load_base_workgroup_id:
326 case nir_intrinsic_load_alpha_reference_amd:
327 case nir_intrinsic_load_ubo_uniform_block_intel:
328 case nir_intrinsic_load_ssbo_uniform_block_intel:
329 case nir_intrinsic_load_shared_uniform_block_intel:
330 case nir_intrinsic_load_barycentric_optimize_amd:
331 case nir_intrinsic_load_poly_line_smooth_enabled:
332 case nir_intrinsic_load_rasterization_primitive_amd:
333 case nir_intrinsic_unit_test_uniform_amd:
334 case nir_intrinsic_load_global_constant_uniform_block_intel:
335 case nir_intrinsic_load_debug_log_desc_amd:
336 case nir_intrinsic_load_xfb_state_address_gfx12_amd:
337 case nir_intrinsic_cmat_length:
338 case nir_intrinsic_load_vs_primitive_stride_ir3:
339 case nir_intrinsic_load_vs_vertex_stride_ir3:
340 case nir_intrinsic_load_hs_patch_stride_ir3:
341 case nir_intrinsic_load_tess_factor_base_ir3:
342 case nir_intrinsic_load_tess_param_base_ir3:
343 case nir_intrinsic_load_primitive_location_ir3:
344 case nir_intrinsic_preamble_start_ir3:
345 case nir_intrinsic_optimization_barrier_sgpr_amd:
346 case nir_intrinsic_load_fbfetch_image_fmask_desc_amd:
347 case nir_intrinsic_load_fbfetch_image_desc_amd:
348 case nir_intrinsic_load_polygon_stipple_buffer_amd:
349 case nir_intrinsic_load_printf_buffer_address:
350 case nir_intrinsic_load_printf_buffer_size:
351 case nir_intrinsic_load_printf_base_identifier:
352 case nir_intrinsic_load_core_id_agx:
353 case nir_intrinsic_load_samples_log2_agx:
354 case nir_intrinsic_load_active_subgroup_count_agx:
355 case nir_intrinsic_load_constant_base_ptr:
356 is_divergent = false;
357 break;
358
359 /* This is divergent because it specifically loads sequential values into
360 * successive SIMD lanes.
361 */
362 case nir_intrinsic_load_global_block_intel:
363 is_divergent = true;
364 break;
365
366 case nir_intrinsic_decl_reg:
367 is_divergent = nir_intrinsic_divergent(instr);
368 break;
369
370 /* Intrinsics with divergence depending on shader stage and hardware */
371 case nir_intrinsic_load_shader_record_ptr:
372 is_divergent = !(options & nir_divergence_shader_record_ptr_uniform);
373 break;
374 case nir_intrinsic_load_frag_shading_rate:
375 is_divergent = !(options & nir_divergence_single_frag_shading_rate_per_subgroup);
376 break;
377 case nir_intrinsic_load_input:
378 case nir_intrinsic_load_per_primitive_input:
379 is_divergent = src_divergent(instr->src[0], state);
380
381 if (stage == MESA_SHADER_FRAGMENT) {
382 is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
383 } else if (stage == MESA_SHADER_TESS_EVAL) {
384 /* Patch input loads are uniform between vertices of the same primitive. */
385 if (state->vertex_divergence)
386 is_divergent = false;
387 else
388 is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
389 } else {
390 is_divergent = true;
391 }
392 break;
393 case nir_intrinsic_load_attribute_pan:
394 assert(stage == MESA_SHADER_VERTEX);
395 is_divergent = src_divergent(instr->src[0], state) ||
396 src_divergent(instr->src[1], state) ||
397 src_divergent(instr->src[2], state);
398 break;
399 case nir_intrinsic_load_per_vertex_input:
400 is_divergent = src_divergent(instr->src[0], state) ||
401 src_divergent(instr->src[1], state);
402 if (stage == MESA_SHADER_TESS_CTRL)
403 is_divergent |= !(options & nir_divergence_single_patch_per_tcs_subgroup);
404 if (stage == MESA_SHADER_TESS_EVAL)
405 is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
406 else
407 is_divergent = true;
408 break;
409 case nir_intrinsic_load_input_vertex:
410 is_divergent = src_divergent(instr->src[1], state);
411 assert(stage == MESA_SHADER_FRAGMENT);
412 is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
413 break;
414 case nir_intrinsic_load_output:
415 is_divergent = src_divergent(instr->src[0], state);
416 switch (stage) {
417 case MESA_SHADER_TESS_CTRL:
418 is_divergent |= !(options & nir_divergence_single_patch_per_tcs_subgroup);
419 break;
420 case MESA_SHADER_FRAGMENT:
421 is_divergent = true;
422 break;
423 case MESA_SHADER_TASK:
424 case MESA_SHADER_MESH:
425 /* NV_mesh_shader only (EXT_mesh_shader does not allow loading outputs).
426 * Divergent if src[0] is, so nothing else to do.
427 */
428 break;
429 default:
430 unreachable("Invalid stage for load_output");
431 }
432 break;
433 case nir_intrinsic_load_per_view_output:
434 is_divergent = instr->src[0].ssa->divergent ||
435 instr->src[1].ssa->divergent ||
436 (stage == MESA_SHADER_TESS_CTRL &&
437 !(options & nir_divergence_single_patch_per_tcs_subgroup));
438 break;
439 case nir_intrinsic_load_per_vertex_output:
440 /* TCS and NV_mesh_shader only (EXT_mesh_shader does not allow loading outputs). */
441 assert(stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_MESH);
442 is_divergent = src_divergent(instr->src[0], state) ||
443 src_divergent(instr->src[1], state) ||
444 (stage == MESA_SHADER_TESS_CTRL &&
445 !(options & nir_divergence_single_patch_per_tcs_subgroup));
446 break;
447 case nir_intrinsic_load_per_primitive_output:
448 /* NV_mesh_shader only (EXT_mesh_shader does not allow loading outputs). */
449 assert(stage == MESA_SHADER_MESH);
450 is_divergent = src_divergent(instr->src[0], state) ||
451 src_divergent(instr->src[1], state);
452 break;
453 case nir_intrinsic_load_layer_id:
454 case nir_intrinsic_load_front_face:
455 case nir_intrinsic_load_front_face_fsign:
456 case nir_intrinsic_load_back_face_agx:
457 assert(stage == MESA_SHADER_FRAGMENT || state->shader->info.internal);
458 is_divergent = !(options & nir_divergence_single_prim_per_subgroup);
459 break;
460 case nir_intrinsic_load_view_index:
461 assert(stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_KERNEL);
462 if (options & nir_divergence_view_index_uniform)
463 is_divergent = false;
464 else if (stage == MESA_SHADER_FRAGMENT)
465 is_divergent = !(options & nir_divergence_single_prim_per_subgroup);
466 else
467 is_divergent = true;
468 break;
469 case nir_intrinsic_load_fs_input_interp_deltas:
470 assert(stage == MESA_SHADER_FRAGMENT);
471 is_divergent = src_divergent(instr->src[0], state);
472 is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
473 break;
474 case nir_intrinsic_load_instance_id:
475 is_divergent = !state->vertex_divergence;
476 break;
477 case nir_intrinsic_load_primitive_id:
478 if (stage == MESA_SHADER_FRAGMENT)
479 is_divergent = !(options & nir_divergence_single_prim_per_subgroup);
480 else if (stage == MESA_SHADER_TESS_CTRL)
481 is_divergent = !state->vertex_divergence &&
482 !(options & nir_divergence_single_patch_per_tcs_subgroup);
483 else if (stage == MESA_SHADER_TESS_EVAL)
484 is_divergent = !state->vertex_divergence &&
485 !(options & nir_divergence_single_patch_per_tes_subgroup);
486 else if (stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_VERTEX)
487 is_divergent = !state->vertex_divergence;
488 else if (stage == MESA_SHADER_ANY_HIT ||
489 stage == MESA_SHADER_CLOSEST_HIT ||
490 stage == MESA_SHADER_INTERSECTION)
491 is_divergent = true;
492 else
493 unreachable("Invalid stage for load_primitive_id");
494 break;
495 case nir_intrinsic_load_tess_level_inner:
496 case nir_intrinsic_load_tess_level_outer:
497 if (stage == MESA_SHADER_TESS_CTRL)
498 is_divergent = !(options & nir_divergence_single_patch_per_tcs_subgroup);
499 else if (stage == MESA_SHADER_TESS_EVAL)
500 is_divergent = !(options & nir_divergence_single_patch_per_tes_subgroup);
501 else
502 unreachable("Invalid stage for load_primitive_tess_level_*");
503 break;
504
505 case nir_intrinsic_load_workgroup_index:
506 case nir_intrinsic_load_workgroup_id:
507 assert(gl_shader_stage_uses_workgroup(stage) || stage == MESA_SHADER_TESS_CTRL);
508 if (stage == MESA_SHADER_COMPUTE)
509 is_divergent |= (options & nir_divergence_multiple_workgroup_per_compute_subgroup);
510 break;
511
512 /* Clustered reductions are uniform if cluster_size == subgroup_size or
513 * the source is uniform and the operation is invariant.
514 * Inclusive scans are uniform if
515 * the source is uniform and the operation is invariant
516 */
517 case nir_intrinsic_reduce:
518 if (nir_intrinsic_cluster_size(instr) == 0) {
519 /* Cluster size of 0 means the subgroup size.
520 * This is uniform within a subgroup, but divergent between
521 * vertices of the same primitive because they may be in
522 * different subgroups.
523 */
524 is_divergent = state->vertex_divergence;
525 break;
526 }
527 FALLTHROUGH;
528 case nir_intrinsic_inclusive_scan:
529 case nir_intrinsic_inclusive_scan_clusters_ir3: {
530 nir_op op = nir_intrinsic_reduction_op(instr);
531 is_divergent = src_divergent(instr->src[0], state) ||
532 state->vertex_divergence;
533 if (op != nir_op_umin && op != nir_op_imin && op != nir_op_fmin &&
534 op != nir_op_umax && op != nir_op_imax && op != nir_op_fmax &&
535 op != nir_op_iand && op != nir_op_ior)
536 is_divergent = true;
537 break;
538 }
539
540 case nir_intrinsic_reduce_clusters_ir3:
541 /* This reduces the last invocations in all 8-wide clusters. It should
542 * behave the same as reduce with cluster_size == subgroup_size.
543 */
544 is_divergent = state->vertex_divergence;
545 break;
546
547 case nir_intrinsic_load_ubo:
548 case nir_intrinsic_load_ubo_vec4:
549 case nir_intrinsic_ldc_nv:
550 case nir_intrinsic_ldcx_nv:
551 is_divergent = (src_divergent(instr->src[0], state) &&
552 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
553 src_divergent(instr->src[1], state);
554 break;
555
556 case nir_intrinsic_load_ssbo:
557 case nir_intrinsic_load_ssbo_ir3:
558 case nir_intrinsic_load_uav_ir3:
559 is_divergent = (src_divergent(instr->src[0], state) &&
560 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
561 src_divergent(instr->src[1], state) ||
562 load_may_tear(state, instr);
563 break;
564
565 case nir_intrinsic_load_shared:
566 case nir_intrinsic_load_shared_ir3:
567 is_divergent = src_divergent(instr->src[0], state) ||
568 (options & nir_divergence_uniform_load_tears);
569 break;
570
571 case nir_intrinsic_load_global:
572 case nir_intrinsic_load_global_2x32:
573 case nir_intrinsic_load_global_ir3:
574 case nir_intrinsic_load_deref: {
575 if (load_may_tear(state, instr)) {
576 is_divergent = true;
577 break;
578 }
579
580 unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
581 for (unsigned i = 0; i < num_srcs; i++) {
582 if (src_divergent(instr->src[i], state)) {
583 is_divergent = true;
584 break;
585 }
586 }
587 break;
588 }
589
590 case nir_intrinsic_get_ssbo_size:
591 case nir_intrinsic_deref_buffer_array_length:
592 is_divergent = src_divergent(instr->src[0], state) &&
593 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
594 break;
595
596 case nir_intrinsic_image_samples_identical:
597 case nir_intrinsic_image_deref_samples_identical:
598 case nir_intrinsic_bindless_image_samples_identical:
599 case nir_intrinsic_image_fragment_mask_load_amd:
600 case nir_intrinsic_image_deref_fragment_mask_load_amd:
601 case nir_intrinsic_bindless_image_fragment_mask_load_amd:
602 is_divergent = (src_divergent(instr->src[0], state) &&
603 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
604 src_divergent(instr->src[1], state) ||
605 load_may_tear(state, instr);
606 break;
607
608 case nir_intrinsic_image_texel_address:
609 case nir_intrinsic_image_deref_texel_address:
610 case nir_intrinsic_bindless_image_texel_address:
611 is_divergent = (src_divergent(instr->src[0], state) &&
612 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
613 src_divergent(instr->src[1], state) ||
614 src_divergent(instr->src[2], state);
615 break;
616
617 case nir_intrinsic_image_load:
618 case nir_intrinsic_image_deref_load:
619 case nir_intrinsic_bindless_image_load:
620 case nir_intrinsic_image_sparse_load:
621 case nir_intrinsic_image_deref_sparse_load:
622 case nir_intrinsic_bindless_image_sparse_load:
623 is_divergent = (src_divergent(instr->src[0], state) &&
624 (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
625 src_divergent(instr->src[1], state) ||
626 src_divergent(instr->src[2], state) ||
627 src_divergent(instr->src[3], state) ||
628 load_may_tear(state, instr);
629 break;
630
631 case nir_intrinsic_optimization_barrier_vgpr_amd:
632 is_divergent = src_divergent(instr->src[0], state);
633 break;
634
635 /* Intrinsics with divergence depending on sources */
636 case nir_intrinsic_convert_alu_types:
637 case nir_intrinsic_ddx:
638 case nir_intrinsic_ddx_fine:
639 case nir_intrinsic_ddx_coarse:
640 case nir_intrinsic_ddy:
641 case nir_intrinsic_ddy_fine:
642 case nir_intrinsic_ddy_coarse:
643 case nir_intrinsic_ballot_bitfield_extract:
644 case nir_intrinsic_ballot_find_lsb:
645 case nir_intrinsic_ballot_find_msb:
646 case nir_intrinsic_ballot_bit_count_reduce:
647 case nir_intrinsic_rotate:
648 case nir_intrinsic_shuffle_xor:
649 case nir_intrinsic_shuffle_up:
650 case nir_intrinsic_shuffle_down:
651 case nir_intrinsic_shuffle_xor_uniform_ir3:
652 case nir_intrinsic_shuffle_up_uniform_ir3:
653 case nir_intrinsic_shuffle_down_uniform_ir3:
654 case nir_intrinsic_quad_broadcast:
655 case nir_intrinsic_quad_swap_horizontal:
656 case nir_intrinsic_quad_swap_vertical:
657 case nir_intrinsic_quad_swap_diagonal:
658 case nir_intrinsic_quad_vote_any:
659 case nir_intrinsic_quad_vote_all:
660 case nir_intrinsic_load_shared2_amd:
661 case nir_intrinsic_load_global_constant:
662 case nir_intrinsic_load_global_amd:
663 case nir_intrinsic_load_uniform:
664 case nir_intrinsic_load_constant:
665 case nir_intrinsic_load_sample_pos_from_id:
666 case nir_intrinsic_load_kernel_input:
667 case nir_intrinsic_load_task_payload:
668 case nir_intrinsic_load_buffer_amd:
669 case nir_intrinsic_load_typed_buffer_amd:
670 case nir_intrinsic_image_levels:
671 case nir_intrinsic_image_deref_levels:
672 case nir_intrinsic_bindless_image_levels:
673 case nir_intrinsic_image_samples:
674 case nir_intrinsic_image_deref_samples:
675 case nir_intrinsic_bindless_image_samples:
676 case nir_intrinsic_image_size:
677 case nir_intrinsic_image_deref_size:
678 case nir_intrinsic_bindless_image_size:
679 case nir_intrinsic_image_descriptor_amd:
680 case nir_intrinsic_image_deref_descriptor_amd:
681 case nir_intrinsic_bindless_image_descriptor_amd:
682 case nir_intrinsic_strict_wqm_coord_amd:
683 case nir_intrinsic_copy_deref:
684 case nir_intrinsic_vulkan_resource_index:
685 case nir_intrinsic_vulkan_resource_reindex:
686 case nir_intrinsic_load_vulkan_descriptor:
687 case nir_intrinsic_atomic_counter_read:
688 case nir_intrinsic_atomic_counter_read_deref:
689 case nir_intrinsic_quad_swizzle_amd:
690 case nir_intrinsic_masked_swizzle_amd:
691 case nir_intrinsic_is_sparse_texels_resident:
692 case nir_intrinsic_is_sparse_resident_zink:
693 case nir_intrinsic_sparse_residency_code_and:
694 case nir_intrinsic_bvh64_intersect_ray_amd:
695 case nir_intrinsic_image_deref_load_param_intel:
696 case nir_intrinsic_image_load_raw_intel:
697 case nir_intrinsic_get_ubo_size:
698 case nir_intrinsic_load_ssbo_address:
699 case nir_intrinsic_load_global_constant_bounded:
700 case nir_intrinsic_load_global_constant_offset:
701 case nir_intrinsic_load_reg:
702 case nir_intrinsic_load_constant_agx:
703 case nir_intrinsic_load_reg_indirect:
704 case nir_intrinsic_load_const_ir3:
705 case nir_intrinsic_load_frag_size_ir3:
706 case nir_intrinsic_load_frag_offset_ir3:
707 case nir_intrinsic_bindless_resource_ir3:
708 case nir_intrinsic_ray_intersection_ir3: {
709 unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
710 for (unsigned i = 0; i < num_srcs; i++) {
711 if (src_divergent(instr->src[i], state)) {
712 is_divergent = true;
713 break;
714 }
715 }
716 break;
717 }
718
719 case nir_intrinsic_resource_intel:
720 /* Not having the non_uniform flag with divergent sources is undefined
721 * behavior. The Intel driver defines it pick the lowest numbered live
722 * SIMD lane (via emit_uniformize).
723 */
724 if ((nir_intrinsic_resource_access_intel(instr) &
725 nir_resource_intel_non_uniform) != 0) {
726 unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
727 for (unsigned i = 0; i < num_srcs; i++) {
728 if (src_divergent(instr->src[i], state)) {
729 is_divergent = true;
730 break;
731 }
732 }
733 }
734 break;
735
736 case nir_intrinsic_shuffle:
737 is_divergent = src_divergent(instr->src[0], state) &&
738 src_divergent(instr->src[1], state);
739 break;
740
741 case nir_intrinsic_load_param:
742 is_divergent =
743 !state->impl->function->params[nir_intrinsic_param_idx(instr)].is_uniform;
744 break;
745
746 /* Intrinsics which are always divergent */
747 case nir_intrinsic_inverse_ballot:
748 case nir_intrinsic_load_color0:
749 case nir_intrinsic_load_color1:
750 case nir_intrinsic_load_sample_id:
751 case nir_intrinsic_load_sample_id_no_per_sample:
752 case nir_intrinsic_load_sample_mask_in:
753 case nir_intrinsic_load_interpolated_input:
754 case nir_intrinsic_load_point_coord_maybe_flipped:
755 case nir_intrinsic_load_barycentric_pixel:
756 case nir_intrinsic_load_barycentric_centroid:
757 case nir_intrinsic_load_barycentric_sample:
758 case nir_intrinsic_load_barycentric_model:
759 case nir_intrinsic_load_barycentric_at_sample:
760 case nir_intrinsic_load_barycentric_at_offset:
761 case nir_intrinsic_load_barycentric_at_offset_nv:
762 case nir_intrinsic_load_barycentric_coord_pixel:
763 case nir_intrinsic_load_barycentric_coord_centroid:
764 case nir_intrinsic_load_barycentric_coord_sample:
765 case nir_intrinsic_load_barycentric_coord_at_sample:
766 case nir_intrinsic_load_barycentric_coord_at_offset:
767 case nir_intrinsic_load_persp_center_rhw_ir3:
768 case nir_intrinsic_interp_deref_at_offset:
769 case nir_intrinsic_interp_deref_at_sample:
770 case nir_intrinsic_interp_deref_at_centroid:
771 case nir_intrinsic_interp_deref_at_vertex:
772 case nir_intrinsic_load_tess_coord:
773 case nir_intrinsic_load_tess_coord_xy:
774 case nir_intrinsic_load_point_coord:
775 case nir_intrinsic_load_line_coord:
776 case nir_intrinsic_load_frag_coord:
777 case nir_intrinsic_load_frag_coord_zw:
778 case nir_intrinsic_load_frag_coord_zw_pan:
779 case nir_intrinsic_load_frag_coord_unscaled_ir3:
780 case nir_intrinsic_load_pixel_coord:
781 case nir_intrinsic_load_fully_covered:
782 case nir_intrinsic_load_sample_pos:
783 case nir_intrinsic_load_sample_pos_or_center:
784 case nir_intrinsic_load_vertex_id_zero_base:
785 case nir_intrinsic_load_vertex_id:
786 case nir_intrinsic_load_invocation_id:
787 case nir_intrinsic_load_local_invocation_id:
788 case nir_intrinsic_load_local_invocation_index:
789 case nir_intrinsic_load_global_invocation_id:
790 case nir_intrinsic_load_global_invocation_index:
791 case nir_intrinsic_load_subgroup_invocation:
792 case nir_intrinsic_load_subgroup_eq_mask:
793 case nir_intrinsic_load_subgroup_ge_mask:
794 case nir_intrinsic_load_subgroup_gt_mask:
795 case nir_intrinsic_load_subgroup_le_mask:
796 case nir_intrinsic_load_subgroup_lt_mask:
797 case nir_intrinsic_load_helper_invocation:
798 case nir_intrinsic_is_helper_invocation:
799 case nir_intrinsic_load_scratch:
800 case nir_intrinsic_deref_atomic:
801 case nir_intrinsic_deref_atomic_swap:
802 case nir_intrinsic_ssbo_atomic:
803 case nir_intrinsic_ssbo_atomic_swap:
804 case nir_intrinsic_ssbo_atomic_ir3:
805 case nir_intrinsic_ssbo_atomic_swap_ir3:
806 case nir_intrinsic_image_deref_atomic:
807 case nir_intrinsic_image_deref_atomic_swap:
808 case nir_intrinsic_image_atomic:
809 case nir_intrinsic_image_atomic_swap:
810 case nir_intrinsic_bindless_image_atomic:
811 case nir_intrinsic_bindless_image_atomic_swap:
812 case nir_intrinsic_shared_atomic:
813 case nir_intrinsic_shared_atomic_swap:
814 case nir_intrinsic_task_payload_atomic:
815 case nir_intrinsic_task_payload_atomic_swap:
816 case nir_intrinsic_global_atomic:
817 case nir_intrinsic_global_atomic_swap:
818 case nir_intrinsic_global_atomic_amd:
819 case nir_intrinsic_global_atomic_agx:
820 case nir_intrinsic_global_atomic_swap_amd:
821 case nir_intrinsic_global_atomic_swap_agx:
822 case nir_intrinsic_global_atomic_2x32:
823 case nir_intrinsic_global_atomic_swap_2x32:
824 case nir_intrinsic_global_atomic_ir3:
825 case nir_intrinsic_global_atomic_swap_ir3:
826 case nir_intrinsic_atomic_counter_add:
827 case nir_intrinsic_atomic_counter_min:
828 case nir_intrinsic_atomic_counter_max:
829 case nir_intrinsic_atomic_counter_and:
830 case nir_intrinsic_atomic_counter_or:
831 case nir_intrinsic_atomic_counter_xor:
832 case nir_intrinsic_atomic_counter_inc:
833 case nir_intrinsic_atomic_counter_pre_dec:
834 case nir_intrinsic_atomic_counter_post_dec:
835 case nir_intrinsic_atomic_counter_exchange:
836 case nir_intrinsic_atomic_counter_comp_swap:
837 case nir_intrinsic_atomic_counter_add_deref:
838 case nir_intrinsic_atomic_counter_min_deref:
839 case nir_intrinsic_atomic_counter_max_deref:
840 case nir_intrinsic_atomic_counter_and_deref:
841 case nir_intrinsic_atomic_counter_or_deref:
842 case nir_intrinsic_atomic_counter_xor_deref:
843 case nir_intrinsic_atomic_counter_inc_deref:
844 case nir_intrinsic_atomic_counter_pre_dec_deref:
845 case nir_intrinsic_atomic_counter_post_dec_deref:
846 case nir_intrinsic_atomic_counter_exchange_deref:
847 case nir_intrinsic_atomic_counter_comp_swap_deref:
848 case nir_intrinsic_exclusive_scan:
849 case nir_intrinsic_exclusive_scan_clusters_ir3:
850 case nir_intrinsic_ballot_bit_count_exclusive:
851 case nir_intrinsic_ballot_bit_count_inclusive:
852 case nir_intrinsic_write_invocation_amd:
853 case nir_intrinsic_mbcnt_amd:
854 case nir_intrinsic_lane_permute_16_amd:
855 case nir_intrinsic_dpp16_shift_amd:
856 case nir_intrinsic_elect:
857 case nir_intrinsic_elect_any_ir3:
858 case nir_intrinsic_load_tlb_color_brcm:
859 case nir_intrinsic_load_tess_rel_patch_id_amd:
860 case nir_intrinsic_load_gs_vertex_offset_amd:
861 case nir_intrinsic_is_subgroup_invocation_lt_amd:
862 case nir_intrinsic_load_packed_passthrough_primitive_amd:
863 case nir_intrinsic_load_initial_edgeflags_amd:
864 case nir_intrinsic_gds_atomic_add_amd:
865 case nir_intrinsic_load_rt_arg_scratch_offset_amd:
866 case nir_intrinsic_load_intersection_opaque_amd:
867 case nir_intrinsic_load_vector_arg_amd:
868 case nir_intrinsic_load_btd_stack_id_intel:
869 case nir_intrinsic_load_topology_id_intel:
870 case nir_intrinsic_load_scratch_base_ptr:
871 case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
872 case nir_intrinsic_ordered_add_loop_gfx12_amd:
873 case nir_intrinsic_xfb_counter_sub_gfx11_amd:
874 case nir_intrinsic_unit_test_divergent_amd:
875 case nir_intrinsic_load_stack:
876 case nir_intrinsic_load_ray_launch_id:
877 case nir_intrinsic_load_ray_instance_custom_index:
878 case nir_intrinsic_load_ray_geometry_index:
879 case nir_intrinsic_load_ray_world_direction:
880 case nir_intrinsic_load_ray_world_origin:
881 case nir_intrinsic_load_ray_object_origin:
882 case nir_intrinsic_load_ray_object_direction:
883 case nir_intrinsic_load_ray_t_min:
884 case nir_intrinsic_load_ray_t_max:
885 case nir_intrinsic_load_ray_object_to_world:
886 case nir_intrinsic_load_ray_world_to_object:
887 case nir_intrinsic_load_ray_hit_kind:
888 case nir_intrinsic_load_ray_flags:
889 case nir_intrinsic_load_cull_mask:
890 case nir_intrinsic_load_sysval_nv:
891 case nir_intrinsic_emit_vertex_nv:
892 case nir_intrinsic_end_primitive_nv:
893 case nir_intrinsic_report_ray_intersection:
894 case nir_intrinsic_rq_proceed:
895 case nir_intrinsic_rq_load:
896 case nir_intrinsic_load_ray_triangle_vertex_positions:
897 case nir_intrinsic_cmat_extract:
898 case nir_intrinsic_cmat_muladd_amd:
899 case nir_intrinsic_dpas_intel:
900 case nir_intrinsic_isberd_nv:
901 case nir_intrinsic_al2p_nv:
902 case nir_intrinsic_ald_nv:
903 case nir_intrinsic_ipa_nv:
904 case nir_intrinsic_ldtram_nv:
905 case nir_intrinsic_printf:
906 case nir_intrinsic_load_gs_header_ir3:
907 case nir_intrinsic_load_tcs_header_ir3:
908 case nir_intrinsic_load_rel_patch_id_ir3:
909 case nir_intrinsic_brcst_active_ir3:
910 case nir_intrinsic_load_helper_op_id_agx:
911 case nir_intrinsic_load_helper_arg_lo_agx:
912 case nir_intrinsic_load_helper_arg_hi_agx:
913 case nir_intrinsic_stack_map_agx:
914 case nir_intrinsic_stack_unmap_agx:
915 case nir_intrinsic_load_exported_agx:
916 case nir_intrinsic_load_local_pixel_agx:
917 case nir_intrinsic_load_coefficients_agx:
918 case nir_intrinsic_load_active_subgroup_invocation_agx:
919 case nir_intrinsic_load_sample_mask:
920 case nir_intrinsic_quad_ballot_agx:
921 case nir_intrinsic_load_agx:
922 is_divergent = true;
923 break;
924
925 default:
926 #ifdef NDEBUG
927 is_divergent = true;
928 break;
929 #else
930 nir_print_instr(&instr->instr, stderr);
931 unreachable("\nNIR divergence analysis: Unhandled intrinsic.");
932 #endif
933 }
934
935 instr->def.divergent = is_divergent;
936 return is_divergent;
937 }
938
939 static bool
visit_tex(nir_tex_instr * instr,struct divergence_state * state)940 visit_tex(nir_tex_instr *instr, struct divergence_state *state)
941 {
942 if (instr->def.divergent)
943 return false;
944
945 bool is_divergent = false;
946
947 for (unsigned i = 0; i < instr->num_srcs; i++) {
948 switch (instr->src[i].src_type) {
949 case nir_tex_src_sampler_deref:
950 case nir_tex_src_sampler_handle:
951 case nir_tex_src_sampler_offset:
952 is_divergent |= src_divergent(instr->src[i].src, state) &&
953 instr->sampler_non_uniform;
954 break;
955 case nir_tex_src_texture_deref:
956 case nir_tex_src_texture_handle:
957 case nir_tex_src_texture_offset:
958 is_divergent |= src_divergent(instr->src[i].src, state) &&
959 instr->texture_non_uniform;
960 break;
961 default:
962 is_divergent |= src_divergent(instr->src[i].src, state);
963 break;
964 }
965 }
966
967 instr->def.divergent = is_divergent;
968 return is_divergent;
969 }
970
971 static bool
visit_def(nir_def * def,struct divergence_state * state)972 visit_def(nir_def *def, struct divergence_state *state)
973 {
974 return false;
975 }
976
977 static bool
nir_variable_mode_is_uniform(nir_variable_mode mode)978 nir_variable_mode_is_uniform(nir_variable_mode mode)
979 {
980 switch (mode) {
981 case nir_var_uniform:
982 case nir_var_mem_ubo:
983 case nir_var_mem_ssbo:
984 case nir_var_mem_shared:
985 case nir_var_mem_task_payload:
986 case nir_var_mem_global:
987 case nir_var_image:
988 return true;
989 default:
990 return false;
991 }
992 }
993
994 static bool
nir_variable_is_uniform(nir_shader * shader,nir_variable * var,struct divergence_state * state)995 nir_variable_is_uniform(nir_shader *shader, nir_variable *var,
996 struct divergence_state *state)
997 {
998 if (nir_variable_mode_is_uniform(var->data.mode))
999 return true;
1000
1001 /* Handle system value variables. */
1002 if (var->data.mode == nir_var_system_value) {
1003 /* Fake the instruction to reuse visit_intrinsic for all sysvals. */
1004 nir_intrinsic_instr fake_instr;
1005
1006 memset(&fake_instr, 0, sizeof(fake_instr));
1007 fake_instr.intrinsic =
1008 nir_intrinsic_from_system_value(var->data.location);
1009
1010 visit_intrinsic(&fake_instr, state);
1011 return !fake_instr.def.divergent;
1012 }
1013
1014 nir_divergence_options options = state->options;
1015 gl_shader_stage stage = shader->info.stage;
1016
1017 if (stage == MESA_SHADER_FRAGMENT &&
1018 (options & nir_divergence_single_prim_per_subgroup) &&
1019 var->data.mode == nir_var_shader_in &&
1020 var->data.interpolation == INTERP_MODE_FLAT)
1021 return true;
1022
1023 if (stage == MESA_SHADER_TESS_CTRL &&
1024 (options & nir_divergence_single_patch_per_tcs_subgroup) &&
1025 var->data.mode == nir_var_shader_out && var->data.patch)
1026 return true;
1027
1028 if (stage == MESA_SHADER_TESS_EVAL &&
1029 (options & nir_divergence_single_patch_per_tes_subgroup) &&
1030 var->data.mode == nir_var_shader_in && var->data.patch)
1031 return true;
1032
1033 return false;
1034 }
1035
1036 static bool
visit_deref(nir_shader * shader,nir_deref_instr * deref,struct divergence_state * state)1037 visit_deref(nir_shader *shader, nir_deref_instr *deref,
1038 struct divergence_state *state)
1039 {
1040 if (deref->def.divergent)
1041 return false;
1042
1043 bool is_divergent = false;
1044 switch (deref->deref_type) {
1045 case nir_deref_type_var:
1046 is_divergent = !nir_variable_is_uniform(shader, deref->var, state);
1047 break;
1048 case nir_deref_type_array:
1049 case nir_deref_type_ptr_as_array:
1050 is_divergent = src_divergent(deref->arr.index, state);
1051 FALLTHROUGH;
1052 case nir_deref_type_struct:
1053 case nir_deref_type_array_wildcard:
1054 is_divergent |= src_divergent(deref->parent, state);
1055 break;
1056 case nir_deref_type_cast:
1057 is_divergent = !nir_variable_mode_is_uniform(deref->var->data.mode) ||
1058 src_divergent(deref->parent, state);
1059 break;
1060 }
1061
1062 deref->def.divergent = is_divergent;
1063 return is_divergent;
1064 }
1065
1066 static bool
visit_jump(nir_jump_instr * jump,struct divergence_state * state)1067 visit_jump(nir_jump_instr *jump, struct divergence_state *state)
1068 {
1069 switch (jump->type) {
1070 case nir_jump_continue:
1071 if (state->divergent_loop_continue)
1072 return false;
1073 if (state->divergent_loop_cf)
1074 state->divergent_loop_continue = true;
1075 return state->divergent_loop_continue;
1076 case nir_jump_break:
1077 if (state->divergent_loop_break)
1078 return false;
1079 if (state->divergent_loop_cf)
1080 state->divergent_loop_break = true;
1081 return state->divergent_loop_break;
1082 case nir_jump_halt:
1083 /* This totally kills invocations so it doesn't add divergence */
1084 break;
1085 case nir_jump_return:
1086 unreachable("NIR divergence analysis: Unsupported return instruction.");
1087 break;
1088 case nir_jump_goto:
1089 case nir_jump_goto_if:
1090 unreachable("NIR divergence analysis: Unsupported goto_if instruction.");
1091 break;
1092 }
1093 return false;
1094 }
1095
1096 static bool
set_ssa_def_not_divergent(nir_def * def,void * invariant)1097 set_ssa_def_not_divergent(nir_def *def, void *invariant)
1098 {
1099 def->divergent = false;
1100 def->loop_invariant = *(bool *)invariant;
1101 return true;
1102 }
1103
1104 static bool
instr_is_loop_invariant(nir_instr * instr,struct divergence_state * state)1105 instr_is_loop_invariant(nir_instr *instr, struct divergence_state *state)
1106 {
1107 if (!state->loop)
1108 return false;
1109
1110 switch (instr->type) {
1111 case nir_instr_type_load_const:
1112 case nir_instr_type_undef:
1113 case nir_instr_type_debug_info:
1114 case nir_instr_type_jump:
1115 return true;
1116 case nir_instr_type_intrinsic:
1117 if (!nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
1118 return false;
1119 FALLTHROUGH;
1120 case nir_instr_type_alu:
1121 case nir_instr_type_deref:
1122 case nir_instr_type_tex:
1123 return nir_foreach_src(instr, src_invariant, state->loop);
1124 case nir_instr_type_call:
1125 return false;
1126 case nir_instr_type_phi:
1127 case nir_instr_type_parallel_copy:
1128 default:
1129 unreachable("NIR divergence analysis: Unsupported instruction type.");
1130 }
1131 }
1132
1133 static bool
update_instr_divergence(nir_instr * instr,struct divergence_state * state)1134 update_instr_divergence(nir_instr *instr, struct divergence_state *state)
1135 {
1136 switch (instr->type) {
1137 case nir_instr_type_alu:
1138 return visit_alu(nir_instr_as_alu(instr), state);
1139 case nir_instr_type_intrinsic:
1140 return visit_intrinsic(nir_instr_as_intrinsic(instr), state);
1141 case nir_instr_type_tex:
1142 return visit_tex(nir_instr_as_tex(instr), state);
1143 case nir_instr_type_load_const:
1144 return visit_def(&nir_instr_as_load_const(instr)->def, state);
1145 case nir_instr_type_undef:
1146 return visit_def(&nir_instr_as_undef(instr)->def, state);
1147 case nir_instr_type_deref:
1148 return visit_deref(state->shader, nir_instr_as_deref(instr), state);
1149 case nir_instr_type_debug_info:
1150 return false;
1151 case nir_instr_type_call:
1152 return false;
1153 case nir_instr_type_jump:
1154 case nir_instr_type_phi:
1155 case nir_instr_type_parallel_copy:
1156 default:
1157 unreachable("NIR divergence analysis: Unsupported instruction type.");
1158 }
1159 }
1160
1161 static bool
visit_block(nir_block * block,struct divergence_state * state)1162 visit_block(nir_block *block, struct divergence_state *state)
1163 {
1164 bool has_changed = false;
1165
1166 nir_foreach_instr(instr, block) {
1167 /* phis are handled when processing the branches */
1168 if (instr->type == nir_instr_type_phi)
1169 continue;
1170
1171 if (state->first_visit) {
1172 bool invariant = state->loop_all_invariant || instr_is_loop_invariant(instr, state);
1173 nir_foreach_def(instr, set_ssa_def_not_divergent, &invariant);
1174 }
1175
1176 if (instr->type == nir_instr_type_jump) {
1177 has_changed |= visit_jump(nir_instr_as_jump(instr), state);
1178 } else {
1179 has_changed |= update_instr_divergence(instr, state);
1180 }
1181 }
1182
1183 bool divergent = state->divergent_loop_cf ||
1184 state->divergent_loop_continue ||
1185 state->divergent_loop_break;
1186 if (divergent != block->divergent) {
1187 block->divergent = divergent;
1188 has_changed = true;
1189 }
1190
1191 return has_changed;
1192 }
1193
1194 /* There are 3 types of phi instructions:
1195 * (1) gamma: represent the joining point of different paths
1196 * created by an “if-then-else” branch.
1197 * The resulting value is divergent if the branch condition
1198 * or any of the source values is divergent. */
1199 static bool
visit_if_merge_phi(nir_phi_instr * phi,bool if_cond_divergent,bool ignore_undef)1200 visit_if_merge_phi(nir_phi_instr *phi, bool if_cond_divergent, bool ignore_undef)
1201 {
1202 if (phi->def.divergent)
1203 return false;
1204
1205 unsigned defined_srcs = 0;
1206 nir_foreach_phi_src(src, phi) {
1207 /* if any source value is divergent, the resulting value is divergent */
1208 if (nir_src_is_divergent(&src->src)) {
1209 phi->def.divergent = true;
1210 return true;
1211 }
1212 if (src->src.ssa->parent_instr->type != nir_instr_type_undef) {
1213 defined_srcs++;
1214 }
1215 }
1216
1217 if (!(ignore_undef && defined_srcs <= 1) && if_cond_divergent) {
1218 phi->def.divergent = true;
1219 return true;
1220 }
1221
1222 return false;
1223 }
1224
1225 /* There are 3 types of phi instructions:
1226 * (2) mu: which only exist at loop headers,
1227 * merge initial and loop-carried values.
1228 * The resulting value is divergent if any source value
1229 * is divergent or a divergent loop continue condition
1230 * is associated with a different ssa-def. */
1231 static bool
visit_loop_header_phi(nir_phi_instr * phi,nir_block * preheader,bool divergent_continue)1232 visit_loop_header_phi(nir_phi_instr *phi, nir_block *preheader, bool divergent_continue)
1233 {
1234 if (phi->def.divergent)
1235 return false;
1236
1237 nir_def *same = NULL;
1238 nir_foreach_phi_src(src, phi) {
1239 /* if any source value is divergent, the resulting value is divergent */
1240 if (nir_src_is_divergent(&src->src)) {
1241 phi->def.divergent = true;
1242 return true;
1243 }
1244 /* if this loop is uniform, we're done here */
1245 if (!divergent_continue)
1246 continue;
1247 /* skip the loop preheader */
1248 if (src->pred == preheader)
1249 continue;
1250
1251 /* check if all loop-carried values are from the same ssa-def */
1252 if (!same)
1253 same = src->src.ssa;
1254 else if (same != src->src.ssa) {
1255 phi->def.divergent = true;
1256 return true;
1257 }
1258 }
1259
1260 return false;
1261 }
1262
1263 /* There are 3 types of phi instructions:
1264 * (3) eta: represent values that leave a loop.
1265 * The resulting value is divergent if the source value is divergent
1266 * or any loop exit condition is divergent for a value which is
1267 * not loop-invariant (see nir_src_is_divergent()).
1268 */
1269 static bool
visit_loop_exit_phi(nir_phi_instr * phi,nir_loop * loop)1270 visit_loop_exit_phi(nir_phi_instr *phi, nir_loop *loop)
1271 {
1272 if (phi->def.divergent)
1273 return false;
1274
1275 nir_def *same = NULL;
1276 nir_foreach_phi_src(src, phi) {
1277 /* If any loop exit condition is divergent and this value is not loop
1278 * invariant, or if the source value is divergent, then the resulting
1279 * value is divergent.
1280 */
1281 if ((loop->divergent_break && !src_invariant(&src->src, loop)) ||
1282 nir_src_is_divergent(&src->src)) {
1283 phi->def.divergent = true;
1284 return true;
1285 }
1286
1287 /* if this loop is uniform, we're done here */
1288 if (!loop->divergent_break)
1289 continue;
1290
1291 /* check if all loop-exit values are from the same ssa-def */
1292 if (!same)
1293 same = src->src.ssa;
1294 else if (same != src->src.ssa) {
1295 phi->def.divergent = true;
1296 return true;
1297 }
1298 }
1299
1300 return false;
1301 }
1302
1303 static bool
visit_if(nir_if * if_stmt,struct divergence_state * state)1304 visit_if(nir_if *if_stmt, struct divergence_state *state)
1305 {
1306 bool progress = false;
1307 bool cond_divergent = src_divergent(if_stmt->condition, state);
1308
1309 struct divergence_state then_state = *state;
1310 then_state.divergent_loop_cf |= cond_divergent;
1311 progress |= visit_cf_list(&if_stmt->then_list, &then_state);
1312
1313 struct divergence_state else_state = *state;
1314 else_state.divergent_loop_cf |= cond_divergent;
1315 progress |= visit_cf_list(&if_stmt->else_list, &else_state);
1316
1317 /* handle phis after the IF */
1318 bool invariant = state->loop && src_invariant(&if_stmt->condition, state->loop);
1319 nir_foreach_phi(phi, nir_cf_node_cf_tree_next(&if_stmt->cf_node)) {
1320 if (state->first_visit) {
1321 phi->def.divergent = false;
1322 phi->def.loop_invariant =
1323 invariant && nir_foreach_src(&phi->instr, src_invariant, state->loop);
1324 }
1325 bool ignore_undef = state->options & nir_divergence_ignore_undef_if_phi_srcs;
1326 progress |= visit_if_merge_phi(phi, cond_divergent, ignore_undef);
1327 }
1328
1329 /* join loop divergence information from both branch legs */
1330 state->divergent_loop_continue |= then_state.divergent_loop_continue ||
1331 else_state.divergent_loop_continue;
1332 state->divergent_loop_break |= then_state.divergent_loop_break ||
1333 else_state.divergent_loop_break;
1334
1335 /* A divergent continue makes succeeding loop CF divergent:
1336 * not all loop-active invocations participate in the remaining loop-body
1337 * which means that a following break might be taken by some invocations, only */
1338 state->divergent_loop_cf |= state->divergent_loop_continue;
1339
1340 state->consider_loop_invariance |= then_state.consider_loop_invariance ||
1341 else_state.consider_loop_invariance;
1342
1343 return progress;
1344 }
1345
1346 static bool
visit_loop(nir_loop * loop,struct divergence_state * state)1347 visit_loop(nir_loop *loop, struct divergence_state *state)
1348 {
1349 assert(!nir_loop_has_continue_construct(loop));
1350 bool progress = false;
1351 nir_block *loop_header = nir_loop_first_block(loop);
1352 nir_block *loop_preheader = nir_block_cf_tree_prev(loop_header);
1353
1354 /* handle loop header phis first: we have no knowledge yet about
1355 * the loop's control flow or any loop-carried sources. */
1356 nir_foreach_phi(phi, loop_header) {
1357 if (!state->first_visit && phi->def.divergent)
1358 continue;
1359
1360 phi->def.loop_invariant = false;
1361 nir_foreach_phi_src(src, phi) {
1362 if (src->pred == loop_preheader) {
1363 phi->def.divergent = nir_src_is_divergent(&src->src);
1364 break;
1365 }
1366 }
1367 progress |= phi->def.divergent;
1368 }
1369
1370 /* setup loop state */
1371 struct divergence_state loop_state = *state;
1372 loop_state.loop = loop;
1373 loop_state.loop_all_invariant = loop_header->predecessors->entries == 1;
1374 loop_state.divergent_loop_cf = false;
1375 loop_state.divergent_loop_continue = false;
1376 loop_state.divergent_loop_break = false;
1377
1378 /* process loop body until no further changes are made */
1379 bool repeat;
1380 do {
1381 progress |= visit_cf_list(&loop->body, &loop_state);
1382 repeat = false;
1383
1384 /* revisit loop header phis to see if something has changed */
1385 nir_foreach_phi(phi, loop_header) {
1386 repeat |= visit_loop_header_phi(phi, loop_preheader,
1387 loop_state.divergent_loop_continue);
1388 }
1389
1390 loop_state.divergent_loop_cf = false;
1391 loop_state.first_visit = false;
1392 } while (repeat);
1393
1394 loop->divergent_continue = loop_state.divergent_loop_continue;
1395 loop->divergent_break = loop_state.divergent_loop_break;
1396
1397 /* handle phis after the loop */
1398 nir_foreach_phi(phi, nir_cf_node_cf_tree_next(&loop->cf_node)) {
1399 if (state->first_visit) {
1400 phi->def.divergent = false;
1401 phi->def.loop_invariant = false;
1402 }
1403 progress |= visit_loop_exit_phi(phi, loop);
1404 }
1405
1406 state->consider_loop_invariance |= loop_state.consider_loop_invariance ||
1407 loop->divergent_break;
1408 return progress;
1409 }
1410
1411 static bool
visit_cf_list(struct exec_list * list,struct divergence_state * state)1412 visit_cf_list(struct exec_list *list, struct divergence_state *state)
1413 {
1414 bool has_changed = false;
1415
1416 foreach_list_typed(nir_cf_node, node, node, list) {
1417 switch (node->type) {
1418 case nir_cf_node_block:
1419 has_changed |= visit_block(nir_cf_node_as_block(node), state);
1420 break;
1421 case nir_cf_node_if:
1422 has_changed |= visit_if(nir_cf_node_as_if(node), state);
1423 break;
1424 case nir_cf_node_loop:
1425 has_changed |= visit_loop(nir_cf_node_as_loop(node), state);
1426 break;
1427 case nir_cf_node_function:
1428 unreachable("NIR divergence analysis: Unsupported cf_node type.");
1429 }
1430 }
1431
1432 return has_changed;
1433 }
1434
1435 void
nir_divergence_analysis_impl(nir_function_impl * impl,nir_divergence_options options)1436 nir_divergence_analysis_impl(nir_function_impl *impl, nir_divergence_options options)
1437 {
1438 nir_metadata_require(impl, nir_metadata_block_index);
1439
1440 struct divergence_state state = {
1441 .stage = impl->function->shader->info.stage,
1442 .shader = impl->function->shader,
1443 .impl = impl,
1444 .options = options,
1445 .loop = NULL,
1446 .loop_all_invariant = false,
1447 .divergent_loop_cf = false,
1448 .divergent_loop_continue = false,
1449 .divergent_loop_break = false,
1450 .first_visit = true,
1451 };
1452
1453 visit_cf_list(&impl->body, &state);
1454
1455 nir_metadata_preserve(impl, nir_metadata_all);
1456 }
1457
1458 void
nir_divergence_analysis(nir_shader * shader)1459 nir_divergence_analysis(nir_shader *shader)
1460 {
1461 shader->info.divergence_analysis_run = true;
1462 nir_foreach_function_impl(impl, shader) {
1463 nir_divergence_analysis_impl(impl,
1464 shader->options->divergence_analysis_options);
1465 }
1466 }
1467
1468 /* Compute divergence between vertices of the same primitive. This uses
1469 * the same divergent field in nir_def and nir_loop as the regular divergence
1470 * pass.
1471 */
1472 void
nir_vertex_divergence_analysis(nir_shader * shader)1473 nir_vertex_divergence_analysis(nir_shader *shader)
1474 {
1475 shader->info.divergence_analysis_run = false;
1476
1477 struct divergence_state state = {
1478 .stage = shader->info.stage,
1479 .shader = shader,
1480 .options = shader->options->divergence_analysis_options,
1481 .loop = NULL,
1482 .loop_all_invariant = false,
1483 .vertex_divergence = true,
1484 .first_visit = true,
1485 };
1486
1487 nir_foreach_function_impl(impl, shader) {
1488 nir_metadata_require(impl, nir_metadata_block_index);
1489 state.impl = impl;
1490 visit_cf_list(&impl->body, &state);
1491 nir_metadata_preserve(impl, nir_metadata_all);
1492 }
1493 }
1494
1495 bool
nir_has_divergent_loop(nir_shader * shader)1496 nir_has_divergent_loop(nir_shader *shader)
1497 {
1498 nir_function_impl *func = nir_shader_get_entrypoint(shader);
1499
1500 foreach_list_typed(nir_cf_node, node, node, &func->body) {
1501 if (node->type == nir_cf_node_loop) {
1502 if (nir_cf_node_as_loop(node)->divergent_break)
1503 return true;
1504 }
1505 }
1506
1507 return false;
1508 }
1509