1 /*
2 * Copyright © 2023 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 /* Introduction
8 * ============
9 *
10 * This pass optimizes varyings between 2 shaders, which means dead input/
11 * output removal, constant and uniform load propagation, deduplication,
12 * compaction, and inter-shader code motion. This is used during the shader
13 * linking process.
14 *
15 *
16 * Notes on behavior
17 * =================
18 *
19 * The pass operates on scalar varyings using 32-bit and 16-bit types. Vector
20 * varyings are not allowed.
21 *
22 * Indirectly-indexed varying slots (not vertices) are not optimized or
23 * compacted, but unused slots of indirectly-indexed varyings are still filled
24 * with directly-indexed varyings during compaction. Indirectly-indexed
25 * varyings are still removed if they are unused by the other shader.
26 *
27 * Indirectly-indexed vertices don't disallow optimizations, but compromises
28 * are made depending on how they are accessed. They are common in TCS, TES,
29 * and GS, so there is a desire to optimize them as much as possible. More on
30 * that in various sections below.
31 *
32 * Transform feedback doesn't prevent most optimizations such as constant
33 * propagation and compaction. Shaders can be left with output stores that set
34 * the no_varying flag, meaning the output is not consumed by the next shader,
35 * which means that optimizations did their job and now the output is only
36 * consumed by transform feedback.
37 *
38 * All legacy varying slots are optimized when it's allowed.
39 *
40 *
41 * Convergence property of shader outputs
42 * ======================================
43 *
44 * When an output stores an SSA that is convergent and all stores of that
45 * output appear in unconditional blocks or conditional blocks with
46 * a convergent entry condition and the shader is not GS, it implies that all
47 * vertices of that output have the same value, therefore the output can be
48 * promoted to flat because all interpolation modes lead to the same result
49 * as flat. Such outputs are opportunistically compacted with both flat and
50 * non-flat varyings based on whichever has unused slots in their vec4s. This
51 * pass refers to such inputs, outputs, and varyings as "convergent" (meaning
52 * all vertices are always equal).
53 *
54 * By default, flat varyings are the only ones that are not considered convergent
55 * because we want the flexibility to pack convergent varyings with both flat
56 * and non-flat varyings, and since flat varyings can contain integers and
57 * doubles, we can never interpolate them as FP32 or FP16. Optimizations start
58 * with separate interpolated, flat, and convergent groups of varyings, and
59 * they choose whether they want to promote convergent to interpolated or
60 * flat, or whether to leave that decision to the end when the compaction
61 * happens.
62 *
63 * The above default behavior doesn't apply when the hw supports convergent
64 * flat loads with interpolated vec4 slots. (there is a NIR option)
65 *
66 * TES patch inputs are always convergent because they are uniform within
67 * a primitive.
68 *
69 *
70 * Optimization steps
71 * ==================
72 *
73 * 1. Determine which varying slots can be optimized and how.
74 *
75 * * When a varying is said to be "optimized" in the following text, it
76 * means all optimizations are performed, such as removal, constant
77 * propagation, and deduplication.
78 * * All VARn, PATCHn, and FOGC varyings are always optimized and
79 * compacted.
80 * * PRIMITIVE_ID is treated as VARn in (GS, FS).
81 * * TEXn are removed if they are dead (except TEXn inputs, which can't be
82 * removed due to being affected by the coord replace state). TEXn can’t
83 * also be optimized or compacted due to being affected by the coord
84 * replace state. TEXn not consumed by FS are treated as VARn.
85 * * COLn and BFCn only propagate constants if they are between 0 and 1
86 * because of the clamp vertex color state, and they are only
87 * deduplicated and compacted among themselves because they are affected
88 * by the flat shade, provoking vertex, two-side color selection, and
89 * clamp vertex color states. COLn and BFCn not consumed by FS are
90 * treated as VARn.
91 * * All system value outputs like POS, PSIZ, CLIP_DISTn, etc. can’t be
92 * removed, but they are demoted to sysval-only outputs by setting
93 * the "no_varying" flag (i.e. they can be removed as varyings), so
94 * drivers should look at the "no_varying" flag. If an output is not
95 * a sysval output in a specific stage, it's treated as VARn. (such as
96 * POS in TCS)
97 * * TESS_LEVEL_* inputs in TES can’t be touched if TCS is missing.
98 *
99 * 2. Remove unused inputs and outputs
100 *
101 * * Outputs not used in the next shader are removed.
102 * * Inputs not initialized by the previous shader are replaced with undef
103 * except:
104 * * LAYER and VIEWPORT are replaced with 0 in FS.
105 * * TEXn.xy is untouched because the coord replace state can set it, and
106 * TEXn.zw is replaced by (0, 1), which is equal to the coord replace
107 * value.
108 * * Output loads that have no output stores anywhere in the shader are
109 * replaced with undef. (for TCS, though it works with any shader)
110 * * Output stores with transform feedback are preserved, but get
111 * the “no_varying” flag, meaning they are not consumed by the next
112 * shader stage. Later, transform-feedback-only varyings are compacted
113 * (relocated) such that they are always last.
114 * * TCS outputs that are read by TCS, but not used by TES get
115 * the "no_varying" flag to indicate that they are only read by TCS and
116 * not consumed by TES. Later, such TCS outputs are compacted (relocated)
117 * such that they are always last to keep all outputs consumed by TES
118 * consecutive without holes.
119 *
120 * 3. Constant, uniform, UBO load, and uniform expression propagation
121 *
122 * * Define “uniform expressions” as ALU expressions only sourcing
123 * constants, uniforms, and UBO loads.
124 * * Constants, uniforms, UBO loads, and uniform expressions stored
125 * in outputs are moved into the next shader, and the outputs are removed.
126 * * The same propagation is done from output stores to output loads.
127 * (for TCS, though it works with any shader)
128 * * If there are multiple stores to the same output, all such stores
129 * should store the same constant, uniform, UBO load, or uniform
130 * expression for the expression to be propagated. If an output has
131 * multiple vertices, all vertices should store the same expression.
132 * * nir->options has callbacks that are used to estimate the cost of
133 * uniform expressions that drivers can set to control the complexity of
134 * uniform expressions that are propagated. This is to ensure that
135 * we don't increase the GPU overhead measurably by moving code across
136 * pipeline stages that amplify GPU work.
137 * * Special cases:
138 * * Constant COLn and BFCn are propagated only if the constants are
139 * in the [0, 1] range because of the clamp vertex color state.
140 * If both COLn and BFCn are written, they must write the same
141 * constant. If BFCn is written but not COLn, the constant is
142 * propagated from BFCn to COLn.
143 * * TEX.xy is untouched because of the coord replace state.
144 * If TEX.zw is (0, 1), only those constants are propagated because
145 * they match the coord replace values.
146 * * CLIP_DISTn, LAYER and VIEWPORT are always propagated.
147 * Eliminated output stores get the "no_varying" flag if they are also
148 * xfb stores or write sysval outputs.
149 *
150 * 4. Remove duplicated output components
151 *
152 * * By comparing SSA defs.
153 * * If there are multiple stores to the same output, all such stores
154 * should store the same SSA as all stores of another output for
155 * the output to be considered duplicated. If an output has multiple
156 * vertices, all vertices should store the same SSA.
157 * * Deduplication can only be done between outputs of the same category.
158 * Those are: interpolated, patch, flat, interpolated color, flat color,
159 * and conditionally interpolated color based on the flat
160 * shade state
161 * * Everything is deduplicated except TEXn due to the coord replace state.
162 * * Eliminated output stores get the "no_varying" flag if they are also
163 * xfb stores or write sysval outputs.
164 *
165 * 5. Backward inter-shader code motion
166 *
167 * "Backward" refers to moving code in the opposite direction that shaders
168 * are executed, i.e. moving code from the consumer to the producer.
169 *
170 * Fragment shader example:
171 * ```
172 * result = input0 * uniform + input1 * constant + UBO.variable;
173 * ```
174 *
175 * The computation of "result" in the above example can be moved into
176 * the previous shader and both inputs can be replaced with a new input
177 * holding the value of "result", thus making the shader smaller and
178 * possibly reducing the number of inputs, uniforms, and UBOs by 1.
179 *
180 * Such code motion can be performed for any expression sourcing only
181 * inputs, constants, and uniforms except for fragment shaders, which can
182 * also do it but with the following limitations:
183 * * Only these transformations can be perfomed with interpolated inputs
184 * and any composition of these transformations (such as lerp), which can
185 * all be proven mathematically:
186 * * interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j)
187 * * interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j)
188 * * interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j)
189 * * all of these transformations are considered "inexact" in NIR
190 * * interp interpolates an input according to the barycentric
191 * coordinates (i, j), which are different for perspective,
192 * noperspective, center, centroid, sample, at_offset, and at_sample
193 * modes.
194 * * convergent_expr is any expression sourcing only constants,
195 * uniforms, and convergent inputs. The only requirement on
196 * convergent_expr is that it doesn't vary between vertices of
197 * the same primitive, but it can vary between primitives.
198 * * If inputs are flat or convergent, there are no limitations on
199 * expressions that can be moved.
200 * * Interpolated and flat inputs can't mix in the same expression, but
201 * convergent inputs can mix with both.
202 * * The interpolation qualifier of the new input is inherited from
203 * the removed non-convergent inputs that should all have the same (i, j).
204 * If there are no non-convergent inputs, then the new input is declared
205 * as flat (for simplicity; we can't choose the barycentric coordinates
206 * at random because AMD doesn't like when there are multiple sets of
207 * barycentric coordinates in the same shader unnecessarily).
208 * * Inf values break code motion across interpolation. See the section
209 * discussing how we handle it near the end.
210 *
211 * The above rules also apply to open-coded TES input interpolation, which
212 * is handled the same as FS input interpolation. The only differences are:
213 * * Open-coded TES input interpolation must match one of the allowed
214 * equations. Different interpolation equations are treated the same as
215 * different interpolation qualifiers in FS.
216 * * Patch varyings are always treated as convergent.
217 *
218 * Prerequisites:
219 * * We need a post-dominator tree that is constructed from a graph where
220 * vertices are instructions and directed edges going into them are
221 * the values of their source operands. This is different from how NIR
222 * dominance works, which represents all instructions within a basic
223 * block as a linear chain of vertices in the graph.
224 * In our graph, all loads without source operands and all constants are
225 * entry nodes in the graph, and all stores and discards are exit nodes
226 * in the graph. Each shader can have multiple disjoint graphs where
227 * the Lowest Common Ancestor of 2 instructions doesn't exist.
228 * * Given the above definition, the instruction whose result is the best
229 * candidate for a new input is the farthest instruction that
230 * post-dominates one of more inputs and is movable between shaders.
231 *
232 * Algorithm Idea Part 1: Search
233 * * Pick any input load that is hypothetically movable and call it
234 * the iterator.
235 * * Get the immediate post-dominator of the iterator, and if it's movable,
236 * replace the iterator with it.
237 * * Repeat the previous step until the obtained immediate post-dominator
238 * is not movable.
239 * * The iterator now contains the farthest post-dominator that is movable.
240 * * Gather all input loads that the post-dominator consumes.
241 * * For each of those input loads, all matching output stores must be
242 * in the same block (because they will be replaced by a single store).
243 *
244 * Algorithm Idea Part 2: Code Motion
245 * * Clone the post-dominator in the producer except input loads, which
246 * should be replaced by stored output values. Uniform and UBO loads,
247 * if any, should be cloned too.
248 * * Remove the original output stores.
249 * * Replace the post-dominator from the consumer with a new input load.
250 * * The step above makes the post-dominated input load that we picked
251 * at the beginning dead, but other input loads used by the post-
252 * dominator might still have other uses (shown in the example below).
253 *
254 * Example SSA-use graph - initial shader and the result:
255 * ```
256 * input0 input1 input0 input1
257 * \ / \ | \
258 * constant alu ... ======> | ...
259 * \ /
260 * alu
261 * (post-dominator)
262 * ```
263 *
264 * Description:
265 * On the right, the algorithm moved the constant and both ALU opcodes
266 * into the previous shader and input0 now contains the value of
267 * the post-dominator. input1 stays the same because it still has one
268 * use left. If input1 hadn't had the other use, it would have been
269 * removed.
270 *
271 * If the algorithm moves any code, the algorithm is repeated until there
272 * is no code that it can move.
273 *
274 * Which shader pairs are supported:
275 * * (VS, FS), (TES, FS): yes, fully
276 * * Limitation: If Infs must be preserved, no code is moved across
277 * interpolation, so only flat varyings are optimized.
278 * * (VS, TCS), (VS, GS), (TES, GS): no, but possible -- TODO
279 * * Current behavior:
280 * * Per-vertex inputs are rejected.
281 * * Possible solution:
282 * * All input loads used by an accepted post-dominator must use
283 * the same vertex index. The post-dominator must use all loads with
284 * that vertex index.
285 * * If a post-dominator is found for an input load from a specific
286 * slot, all other input loads from that slot must also have
287 * an accepted post-dominator, and all such post-dominators should
288 * be identical expressions.
289 * * (TCS, TES), (VS, TES): yes, with limitations
290 * * Limitations:
291 * * Only 1 store and 1 load per slot allowed.
292 * * No output loads allowed.
293 * * All stores used by an accepted post-dominator must be in
294 * the same block.
295 * * TCS barriers don't matter because there are no output loads.
296 * * Patch varyings are handled trivially with the above constraints.
297 * * Per-vertex outputs should only be indexed by gl_InvocationID.
298 * * An interpolated TES load is any ALU instruction that computes
299 * the result of linear interpolation of per-vertex inputs from
300 * the same slot using gl_TessCoord. If such an ALU instruction is
301 * found, it must be the only one, and all per-vertex input loads
302 * from that slot must feed into it. The interpolation equation must
303 * be equal to one of the allowed equations. Then the same rules as
304 * for interpolated FS inputs are used, treating different
305 * interpolation equations just like different interpolation
306 * qualifiers.
307 * * Patch inputs are treated as convergent, which means they are
308 * allowed to be in the same movable expression as interpolated TES
309 * inputs, and the same rules as for convergent FS inputs apply.
310 * * (GS, FS), (MS, FS): no
311 * * Workaround: Add a passthrough VS between GS/MS and FS, run
312 * the pass on the (VS, FS) pair to move code out of FS,
313 * and inline that VS at the end of your hw-specific
314 * GS/MS if it's possible.
315 * * (TS, MS): no
316 *
317 * The disadvantage of using the post-dominator tree is that it's a tree,
318 * which means there is only 1 post-dominator of each input. This example
319 * shows a case that could be optimized by replacing 3 inputs with 2 inputs,
320 * reducing the number of inputs by 1, but the immediate post-dominator of
321 * all input loads is NULL:
322 * ```
323 * temp0 = input0 + input1 + input2;
324 * temp1 = input0 + input1 * const1 + input2 * const2;
325 * ```
326 *
327 * If there is a graph algorithm that returns the best solution to
328 * the above case (which is temp0 and temp1 to replace all 3 inputs), let
329 * us know.
330 *
331 * 6. Forward inter-shader code motion
332 *
333 * TODO: Not implemented. The text below is a draft of the description.
334 *
335 * "Forward" refers to moving code in the direction that shaders are
336 * executed, i.e. moving code from the producer to the consumer.
337 *
338 * Vertex shader example:
339 * ```
340 * output0 = value + 1;
341 * output1 = value * 2;
342 * ```
343 *
344 * Both outputs can be replaced by 1 output storing "value", and both ALU
345 * operations can be moved into the next shader.
346 *
347 * The same dominance algorithm as in the previous optimization is used,
348 * except that:
349 * * Instead of inputs, we use outputs.
350 * * Instead of a post-dominator tree, we use a dominator tree of the exact
351 * same graph.
352 *
353 * The algorithm idea is: For each pair of 2 output stores, find their
354 * Lowest Common Ancestor in the dominator tree, and that's a candidate
355 * for a new output. All movable loads like load_const should be removed
356 * from the graph, otherwise the LCA wouldn't exist.
357 *
358 * The limitations on instructions that can be moved between shaders across
359 * interpolated loads are exactly the same as the previous optimization.
360 *
361 * nir->options has callbacks that are used to estimate the cost of
362 * expressions that drivers can set to control the complexity of
363 * expressions that can be moved to later shaders. This is to ensure that
364 * we don't increase the GPU overhead measurably by moving code across
365 * pipeline stages that amplify GPU work.
366 *
367 * 7. Compaction to vec4 slots (AKA packing)
368 *
369 * First, varyings are divided into these groups, and components from each
370 * group are assigned locations in this order (effectively forcing
371 * components from the same group to be in the same vec4 slot or adjacent
372 * vec4 slots) with some exceptions listed below:
373 *
374 * Non-FS groups (patch and non-patch are packed separately):
375 * * 32-bit cross-invocation (TCS inputs using cross-invocation access)
376 * * 16-bit cross-invocation (TCS inputs using cross-invocation access)
377 * * 32-bit flat
378 * * 16-bit flat
379 * * 32-bit no-varying (TCS outputs read by TCS but not TES)
380 * * 16-bit no-varying (TCS outputs read by TCS but not TES)
381 *
382 * FS groups:
383 * * 32-bit interpolated (always FP32)
384 * * 32-bit flat
385 * * 32-bit convergent (always FP32)
386 * * 16-bit interpolated (always FP16)
387 * * 16-bit flat
388 * * 16-bit convergent (always FP16)
389 * * 32-bit transform feedback only
390 * * 16-bit transform feedback only
391 *
392 * When the driver/hw can't mix different interpolation qualifiers
393 * in the same vec4, the interpolated groups are further split into 6
394 * groups, one for each qualifier.
395 *
396 * Then, all scalar varyings are relocated into new slots, starting from
397 * VAR0.x and increasing the scalar slot offset in 32-bit or 16-bit
398 * increments. Rules:
399 * * Both 32-bit and 16-bit flat varyings are packed in the same vec4.
400 * * Convergent varyings can be packed with interpolated varyings of
401 * the same type or flat. The group to pack with is chosen based on
402 * whichever has unused scalar slots because we want to reduce the total
403 * number of vec4s. After filling all unused scalar slots, the remaining
404 * convergent varyings are packed as flat.
405 * * Transform-feedback-only slots and no-varying slots are packed last,
406 * so that they are consecutive and not intermixed with varyings consumed
407 * by the next shader stage, and 32-bit and 16-bit slots are packed in
408 * the same vec4. This allows reducing memory for outputs by ignoring
409 * the trailing outputs that the next shader stage doesn't read.
410 *
411 * In the end, we should end up with these groups for FS:
412 * * 32-bit interpolated (always FP32) on separate vec4s
413 * * 16-bit interpolated (always FP16) on separate vec4s
414 * * 32-bit flat and 16-bit flat, mixed in the same vec4
415 * * 32-bit and 16-bit transform feedback only, sharing vec4s with flat
416 *
417 * Colors are compacted the same but separately because they can't be mixed
418 * with VARn. Colors are divided into 3 FS groups. They are:
419 * * 32-bit maybe-interpolated (affected by the flat-shade state)
420 * * 32-bit interpolated (not affected by the flat-shade state)
421 * * 32-bit flat (not affected by the flat-shade state)
422 *
423 * To facilitate driver-specific output merging, color channels are
424 * assigned in a rotated order depending on which one the first unused VARn
425 * channel is. For example, if the first unused VARn channel is VAR0.z,
426 * color channels are allocated in this order:
427 * COL0.z, COL0.w, COL0.x, COL0.y, COL1.z, COL1.w, COL1.x, COL1.y
428 * The reason is that some drivers merge outputs if each output sets
429 * different components, for example 2 outputs defining VAR0.xy and COL0.z.
430 * If drivers do interpolation in the fragment shader and color
431 * interpolation can differ for each component, VAR0.xy and COL.z can be
432 * stored in the same output storage slot, and the consumer can load VAR0
433 * and COL0 from the same slot.
434 *
435 * If COLn, BFCn, and TEXn are transform-feedback-only, they are moved to
436 * VARn. PRIMITIVE_ID in (GS, FS) and FOGC in (xx, FS) are always moved to
437 * VARn for better packing.
438 *
439 *
440 * Issue: Interpolation converts Infs to NaNs
441 * ==========================================
442 *
443 * Interpolation converts Infs to NaNs, i.e. interp(Inf, i, j) = NaN, which
444 * impacts and limits backward inter-shader code motion, uniform expression
445 * propagation, and compaction.
446 *
447 * When we decide not to interpolate a varying, we need to convert Infs to
448 * NaNs manually. Infs can be converted to NaNs like this: x*0 + x
449 * (suggested by Ian Romanick, the multiplication must be "exact")
450 *
451 * Changes to optimizations:
452 * - When we propagate a uniform expression and NaNs must be preserved,
453 * convert Infs in the result to NaNs using "x*0 + x" in the consumer.
454 * - When we change interpolation to flat for convergent varyings and NaNs
455 * must be preserved, apply "x*0 + x" to the stored output value
456 * in the producer.
457 * - There is no solution for backward inter-shader code motion with
458 * interpolation if Infs must be preserved. As an alternative, we can allow
459 * code motion across interpolation only for specific shader hashes in
460 * can_move_alu_across_interp. We can use shader-db to automatically produce
461 * a list of shader hashes that benefit from this optimization.
462 *
463 *
464 * Usage
465 * =====
466 *
467 * Requirements:
468 * - ALUs should be scalarized
469 * - Dot products and other vector opcodes should be lowered (recommended)
470 * - Input loads and output stores should be scalarized
471 * - 64-bit varyings should be lowered to 32 bits
472 * - nir_vertex_divergence_analysis must be called on the producer if
473 * the constumer is a fragment shader
474 *
475 * It's recommended to run this for all shader pairs from the first shader
476 * to the last shader first (to propagate constants etc.). If the optimization
477 * of (S1, S2) stages leads to changes in S1, remember the highest S1. Then
478 * re-run this for all shader pairs in the descending order from S1 to VS.
479 *
480 * NIR optimizations should be performed after every run that changes the IR.
481 *
482 *
483 * Analyzing the optimization potential of linking separate shaders
484 * ================================================================
485 *
486 * We can use this pass in an analysis pass that decides whether a separate
487 * shader has the potential to benefit from full draw-time linking. The way
488 * it would work is that we would create a passthrough shader adjacent to
489 * the separate shader, run this pass on both shaders, and check if the number
490 * of varyings decreased. This way we can decide to perform the draw-time
491 * linking only if we are confident that it would help performance.
492 *
493 * TODO: not implemented, mention the pass that implements it
494 */
495
496 #include "nir.h"
497 #include "nir_builder.h"
498 #include "util/hash_table.h"
499 #include "util/u_math.h"
500 #include "util/u_memory.h"
501
502 /* nir_opt_varyings works at scalar 16-bit granularity across all varyings.
503 *
504 * Slots (i % 8 == 0,2,4,6) are 32-bit channels or low bits of 16-bit channels.
505 * Slots (i % 8 == 1,3,5,7) are high bits of 16-bit channels. 32-bit channels
506 * don't set these slots as used in bitmasks.
507 */
508 #define NUM_SCALAR_SLOTS (NUM_TOTAL_VARYING_SLOTS * 8)
509
510 /* Fragment shader input slots can be packed with indirectly-indexed vec4
511 * slots if there are unused components, but only if the vec4 slot has
512 * the same interpolation type. There are only 3 types: FLAT, FP32, FP16.
513 */
514 enum fs_vec4_type {
515 FS_VEC4_TYPE_NONE = 0,
516 FS_VEC4_TYPE_FLAT,
517 FS_VEC4_TYPE_INTERP_EXPLICIT,
518 FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
519 FS_VEC4_TYPE_PER_PRIMITIVE,
520 /* When nir_io_has_flexible_input_interpolation_except_flat is set: */
521 FS_VEC4_TYPE_INTERP_FP32,
522 FS_VEC4_TYPE_INTERP_FP16,
523 FS_VEC4_TYPE_INTERP_COLOR, /* only for glShadeModel, i.e. INTERP_MODE_NONE */
524 /* When nir_io_has_flexible_input_interpolation_except_flat is not set: */
525 FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL,
526 FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID,
527 FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE,
528 FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL,
529 FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID,
530 FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE,
531 FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL,
532 FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID,
533 FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE,
534 FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL,
535 FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID,
536 FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE,
537 FS_VEC4_TYPE_INTERP_COLOR_PIXEL, /* only for glShadeModel, i.e. INTERP_MODE_NONE */
538 FS_VEC4_TYPE_INTERP_COLOR_CENTROID, /* same */
539 FS_VEC4_TYPE_INTERP_COLOR_SAMPLE, /* same */
540 };
541
542 enum {
543 PERSP_PIXEL,
544 PERSP_CENTROID,
545 PERSP_SAMPLE,
546 LINEAR_PIXEL,
547 LINEAR_CENTROID,
548 LINEAR_SAMPLE,
549 NUM_INTERP_QUALIFIERS,
550 };
551
552 enum {
553 COLOR_PIXEL,
554 COLOR_CENTROID,
555 COLOR_SAMPLE,
556 NUM_COLOR_QUALIFIERS,
557 };
558
559 #if PRINT_RELOCATE_SLOT
560 static const char *fs_vec4_type_strings[] = {
561 "NONE",
562 "FLAT",
563 "INTERP_EXPLICIT",
564 "INTERP_EXPLICIT_STRICT",
565 "PER_PRIMITIVE",
566 "INTERP_FP32",
567 "INTERP_FP16",
568 "INTERP_COLOR",
569 "INTERP_FP32_PERSP_PIXEL",
570 "INTERP_FP32_PERSP_CENTROID",
571 "INTERP_FP32_PERSP_SAMPLE",
572 "INTERP_FP32_LINEAR_PIXEL",
573 "INTERP_FP32_LINEAR_CENTROID",
574 "INTERP_FP32_LINEAR_SAMPLE",
575 "INTERP_FP16_PERSP_PIXEL",
576 "INTERP_FP16_PERSP_CENTROID",
577 "INTERP_FP16_PERSP_SAMPLE",
578 "INTERP_FP16_LINEAR_PIXEL",
579 "INTERP_FP16_LINEAR_CENTROID",
580 "INTERP_FP16_LINEAR_SAMPLE",
581 "INTERP_COLOR_PIXEL",
582 "INTERP_COLOR_CENTROID",
583 "INTERP_COLOR_SAMPLE",
584 };
585 #endif // PRINT_RELOCATE_SLOT
586
587 typedef BITSET_WORD INTERP_QUAL_BITSET[NUM_INTERP_QUALIFIERS][BITSET_WORDS(NUM_SCALAR_SLOTS)];
588 typedef BITSET_WORD COLOR_QUAL_BITSET[NUM_COLOR_QUALIFIERS][BITSET_WORDS(NUM_SCALAR_SLOTS)];
589
590 static unsigned
get_scalar_16bit_slot(nir_io_semantics sem,unsigned component)591 get_scalar_16bit_slot(nir_io_semantics sem, unsigned component)
592 {
593 return sem.location * 8 + component * 2 + sem.high_16bits;
594 }
595
596 static unsigned
intr_get_scalar_16bit_slot(nir_intrinsic_instr * intr)597 intr_get_scalar_16bit_slot(nir_intrinsic_instr *intr)
598 {
599 return get_scalar_16bit_slot(nir_intrinsic_io_semantics(intr),
600 nir_intrinsic_component(intr));
601 }
602
603 static unsigned
vec4_slot(unsigned scalar_slot)604 vec4_slot(unsigned scalar_slot)
605 {
606 return scalar_slot / 8;
607 }
608
609 struct list_node {
610 struct list_head head;
611 nir_intrinsic_instr *instr;
612 };
613
614 /* Information about 1 scalar varying slot for both shader stages. */
615 struct scalar_slot {
616 struct {
617 /* Linked list of all store instructions writing into the scalar slot
618 * in the producer.
619 */
620 struct list_head stores;
621
622 /* Only for TCS: Linked list of all load instructions read the scalar
623 * slot in the producer.
624 */
625 struct list_head loads;
626
627 /* If there is only one store instruction or if all store instructions
628 * store the same value in the producer, this is the instruction
629 * computing the stored value. Used by constant and uniform propagation
630 * to the next shader.
631 */
632 nir_instr *value;
633 } producer;
634
635 struct {
636 /* Linked list of all load instructions loading from the scalar slot
637 * in the consumer.
638 */
639 struct list_head loads;
640
641 /* The result of TES input interpolation. */
642 nir_alu_instr *tes_interp_load;
643 unsigned tes_interp_mode; /* FLAG_INTERP_TES_* */
644 nir_def *tes_load_tess_coord;
645 } consumer;
646
647 /* The number of accessed slots if this slot has indirect indexing. */
648 unsigned num_slots;
649 };
650
651 struct linkage_info {
652 struct scalar_slot slot[NUM_SCALAR_SLOTS];
653
654 bool spirv;
655 bool can_move_uniforms;
656 bool can_move_ubos;
657 bool can_mix_convergent_flat_with_interpolated;
658 bool has_flexible_interp;
659 bool always_interpolate_convergent_fs_inputs;
660
661 gl_shader_stage producer_stage;
662 gl_shader_stage consumer_stage;
663 nir_builder producer_builder;
664 nir_builder consumer_builder;
665 unsigned max_varying_expression_cost;
666 unsigned (*varying_estimate_instr_cost)(struct nir_instr *instr);
667
668 /* Memory context for linear_alloc_child (fast allocation). */
669 void *linear_mem_ctx;
670
671 /* Hash table for efficient cloning instructions between shaders. */
672 struct hash_table *clones_ht;
673
674 /* If any component of a vec4 slot is accessed indirectly, this is its
675 * FS vec4 qualifier type, which is either FLAT, FP32, or FP16.
676 * Components with different qualifier types can't be compacted
677 * in the same vec4.
678 */
679 uint8_t fs_vec4_type[NUM_TOTAL_VARYING_SLOTS];
680
681 /* Mask of all varyings that can be removed. Only a few non-VARn non-PATCHn
682 * varyings can't be removed.
683 */
684 BITSET_DECLARE(removable_mask, NUM_SCALAR_SLOTS);
685
686 /* Mask of all slots that have transform feedback info. */
687 BITSET_DECLARE(xfb_mask, NUM_SCALAR_SLOTS);
688
689 /* Mask of all slots that have transform feedback info, but are not used
690 * by the next shader. Separate masks for 32-bit and 16-bit outputs.
691 */
692 BITSET_DECLARE(xfb32_only_mask, NUM_SCALAR_SLOTS);
693 BITSET_DECLARE(xfb16_only_mask, NUM_SCALAR_SLOTS);
694
695 /* Mask of all TCS inputs using cross-invocation access. */
696 BITSET_DECLARE(tcs_cross_invoc32_mask, NUM_SCALAR_SLOTS);
697 BITSET_DECLARE(tcs_cross_invoc16_mask, NUM_SCALAR_SLOTS);
698
699 /* Mask of all TCS->TES slots that are read by TCS, but not TES. */
700 BITSET_DECLARE(no_varying32_mask, NUM_SCALAR_SLOTS);
701 BITSET_DECLARE(no_varying16_mask, NUM_SCALAR_SLOTS);
702
703 /* Mask of all slots accessed with indirect indexing. */
704 BITSET_DECLARE(indirect_mask, NUM_SCALAR_SLOTS);
705
706 /* The following masks only contain slots that can be compacted and
707 * describe the groups in which they should be compacted. Non-fragment
708 * shaders only use the flat bitmasks.
709 *
710 * Some legacy varyings are excluded when they can't be compacted due to
711 * being affected by pipeline states (like coord replace). That only
712 * applies to xx->FS shader pairs. Other shader pairs get all legacy
713 * varyings compacted and relocated to VARn.
714 *
715 * Indirectly-indexed varyings are also excluded because they are not
716 * compacted.
717 */
718 BITSET_DECLARE(interp_fp32_mask, NUM_SCALAR_SLOTS);
719 BITSET_DECLARE(interp_fp16_mask, NUM_SCALAR_SLOTS);
720 BITSET_DECLARE(flat32_mask, NUM_SCALAR_SLOTS);
721 BITSET_DECLARE(flat16_mask, NUM_SCALAR_SLOTS);
722 BITSET_DECLARE(interp_explicit32_mask, NUM_SCALAR_SLOTS);
723 BITSET_DECLARE(interp_explicit16_mask, NUM_SCALAR_SLOTS);
724 BITSET_DECLARE(interp_explicit_strict32_mask, NUM_SCALAR_SLOTS);
725 BITSET_DECLARE(interp_explicit_strict16_mask, NUM_SCALAR_SLOTS);
726 BITSET_DECLARE(per_primitive32_mask, NUM_SCALAR_SLOTS);
727 BITSET_DECLARE(per_primitive16_mask, NUM_SCALAR_SLOTS);
728
729 /* Color interpolation unqualified (follows the flat-shade state). */
730 BITSET_DECLARE(color32_mask, NUM_SCALAR_SLOTS);
731
732 /* A separate bitmask for each qualifier when
733 * nir_io_has_flexible_input_interpolation_except_flat is not set.
734 */
735 INTERP_QUAL_BITSET interp_fp32_qual_masks;
736 INTERP_QUAL_BITSET interp_fp16_qual_masks;
737 COLOR_QUAL_BITSET color32_qual_masks;
738
739 /* Mask of output components that have only one store instruction, or if
740 * they have multiple store instructions, all those instructions store
741 * the same value. If the output has multiple vertices, all vertices store
742 * the same value. This is a useful property for:
743 * - constant and uniform propagation to the next shader
744 * - deduplicating outputs
745 */
746 BITSET_DECLARE(output_equal_mask, NUM_SCALAR_SLOTS);
747
748 /* Mask of output components that store values that are convergent,
749 * i.e. all values stored into the outputs are equal within a primitive.
750 *
751 * This is different from output_equal_mask, which says that all stores
752 * to the same slot in the same thread are equal, while this says that
753 * each store to the same slot can be different, but it always stores
754 * a convergent value, which means the stored value is equal among all
755 * threads within a primitive.
756 *
757 * The advantage is that these varyings can always be promoted to flat
758 * regardless of the original interpolation mode, and they can always be
759 * compacted with both interpolated and flat varyings.
760 */
761 BITSET_DECLARE(convergent32_mask, NUM_SCALAR_SLOTS);
762 BITSET_DECLARE(convergent16_mask, NUM_SCALAR_SLOTS);
763 };
764
765 /******************************************************************
766 * HELPERS
767 ******************************************************************/
768
769 /* Return whether the low or high 16-bit slot is 1. */
770 #define BITSET_TEST32(m, b) \
771 (BITSET_TEST(m, (b) & ~0x1) || BITSET_TEST(m, ((b) & ~0x1) + 1))
772
773 #define BITSET3_TEST_ANY(bitsets, b) (BITSET_TEST((bitsets)[0], (b)) || \
774 BITSET_TEST((bitsets)[1], (b)) || \
775 BITSET_TEST((bitsets)[2], (b)))
776 #define BITSET6_TEST_ANY(bitsets, b) (BITSET3_TEST_ANY((bitsets), (b)) || \
777 BITSET3_TEST_ANY(&(bitsets)[3], (b)))
778
779 static void
print_linkage(struct linkage_info * linkage)780 print_linkage(struct linkage_info *linkage)
781 {
782 printf("Linkage: %s -> %s\n",
783 _mesa_shader_stage_to_abbrev(linkage->producer_stage),
784 _mesa_shader_stage_to_abbrev(linkage->consumer_stage));
785
786 for (unsigned i = 0; i < NUM_SCALAR_SLOTS; i++) {
787 struct scalar_slot *slot = &linkage->slot[i];
788
789 if (!slot->num_slots &&
790 list_is_empty(&slot->producer.stores) &&
791 list_is_empty(&slot->producer.loads) &&
792 list_is_empty(&slot->consumer.loads) &&
793 !BITSET_TEST(linkage->removable_mask, i) &&
794 !BITSET_TEST(linkage->indirect_mask, i) &&
795 !BITSET_TEST(linkage->xfb32_only_mask, i) &&
796 !BITSET_TEST(linkage->xfb16_only_mask, i) &&
797 !BITSET_TEST(linkage->tcs_cross_invoc32_mask, i) &&
798 !BITSET_TEST(linkage->tcs_cross_invoc16_mask, i) &&
799 !BITSET_TEST(linkage->no_varying32_mask, i) &&
800 !BITSET_TEST(linkage->no_varying16_mask, i) &&
801 !BITSET_TEST(linkage->interp_fp32_mask, i) &&
802 !BITSET_TEST(linkage->interp_fp16_mask, i) &&
803 !BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, i) &&
804 !BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, i) &&
805 !BITSET_TEST(linkage->color32_mask, i) &&
806 !BITSET3_TEST_ANY(linkage->color32_qual_masks, i) &&
807 !BITSET_TEST(linkage->flat32_mask, i) &&
808 !BITSET_TEST(linkage->flat16_mask, i) &&
809 !BITSET_TEST(linkage->interp_explicit32_mask, i) &&
810 !BITSET_TEST(linkage->interp_explicit16_mask, i) &&
811 !BITSET_TEST(linkage->interp_explicit_strict32_mask, i) &&
812 !BITSET_TEST(linkage->interp_explicit_strict16_mask, i) &&
813 !BITSET_TEST(linkage->per_primitive32_mask, i) &&
814 !BITSET_TEST(linkage->per_primitive16_mask, i) &&
815 !BITSET_TEST(linkage->convergent32_mask, i) &&
816 !BITSET_TEST(linkage->convergent16_mask, i) &&
817 !BITSET_TEST(linkage->output_equal_mask, i))
818 continue;
819
820 printf(" %7s.%c.%s: num_slots=%2u%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
821 gl_varying_slot_name_for_stage(vec4_slot(i),
822 linkage->producer_stage) + 13,
823 "xyzw"[(i / 2) % 4],
824 i % 2 ? "hi" : "lo",
825 slot->num_slots,
826 BITSET_TEST(linkage->removable_mask, i) ? " removable" : "",
827 BITSET_TEST(linkage->indirect_mask, i) ? " indirect" : "",
828 BITSET_TEST(linkage->xfb32_only_mask, i) ? " xfb32_only" : "",
829 BITSET_TEST(linkage->xfb16_only_mask, i) ? " xfb16_only" : "",
830 BITSET_TEST(linkage->tcs_cross_invoc32_mask, i) ? " tcs_cross_invoc32" : "",
831 BITSET_TEST(linkage->tcs_cross_invoc16_mask, i) ? " tcs_cross_invoc16" : "",
832 BITSET_TEST(linkage->no_varying32_mask, i) ? " no_varying32" : "",
833 BITSET_TEST(linkage->no_varying16_mask, i) ? " no_varying16" : "",
834 BITSET_TEST(linkage->interp_fp32_mask, i) ? " interp_fp32" : "",
835 BITSET_TEST(linkage->interp_fp32_qual_masks[0], i) ? " interp_fp32_persp_pixel" : "",
836 BITSET_TEST(linkage->interp_fp32_qual_masks[1], i) ? " interp_fp32_persp_centroid" : "",
837 BITSET_TEST(linkage->interp_fp32_qual_masks[2], i) ? " interp_fp32_persp_sample" : "",
838 BITSET_TEST(linkage->interp_fp32_qual_masks[3], i) ? " interp_fp32_linear_pixel" : "",
839 BITSET_TEST(linkage->interp_fp32_qual_masks[4], i) ? " interp_fp32_linear_centroid" : "",
840 BITSET_TEST(linkage->interp_fp32_qual_masks[5], i) ? " interp_fp32_linear_sample" : "",
841 BITSET_TEST(linkage->interp_fp16_mask, i) ? " interp_fp16" : "",
842 BITSET_TEST(linkage->interp_fp16_qual_masks[0], i) ? " interp_fp16_persp_pixel" : "",
843 BITSET_TEST(linkage->interp_fp16_qual_masks[1], i) ? " interp_fp16_persp_centroid" : "",
844 BITSET_TEST(linkage->interp_fp16_qual_masks[2], i) ? " interp_fp16_persp_sample" : "",
845 BITSET_TEST(linkage->interp_fp16_qual_masks[3], i) ? " interp_fp16_linear_pixel" : "",
846 BITSET_TEST(linkage->interp_fp16_qual_masks[4], i) ? " interp_fp16_linear_centroid" : "",
847 BITSET_TEST(linkage->interp_fp16_qual_masks[5], i) ? " interp_fp16_linear_sample" : "",
848 BITSET_TEST(linkage->color32_mask, i) ? " color32" : "",
849 BITSET_TEST(linkage->color32_qual_masks[0], i) ? " color32_pixel" : "",
850 BITSET_TEST(linkage->color32_qual_masks[1], i) ? " color32_centroid" : "",
851 BITSET_TEST(linkage->color32_qual_masks[2], i) ? " color32_sample" : "",
852 BITSET_TEST(linkage->flat32_mask, i) ? " flat32" : "",
853 BITSET_TEST(linkage->flat16_mask, i) ? " flat16" : "",
854 BITSET_TEST(linkage->interp_explicit32_mask, i) ? " interp_explicit32" : "",
855 BITSET_TEST(linkage->interp_explicit16_mask, i) ? " interp_explicit16" : "",
856 BITSET_TEST(linkage->interp_explicit_strict32_mask, i) ? " interp_explicit_strict32" : "",
857 BITSET_TEST(linkage->interp_explicit_strict16_mask, i) ? " interp_explicit_strict16" : "",
858 BITSET_TEST(linkage->per_primitive32_mask, i) ? " per_primitive32" : "",
859 BITSET_TEST(linkage->per_primitive32_mask, i) ? " per_primitive16" : "",
860 BITSET_TEST(linkage->convergent32_mask, i) ? " convergent32" : "",
861 BITSET_TEST(linkage->convergent16_mask, i) ? " convergent16" : "",
862 BITSET_TEST(linkage->output_equal_mask, i) ? " output_equal" : "",
863 !list_is_empty(&slot->producer.stores) ? " producer_stores" : "",
864 !list_is_empty(&slot->producer.loads) ? " producer_loads" : "",
865 !list_is_empty(&slot->consumer.loads) ? " consumer_loads" : "");
866 }
867 }
868
869 static void
slot_disable_optimizations_and_compaction(struct linkage_info * linkage,unsigned i)870 slot_disable_optimizations_and_compaction(struct linkage_info *linkage,
871 unsigned i)
872 {
873 BITSET_CLEAR(linkage->output_equal_mask, i);
874 BITSET_CLEAR(linkage->convergent32_mask, i);
875 BITSET_CLEAR(linkage->convergent16_mask, i);
876 BITSET_CLEAR(linkage->interp_fp32_mask, i);
877 BITSET_CLEAR(linkage->interp_fp16_mask, i);
878 for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++) {
879 BITSET_CLEAR(linkage->interp_fp32_qual_masks[b], i);
880 BITSET_CLEAR(linkage->interp_fp16_qual_masks[b], i);
881 }
882 BITSET_CLEAR(linkage->flat32_mask, i);
883 BITSET_CLEAR(linkage->flat16_mask, i);
884 BITSET_CLEAR(linkage->interp_explicit32_mask, i);
885 BITSET_CLEAR(linkage->interp_explicit16_mask, i);
886 BITSET_CLEAR(linkage->interp_explicit_strict32_mask, i);
887 BITSET_CLEAR(linkage->interp_explicit_strict16_mask, i);
888 BITSET_CLEAR(linkage->per_primitive32_mask, i);
889 BITSET_CLEAR(linkage->per_primitive16_mask, i);
890 BITSET_CLEAR(linkage->tcs_cross_invoc32_mask, i);
891 BITSET_CLEAR(linkage->tcs_cross_invoc16_mask, i);
892 BITSET_CLEAR(linkage->no_varying32_mask, i);
893 BITSET_CLEAR(linkage->no_varying16_mask, i);
894 BITSET_CLEAR(linkage->color32_mask, i);
895 for (unsigned b = 0; b < NUM_COLOR_QUALIFIERS; b++)
896 BITSET_CLEAR(linkage->color32_qual_masks[b], i);
897 }
898
899 static void
clear_slot_info_after_removal(struct linkage_info * linkage,unsigned i,bool uses_xfb)900 clear_slot_info_after_removal(struct linkage_info *linkage, unsigned i, bool uses_xfb)
901 {
902 slot_disable_optimizations_and_compaction(linkage, i);
903
904 if (uses_xfb)
905 return;
906
907 linkage->slot[i].num_slots = 0;
908
909 BITSET_CLEAR(linkage->indirect_mask, i);
910 BITSET_CLEAR(linkage->removable_mask, i);
911
912 /* Transform feedback stores can't be removed. */
913 assert(!BITSET_TEST(linkage->xfb32_only_mask, i));
914 assert(!BITSET_TEST(linkage->xfb16_only_mask, i));
915 }
916
917 static bool
has_xfb(nir_intrinsic_instr * intr)918 has_xfb(nir_intrinsic_instr *intr)
919 {
920 /* This means whether the instrinsic is ABLE to have xfb info. */
921 if (!nir_intrinsic_has_io_xfb(intr))
922 return false;
923
924 unsigned comp = nir_intrinsic_component(intr);
925
926 if (comp >= 2)
927 return nir_intrinsic_io_xfb2(intr).out[comp - 2].num_components > 0;
928 else
929 return nir_intrinsic_io_xfb(intr).out[comp].num_components > 0;
930 }
931
932 static bool
is_interpolated_color(struct linkage_info * linkage,unsigned i)933 is_interpolated_color(struct linkage_info *linkage, unsigned i)
934 {
935 if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
936 return false;
937
938 /* BFCn stores are bunched in the COLn slots with COLn, so we should never
939 * get BFCn here.
940 */
941 assert(vec4_slot(i) != VARYING_SLOT_BFC0 &&
942 vec4_slot(i) != VARYING_SLOT_BFC1);
943
944 return vec4_slot(i) == VARYING_SLOT_COL0 ||
945 vec4_slot(i) == VARYING_SLOT_COL1;
946 }
947
948 static bool
is_interpolated_texcoord(struct linkage_info * linkage,unsigned i)949 is_interpolated_texcoord(struct linkage_info *linkage, unsigned i)
950 {
951 if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
952 return false;
953
954 return vec4_slot(i) >= VARYING_SLOT_TEX0 &&
955 vec4_slot(i) <= VARYING_SLOT_TEX7;
956 }
957
958 static bool
color_uses_shade_model(struct linkage_info * linkage,unsigned i)959 color_uses_shade_model(struct linkage_info *linkage, unsigned i)
960 {
961 if (!is_interpolated_color(linkage, i))
962 return false;
963
964 list_for_each_entry(struct list_node, iter,
965 &linkage->slot[i].consumer.loads, head) {
966 assert(iter->instr->intrinsic == nir_intrinsic_load_interpolated_input);
967
968 nir_intrinsic_instr *baryc =
969 nir_instr_as_intrinsic(iter->instr->src[0].ssa->parent_instr);
970 if (nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE)
971 return true;
972 }
973
974 return false;
975 }
976
977 static enum fs_vec4_type
get_interp_vec4_type(struct linkage_info * linkage,unsigned slot,nir_intrinsic_instr * load)978 get_interp_vec4_type(struct linkage_info *linkage, unsigned slot,
979 nir_intrinsic_instr *load)
980 {
981 assert(!linkage->has_flexible_interp);
982 assert(load->intrinsic == nir_intrinsic_load_interpolated_input);
983
984 nir_intrinsic_instr *baryc =
985 nir_instr_as_intrinsic(load->src[0].ssa->parent_instr);
986 enum fs_vec4_type base;
987
988 if (color_uses_shade_model(linkage, slot))
989 base = FS_VEC4_TYPE_INTERP_COLOR_PIXEL;
990 else if (load->def.bit_size == 32)
991 base = FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL;
992 else if (load->def.bit_size == 16)
993 base = FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL;
994 else
995 unreachable("invalid load_interpolated_input type");
996
997 bool linear = nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NOPERSPECTIVE;
998
999 if (linear)
1000 base += 3;
1001
1002 switch (baryc->intrinsic) {
1003 case nir_intrinsic_load_barycentric_pixel:
1004 case nir_intrinsic_load_barycentric_at_offset:
1005 case nir_intrinsic_load_barycentric_at_sample:
1006 return base;
1007 case nir_intrinsic_load_barycentric_centroid:
1008 return base + 1;
1009 case nir_intrinsic_load_barycentric_sample:
1010 return base + 2;
1011 default:
1012 unreachable("unexpected barycentric intrinsic");
1013 }
1014 }
1015
1016 static bool
preserve_infs_nans(nir_shader * nir,unsigned bit_size)1017 preserve_infs_nans(nir_shader *nir, unsigned bit_size)
1018 {
1019 unsigned mode = nir->info.float_controls_execution_mode;
1020
1021 return nir_is_float_control_inf_preserve(mode, bit_size) ||
1022 nir_is_float_control_nan_preserve(mode, bit_size);
1023 }
1024
1025 static bool
preserve_nans(nir_shader * nir,unsigned bit_size)1026 preserve_nans(nir_shader *nir, unsigned bit_size)
1027 {
1028 unsigned mode = nir->info.float_controls_execution_mode;
1029
1030 return nir_is_float_control_nan_preserve(mode, bit_size);
1031 }
1032
1033 static nir_def *
build_convert_inf_to_nan(nir_builder * b,nir_def * x)1034 build_convert_inf_to_nan(nir_builder *b, nir_def *x)
1035 {
1036 /* Do x*0 + x. The multiplication by 0 can't be optimized out. */
1037 nir_def *fma = nir_ffma_imm1(b, x, 0, x);
1038 nir_instr_as_alu(fma->parent_instr)->exact = true;
1039 return fma;
1040 }
1041
1042 static bool
is_sysval(nir_instr * instr,gl_system_value sysval)1043 is_sysval(nir_instr *instr, gl_system_value sysval)
1044 {
1045 if (instr->type == nir_instr_type_intrinsic) {
1046 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1047
1048 if (intr->intrinsic == nir_intrinsic_from_system_value(sysval))
1049 return true;
1050
1051 if (intr->intrinsic == nir_intrinsic_load_deref) {
1052 nir_deref_instr *deref =
1053 nir_instr_as_deref(intr->src[0].ssa->parent_instr);
1054
1055 return nir_deref_mode_is_one_of(deref, nir_var_system_value) &&
1056 nir_deref_instr_get_variable(deref)->data.location == sysval;
1057 }
1058 }
1059
1060 return false;
1061 }
1062
1063 /******************************************************************
1064 * GATHERING INPUTS & OUTPUTS
1065 ******************************************************************/
1066
1067 static bool
is_active_sysval_output(struct linkage_info * linkage,unsigned slot,nir_intrinsic_instr * intr)1068 is_active_sysval_output(struct linkage_info *linkage, unsigned slot,
1069 nir_intrinsic_instr *intr)
1070 {
1071 return nir_slot_is_sysval_output(vec4_slot(slot),
1072 linkage->consumer_stage) &&
1073 !nir_intrinsic_io_semantics(intr).no_sysval_output;
1074 }
1075
1076 /**
1077 * This function acts like a filter. The pass won't touch varyings that
1078 * return false here, and the return value is saved in the linkage bitmasks,
1079 * so that all subpasses will *automatically* skip such varyings.
1080 */
1081 static bool
can_remove_varying(struct linkage_info * linkage,gl_varying_slot location)1082 can_remove_varying(struct linkage_info *linkage, gl_varying_slot location)
1083 {
1084 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1085 /* User-defined varyings and fog coordinates can always be removed. */
1086 if (location >= VARYING_SLOT_VAR0 ||
1087 location == VARYING_SLOT_FOGC)
1088 return true;
1089
1090 /* These can be removed as varyings, which means they will be demoted to
1091 * sysval-only outputs keeping their culling/rasterization functions
1092 * while not passing the values to FS. Drivers should handle
1093 * the "no_varying" semantic to benefit from this.
1094 *
1095 * Note: When removing unset LAYER and VIEWPORT FS inputs, they will
1096 * be replaced by 0 instead of undef.
1097 */
1098 if (location == VARYING_SLOT_CLIP_DIST0 ||
1099 location == VARYING_SLOT_CLIP_DIST1 ||
1100 location == VARYING_SLOT_CULL_DIST0 ||
1101 location == VARYING_SLOT_CULL_DIST1 ||
1102 location == VARYING_SLOT_LAYER ||
1103 location == VARYING_SLOT_VIEWPORT)
1104 return true;
1105
1106 /* COLn inputs can be removed only if both COLn and BFCn are not
1107 * written. Both COLn and BFCn outputs can be removed if COLn inputs
1108 * aren't read.
1109 *
1110 * TEXn inputs can never be removed in FS because of the coord replace
1111 * state, but TEXn outputs can be removed if they are not read by FS.
1112 */
1113 if (location == VARYING_SLOT_COL0 ||
1114 location == VARYING_SLOT_COL1 ||
1115 location == VARYING_SLOT_BFC0 ||
1116 location == VARYING_SLOT_BFC1 ||
1117 (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7))
1118 return true;
1119
1120 /* "GS -> FS" can remove the primitive ID if not written or not read. */
1121 if ((linkage->producer_stage == MESA_SHADER_GEOMETRY ||
1122 linkage->producer_stage == MESA_SHADER_MESH) &&
1123 location == VARYING_SLOT_PRIMITIVE_ID)
1124 return true;
1125
1126 /* No other varyings can be removed. */
1127 return false;
1128 } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) {
1129 /* Only VS->TES shouldn't remove TESS_LEVEL_* inputs because the values
1130 * come from glPatchParameterfv.
1131 *
1132 * For TCS->TES, TESS_LEVEL_* outputs can be removed as varyings, which
1133 * means they will be demoted to sysval-only outputs, so that drivers
1134 * know that TES doesn't read them.
1135 */
1136 if (linkage->producer_stage == MESA_SHADER_VERTEX &&
1137 (location == VARYING_SLOT_TESS_LEVEL_INNER ||
1138 location == VARYING_SLOT_TESS_LEVEL_OUTER))
1139 return false;
1140
1141 return true;
1142 }
1143
1144 /* All other varyings can be removed. */
1145 return true;
1146 }
1147
1148 struct opt_options {
1149 bool propagate_uniform_expr:1;
1150 bool deduplicate:1;
1151 bool inter_shader_code_motion:1;
1152 bool compact:1;
1153 bool disable_all:1;
1154 };
1155
1156 /**
1157 * Return which optimizations are allowed.
1158 */
1159 static struct opt_options
can_optimize_varying(struct linkage_info * linkage,gl_varying_slot location)1160 can_optimize_varying(struct linkage_info *linkage, gl_varying_slot location)
1161 {
1162 struct opt_options options_var = {
1163 .propagate_uniform_expr = true,
1164 .deduplicate = true,
1165 .inter_shader_code_motion = true,
1166 .compact = true,
1167 };
1168 struct opt_options options_color = {
1169 .propagate_uniform_expr = true, /* only constants in [0, 1] */
1170 .deduplicate = true,
1171 .compact = true,
1172 };
1173 struct opt_options options_tex = {
1174 .propagate_uniform_expr = true, /* only TEX.zw if equal to (0, 1) */
1175 };
1176 struct opt_options options_sysval_output = {
1177 .propagate_uniform_expr = true,
1178 .deduplicate = true,
1179 };
1180 struct opt_options options_tess_levels = {
1181 .propagate_uniform_expr = true,
1182 .deduplicate = true,
1183 };
1184 struct opt_options options_disable_all = {
1185 .disable_all = true,
1186 };
1187
1188 assert(can_remove_varying(linkage, location));
1189
1190 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1191 /* xx -> FS */
1192 /* User-defined varyings and fog coordinates can always be optimized. */
1193 if (location >= VARYING_SLOT_VAR0 ||
1194 location == VARYING_SLOT_FOGC)
1195 return options_var;
1196
1197 /* The primitive ID can always be optimized in GS -> FS and MS -> FS. */
1198 if ((linkage->producer_stage == MESA_SHADER_GEOMETRY ||
1199 linkage->producer_stage == MESA_SHADER_MESH) &&
1200 location == VARYING_SLOT_PRIMITIVE_ID)
1201 return options_var;
1202
1203 /* Colors can only do constant propagation if COLn and BFCn store the
1204 * same constant and the constant is between 0 and 1 (because clamp
1205 * vertex color state is unknown). Uniform propagation isn't possible
1206 * because of the clamping.
1207 *
1208 * Color components can only be deduplicated and compacted among
1209 * themselves if they have the same interpolation qualifier, and can't
1210 * be mixed with other varyings.
1211 */
1212 if (location == VARYING_SLOT_COL0 ||
1213 location == VARYING_SLOT_COL1 ||
1214 location == VARYING_SLOT_BFC0 ||
1215 location == VARYING_SLOT_BFC1)
1216 return options_color;
1217
1218 /* TEXn.zw can only be constant-propagated if the value is (0, 1)
1219 * because it matches the coord replace values.
1220 */
1221 if (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7)
1222 return options_tex;
1223
1224 /* LAYER, VIEWPORT, CLIP_DISTn, and CULL_DISTn can only propagate
1225 * uniform expressions and be compacted (moved to VARn while keeping
1226 * the sysval outputs where they are).
1227 */
1228 if (location == VARYING_SLOT_LAYER ||
1229 location == VARYING_SLOT_VIEWPORT ||
1230 location == VARYING_SLOT_CLIP_DIST0 ||
1231 location == VARYING_SLOT_CLIP_DIST1 ||
1232 location == VARYING_SLOT_CULL_DIST0 ||
1233 location == VARYING_SLOT_CULL_DIST1)
1234 return options_sysval_output;
1235
1236 /* Everything else can't be read by the consumer, such as POS, PSIZ,
1237 * CLIP_VERTEX, EDGE, PRIMITIVE_SHADING_RATE, etc.
1238 */
1239 return options_disable_all;
1240 }
1241
1242 if (linkage->producer_stage == MESA_SHADER_TESS_CTRL) {
1243 /* TESS_LEVEL_* can only propagate uniform expressions.
1244 * Compaction is disabled because AMD doesn't want the varying to be
1245 * moved to PATCHn while keeping the sysval output where it is.
1246 */
1247 if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
1248 location == VARYING_SLOT_TESS_LEVEL_OUTER)
1249 return options_tess_levels;
1250 }
1251
1252 /* All other shader pairs, which are (VS, TCS), (TCS, TES), (VS, TES),
1253 * (TES, GS), and (VS, GS) can compact and optimize all varyings.
1254 */
1255 return options_var;
1256 }
1257
1258 static bool
gather_inputs(struct nir_builder * builder,nir_intrinsic_instr * intr,void * cb_data)1259 gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data)
1260 {
1261 struct linkage_info *linkage = (struct linkage_info *)cb_data;
1262
1263 if (intr->intrinsic != nir_intrinsic_load_input &&
1264 intr->intrinsic != nir_intrinsic_load_per_vertex_input &&
1265 intr->intrinsic != nir_intrinsic_load_per_primitive_input &&
1266 intr->intrinsic != nir_intrinsic_load_interpolated_input &&
1267 intr->intrinsic != nir_intrinsic_load_input_vertex)
1268 return false;
1269
1270 /* nir_lower_io_to_scalar is required before this */
1271 assert(intr->def.num_components == 1);
1272 /* Non-zero constant offsets should have been folded by
1273 * nir_io_add_const_offset_to_base.
1274 */
1275 nir_src offset = *nir_get_io_offset_src(intr);
1276 assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0);
1277
1278 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1279
1280 if (!can_remove_varying(linkage, sem.location))
1281 return false;
1282
1283 /* Insert the load into the list of loads for this scalar slot. */
1284 unsigned slot = intr_get_scalar_16bit_slot(intr);
1285 struct scalar_slot *in = &linkage->slot[slot];
1286 struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx,
1287 sizeof(struct list_node));
1288 node->instr = intr;
1289 list_addtail(&node->head, &in->consumer.loads);
1290 in->num_slots = MAX2(in->num_slots, sem.num_slots);
1291
1292 BITSET_SET(linkage->removable_mask, slot);
1293
1294 enum fs_vec4_type fs_vec4_type = FS_VEC4_TYPE_NONE;
1295
1296 /* Determine the type of the input for compaction. Other inputs
1297 * can be compacted with indirectly-indexed vec4 slots if they
1298 * have unused components, but only if they are of the same type.
1299 */
1300 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1301 switch (intr->intrinsic) {
1302 case nir_intrinsic_load_input:
1303 fs_vec4_type = FS_VEC4_TYPE_FLAT;
1304 break;
1305 case nir_intrinsic_load_per_primitive_input:
1306 fs_vec4_type = FS_VEC4_TYPE_PER_PRIMITIVE;
1307 break;
1308 case nir_intrinsic_load_input_vertex:
1309 if (sem.interp_explicit_strict)
1310 fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT;
1311 else
1312 fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT;
1313 break;
1314 case nir_intrinsic_load_interpolated_input:
1315 if (linkage->has_flexible_interp) {
1316 if (color_uses_shade_model(linkage, slot))
1317 fs_vec4_type = FS_VEC4_TYPE_INTERP_COLOR;
1318 else if (intr->def.bit_size == 32)
1319 fs_vec4_type = FS_VEC4_TYPE_INTERP_FP32;
1320 else if (intr->def.bit_size == 16)
1321 fs_vec4_type = FS_VEC4_TYPE_INTERP_FP16;
1322 else
1323 unreachable("invalid load_interpolated_input type");
1324 } else {
1325 fs_vec4_type = get_interp_vec4_type(linkage, slot, intr);
1326 }
1327 break;
1328 default:
1329 unreachable("unexpected input load intrinsic");
1330 }
1331
1332 linkage->fs_vec4_type[sem.location] = fs_vec4_type;
1333 }
1334
1335 /* Indirect indexing. */
1336 if (!nir_src_is_const(offset)) {
1337 /* Only the indirectly-indexed component is marked as indirect. */
1338 for (unsigned i = 0; i < sem.num_slots; i++)
1339 BITSET_SET(linkage->indirect_mask, slot + i * 8);
1340
1341 /* Set the same vec4 type as the first element in all slots. */
1342 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1343 for (unsigned i = 1; i < sem.num_slots; i++)
1344 linkage->fs_vec4_type[sem.location + i] = fs_vec4_type;
1345 }
1346 return false;
1347 }
1348
1349 if (!can_optimize_varying(linkage, sem.location).compact)
1350 return false;
1351
1352 /* Record inputs that can be compacted. */
1353 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1354 unsigned i;
1355 assert(intr->def.bit_size == 32 || intr->def.bit_size == 16);
1356
1357 switch (fs_vec4_type) {
1358 case FS_VEC4_TYPE_FLAT:
1359 if (intr->def.bit_size == 32)
1360 BITSET_SET(linkage->flat32_mask, slot);
1361 else
1362 BITSET_SET(linkage->flat16_mask, slot);
1363 break;
1364 case FS_VEC4_TYPE_INTERP_EXPLICIT:
1365 if (intr->def.bit_size == 32)
1366 BITSET_SET(linkage->interp_explicit32_mask, slot);
1367 else
1368 BITSET_SET(linkage->interp_explicit16_mask, slot);
1369 break;
1370 case FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT:
1371 if (intr->def.bit_size == 32)
1372 BITSET_SET(linkage->interp_explicit_strict32_mask, slot);
1373 else
1374 BITSET_SET(linkage->interp_explicit_strict16_mask, slot);
1375 break;
1376 case FS_VEC4_TYPE_PER_PRIMITIVE:
1377 if (intr->def.bit_size == 32)
1378 BITSET_SET(linkage->per_primitive32_mask, slot);
1379 else
1380 BITSET_SET(linkage->per_primitive16_mask, slot);
1381 break;
1382
1383 case FS_VEC4_TYPE_INTERP_FP32:
1384 BITSET_SET(linkage->interp_fp32_mask, slot);
1385 break;
1386 case FS_VEC4_TYPE_INTERP_FP16:
1387 BITSET_SET(linkage->interp_fp16_mask, slot);
1388 break;
1389 case FS_VEC4_TYPE_INTERP_COLOR:
1390 BITSET_SET(linkage->color32_mask, slot);
1391 break;
1392
1393 case FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL:
1394 case FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID:
1395 case FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE:
1396 case FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL:
1397 case FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID:
1398 case FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE:
1399 i = fs_vec4_type - FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL;
1400 BITSET_SET(linkage->interp_fp32_qual_masks[i], slot);
1401 break;
1402
1403 case FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL:
1404 case FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID:
1405 case FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE:
1406 case FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL:
1407 case FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID:
1408 case FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE:
1409 i = fs_vec4_type - FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL;
1410 BITSET_SET(linkage->interp_fp16_qual_masks[i], slot);
1411 break;
1412
1413 case FS_VEC4_TYPE_INTERP_COLOR_PIXEL:
1414 case FS_VEC4_TYPE_INTERP_COLOR_CENTROID:
1415 case FS_VEC4_TYPE_INTERP_COLOR_SAMPLE:
1416 i = fs_vec4_type - FS_VEC4_TYPE_INTERP_COLOR_PIXEL;
1417 BITSET_SET(linkage->color32_qual_masks[i], slot);
1418 break;
1419
1420 case FS_VEC4_TYPE_NONE:
1421 unreachable("unexpected fs_vec4_type");
1422 }
1423
1424 if (!linkage->has_flexible_interp &&
1425 intr->intrinsic == nir_intrinsic_load_interpolated_input) {
1426 /* interpolateAtCentroid can occur simultaneously with any other
1427 * qualifier. If centroid is flagged with any other qualifier,
1428 * unflag centroid. Even though we track such outputs as the other
1429 * qualifier, the load_barycentric_centroid intrinsic must be
1430 * preserved by all optimizations. The only case when it's not
1431 * preserved is when the input is convergent, in which case
1432 * all qualifiers have the same behavior and we opportunistically
1433 * change it during compaction.
1434 */
1435 if (color_uses_shade_model(linkage, slot)) {
1436 if (BITSET_TEST(linkage->color32_qual_masks[COLOR_CENTROID], slot) &&
1437 (BITSET_TEST(linkage->color32_qual_masks[COLOR_PIXEL], slot) ||
1438 BITSET_TEST(linkage->color32_qual_masks[COLOR_SAMPLE], slot)))
1439 BITSET_CLEAR(linkage->color32_qual_masks[COLOR_CENTROID], slot);
1440 } else {
1441 INTERP_QUAL_BITSET *bitsets =
1442 intr->def.bit_size == 32 ? &linkage->interp_fp32_qual_masks :
1443 &linkage->interp_fp16_qual_masks;
1444
1445 if (BITSET_TEST((*bitsets)[PERSP_CENTROID], slot) &&
1446 (BITSET_TEST((*bitsets)[PERSP_PIXEL], slot) ||
1447 BITSET_TEST((*bitsets)[PERSP_SAMPLE], slot)))
1448 BITSET_CLEAR((*bitsets)[PERSP_CENTROID], slot);
1449
1450 if (BITSET_TEST((*bitsets)[LINEAR_CENTROID], slot) &&
1451 (BITSET_TEST((*bitsets)[LINEAR_PIXEL], slot) ||
1452 BITSET_TEST((*bitsets)[LINEAR_SAMPLE], slot)))
1453 BITSET_CLEAR((*bitsets)[LINEAR_CENTROID], slot);
1454 }
1455 }
1456 } else {
1457 if (intr->def.bit_size == 32)
1458 BITSET_SET(linkage->flat32_mask, slot);
1459 else if (intr->def.bit_size == 16)
1460 BITSET_SET(linkage->flat16_mask, slot);
1461 else
1462 unreachable("invalid load_input type");
1463
1464 if (linkage->consumer_stage == MESA_SHADER_TESS_CTRL &&
1465 intr->intrinsic == nir_intrinsic_load_per_vertex_input) {
1466 nir_src *vertex_index_src = nir_get_io_arrayed_index_src(intr);
1467 nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
1468
1469 if (!is_sysval(vertex_index_instr, SYSTEM_VALUE_INVOCATION_ID)) {
1470 if (intr->def.bit_size == 32)
1471 BITSET_SET(linkage->tcs_cross_invoc32_mask, slot);
1472 else if (intr->def.bit_size == 16)
1473 BITSET_SET(linkage->tcs_cross_invoc16_mask, slot);
1474 else
1475 unreachable("invalid load_input type");
1476 }
1477 }
1478 }
1479 return false;
1480 }
1481
1482 static bool
gather_outputs(struct nir_builder * builder,nir_intrinsic_instr * intr,void * cb_data)1483 gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data)
1484 {
1485 struct linkage_info *linkage = (struct linkage_info *)cb_data;
1486
1487 if (intr->intrinsic != nir_intrinsic_store_output &&
1488 intr->intrinsic != nir_intrinsic_load_output &&
1489 intr->intrinsic != nir_intrinsic_store_per_vertex_output &&
1490 intr->intrinsic != nir_intrinsic_store_per_view_output &&
1491 intr->intrinsic != nir_intrinsic_store_per_primitive_output &&
1492 intr->intrinsic != nir_intrinsic_load_per_vertex_output &&
1493 intr->intrinsic != nir_intrinsic_load_per_view_output &&
1494 intr->intrinsic != nir_intrinsic_load_per_primitive_output)
1495 return false;
1496
1497 bool is_store =
1498 intr->intrinsic == nir_intrinsic_store_output ||
1499 intr->intrinsic == nir_intrinsic_store_per_vertex_output ||
1500 intr->intrinsic == nir_intrinsic_store_per_view_output ||
1501 intr->intrinsic == nir_intrinsic_store_per_primitive_output;
1502
1503 if (is_store) {
1504 /* nir_lower_io_to_scalar is required before this */
1505 assert(intr->src[0].ssa->num_components == 1);
1506 /* nit_opt_undef is required before this. */
1507 assert(intr->src[0].ssa->parent_instr->type !=
1508 nir_instr_type_undef);
1509 } else {
1510 /* nir_lower_io_to_scalar is required before this */
1511 assert(intr->def.num_components == 1);
1512 /* Outputs loads are only allowed in TCS. */
1513 assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
1514 }
1515
1516 /* Non-zero constant offsets should have been folded by
1517 * nir_io_add_const_offset_to_base.
1518 */
1519 nir_src offset = *nir_get_io_offset_src(intr);
1520 assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0);
1521
1522 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1523
1524 if (!can_remove_varying(linkage, sem.location))
1525 return false;
1526
1527 /* For "xx -> FS", treat BFCn stores as COLn to make dead varying
1528 * elimination do the right thing automatically. The rules are:
1529 * - COLn inputs can be removed only if both COLn and BFCn are not
1530 * written.
1531 * - Both COLn and BFCn outputs can be removed if COLn inputs
1532 * aren't read.
1533 */
1534 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1535 if (sem.location == VARYING_SLOT_BFC0)
1536 sem.location = VARYING_SLOT_COL0;
1537 else if (sem.location == VARYING_SLOT_BFC1)
1538 sem.location = VARYING_SLOT_COL1;
1539 }
1540
1541 /* Insert the instruction into the list of stores or loads for this
1542 * scalar slot.
1543 */
1544 unsigned slot =
1545 get_scalar_16bit_slot(sem, nir_intrinsic_component(intr));
1546
1547 struct scalar_slot *out = &linkage->slot[slot];
1548 struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx,
1549 sizeof(struct list_node));
1550 node->instr = intr;
1551 out->num_slots = MAX2(out->num_slots, sem.num_slots);
1552
1553 if (is_store) {
1554 list_addtail(&node->head, &out->producer.stores);
1555
1556 if (has_xfb(intr)) {
1557 BITSET_SET(linkage->xfb_mask, slot);
1558
1559 if (sem.no_varying &&
1560 !is_active_sysval_output(linkage, slot, intr)) {
1561 if (intr->src[0].ssa->bit_size == 32)
1562 BITSET_SET(linkage->xfb32_only_mask, slot);
1563 else if (intr->src[0].ssa->bit_size == 16)
1564 BITSET_SET(linkage->xfb16_only_mask, slot);
1565 else
1566 unreachable("invalid load_input type");
1567 }
1568 }
1569 } else {
1570 list_addtail(&node->head, &out->producer.loads);
1571 }
1572
1573 BITSET_SET(linkage->removable_mask, slot);
1574
1575 /* Indirect indexing. */
1576 if (!nir_src_is_const(offset)) {
1577 /* Only the indirectly-indexed component is marked as indirect. */
1578 for (unsigned i = 0; i < sem.num_slots; i++)
1579 BITSET_SET(linkage->indirect_mask, slot + i * 8);
1580
1581 /* Set the same vec4 type as the first element in all slots. */
1582 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1583 enum fs_vec4_type fs_vec4_type =
1584 linkage->fs_vec4_type[sem.location];
1585
1586 for (unsigned i = 1; i < sem.num_slots; i++)
1587 linkage->fs_vec4_type[sem.location + i] = fs_vec4_type;
1588 }
1589 return false;
1590 }
1591
1592 if (can_optimize_varying(linkage, sem.location).disable_all)
1593 return false;
1594
1595 if (is_store) {
1596 nir_def *value = intr->src[0].ssa;
1597
1598 const bool constant = value->parent_instr->type == nir_instr_type_load_const;
1599
1600 /* If the store instruction is executed in a divergent block, the value
1601 * that's stored in the output becomes divergent.
1602 *
1603 * Mesh shaders get special treatment because we can't follow their topology,
1604 * so we only propagate constants.
1605 * TODO: revisit this when workgroup divergence analysis is merged.
1606 */
1607 const bool divergent = (!constant && linkage->producer_stage == MESA_SHADER_MESH) ||
1608 intr->instr.block->divergent ||
1609 nir_src_is_divergent(&intr->src[0]);
1610
1611 if (!out->producer.value) {
1612 /* This is the first store to this output. */
1613 BITSET_SET(linkage->output_equal_mask, slot);
1614 out->producer.value = value->parent_instr;
1615
1616 /* Set whether the value is convergent. Such varyings can be
1617 * promoted to flat regardless of their original interpolation
1618 * mode.
1619 */
1620 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && !divergent) {
1621 if (value->bit_size == 32)
1622 BITSET_SET(linkage->convergent32_mask, slot);
1623 else if (value->bit_size == 16)
1624 BITSET_SET(linkage->convergent16_mask, slot);
1625 else
1626 unreachable("invalid store_output type");
1627 }
1628 } else {
1629 /* There are multiple stores to the same output. If they store
1630 * different values, clear the mask.
1631 */
1632 if (out->producer.value != value->parent_instr)
1633 BITSET_CLEAR(linkage->output_equal_mask, slot);
1634
1635 /* Update divergence information. */
1636 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && divergent) {
1637 if (value->bit_size == 32)
1638 BITSET_CLEAR(linkage->convergent32_mask, slot);
1639 else if (value->bit_size == 16)
1640 BITSET_CLEAR(linkage->convergent16_mask, slot);
1641 else
1642 unreachable("invalid store_output type");
1643 }
1644 }
1645 } else {
1646 /* Only TCS output loads can get here.
1647 *
1648 * We need to record output loads as flat32 or flat16, otherwise
1649 * compaction will think that the slot is free and will put some
1650 * other output in its place.
1651 */
1652 assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
1653
1654 if (!can_optimize_varying(linkage, sem.location).compact)
1655 return false;
1656
1657 if (intr->def.bit_size == 32)
1658 BITSET_SET(linkage->flat32_mask, slot);
1659 else if (intr->def.bit_size == 16)
1660 BITSET_SET(linkage->flat16_mask, slot);
1661 else
1662 unreachable("invalid load_input type");
1663 }
1664 return false;
1665 }
1666
1667 /******************************************************************
1668 * TIDYING UP INDIRECT VARYINGS (BEFORE DEAD VARYINGS REMOVAL)
1669 ******************************************************************/
1670
1671 static void
tidy_up_indirect_varyings(struct linkage_info * linkage)1672 tidy_up_indirect_varyings(struct linkage_info *linkage)
1673 {
1674 unsigned i;
1675
1676 /* Indirectly-indexed slots can have direct access too and thus set
1677 * various bitmasks, so clear those bitmasks to make sure they are not
1678 * touched.
1679 */
1680 BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) {
1681 slot_disable_optimizations_and_compaction(linkage, i);
1682 }
1683
1684 /* If some slots have both direct and indirect accesses, move instructions
1685 * of such slots to the slot representing the first array element, so that
1686 * we can remove all loads/stores of dead indirectly-indexed varyings
1687 * by only looking at the first element.
1688 */
1689 BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) {
1690 struct scalar_slot *first = &linkage->slot[i];
1691
1692 /* Skip if this is not the first array element. The first element
1693 * always sets num_slots to at least 2.
1694 */
1695 if (first->num_slots <= 1)
1696 continue;
1697
1698 /* Move instructions from other elements of the indirectly-accessed
1699 * array to the first element (by merging the linked lists).
1700 */
1701 for (unsigned elem = 1; elem < first->num_slots; elem++) {
1702 /* The component slots are at 16-bit granularity, so we need to
1703 * increment by 8 to get the same component in the next vec4 slot.
1704 */
1705 struct scalar_slot *other = &linkage->slot[i + elem * 8];
1706
1707 list_splicetail(&other->producer.stores, &first->producer.stores);
1708 list_splicetail(&other->producer.loads, &first->producer.loads);
1709 list_splicetail(&other->consumer.loads, &first->consumer.loads);
1710 list_inithead(&other->producer.stores);
1711 list_inithead(&other->producer.loads);
1712 list_inithead(&other->consumer.loads);
1713 }
1714 }
1715 }
1716
1717 /******************************************************************
1718 * TIDYING UP CONVERGENT VARYINGS
1719 ******************************************************************/
1720
1721 /**
1722 * Reorganize bitmasks for FS because they are initialized such that they can
1723 * intersect with the convergent bitmasks. We want them to be disjoint, so
1724 * that masks of interpolated, flat, and convergent varyings don't intersect.
1725 */
1726 static void
tidy_up_convergent_varyings(struct linkage_info * linkage)1727 tidy_up_convergent_varyings(struct linkage_info *linkage)
1728 {
1729 if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
1730 return;
1731
1732 unsigned i;
1733 /* Whether to promote convergent interpolated slots to flat if it
1734 * doesn't lead to worse compaction.
1735 */
1736 bool optimize_convergent_slots = true; /* only turn off for debugging */
1737
1738 if (optimize_convergent_slots) {
1739 /* If a slot is flat and convergent and the driver can't load as flat
1740 * from interpolated vec4 slots, keep the flat bit and remove
1741 * the convergent bit. If the driver can load as flat from interpolated
1742 * vec4 slots, keep the convergent bit.
1743 *
1744 * If a slot is interpolated and convergent, remove the interpolated
1745 * bit and keep the convergent bit, which means that it's interpolated,
1746 * but can be promoted to flat.
1747 *
1748 * Since the geometry shader is the only shader that can store values
1749 * in multiple vertices before FS, it's required that all stores are
1750 * equal to be considered convergent (output_equal_mask), otherwise
1751 * the promotion to flat would be incorrect.
1752 */
1753 BITSET_FOREACH_SET(i, linkage->convergent32_mask, NUM_SCALAR_SLOTS) {
1754 if (!BITSET_TEST(linkage->interp_fp32_mask, i) &&
1755 !BITSET_TEST(linkage->color32_mask, i) &&
1756 !BITSET_TEST(linkage->flat32_mask, i) &&
1757 !BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, i) &&
1758 !BITSET3_TEST_ANY(linkage->color32_qual_masks, i)) {
1759 /* Clear the flag - not used by FS. */
1760 BITSET_CLEAR(linkage->convergent32_mask, i);
1761 } else if ((!linkage->can_mix_convergent_flat_with_interpolated &&
1762 BITSET_TEST(linkage->flat32_mask, i)) ||
1763 (linkage->producer_stage == MESA_SHADER_GEOMETRY &&
1764 !BITSET_TEST(linkage->output_equal_mask, i))) {
1765 /* Keep the original qualifier. */
1766 BITSET_CLEAR(linkage->convergent32_mask, i);
1767 } else {
1768 /* Keep it convergent. */
1769 BITSET_CLEAR(linkage->interp_fp32_mask, i);
1770 for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++)
1771 BITSET_CLEAR(linkage->interp_fp32_qual_masks[b], i);
1772 BITSET_CLEAR(linkage->color32_mask, i);
1773 for (unsigned b = 0; b < NUM_COLOR_QUALIFIERS; b++)
1774 BITSET_CLEAR(linkage->color32_qual_masks[b], i);
1775 BITSET_CLEAR(linkage->flat32_mask, i);
1776 }
1777 }
1778
1779 BITSET_FOREACH_SET(i, linkage->convergent16_mask, NUM_SCALAR_SLOTS) {
1780 if (!BITSET_TEST(linkage->interp_fp16_mask, i) &&
1781 !BITSET_TEST(linkage->flat16_mask, i) &&
1782 !BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, i)) {
1783 /* Clear the flag - not used by FS. */
1784 BITSET_CLEAR(linkage->convergent16_mask, i);
1785 } else if ((!linkage->can_mix_convergent_flat_with_interpolated &&
1786 BITSET_TEST(linkage->flat16_mask, i)) ||
1787 (linkage->producer_stage == MESA_SHADER_GEOMETRY &&
1788 !BITSET_TEST(linkage->output_equal_mask, i))) {
1789 /* Keep the original qualifier. */
1790 BITSET_CLEAR(linkage->convergent16_mask, i);
1791 } else {
1792 /* Keep it convergent. */
1793 BITSET_CLEAR(linkage->interp_fp16_mask, i);
1794 for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++)
1795 BITSET_CLEAR(linkage->interp_fp16_qual_masks[b], i);
1796 BITSET_CLEAR(linkage->flat16_mask, i);
1797 }
1798 }
1799 } else {
1800 /* Don't do anything with convergent slots. */
1801 BITSET_ZERO(linkage->convergent32_mask);
1802 BITSET_ZERO(linkage->convergent16_mask);
1803 }
1804 }
1805
1806 /******************************************************************
1807 * DETERMINING UNIFORM AND UBO MOVABILITY BASED ON DRIVER LIMITS
1808 ******************************************************************/
1809
1810 static bool
is_variable_present(nir_shader * nir,nir_variable * var,nir_variable_mode mode,bool spirv)1811 is_variable_present(nir_shader *nir, nir_variable *var,
1812 nir_variable_mode mode, bool spirv)
1813 {
1814 nir_foreach_variable_with_modes(it, nir, mode) {
1815 if ((spirv && it->data.binding == var->data.binding) ||
1816 (!spirv && !strcmp(it->name, var->name)))
1817 return true;
1818 }
1819 return false;
1820 }
1821
1822 /* TODO: this should be a helper in common code */
1823 static unsigned
get_uniform_components(const struct glsl_type * type)1824 get_uniform_components(const struct glsl_type *type)
1825 {
1826 unsigned size = glsl_get_aoa_size(type);
1827 size = MAX2(size, 1);
1828 size *= glsl_get_matrix_columns(glsl_without_array(type));
1829
1830 if (glsl_type_is_dual_slot(glsl_without_array(type)))
1831 size *= 2;
1832
1833 /* Convert from vec4 to scalar. */
1834 return size * 4;
1835 }
1836
1837 static unsigned
get_ubo_slots(const nir_variable * var)1838 get_ubo_slots(const nir_variable *var)
1839 {
1840 if (glsl_type_is_interface(glsl_without_array(var->type))) {
1841 unsigned slots = glsl_get_aoa_size(var->type);
1842 return MAX2(slots, 1);
1843 }
1844
1845 return 1;
1846 }
1847
1848 /**
1849 * Count uniforms and see if the combined uniform component count is over
1850 * the limit. If it is, don't move any uniforms. It's sufficient if drivers
1851 * declare a very high limit.
1852 */
1853 static void
determine_uniform_movability(struct linkage_info * linkage,unsigned max_uniform_components)1854 determine_uniform_movability(struct linkage_info *linkage,
1855 unsigned max_uniform_components)
1856 {
1857 nir_shader *producer = linkage->producer_builder.shader;
1858 nir_shader *consumer = linkage->consumer_builder.shader;
1859 unsigned num_producer_uniforms = 0;
1860 unsigned num_consumer_uniforms = 0;
1861 unsigned num_shared_uniforms = 0;
1862
1863 nir_foreach_variable_with_modes(var, producer, nir_var_uniform) {
1864 if (is_variable_present(consumer, var, nir_var_uniform, linkage->spirv))
1865 num_shared_uniforms += get_uniform_components(var->type);
1866 else
1867 num_producer_uniforms += get_uniform_components(var->type);
1868 }
1869
1870 nir_foreach_variable_with_modes(var, consumer, nir_var_uniform) {
1871 if (!is_variable_present(producer, var, nir_var_uniform, linkage->spirv))
1872 num_consumer_uniforms += get_uniform_components(var->type);
1873 }
1874
1875 linkage->can_move_uniforms =
1876 num_producer_uniforms + num_consumer_uniforms + num_shared_uniforms <=
1877 max_uniform_components;
1878 }
1879
1880 /**
1881 * Count UBOs and see if the combined UBO count is over the limit. If it is,
1882 * don't move any UBOs. It's sufficient if drivers declare a very high limit.
1883 */
1884 static void
determine_ubo_movability(struct linkage_info * linkage,unsigned max_ubos_per_stage)1885 determine_ubo_movability(struct linkage_info *linkage,
1886 unsigned max_ubos_per_stage)
1887 {
1888 nir_shader *producer = linkage->producer_builder.shader;
1889 nir_shader *consumer = linkage->consumer_builder.shader;
1890 unsigned num_producer_ubos = 0;
1891 unsigned num_consumer_ubos = 0;
1892 unsigned num_shared_ubos = 0;
1893
1894 nir_foreach_variable_with_modes(var, producer, nir_var_mem_ubo) {
1895 if (is_variable_present(consumer, var, nir_var_mem_ubo, linkage->spirv))
1896 num_shared_ubos += get_ubo_slots(var);
1897 else
1898 num_producer_ubos += get_ubo_slots(var);
1899 }
1900
1901 nir_foreach_variable_with_modes(var, consumer, nir_var_mem_ubo) {
1902 if (!is_variable_present(producer, var, nir_var_mem_ubo,
1903 linkage->spirv))
1904 num_consumer_ubos += get_ubo_slots(var);
1905 }
1906
1907 linkage->can_move_ubos =
1908 num_producer_ubos + num_consumer_ubos + num_shared_ubos <=
1909 max_ubos_per_stage;
1910 }
1911
1912 /******************************************************************
1913 * DEAD VARYINGS REMOVAL
1914 ******************************************************************/
1915
1916 static void
remove_all_stores(struct linkage_info * linkage,unsigned i,bool * uses_xfb,nir_opt_varyings_progress * progress)1917 remove_all_stores(struct linkage_info *linkage, unsigned i,
1918 bool *uses_xfb, nir_opt_varyings_progress *progress)
1919 {
1920 struct scalar_slot *slot = &linkage->slot[i];
1921
1922 assert(!list_is_empty(&slot->producer.stores) &&
1923 list_is_empty(&slot->producer.loads) &&
1924 list_is_empty(&slot->consumer.loads));
1925
1926 /* Remove all stores. */
1927 list_for_each_entry_safe(struct list_node, iter, &slot->producer.stores, head) {
1928 if (nir_remove_varying(iter->instr, linkage->consumer_stage)) {
1929 list_del(&iter->head);
1930 *progress |= nir_progress_producer;
1931 } else {
1932 if (has_xfb(iter->instr)) {
1933 *uses_xfb = true;
1934
1935 if (!is_active_sysval_output(linkage, i, iter->instr)) {
1936 if (iter->instr->src[0].ssa->bit_size == 32)
1937 BITSET_SET(linkage->xfb32_only_mask, i);
1938 else if (iter->instr->src[0].ssa->bit_size == 16)
1939 BITSET_SET(linkage->xfb16_only_mask, i);
1940 else
1941 unreachable("invalid load_input type");
1942 }
1943 }
1944 }
1945 }
1946 }
1947
1948 static void
remove_dead_varyings(struct linkage_info * linkage,nir_opt_varyings_progress * progress)1949 remove_dead_varyings(struct linkage_info *linkage,
1950 nir_opt_varyings_progress *progress)
1951 {
1952 unsigned i;
1953
1954 /* Remove dead inputs and outputs. */
1955 BITSET_FOREACH_SET(i, linkage->removable_mask, NUM_SCALAR_SLOTS) {
1956 struct scalar_slot *slot = &linkage->slot[i];
1957
1958 /* Only indirect access can have no loads and stores because we moved
1959 * them to the first element in tidy_up_indirect_varyings().
1960 */
1961 assert(!list_is_empty(&slot->producer.stores) ||
1962 !list_is_empty(&slot->producer.loads) ||
1963 !list_is_empty(&slot->consumer.loads) ||
1964 BITSET_TEST(linkage->indirect_mask, i));
1965
1966 /* Nothing to do if there are no loads and stores. */
1967 if (list_is_empty(&slot->producer.stores) &&
1968 list_is_empty(&slot->producer.loads) &&
1969 list_is_empty(&slot->consumer.loads))
1970 continue;
1971
1972 /* If there are producer loads (e.g. TCS) but no consumer loads
1973 * (e.g. TES), set the "no_varying" flag to indicate that the outputs
1974 * are not consumed by the next shader stage (e.g. TES).
1975 */
1976 if (!list_is_empty(&slot->producer.stores) &&
1977 !list_is_empty(&slot->producer.loads) &&
1978 list_is_empty(&slot->consumer.loads)) {
1979 for (unsigned list_index = 0; list_index < 2; list_index++) {
1980 struct list_head *list = list_index ? &slot->producer.stores :
1981 &slot->producer.loads;
1982
1983 list_for_each_entry(struct list_node, iter, list, head) {
1984 nir_io_semantics sem = nir_intrinsic_io_semantics(iter->instr);
1985 sem.no_varying = 1;
1986 nir_intrinsic_set_io_semantics(iter->instr, sem);
1987 }
1988 }
1989
1990 /* This tells the compaction to move these varyings to the end. */
1991 if (BITSET_TEST(linkage->flat32_mask, i)) {
1992 assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT);
1993 BITSET_CLEAR(linkage->flat32_mask, i);
1994 BITSET_SET(linkage->no_varying32_mask, i);
1995 }
1996 if (BITSET_TEST(linkage->flat16_mask, i)) {
1997 assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT);
1998 BITSET_CLEAR(linkage->flat16_mask, i);
1999 BITSET_SET(linkage->no_varying16_mask, i);
2000 }
2001 continue;
2002 }
2003
2004 /* The varyings aren't dead if both loads and stores are present. */
2005 if (!list_is_empty(&slot->producer.stores) &&
2006 (!list_is_empty(&slot->producer.loads) ||
2007 !list_is_empty(&slot->consumer.loads)))
2008 continue;
2009
2010 bool uses_xfb = false;
2011
2012 if (list_is_empty(&slot->producer.stores)) {
2013 /* There are no stores. */
2014 assert(!list_is_empty(&slot->producer.loads) ||
2015 !list_is_empty(&slot->consumer.loads));
2016
2017 /* TEXn.xy loads can't be removed in FS because of the coord
2018 * replace state, but TEXn outputs can be removed if they are
2019 * not read by FS.
2020 *
2021 * TEXn.zw loads can be eliminated and replaced by (0, 1), which
2022 * is equal to the coord replace value.
2023 */
2024 if (is_interpolated_texcoord(linkage, i)) {
2025 assert(i % 2 == 0); /* high 16-bit slots disallowed */
2026 /* Keep TEXn.xy. */
2027 if (i % 8 < 4)
2028 continue;
2029 }
2030
2031 /* Replace all loads with undef. Do that for both input loads
2032 * in the consumer stage and output loads in the producer stage
2033 * because we also want to eliminate TCS loads that have no
2034 * corresponding TCS stores.
2035 */
2036 for (unsigned list_index = 0; list_index < 2; list_index++) {
2037 struct list_head *list = list_index ? &slot->producer.loads :
2038 &slot->consumer.loads;
2039 nir_builder *b = list_index ? &linkage->producer_builder :
2040 &linkage->consumer_builder;
2041
2042 list_for_each_entry(struct list_node, iter, list, head) {
2043 nir_intrinsic_instr *loadi = iter->instr;
2044 nir_def *replacement = NULL;
2045
2046 b->cursor = nir_before_instr(&loadi->instr);
2047
2048 /* LAYER and VIEWPORT FS inputs should be replaced by 0
2049 * instead of undef.
2050 */
2051 gl_varying_slot location = (gl_varying_slot)(vec4_slot(i));
2052
2053 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
2054 (location == VARYING_SLOT_LAYER ||
2055 location == VARYING_SLOT_VIEWPORT ||
2056 /* TEXn.z is replaced by 0 (matching coord replace) */
2057 (is_interpolated_texcoord(linkage, i) && i % 8 == 4)))
2058 replacement = nir_imm_intN_t(b, 0, loadi->def.bit_size);
2059 else if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
2060 /* TEXn.w is replaced by 1 (matching coord replace) */
2061 is_interpolated_texcoord(linkage, i) && i % 8 == 6)
2062 replacement = nir_imm_floatN_t(b, 1, loadi->def.bit_size);
2063 else
2064 replacement = nir_undef(b, 1, loadi->def.bit_size);
2065
2066 nir_def_replace(&loadi->def, replacement);
2067
2068 *progress |= list_index ? nir_progress_producer :
2069 nir_progress_consumer;
2070 }
2071 }
2072
2073 /* Clear the lists. */
2074 list_inithead(&slot->producer.loads);
2075 list_inithead(&slot->consumer.loads);
2076 } else {
2077 /* There are no loads. */
2078 remove_all_stores(linkage, i, &uses_xfb, progress);
2079 }
2080
2081 /* Clear bitmasks associated with this varying slot or array. */
2082 for (unsigned elem = 0; elem < slot->num_slots; elem++)
2083 clear_slot_info_after_removal(linkage, i + elem, uses_xfb);
2084 }
2085 }
2086
2087 /******************************************************************
2088 * SSA CLONING HELPERS
2089 ******************************************************************/
2090
2091 /* Pass flags for inter-shader code motion. Also used by helpers. */
2092 #define FLAG_ALU_IS_TES_INTERP_LOAD BITFIELD_BIT(0)
2093 #define FLAG_MOVABLE BITFIELD_BIT(1)
2094 #define FLAG_UNMOVABLE BITFIELD_BIT(2)
2095 #define FLAG_POST_DOMINATOR_PROCESSED BITFIELD_BIT(3)
2096 #define FLAG_GATHER_LOADS_VISITED BITFIELD_BIT(4)
2097
2098 #define FLAG_INTERP_MASK BITFIELD_RANGE(5, 3)
2099 #define FLAG_INTERP_CONVERGENT (0 << 5)
2100 #define FLAG_INTERP_FLAT (1 << 5)
2101 /* FS-only interpolation modes. */
2102 #define FLAG_INTERP_PERSP_PIXEL (2 << 5)
2103 #define FLAG_INTERP_PERSP_CENTROID (3 << 5)
2104 #define FLAG_INTERP_PERSP_SAMPLE (4 << 5)
2105 #define FLAG_INTERP_LINEAR_PIXEL (5 << 5)
2106 #define FLAG_INTERP_LINEAR_CENTROID (6 << 5)
2107 #define FLAG_INTERP_LINEAR_SAMPLE (7 << 5)
2108 /* TES-only interpolation modes. (these were found in shaders) */
2109 #define FLAG_INTERP_TES_TRIANGLE_UVW (2 << 5) /* v0*u + v1*v + v2*w */
2110 #define FLAG_INTERP_TES_TRIANGLE_WUV (3 << 5) /* v0*w + v1*u + v2*v */
2111 /* TODO: Feel free to insert more TES interpolation equations here. */
2112
2113 static bool
can_move_deref_between_shaders(struct linkage_info * linkage,nir_instr * instr)2114 can_move_deref_between_shaders(struct linkage_info *linkage, nir_instr *instr)
2115 {
2116 nir_deref_instr *deref = nir_instr_as_deref(instr);
2117 unsigned allowed_modes =
2118 (linkage->can_move_uniforms ? nir_var_uniform : 0) |
2119 (linkage->can_move_ubos ? nir_var_mem_ubo : 0);
2120
2121 if (!nir_deref_mode_is_one_of(deref, allowed_modes))
2122 return false;
2123
2124 switch (deref->deref_type) {
2125 case nir_deref_type_var:
2126 case nir_deref_type_struct:
2127 case nir_deref_type_array:
2128 break;
2129 default:
2130 return false;
2131 }
2132
2133 nir_variable *var = nir_deref_instr_get_variable(deref);
2134
2135 /* Subroutine uniforms are not moved. Even though it works and subroutine
2136 * uniforms are moved correctly and subroutines have been inlined at this
2137 * point, subroutine functions aren't moved and the linker doesn't like
2138 * when a shader only contains a subroutine uniform but no subroutine
2139 * functions. This could be fixed in the linker, but for now, don't
2140 * move subroutine uniforms.
2141 */
2142 if (var->name && strstr(var->name, "__subu_") == var->name)
2143 return false;
2144
2145 return true;
2146 }
2147
2148 static nir_intrinsic_instr *
find_per_vertex_load_for_tes_interp(nir_instr * instr)2149 find_per_vertex_load_for_tes_interp(nir_instr *instr)
2150 {
2151 switch (instr->type) {
2152 case nir_instr_type_alu: {
2153 nir_alu_instr *alu = nir_instr_as_alu(instr);
2154 unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
2155
2156 for (unsigned i = 0; i < num_srcs; i++) {
2157 nir_instr *src = alu->src[i].src.ssa->parent_instr;
2158 nir_intrinsic_instr *intr = find_per_vertex_load_for_tes_interp(src);
2159
2160 if (intr)
2161 return intr;
2162 }
2163 return NULL;
2164 }
2165
2166 case nir_instr_type_intrinsic: {
2167 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2168
2169 return intr->intrinsic == nir_intrinsic_load_per_vertex_input ?
2170 intr : NULL;
2171 }
2172
2173 default:
2174 unreachable("unexpected instruction type");
2175 }
2176 }
2177
2178 static nir_def *
get_stored_value_for_load(struct linkage_info * linkage,nir_instr * instr)2179 get_stored_value_for_load(struct linkage_info *linkage, nir_instr *instr)
2180 {
2181 nir_intrinsic_instr *intr;
2182
2183 if (instr->type == nir_instr_type_intrinsic) {
2184 intr = nir_instr_as_intrinsic(instr);
2185 } else {
2186 assert(instr->type == nir_instr_type_alu &&
2187 instr->pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD);
2188 intr = find_per_vertex_load_for_tes_interp(instr);
2189 }
2190
2191 unsigned slot_index = intr_get_scalar_16bit_slot(intr);
2192 assert(list_is_singular(&linkage->slot[slot_index].producer.stores));
2193
2194 nir_def *stored_value =
2195 list_first_entry(&linkage->slot[slot_index].producer.stores,
2196 struct list_node, head)->instr->src[0].ssa;
2197 assert(stored_value->num_components == 1);
2198 return stored_value;
2199 }
2200
2201 /* Clone the SSA, which can be in a different shader. */
2202 static nir_def *
clone_ssa_impl(struct linkage_info * linkage,nir_builder * b,nir_def * ssa)2203 clone_ssa_impl(struct linkage_info *linkage, nir_builder *b, nir_def *ssa)
2204 {
2205 struct hash_entry *entry = _mesa_hash_table_search(linkage->clones_ht,
2206 ssa->parent_instr);
2207 if (entry)
2208 return entry->data;
2209
2210 nir_def *clone = NULL;
2211
2212 switch (ssa->parent_instr->type) {
2213 case nir_instr_type_load_const:
2214 clone = nir_build_imm(b, ssa->num_components, ssa->bit_size,
2215 nir_instr_as_load_const(ssa->parent_instr)->value);
2216 break;
2217
2218 case nir_instr_type_undef:
2219 clone = nir_undef(b, ssa->num_components, ssa->bit_size);
2220 break;
2221
2222 case nir_instr_type_alu: {
2223 nir_alu_instr *alu = nir_instr_as_alu(ssa->parent_instr);
2224
2225 if (alu->instr.pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD) {
2226 /* We are cloning an interpolated TES load in the producer for
2227 * backward inter-shader code motion.
2228 */
2229 assert(&linkage->producer_builder == b);
2230 return get_stored_value_for_load(linkage, &alu->instr);
2231 }
2232
2233 nir_def *src[4] = {0};
2234 unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
2235 assert(num_srcs <= ARRAY_SIZE(src));
2236
2237 for (unsigned i = 0; i < num_srcs; i++)
2238 src[i] = clone_ssa_impl(linkage, b, alu->src[i].src.ssa);
2239
2240 clone = nir_build_alu(b, alu->op, src[0], src[1], src[2], src[3]);
2241 nir_alu_instr *alu_clone = nir_instr_as_alu(clone->parent_instr);
2242
2243 alu_clone->exact = alu->exact;
2244 alu_clone->no_signed_wrap = alu->no_signed_wrap;
2245 alu_clone->no_unsigned_wrap = alu->no_unsigned_wrap;
2246 alu_clone->def.num_components = alu->def.num_components;
2247 alu_clone->def.bit_size = alu->def.bit_size;
2248
2249 for (unsigned i = 0; i < num_srcs; i++) {
2250 memcpy(alu_clone->src[i].swizzle, alu->src[i].swizzle,
2251 NIR_MAX_VEC_COMPONENTS);
2252 }
2253 break;
2254 }
2255
2256 case nir_instr_type_intrinsic: {
2257 /* Clone load_deref of uniform or ubo. It's the only thing that can
2258 * occur here.
2259 */
2260 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(ssa->parent_instr);
2261
2262 switch (intr->intrinsic) {
2263 case nir_intrinsic_load_deref: {
2264 nir_def *ssa = clone_ssa_impl(linkage, b, intr->src[0].ssa);
2265 clone = nir_load_deref(b, nir_instr_as_deref(ssa->parent_instr));
2266 break;
2267 }
2268
2269 case nir_intrinsic_load_input:
2270 case nir_intrinsic_load_per_primitive_input:
2271 case nir_intrinsic_load_interpolated_input: {
2272 /* We are cloning load_input in the producer for backward
2273 * inter-shader code motion. Replace the input load with the stored
2274 * output value. That way we can clone any expression using inputs
2275 * from the consumer in the producer.
2276 */
2277 assert(&linkage->producer_builder == b);
2278 clone = get_stored_value_for_load(linkage, &intr->instr);
2279 break;
2280 }
2281
2282 default:
2283 unreachable("unexpected intrinsic");
2284 }
2285 break;
2286 }
2287
2288 case nir_instr_type_deref: {
2289 nir_deref_instr *deref = nir_instr_as_deref(ssa->parent_instr);
2290 assert(nir_deref_mode_is_one_of(deref, nir_var_uniform | nir_var_mem_ubo));
2291
2292 /* Get the uniform from the original shader. */
2293 nir_variable *var = nir_deref_instr_get_variable(deref);
2294 assert(!(var->data.mode & nir_var_mem_ubo) || linkage->can_move_ubos);
2295
2296 /* Declare the uniform in the target shader. If it's the same shader
2297 * (in the case of replacing output loads with a uniform), this has
2298 * no effect. If the variable already exists in the target shader, this
2299 * just returns the existing one.
2300 */
2301 var = nir_clone_uniform_variable(b->shader, var, linkage->spirv);
2302
2303 if (deref->deref_type == nir_deref_type_var) {
2304 clone = &nir_build_deref_var(b, var)->def;
2305 } else {
2306 nir_deref_instr *parent_orig = nir_deref_instr_parent(deref);
2307 nir_deref_instr *parent_clone =
2308 nir_instr_as_deref(clone_ssa_impl(linkage, b, &parent_orig->def)
2309 ->parent_instr);
2310
2311 switch (deref->deref_type) {
2312 case nir_deref_type_array: {
2313 nir_def *index = clone_ssa_impl(linkage, b, deref->arr.index.ssa);
2314 clone = &nir_build_deref_array(b, parent_clone, index)->def;
2315 break;
2316 }
2317 case nir_deref_type_struct:
2318 clone = &nir_build_deref_struct(b, parent_clone,
2319 deref->strct.index)->def;
2320 break;
2321 default:
2322 unreachable("invalid deref type");
2323 }
2324 }
2325 break;
2326 }
2327
2328 default:
2329 unreachable("unexpected instruction type");
2330 }
2331
2332 _mesa_hash_table_insert(linkage->clones_ht, ssa->parent_instr, clone);
2333 return clone;
2334 }
2335
2336 static nir_def *
clone_ssa(struct linkage_info * linkage,nir_builder * b,nir_def * ssa)2337 clone_ssa(struct linkage_info *linkage, nir_builder *b, nir_def *ssa)
2338 {
2339 assert(!linkage->clones_ht);
2340 linkage->clones_ht = _mesa_pointer_hash_table_create(NULL);
2341
2342 nir_def *clone = clone_ssa_impl(linkage, b, ssa);
2343
2344 _mesa_hash_table_destroy(linkage->clones_ht, NULL);
2345 linkage->clones_ht = NULL;
2346 return clone;
2347 }
2348
2349 /******************************************************************
2350 * UNIFORM EXPRESSION PROPAGATION (CONSTANTS, UNIFORMS, UBO LOADS)
2351 ******************************************************************/
2352
2353 static void
remove_all_stores_and_clear_slot(struct linkage_info * linkage,unsigned slot,nir_opt_varyings_progress * progress)2354 remove_all_stores_and_clear_slot(struct linkage_info *linkage, unsigned slot,
2355 nir_opt_varyings_progress *progress)
2356 {
2357 bool uses_xfb = false;
2358 remove_all_stores(linkage, slot, &uses_xfb, progress);
2359 clear_slot_info_after_removal(linkage, slot, uses_xfb);
2360 }
2361
2362 struct is_uniform_expr_state {
2363 struct linkage_info *linkage;
2364 unsigned cost;
2365 };
2366
2367 static bool
2368 is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state);
2369
2370 static bool
src_is_uniform_expression(nir_src * src,void * data)2371 src_is_uniform_expression(nir_src *src, void *data)
2372 {
2373 return is_uniform_expression(src->ssa->parent_instr,
2374 (struct is_uniform_expr_state*)data);
2375 }
2376
2377 /**
2378 * Return whether instr is a uniform expression that can be moved into
2379 * the next shader.
2380 */
2381 static bool
is_uniform_expression(nir_instr * instr,struct is_uniform_expr_state * state)2382 is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state)
2383 {
2384 switch (instr->type) {
2385 case nir_instr_type_load_const:
2386 case nir_instr_type_undef:
2387 return true;
2388
2389 case nir_instr_type_alu:
2390 break;
2391
2392 case nir_instr_type_intrinsic:
2393 if (nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_deref)
2394 break;
2395 return false;
2396
2397 case nir_instr_type_deref:
2398 if (!can_move_deref_between_shaders(state->linkage, instr))
2399 return false;
2400 /* We need to iterate over the deref chain recursively. */
2401 break;
2402
2403 default:
2404 return false;
2405 }
2406
2407 if (!instr->pass_flags) {
2408 state->cost += state->linkage->varying_estimate_instr_cost ?
2409 state->linkage->varying_estimate_instr_cost(instr) : 1;
2410 instr->pass_flags = 1;
2411 return nir_foreach_src(instr, src_is_uniform_expression, state);
2412 }
2413 return true;
2414 }
2415
2416 /**
2417 * Propagate constants, uniforms, UBO loads, and uniform expressions
2418 * in output components to inputs loads in the next shader and output
2419 * loads in the current stage, and remove the output components.
2420 *
2421 * Uniform expressions are ALU expressions only sourcing constants, uniforms,
2422 * and UBO loads.
2423 */
2424 static void
propagate_uniform_expressions(struct linkage_info * linkage,nir_opt_varyings_progress * progress)2425 propagate_uniform_expressions(struct linkage_info *linkage,
2426 nir_opt_varyings_progress *progress)
2427 {
2428 unsigned i;
2429
2430 /* Find uniform expressions. If there are multiple stores, they should all
2431 * store the same value. That's guaranteed by output_equal_mask.
2432 */
2433 BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
2434 if (!can_optimize_varying(linkage, vec4_slot(i)).propagate_uniform_expr)
2435 continue;
2436
2437 struct scalar_slot *slot = &linkage->slot[i];
2438 assert(!list_is_empty(&slot->producer.loads) ||
2439 !list_is_empty(&slot->consumer.loads));
2440
2441 struct is_uniform_expr_state state = {
2442 .linkage = linkage,
2443 .cost = 0,
2444 };
2445
2446 /* Clear pass_flags, which is used to prevent adding the cost of
2447 * the same instruction multiple times.
2448 */
2449 nir_shader_clear_pass_flags(linkage->producer_builder.shader);
2450
2451 if (!is_uniform_expression(slot->producer.value, &state))
2452 continue;
2453
2454 if (state.cost > linkage->max_varying_expression_cost)
2455 continue;
2456
2457 /* Colors can be propagated only if they are constant between [0, 1]
2458 * because that's the only case when the clamp vertex color state has
2459 * no effect.
2460 */
2461 if (is_interpolated_color(linkage, i) &&
2462 (slot->producer.value->type != nir_instr_type_load_const ||
2463 nir_instr_as_load_const(slot->producer.value)->value[0].f32 < 0 ||
2464 nir_instr_as_load_const(slot->producer.value)->value[0].f32 > 1))
2465 continue;
2466
2467 /* TEXn.zw can be propagated only if it's equal to (0, 1) because it's
2468 * the coord replace value.
2469 */
2470 if (is_interpolated_texcoord(linkage, i)) {
2471 assert(i % 2 == 0); /* high 16-bit slots disallowed */
2472
2473 if (i % 8 == 0 || /* TEXn.x */
2474 i % 8 == 2 || /* TEXn.y */
2475 slot->producer.value->type != nir_instr_type_load_const)
2476 continue;
2477
2478 float value =
2479 nir_instr_as_load_const(slot->producer.value)->value[0].f32;
2480
2481 /* This ignores signed zeros, but those are destroyed by
2482 * interpolation, so it doesn't matter.
2483 */
2484 if ((i % 8 == 4 && value != 0) ||
2485 (i % 8 == 6 && value != 1))
2486 continue;
2487 }
2488
2489 /* Clear pass_flags, which is used by clone_ssa. */
2490 nir_shader_clear_pass_flags(linkage->producer_builder.shader);
2491
2492 /* Replace all loads. Do that for both input and output loads. */
2493 for (unsigned list_index = 0; list_index < 2; list_index++) {
2494 struct list_head *load = list_index ? &slot->producer.loads :
2495 &slot->consumer.loads;
2496 nir_builder *b = list_index ? &linkage->producer_builder :
2497 &linkage->consumer_builder;
2498
2499 list_for_each_entry(struct list_node, node, load, head) {
2500 nir_intrinsic_instr *loadi = node->instr;
2501 b->cursor = nir_before_instr(&loadi->instr);
2502
2503 /* Copy the uniform expression before the load. */
2504 nir_def *clone = clone_ssa(linkage, b,
2505 nir_instr_def(slot->producer.value));
2506
2507 /* Interpolation converts Infs to NaNs. If we skip it, we need to
2508 * convert Infs to NaNs manually.
2509 */
2510 if (loadi->intrinsic == nir_intrinsic_load_interpolated_input &&
2511 preserve_nans(b->shader, clone->bit_size))
2512 clone = build_convert_inf_to_nan(b, clone);
2513
2514 /* Replace the original load. */
2515 nir_def_replace(&loadi->def, clone);
2516 *progress |= list_index ? nir_progress_producer :
2517 nir_progress_consumer;
2518 }
2519 }
2520
2521 /* Clear the lists. */
2522 list_inithead(&slot->producer.loads);
2523 list_inithead(&slot->consumer.loads);
2524
2525 /* Remove all stores now that loads have been replaced. */
2526 remove_all_stores_and_clear_slot(linkage, i, progress);
2527 }
2528 }
2529
2530 /******************************************************************
2531 * OUTPUT DEDUPLICATION
2532 ******************************************************************/
2533
2534 /* We can only deduplicate outputs that have the same qualifier, and color
2535 * components must be deduplicated separately because they are affected by GL
2536 * states.
2537 *
2538 * QUAL_*_INTERP_ANY means that the interpolation qualifier doesn't matter for
2539 * deduplication as long as it's not flat.
2540 *
2541 * QUAL_COLOR_SHADEMODEL_ANY is the same, but can be switched to flat
2542 * by the flatshade state, so it can't be deduplicated with
2543 * QUAL_COLOR_INTERP_ANY, which is never flat.
2544 */
2545 enum var_qualifier {
2546 QUAL_PATCH,
2547 QUAL_VAR_FLAT,
2548 QUAL_COLOR_FLAT,
2549 QUAL_EXPLICIT,
2550 QUAL_EXPLICIT_STRICT,
2551 QUAL_PER_PRIMITIVE,
2552 /* When nir_io_has_flexible_input_interpolation_except_flat is set: */
2553 QUAL_VAR_INTERP_ANY,
2554 QUAL_COLOR_INTERP_ANY,
2555 QUAL_COLOR_SHADEMODEL_ANY,
2556 /* When nir_io_has_flexible_input_interpolation_except_flat is not set: */
2557 QUAL_VAR_PERSP_PIXEL,
2558 QUAL_VAR_PERSP_CENTROID,
2559 QUAL_VAR_PERSP_SAMPLE,
2560 QUAL_VAR_LINEAR_PIXEL,
2561 QUAL_VAR_LINEAR_CENTROID,
2562 QUAL_VAR_LINEAR_SAMPLE,
2563 QUAL_COLOR_PERSP_PIXEL,
2564 QUAL_COLOR_PERSP_CENTROID,
2565 QUAL_COLOR_PERSP_SAMPLE,
2566 QUAL_COLOR_LINEAR_PIXEL,
2567 QUAL_COLOR_LINEAR_CENTROID,
2568 QUAL_COLOR_LINEAR_SAMPLE,
2569 QUAL_COLOR_SHADEMODEL_PIXEL,
2570 QUAL_COLOR_SHADEMODEL_CENTROID,
2571 QUAL_COLOR_SHADEMODEL_SAMPLE,
2572 NUM_DEDUP_QUALIFIERS,
2573
2574 QUAL_SKIP,
2575 QUAL_UNKNOWN,
2576 };
2577
2578 /* Return the input qualifier if all loads use the same one, else skip.
2579 * This is only used by output deduplication to determine input compatibility.
2580 */
2581 static enum var_qualifier
get_input_qualifier(struct linkage_info * linkage,unsigned i)2582 get_input_qualifier(struct linkage_info *linkage, unsigned i)
2583 {
2584 assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT);
2585 struct scalar_slot *slot = &linkage->slot[i];
2586 bool is_color = is_interpolated_color(linkage, i);
2587 nir_intrinsic_instr *load =
2588 list_first_entry(&slot->consumer.loads, struct list_node, head)->instr;
2589
2590 if (load->intrinsic == nir_intrinsic_load_input)
2591 return is_color ? QUAL_COLOR_FLAT : QUAL_VAR_FLAT;
2592
2593 if (load->intrinsic == nir_intrinsic_load_per_primitive_input)
2594 return QUAL_PER_PRIMITIVE;
2595
2596 if (load->intrinsic == nir_intrinsic_load_input_vertex) {
2597 return nir_intrinsic_io_semantics(load).interp_explicit_strict ?
2598 QUAL_EXPLICIT_STRICT : QUAL_EXPLICIT;
2599 }
2600
2601 assert(load->intrinsic == nir_intrinsic_load_interpolated_input);
2602
2603 nir_instr *baryc_instr = load->src[0].ssa->parent_instr;
2604 nir_intrinsic_instr *baryc = baryc_instr->type == nir_instr_type_intrinsic ?
2605 nir_instr_as_intrinsic(baryc_instr) : NULL;
2606
2607 if (linkage->has_flexible_interp) {
2608 if (is_color) {
2609 return nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE ?
2610 QUAL_COLOR_SHADEMODEL_ANY : QUAL_COLOR_INTERP_ANY;
2611 } else {
2612 return QUAL_VAR_INTERP_ANY;
2613 }
2614 }
2615
2616 /* This is either lowered barycentric_at_offset/at_sample or user
2617 * barycentrics. Treat it like barycentric_at_offset.
2618 */
2619 if (!baryc)
2620 return QUAL_SKIP;
2621
2622 /* If interpolateAt{Centroid,Offset,Sample} is used, see if there is
2623 * another load that doesn't use those, so that we get the real qualifier.
2624 */
2625 if (baryc->intrinsic == nir_intrinsic_load_barycentric_centroid ||
2626 baryc->intrinsic == nir_intrinsic_load_barycentric_at_offset ||
2627 baryc->intrinsic == nir_intrinsic_load_barycentric_at_sample) {
2628 list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) {
2629 nir_intrinsic_instr *bar =
2630 nir_instr_as_intrinsic(iter->instr->src[0].ssa->parent_instr);
2631
2632 if (bar->intrinsic != nir_intrinsic_load_barycentric_centroid &&
2633 bar->intrinsic != nir_intrinsic_load_barycentric_at_offset &&
2634 bar->intrinsic != nir_intrinsic_load_barycentric_at_sample) {
2635 baryc = bar;
2636 break;
2637 }
2638 }
2639 }
2640
2641 /* Get the exact interpolation qualifier. */
2642 unsigned pixel_location;
2643 enum var_qualifier qual;
2644
2645 switch (baryc->intrinsic) {
2646 case nir_intrinsic_load_barycentric_pixel:
2647 pixel_location = 0;
2648 break;
2649 case nir_intrinsic_load_barycentric_centroid:
2650 pixel_location = 1;
2651 break;
2652 case nir_intrinsic_load_barycentric_sample:
2653 pixel_location = 2;
2654 break;
2655 case nir_intrinsic_load_barycentric_at_offset:
2656 case nir_intrinsic_load_barycentric_at_sample:
2657 /* Don't deduplicate outputs that are interpolated at offset/sample. */
2658 return QUAL_SKIP;
2659 default:
2660 unreachable("unexpected barycentric src");
2661 }
2662
2663 switch (nir_intrinsic_interp_mode(baryc)) {
2664 case INTERP_MODE_NONE:
2665 qual = is_color ? QUAL_COLOR_SHADEMODEL_PIXEL :
2666 QUAL_VAR_PERSP_PIXEL;
2667 break;
2668 case INTERP_MODE_SMOOTH:
2669 qual = is_color ? QUAL_COLOR_PERSP_PIXEL : QUAL_VAR_PERSP_PIXEL;
2670 break;
2671 case INTERP_MODE_NOPERSPECTIVE:
2672 qual = is_color ? QUAL_COLOR_LINEAR_PIXEL : QUAL_VAR_LINEAR_PIXEL;
2673 break;
2674 default:
2675 unreachable("unexpected interp mode");
2676 }
2677
2678 /* The ordering of the "qual" enum was carefully chosen to make this
2679 * addition correct.
2680 */
2681 STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 1 == QUAL_VAR_PERSP_CENTROID);
2682 STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 2 == QUAL_VAR_PERSP_SAMPLE);
2683 STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 1 == QUAL_VAR_LINEAR_CENTROID);
2684 STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 2 == QUAL_VAR_LINEAR_SAMPLE);
2685 STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 1 == QUAL_COLOR_PERSP_CENTROID);
2686 STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 2 == QUAL_COLOR_PERSP_SAMPLE);
2687 STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 1 == QUAL_COLOR_LINEAR_CENTROID);
2688 STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 2 == QUAL_COLOR_LINEAR_SAMPLE);
2689 STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 1 ==
2690 QUAL_COLOR_SHADEMODEL_CENTROID);
2691 STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 2 ==
2692 QUAL_COLOR_SHADEMODEL_SAMPLE);
2693 return qual + pixel_location;
2694 }
2695
2696 static void
deduplicate_outputs(struct linkage_info * linkage,nir_opt_varyings_progress * progress)2697 deduplicate_outputs(struct linkage_info *linkage,
2698 nir_opt_varyings_progress *progress)
2699 {
2700 struct hash_table *tables[NUM_DEDUP_QUALIFIERS] = {NULL};
2701 unsigned i;
2702
2703 /* Find duplicated outputs. If there are multiple stores, they should all
2704 * store the same value as all stores of some other output. That's
2705 * guaranteed by output_equal_mask.
2706 */
2707 BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
2708 if (!can_optimize_varying(linkage, vec4_slot(i)).deduplicate)
2709 continue;
2710
2711 struct scalar_slot *slot = &linkage->slot[i];
2712 enum var_qualifier qualifier;
2713 gl_varying_slot var_slot = vec4_slot(i);
2714
2715 /* Determine which qualifier this slot has. */
2716 if ((var_slot >= VARYING_SLOT_PATCH0 &&
2717 var_slot <= VARYING_SLOT_PATCH31) ||
2718 var_slot == VARYING_SLOT_TESS_LEVEL_INNER ||
2719 var_slot == VARYING_SLOT_TESS_LEVEL_OUTER)
2720 qualifier = QUAL_PATCH;
2721 else if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
2722 qualifier = QUAL_VAR_FLAT;
2723 else
2724 qualifier = get_input_qualifier(linkage, i);
2725
2726 if (qualifier == QUAL_SKIP)
2727 continue;
2728
2729 struct hash_table **table = &tables[qualifier];
2730 if (!*table)
2731 *table = _mesa_pointer_hash_table_create(NULL);
2732
2733 nir_instr *value = slot->producer.value;
2734
2735 struct hash_entry *entry = _mesa_hash_table_search(*table, value);
2736 if (!entry) {
2737 _mesa_hash_table_insert(*table, value, (void*)(uintptr_t)i);
2738 continue;
2739 }
2740
2741 /* We've found a duplicate. Redirect loads and remove stores. */
2742 struct scalar_slot *found_slot = &linkage->slot[(uintptr_t)entry->data];
2743 nir_intrinsic_instr *store =
2744 list_first_entry(&found_slot->producer.stores,
2745 struct list_node, head)->instr;
2746 nir_io_semantics sem = nir_intrinsic_io_semantics(store);
2747 unsigned component = nir_intrinsic_component(store);
2748
2749 /* Redirect loads. */
2750 for (unsigned list_index = 0; list_index < 2; list_index++) {
2751 struct list_head *src_loads = list_index ? &slot->producer.loads :
2752 &slot->consumer.loads;
2753 struct list_head *dst_loads = list_index ? &found_slot->producer.loads :
2754 &found_slot->consumer.loads;
2755 bool has_progress = !list_is_empty(src_loads);
2756
2757 list_for_each_entry(struct list_node, iter, src_loads, head) {
2758 nir_intrinsic_instr *loadi = iter->instr;
2759
2760 nir_intrinsic_set_io_semantics(loadi, sem);
2761 nir_intrinsic_set_component(loadi, component);
2762
2763 /* We also need to set the base to match the duplicate load, so
2764 * that CSE can eliminate it.
2765 */
2766 if (!list_is_empty(dst_loads)) {
2767 struct list_node *first =
2768 list_first_entry(dst_loads, struct list_node, head);
2769 nir_intrinsic_set_base(loadi, nir_intrinsic_base(first->instr));
2770 } else {
2771 /* Use the base of the found store if there are no loads (it can
2772 * only happen with TCS).
2773 */
2774 assert(list_index == 0);
2775 nir_intrinsic_set_base(loadi, nir_intrinsic_base(store));
2776 }
2777 }
2778
2779 if (has_progress) {
2780 /* Move the redirected loads to the found slot, so that compaction
2781 * can find them.
2782 */
2783 list_splicetail(src_loads, dst_loads);
2784 list_inithead(src_loads);
2785
2786 *progress |= list_index ? nir_progress_producer :
2787 nir_progress_consumer;
2788 }
2789 }
2790
2791 /* Remove all duplicated stores now that loads have been redirected. */
2792 remove_all_stores_and_clear_slot(linkage, i, progress);
2793 }
2794
2795 for (unsigned i = 0; i < ARRAY_SIZE(tables); i++)
2796 _mesa_hash_table_destroy(tables[i], NULL);
2797 }
2798
2799 /******************************************************************
2800 * FIND OPEN-CODED TES INPUT INTERPOLATION
2801 ******************************************************************/
2802
2803 static nir_alu_instr *
get_single_use_as_alu(nir_def * def)2804 get_single_use_as_alu(nir_def *def)
2805 {
2806 /* Only 1 use allowed. */
2807 if (!list_is_singular(&def->uses))
2808 return NULL;
2809
2810 nir_instr *instr =
2811 nir_src_parent_instr(list_first_entry(&def->uses, nir_src, use_link));
2812 if (instr->type != nir_instr_type_alu)
2813 return NULL;
2814
2815 return nir_instr_as_alu(instr);
2816 }
2817
2818 static nir_alu_instr *
check_tes_input_load_get_single_use_alu(nir_intrinsic_instr * load,unsigned * vertex_index,unsigned * vertices_used,unsigned max_vertices)2819 check_tes_input_load_get_single_use_alu(nir_intrinsic_instr *load,
2820 unsigned *vertex_index,
2821 unsigned *vertices_used,
2822 unsigned max_vertices)
2823 {
2824 if (load->intrinsic != nir_intrinsic_load_per_vertex_input)
2825 return NULL;
2826
2827 /* Check the vertex index. Each vertex can be loaded only once. */
2828 if (!nir_src_is_const(load->src[0]))
2829 return false;
2830
2831 *vertex_index = nir_src_as_uint(load->src[0]);
2832 if (*vertex_index >= max_vertices ||
2833 *vertices_used & BITFIELD_BIT(*vertex_index))
2834 return false;
2835
2836 *vertices_used |= BITFIELD_BIT(*vertex_index);
2837
2838 return get_single_use_as_alu(&load->def);
2839 }
2840
2841 static bool
gather_fmul_tess_coord(nir_intrinsic_instr * load,nir_alu_instr * fmul,unsigned vertex_index,unsigned * tess_coord_swizzle,unsigned * tess_coord_used,nir_def ** load_tess_coord)2842 gather_fmul_tess_coord(nir_intrinsic_instr *load, nir_alu_instr *fmul,
2843 unsigned vertex_index, unsigned *tess_coord_swizzle,
2844 unsigned *tess_coord_used, nir_def **load_tess_coord)
2845 {
2846 unsigned other_src = fmul->src[0].src.ssa == &load->def;
2847 nir_instr *other_instr = fmul->src[other_src].src.ssa->parent_instr;
2848
2849 assert(fmul->src[!other_src].swizzle[0] == 0);
2850
2851 if (!is_sysval(other_instr, SYSTEM_VALUE_TESS_COORD))
2852 return false;
2853
2854 unsigned tess_coord_component = fmul->src[other_src].swizzle[0];
2855 /* Each tesscoord component can be used only once. */
2856 if (*tess_coord_used & BITFIELD_BIT(tess_coord_component))
2857 return false;
2858
2859 *tess_coord_swizzle |= tess_coord_component << (4 * vertex_index);
2860 *tess_coord_used |= BITFIELD_BIT(tess_coord_component);
2861 *load_tess_coord = &nir_instr_as_intrinsic(other_instr)->def;
2862 return true;
2863 }
2864
2865 /**
2866 * Find interpolation of the form:
2867 * input[0].slot * TessCoord.a +
2868 * input[1].slot * TessCoord.b +
2869 * input[2].slot * TessCoord.c;
2870 *
2871 * a,b,c can be any of x,y,z, but each can occur only once.
2872 */
2873 static bool
find_tes_triangle_interp_3fmul_2fadd(struct linkage_info * linkage,unsigned i)2874 find_tes_triangle_interp_3fmul_2fadd(struct linkage_info *linkage, unsigned i)
2875 {
2876 struct scalar_slot *slot = &linkage->slot[i];
2877 unsigned vertices_used = 0;
2878 unsigned tess_coord_used = 0;
2879 unsigned tess_coord_swizzle = 0;
2880 unsigned num_fmuls = 0, num_fadds = 0;
2881 nir_alu_instr *fadds[2];
2882 nir_def *load_tess_coord = NULL;
2883
2884 /* Find 3 multiplications by TessCoord and their uses, which must be
2885 * fadds.
2886 */
2887 list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) {
2888 unsigned vertex_index;
2889 nir_alu_instr *fmul =
2890 check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index,
2891 &vertices_used, 3);
2892 /* Only maximum of 3 loads expected. Also reject exact ops because we
2893 * are going to do an inexact transformation with it.
2894 */
2895 if (!fmul || fmul->op != nir_op_fmul || fmul->exact || num_fmuls == 3 ||
2896 !gather_fmul_tess_coord(iter->instr, fmul, vertex_index,
2897 &tess_coord_swizzle, &tess_coord_used,
2898 &load_tess_coord))
2899 return false;
2900
2901 num_fmuls++;
2902
2903 /* The multiplication must only be used by fadd. Also reject exact ops.
2904 */
2905 nir_alu_instr *fadd = get_single_use_as_alu(&fmul->def);
2906 if (!fadd || fadd->op != nir_op_fadd || fadd->exact)
2907 return false;
2908
2909 /* The 3 fmuls must only be used by 2 fadds. */
2910 unsigned i;
2911 for (i = 0; i < num_fadds; i++) {
2912 if (fadds[i] == fadd)
2913 break;
2914 }
2915 if (i == num_fadds) {
2916 if (num_fadds == 2)
2917 return false;
2918
2919 fadds[num_fadds++] = fadd;
2920 }
2921 }
2922
2923 if (num_fmuls != 3 || num_fadds != 2)
2924 return false;
2925
2926 assert(tess_coord_used == 0x7);
2927
2928 /* We have found that the only uses of the 3 fmuls are 2 fadds, which
2929 * implies that at least 2 fmuls are used by the same fadd.
2930 *
2931 * Check that 1 fadd is used by the other fadd, which can only be
2932 * the result of the TessCoord interpolation.
2933 */
2934 for (unsigned i = 0; i < 2; i++) {
2935 if (get_single_use_as_alu(&fadds[i]->def) == fadds[!i]) {
2936 switch (tess_coord_swizzle) {
2937 case 0x210:
2938 slot->consumer.tes_interp_load = fadds[!i];
2939 slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW;
2940 slot->consumer.tes_load_tess_coord = load_tess_coord;
2941 return true;
2942
2943 case 0x102:
2944 slot->consumer.tes_interp_load = fadds[!i];
2945 slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV;
2946 slot->consumer.tes_load_tess_coord = load_tess_coord;
2947 return true;
2948
2949 default:
2950 return false;
2951 }
2952 }
2953 }
2954
2955 return false;
2956 }
2957
2958 /**
2959 * Find interpolation of the form:
2960 * fma(input[0].slot, TessCoord.a,
2961 * fma(input[1].slot, TessCoord.b,
2962 * input[2].slot * TessCoord.c))
2963 *
2964 * a,b,c can be any of x,y,z, but each can occur only once.
2965 */
2966 static bool
find_tes_triangle_interp_1fmul_2ffma(struct linkage_info * linkage,unsigned i)2967 find_tes_triangle_interp_1fmul_2ffma(struct linkage_info *linkage, unsigned i)
2968 {
2969 struct scalar_slot *slot = &linkage->slot[i];
2970 unsigned vertices_used = 0;
2971 unsigned tess_coord_used = 0;
2972 unsigned tess_coord_swizzle = 0;
2973 unsigned num_fmuls = 0, num_ffmas = 0;
2974 nir_alu_instr *ffmas[2], *fmul = NULL;
2975 nir_def *load_tess_coord = NULL;
2976
2977 list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) {
2978 unsigned vertex_index;
2979 nir_alu_instr *alu =
2980 check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index,
2981 &vertices_used, 3);
2982
2983 /* Reject exact ops because we are going to do an inexact transformation
2984 * with it.
2985 */
2986 if (!alu || (alu->op != nir_op_fmul && alu->op != nir_op_ffma) ||
2987 alu->exact ||
2988 !gather_fmul_tess_coord(iter->instr, alu, vertex_index,
2989 &tess_coord_swizzle, &tess_coord_used,
2990 &load_tess_coord))
2991 return false;
2992
2993 /* The multiplication must only be used by ffma. */
2994 if (alu->op == nir_op_fmul) {
2995 nir_alu_instr *ffma = get_single_use_as_alu(&alu->def);
2996 if (!ffma || ffma->op != nir_op_ffma)
2997 return false;
2998
2999 if (num_fmuls == 1)
3000 return false;
3001
3002 fmul = alu;
3003 num_fmuls++;
3004 } else {
3005 if (num_ffmas == 2)
3006 return false;
3007
3008 ffmas[num_ffmas++] = alu;
3009 }
3010 }
3011
3012 if (num_fmuls != 1 || num_ffmas != 2)
3013 return false;
3014
3015 assert(tess_coord_used == 0x7);
3016
3017 /* We have found that fmul has only 1 use and it's ffma, and there are 2
3018 * ffmas. Fail if neither ffma is using fmul.
3019 */
3020 if (ffmas[0]->src[2].src.ssa != &fmul->def &&
3021 ffmas[1]->src[2].src.ssa != &fmul->def)
3022 return false;
3023
3024 /* If one ffma is using the other ffma, it's guaranteed to be src[2]. */
3025 for (unsigned i = 0; i < 2; i++) {
3026 if (get_single_use_as_alu(&ffmas[i]->def) == ffmas[!i]) {
3027 switch (tess_coord_swizzle) {
3028 case 0x210:
3029 slot->consumer.tes_interp_load = ffmas[!i];
3030 slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW;
3031 slot->consumer.tes_load_tess_coord = load_tess_coord;
3032 return true;
3033
3034 case 0x102:
3035 slot->consumer.tes_interp_load = ffmas[!i];
3036 slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV;
3037 slot->consumer.tes_load_tess_coord = load_tess_coord;
3038 return true;
3039
3040 default:
3041 return false;
3042 }
3043 }
3044 }
3045
3046 return false;
3047 }
3048
3049 static void
find_open_coded_tes_input_interpolation(struct linkage_info * linkage)3050 find_open_coded_tes_input_interpolation(struct linkage_info *linkage)
3051 {
3052 if (linkage->consumer_stage != MESA_SHADER_TESS_EVAL)
3053 return;
3054
3055 unsigned i;
3056 BITSET_FOREACH_SET(i, linkage->flat32_mask, NUM_SCALAR_SLOTS) {
3057 if (vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
3058 vec4_slot(i) <= VARYING_SLOT_PATCH31)
3059 continue;
3060 if (find_tes_triangle_interp_3fmul_2fadd(linkage, i))
3061 continue;
3062 if (find_tes_triangle_interp_1fmul_2ffma(linkage, i))
3063 continue;
3064 }
3065
3066 BITSET_FOREACH_SET(i, linkage->flat16_mask, NUM_SCALAR_SLOTS) {
3067 if (vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
3068 vec4_slot(i) <= VARYING_SLOT_PATCH31)
3069 continue;
3070 if (find_tes_triangle_interp_3fmul_2fadd(linkage, i))
3071 continue;
3072 if (find_tes_triangle_interp_1fmul_2ffma(linkage, i))
3073 continue;
3074 }
3075 }
3076
3077 /******************************************************************
3078 * BACKWARD INTER-SHADER CODE MOTION
3079 ******************************************************************/
3080
3081 #define NEED_UPDATE_MOVABLE_FLAGS(instr) \
3082 (!((instr)->pass_flags & (FLAG_MOVABLE | FLAG_UNMOVABLE)))
3083
3084 #define GET_SRC_INTERP(alu, i) \
3085 ((alu)->src[i].src.ssa->parent_instr->pass_flags & FLAG_INTERP_MASK)
3086
3087 static bool
can_move_alu_across_interp(struct linkage_info * linkage,nir_alu_instr * alu)3088 can_move_alu_across_interp(struct linkage_info *linkage, nir_alu_instr *alu)
3089 {
3090 /* Exact ALUs can't be moved across interpolation. */
3091 if (alu->exact)
3092 return false;
3093
3094 /* Interpolation converts Infs to NaNs. If we turn a result of an ALU
3095 * instruction into a new interpolated input, it converts Infs to NaNs for
3096 * that instruction, while removing the Infs to NaNs conversion for sourced
3097 * interpolated values. We can't do that if Infs and NaNs must be preserved.
3098 */
3099 if (preserve_infs_nans(linkage->consumer_builder.shader, alu->def.bit_size))
3100 return false;
3101
3102 switch (alu->op) {
3103 /* Always legal if the sources are interpolated identically because:
3104 * interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j)
3105 * interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j)
3106 */
3107 case nir_op_fadd:
3108 case nir_op_fsub:
3109 /* This is the same as multiplying by -1, which is always legal, see fmul.
3110 */
3111 case nir_op_fneg:
3112 case nir_op_mov:
3113 return true;
3114
3115 /* At least one side of the multiplication must be convergent because this
3116 * is the only equation with multiplication that is true:
3117 * interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j)
3118 */
3119 case nir_op_fmul:
3120 case nir_op_fmulz:
3121 case nir_op_ffma:
3122 case nir_op_ffmaz:
3123 return GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT ||
3124 GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT;
3125
3126 case nir_op_fdiv:
3127 /* The right side must be convergent, which then follows the fmul rule.
3128 */
3129 return GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT;
3130
3131 case nir_op_flrp:
3132 /* Using the same rule as fmul. */
3133 return (GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT &&
3134 GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT) ||
3135 GET_SRC_INTERP(alu, 2) == FLAG_INTERP_CONVERGENT;
3136
3137 default:
3138 /* Moving other ALU instructions across interpolation is illegal. */
3139 return false;
3140 }
3141 }
3142
3143 /* Determine whether an instruction is movable from the consumer to
3144 * the producer. Also determine which interpolation modes each ALU instruction
3145 * should use if its value was promoted to a new input.
3146 */
3147 static void
update_movable_flags(struct linkage_info * linkage,nir_instr * instr)3148 update_movable_flags(struct linkage_info *linkage, nir_instr *instr)
3149 {
3150 /* This function shouldn't be called more than once for each instruction
3151 * to minimize recursive calling.
3152 */
3153 assert(NEED_UPDATE_MOVABLE_FLAGS(instr));
3154
3155 switch (instr->type) {
3156 case nir_instr_type_undef:
3157 case nir_instr_type_load_const:
3158 /* Treat constants as convergent, which means compatible with both flat
3159 * and non-flat inputs.
3160 */
3161 instr->pass_flags |= FLAG_MOVABLE | FLAG_INTERP_CONVERGENT;
3162 return;
3163
3164 case nir_instr_type_alu: {
3165 nir_alu_instr *alu = nir_instr_as_alu(instr);
3166 unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
3167 unsigned alu_interp;
3168
3169 /* Make vector ops unmovable. They are technically movable but more
3170 * complicated, and NIR should be scalarized for this pass anyway.
3171 * The only remaining vector ops should be vecN for intrinsic sources.
3172 */
3173 if (alu->def.num_components > 1) {
3174 instr->pass_flags |= FLAG_UNMOVABLE;
3175 return;
3176 }
3177
3178 alu_interp = FLAG_INTERP_CONVERGENT;
3179
3180 for (unsigned i = 0; i < num_srcs; i++) {
3181 nir_instr *src_instr = alu->src[i].src.ssa->parent_instr;
3182
3183 if (NEED_UPDATE_MOVABLE_FLAGS(src_instr))
3184 update_movable_flags(linkage, src_instr);
3185
3186 if (src_instr->pass_flags & FLAG_UNMOVABLE) {
3187 instr->pass_flags |= FLAG_UNMOVABLE;
3188 return;
3189 }
3190
3191 /* Determine which interpolation mode this ALU instruction should
3192 * use if it was promoted to a new input.
3193 */
3194 unsigned src_interp = src_instr->pass_flags & FLAG_INTERP_MASK;
3195
3196 if (alu_interp == src_interp ||
3197 src_interp == FLAG_INTERP_CONVERGENT) {
3198 /* Nothing to do. */
3199 } else if (alu_interp == FLAG_INTERP_CONVERGENT) {
3200 alu_interp = src_interp;
3201 } else {
3202 assert(alu_interp != FLAG_INTERP_CONVERGENT &&
3203 src_interp != FLAG_INTERP_CONVERGENT &&
3204 alu_interp != src_interp);
3205 /* The ALU instruction sources conflicting interpolation flags.
3206 * It can never become a new input.
3207 */
3208 instr->pass_flags |= FLAG_UNMOVABLE;
3209 return;
3210 }
3211 }
3212
3213 /* Check if we can move the ALU instruction across an interpolated
3214 * load into the previous shader.
3215 */
3216 if (alu_interp > FLAG_INTERP_FLAT &&
3217 !can_move_alu_across_interp(linkage, alu)) {
3218 instr->pass_flags |= FLAG_UNMOVABLE;
3219 return;
3220 }
3221
3222 instr->pass_flags |= FLAG_MOVABLE | alu_interp;
3223 return;
3224 }
3225
3226 case nir_instr_type_intrinsic: {
3227 /* Movable input loads already have FLAG_MOVABLE on them.
3228 * Unmovable input loads skipped by initialization get UNMOVABLE here.
3229 * (e.g. colors, texcoords)
3230 *
3231 * The only other movable intrinsic is load_deref for uniforms and UBOs.
3232 * Other intrinsics are not movable.
3233 */
3234 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3235
3236 if (intr->intrinsic == nir_intrinsic_load_deref) {
3237 nir_instr *deref = intr->src[0].ssa->parent_instr;
3238
3239 if (NEED_UPDATE_MOVABLE_FLAGS(deref))
3240 update_movable_flags(linkage, deref);
3241
3242 instr->pass_flags |= deref->pass_flags;
3243 return;
3244 }
3245
3246 instr->pass_flags |= FLAG_UNMOVABLE;
3247 return;
3248 }
3249
3250 case nir_instr_type_deref: {
3251 if (!can_move_deref_between_shaders(linkage, instr)) {
3252 instr->pass_flags |= FLAG_UNMOVABLE;
3253 return;
3254 }
3255
3256 nir_deref_instr *deref = nir_instr_as_deref(instr);
3257 nir_deref_instr *parent = nir_deref_instr_parent(deref);
3258
3259 if (parent) {
3260 if (NEED_UPDATE_MOVABLE_FLAGS(&parent->instr))
3261 update_movable_flags(linkage, &parent->instr);
3262
3263 if (parent->instr.pass_flags & FLAG_UNMOVABLE) {
3264 instr->pass_flags |= FLAG_UNMOVABLE;
3265 return;
3266 }
3267 }
3268
3269 switch (deref->deref_type) {
3270 case nir_deref_type_var:
3271 instr->pass_flags |= FLAG_MOVABLE;
3272 return;
3273
3274 case nir_deref_type_struct:
3275 assert(parent->instr.pass_flags & FLAG_MOVABLE);
3276 instr->pass_flags |= parent->instr.pass_flags;
3277 return;
3278
3279 case nir_deref_type_array: {
3280 nir_instr *index = deref->arr.index.ssa->parent_instr;
3281
3282 if (NEED_UPDATE_MOVABLE_FLAGS(index))
3283 update_movable_flags(linkage, index);
3284
3285 /* Integer array indices should be movable only if they are
3286 * convergent or flat.
3287 */
3288 ASSERTED unsigned index_interp = index->pass_flags & FLAG_INTERP_MASK;
3289 assert(index->pass_flags & FLAG_UNMOVABLE ||
3290 (index_interp == FLAG_INTERP_CONVERGENT ||
3291 index_interp == FLAG_INTERP_FLAT));
3292
3293 if (parent) {
3294 unsigned parent_interp = parent->instr.pass_flags & FLAG_INTERP_MASK;
3295
3296 /* Check if the interpolation flags are compatible. */
3297 if (parent_interp != FLAG_INTERP_CONVERGENT &&
3298 index_interp != FLAG_INTERP_CONVERGENT &&
3299 parent_interp != index_interp) {
3300 instr->pass_flags |= FLAG_UNMOVABLE;
3301 return;
3302 }
3303
3304 /* Pick the one that isn't convergent because convergent inputs
3305 * can be in expressions with any other qualifier.
3306 */
3307 if (parent_interp == FLAG_INTERP_CONVERGENT)
3308 instr->pass_flags |= index->pass_flags;
3309 else
3310 instr->pass_flags |= parent->instr.pass_flags;
3311 } else {
3312 instr->pass_flags |= index->pass_flags;
3313 }
3314 return;
3315 }
3316
3317 default:
3318 instr->pass_flags |= FLAG_UNMOVABLE;
3319 return;
3320 }
3321 }
3322
3323 default:
3324 instr->pass_flags |= FLAG_UNMOVABLE;
3325 return;
3326 }
3327 }
3328
3329 /* Gather the input loads used by the post-dominator using DFS. */
3330 static void
gather_used_input_loads(nir_instr * instr,nir_intrinsic_instr * loads[NUM_SCALAR_SLOTS],unsigned * num_loads)3331 gather_used_input_loads(nir_instr *instr,
3332 nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS],
3333 unsigned *num_loads)
3334 {
3335 switch (instr->type) {
3336 case nir_instr_type_undef:
3337 case nir_instr_type_load_const:
3338 return;
3339
3340 case nir_instr_type_alu: {
3341 nir_alu_instr *alu = nir_instr_as_alu(instr);
3342 unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
3343
3344 for (unsigned i = 0; i < num_srcs; i++) {
3345 gather_used_input_loads(alu->src[i].src.ssa->parent_instr,
3346 loads, num_loads);
3347 }
3348 return;
3349 }
3350
3351 case nir_instr_type_intrinsic: {
3352 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3353
3354 switch (intr->intrinsic) {
3355 case nir_intrinsic_load_tess_coord:
3356 return;
3357
3358 case nir_intrinsic_load_deref:
3359 gather_used_input_loads(intr->src[0].ssa->parent_instr,
3360 loads, num_loads);
3361 return;
3362
3363 case nir_intrinsic_load_input:
3364 case nir_intrinsic_load_per_vertex_input:
3365 case nir_intrinsic_load_interpolated_input:
3366 if (!(intr->instr.pass_flags & FLAG_GATHER_LOADS_VISITED)) {
3367 assert(*num_loads < NUM_SCALAR_SLOTS*8);
3368 loads[(*num_loads)++] = intr;
3369 intr->instr.pass_flags |= FLAG_GATHER_LOADS_VISITED;
3370 }
3371 return;
3372
3373 default:
3374 printf("%u\n", intr->intrinsic);
3375 unreachable("unexpected intrinsic");
3376 }
3377 }
3378
3379 case nir_instr_type_deref: {
3380 nir_deref_instr *deref = nir_instr_as_deref(instr);
3381 nir_deref_instr *parent = nir_deref_instr_parent(deref);
3382
3383 if (parent)
3384 gather_used_input_loads(&parent->instr, loads, num_loads);
3385
3386 switch (deref->deref_type) {
3387 case nir_deref_type_var:
3388 case nir_deref_type_struct:
3389 return;
3390
3391 case nir_deref_type_array:
3392 gather_used_input_loads(deref->arr.index.ssa->parent_instr,
3393 loads, num_loads);
3394 return;
3395
3396 default:
3397 unreachable("unexpected deref type");
3398 }
3399 }
3400
3401 default:
3402 unreachable("unexpected instr type");
3403 }
3404 }
3405
3406 /* Move a post-dominator, which is an ALU opcode, into the previous shader,
3407 * and replace the post-dominator with a new input load.
3408 */
3409 static bool
try_move_postdominator(struct linkage_info * linkage,struct nir_use_dominance_state * postdom_state,nir_instr * postdom,nir_def * load_def,nir_intrinsic_instr * first_load,nir_opt_varyings_progress * progress)3410 try_move_postdominator(struct linkage_info *linkage,
3411 struct nir_use_dominance_state *postdom_state,
3412 nir_instr *postdom,
3413 nir_def *load_def,
3414 nir_intrinsic_instr *first_load,
3415 nir_opt_varyings_progress *progress)
3416 {
3417 #define PRINT 0
3418 #if PRINT
3419 printf("Trying to move post-dom: ");
3420 nir_print_instr(postdom, stdout);
3421 puts("");
3422 #endif
3423
3424 /* Gather the input loads used by the post-dominator using DFS. */
3425 nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS*8];
3426 unsigned num_loads = 0;
3427 gather_used_input_loads(postdom, loads, &num_loads);
3428 assert(num_loads && "no loads were gathered");
3429
3430 /* Clear the flag set by gather_used_input_loads. */
3431 for (unsigned i = 0; i < num_loads; i++)
3432 loads[i]->instr.pass_flags &= ~FLAG_GATHER_LOADS_VISITED;
3433
3434 /* For all the loads, the previous shader must have the corresponding
3435 * output stores in the same basic block because we are going to replace
3436 * them with 1 store. Only TCS and GS can have stores of different outputs
3437 * in different blocks.
3438 */
3439 nir_block *block = NULL;
3440
3441 for (unsigned i = 0; i < num_loads; i++) {
3442 unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]);
3443 struct scalar_slot *slot = &linkage->slot[slot_index];
3444
3445 assert(list_is_singular(&slot->producer.stores));
3446 nir_intrinsic_instr *store =
3447 list_first_entry(&slot->producer.stores, struct list_node,
3448 head)->instr;
3449
3450 if (!block) {
3451 block = store->instr.block;
3452 continue;
3453 }
3454 if (block != store->instr.block)
3455 return false;
3456 }
3457
3458 assert(block);
3459
3460 #if PRINT
3461 printf("Post-dom accepted: ");
3462 nir_print_instr(postdom, stdout);
3463 puts("\n");
3464 #endif
3465
3466 /* Determine the scalar slot index of the new varying. It will reuse
3467 * the slot of the load we started from because the load will be
3468 * removed.
3469 */
3470 unsigned final_slot = intr_get_scalar_16bit_slot(first_load);
3471
3472 /* Replace the post-dominator in the consumer with a new input load.
3473 * Since we are reusing the same slot as the first load and it has
3474 * the right interpolation qualifiers, use it as the new load by using
3475 * it in place of the post-dominator.
3476 *
3477 * Boolean post-dominators are upcast in the producer and then downcast
3478 * in the consumer.
3479 */
3480 unsigned slot_index = final_slot;
3481 struct scalar_slot *slot = &linkage->slot[slot_index];
3482 nir_builder *b = &linkage->consumer_builder;
3483 b->cursor = nir_after_instr(load_def->parent_instr);
3484 nir_def *postdom_def = nir_instr_def(postdom);
3485 unsigned alu_interp = postdom->pass_flags & FLAG_INTERP_MASK;
3486 nir_def *new_input, *new_tes_loads[3];
3487 BITSET_WORD *mask;
3488
3489 /* Convergent instruction results that are not interpolatable (integer or
3490 * FP64) should not be moved because compaction can relocate convergent
3491 * varyings to interpolated vec4 slots because the definition of convergent
3492 * varyings implies that they can be interpolated (which doesn't work with
3493 * integer and FP64 values).
3494 *
3495 * Check the result type and if it's not float and the driver doesn't
3496 * support convergent flat loads from interpolated vec4 slots, don't move
3497 * it.
3498 */
3499 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
3500 alu_interp == FLAG_INTERP_CONVERGENT &&
3501 !linkage->can_mix_convergent_flat_with_interpolated &&
3502 (postdom->type != nir_instr_type_alu ||
3503 (postdom_def->bit_size != 16 && postdom_def->bit_size != 32) ||
3504 !(nir_op_infos[nir_instr_as_alu(postdom)->op].output_type & nir_type_float)))
3505 return false;
3506
3507 /* NIR can't do 1-bit inputs. Convert them to a bigger size. */
3508 assert(postdom_def->bit_size & (1 | 16 | 32));
3509 unsigned new_bit_size = postdom_def->bit_size;
3510
3511 if (new_bit_size == 1) {
3512 assert(alu_interp == FLAG_INTERP_CONVERGENT ||
3513 alu_interp == FLAG_INTERP_FLAT);
3514 /* TODO: We could use 16 bits instead, but that currently fails on AMD.
3515 */
3516 new_bit_size = 32;
3517 }
3518
3519 bool rewrite_convergent_to_flat =
3520 alu_interp == FLAG_INTERP_CONVERGENT &&
3521 linkage->can_mix_convergent_flat_with_interpolated;
3522
3523 /* Create the new input load. This creates a new load (or a series of
3524 * loads in case of open-coded TES interpolation) that's identical to
3525 * the original load(s).
3526 */
3527 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
3528 alu_interp != FLAG_INTERP_FLAT && !rewrite_convergent_to_flat) {
3529 nir_def *baryc = NULL;
3530
3531 /* Determine the barycentric coordinates. */
3532 switch (alu_interp) {
3533 case FLAG_INTERP_PERSP_PIXEL:
3534 case FLAG_INTERP_LINEAR_PIXEL:
3535 baryc = nir_load_barycentric_pixel(b, 32);
3536 break;
3537 case FLAG_INTERP_PERSP_CENTROID:
3538 case FLAG_INTERP_LINEAR_CENTROID:
3539 baryc = nir_load_barycentric_centroid(b, 32);
3540 break;
3541 case FLAG_INTERP_PERSP_SAMPLE:
3542 case FLAG_INTERP_LINEAR_SAMPLE:
3543 baryc = nir_load_barycentric_sample(b, 32);
3544 break;
3545 default:
3546 baryc = first_load->src[0].ssa;
3547 break;
3548 }
3549
3550 if (baryc != first_load->src[0].ssa) {
3551 nir_intrinsic_instr *baryc_i =
3552 nir_instr_as_intrinsic(baryc->parent_instr);
3553
3554 if (alu_interp == FLAG_INTERP_LINEAR_PIXEL ||
3555 alu_interp == FLAG_INTERP_LINEAR_CENTROID ||
3556 alu_interp == FLAG_INTERP_LINEAR_SAMPLE)
3557 nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_NOPERSPECTIVE);
3558 else
3559 nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_SMOOTH);
3560 }
3561
3562 new_input = nir_load_interpolated_input(
3563 b, 1, new_bit_size, baryc, nir_imm_int(b, 0),
3564 .base = nir_intrinsic_base(first_load),
3565 .component = nir_intrinsic_component(first_load),
3566 .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3567 new_bit_size,
3568 .io_semantics = nir_intrinsic_io_semantics(first_load));
3569
3570 if (alu_interp == FLAG_INTERP_CONVERGENT) {
3571 mask = new_bit_size == 16 ? linkage->convergent16_mask
3572 : linkage->convergent32_mask;
3573 } else if (linkage->has_flexible_interp) {
3574 mask = new_bit_size == 16 ? linkage->interp_fp16_mask
3575 : linkage->interp_fp32_mask;
3576 } else {
3577 /* The index of the qualifier is encoded in alu_interp, so extract it. */
3578 unsigned i = (alu_interp - FLAG_INTERP_PERSP_PIXEL) >> 5;
3579 mask = new_bit_size == 16 ? linkage->interp_fp16_qual_masks[i]
3580 : linkage->interp_fp32_qual_masks[i];
3581 }
3582 } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3583 alu_interp > FLAG_INTERP_FLAT) {
3584 nir_def *zero = nir_imm_int(b, 0);
3585
3586 for (unsigned i = 0; i < 3; i++) {
3587 new_tes_loads[i] =
3588 nir_load_per_vertex_input(b, 1, new_bit_size,
3589 i ? nir_imm_int(b, i) : zero, zero,
3590 .base = nir_intrinsic_base(first_load),
3591 .component = nir_intrinsic_component(first_load),
3592 .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3593 new_bit_size,
3594 .io_semantics = nir_intrinsic_io_semantics(first_load));
3595 }
3596
3597 int remap_uvw[3] = {0, 1, 2};
3598 int remap_wuv[3] = {2, 0, 1};
3599 int *remap;
3600
3601 switch (alu_interp) {
3602 case FLAG_INTERP_TES_TRIANGLE_UVW:
3603 remap = remap_uvw;
3604 break;
3605 case FLAG_INTERP_TES_TRIANGLE_WUV:
3606 remap = remap_wuv;
3607 break;
3608 default:
3609 unreachable("invalid TES interpolation mode");
3610 }
3611
3612 nir_def *tesscoord = slot->consumer.tes_load_tess_coord;
3613 nir_def *defs[3];
3614
3615 for (unsigned i = 0; i < 3; i++) {
3616 if (i == 0) {
3617 defs[i] = nir_fmul(b, new_tes_loads[i],
3618 nir_channel(b, tesscoord, remap[i]));
3619 } else {
3620 defs[i] = nir_ffma(b, new_tes_loads[i],
3621 nir_channel(b, tesscoord, remap[i]),
3622 defs[i - 1]);
3623 }
3624 }
3625 new_input = defs[2];
3626
3627 mask = new_bit_size == 16 ? linkage->flat16_mask
3628 : linkage->flat32_mask;
3629 } else {
3630 /* We have to rewrite convergent to flat here and not during compaction
3631 * because compaction adds code to convert Infs to NaNs for
3632 * "load_interpolated_input -> load_input" replacements, which corrupts
3633 * integer data.
3634 */
3635 assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT ||
3636 alu_interp == FLAG_INTERP_FLAT || rewrite_convergent_to_flat);
3637
3638 new_input =
3639 nir_load_input(b, 1, new_bit_size, nir_imm_int(b, 0),
3640 .base = nir_intrinsic_base(first_load),
3641 .component = nir_intrinsic_component(first_load),
3642 .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3643 new_bit_size,
3644 .io_semantics = nir_intrinsic_io_semantics(first_load));
3645
3646 mask = new_bit_size == 16 ? linkage->flat16_mask
3647 : linkage->flat32_mask;
3648
3649 if (rewrite_convergent_to_flat) {
3650 mask = new_bit_size == 16 ? linkage->convergent16_mask
3651 : linkage->convergent32_mask;
3652 }
3653 }
3654
3655 assert(!BITSET_TEST(linkage->no_varying32_mask, slot_index));
3656 assert(!BITSET_TEST(linkage->no_varying16_mask, slot_index));
3657
3658 /* Re-set the category of the new scalar input. This will cause
3659 * the compaction to treat it as a different type, so that it will be moved
3660 * into the vec4 that has compatible interpolation qualifiers.
3661 *
3662 * This shouldn't be done if any of the interp masks are not set, which
3663 * indicates that compaction is disallowed.
3664 */
3665 if (BITSET_TEST(linkage->interp_fp32_mask, slot_index) ||
3666 BITSET_TEST(linkage->interp_fp16_mask, slot_index) ||
3667 BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, slot_index) ||
3668 BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, slot_index) ||
3669 BITSET_TEST(linkage->flat32_mask, slot_index) ||
3670 BITSET_TEST(linkage->flat16_mask, slot_index) ||
3671 BITSET_TEST(linkage->convergent32_mask, slot_index) ||
3672 BITSET_TEST(linkage->convergent16_mask, slot_index)) {
3673 BITSET_CLEAR(linkage->interp_fp32_mask, slot_index);
3674 for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++)
3675 BITSET_CLEAR(linkage->interp_fp32_qual_masks[i], slot_index);
3676 BITSET_CLEAR(linkage->interp_fp16_mask, slot_index);
3677 for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++)
3678 BITSET_CLEAR(linkage->interp_fp16_qual_masks[i], slot_index);
3679 BITSET_CLEAR(linkage->flat16_mask, slot_index);
3680 BITSET_CLEAR(linkage->flat32_mask, slot_index);
3681 BITSET_CLEAR(linkage->convergent16_mask, slot_index);
3682 BITSET_CLEAR(linkage->convergent32_mask, slot_index);
3683 BITSET_SET(mask, slot_index);
3684 }
3685
3686 /* Replace the existing load with the new load in the slot. */
3687 if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3688 alu_interp >= FLAG_INTERP_TES_TRIANGLE_UVW) {
3689 /* For TES, replace all 3 loads. */
3690 unsigned i = 0;
3691 list_for_each_entry(struct list_node, iter, &slot->consumer.loads,
3692 head) {
3693 assert(i < 3);
3694 iter->instr = nir_instr_as_intrinsic(new_tes_loads[i]->parent_instr);
3695 i++;
3696 }
3697
3698 assert(i == 3);
3699 assert(postdom_def->bit_size != 1);
3700
3701 slot->consumer.tes_interp_load =
3702 nir_instr_as_alu(new_input->parent_instr);
3703 } else {
3704 assert(list_is_singular(&slot->consumer.loads));
3705 list_first_entry(&slot->consumer.loads, struct list_node, head)->instr =
3706 nir_instr_as_intrinsic(new_input->parent_instr);
3707
3708 /* The input is a bigger type even if the post-dominator is boolean. */
3709 if (postdom_def->bit_size == 1)
3710 new_input = nir_ine_imm(b, new_input, 0);
3711 }
3712
3713 nir_def_rewrite_uses(postdom_def, new_input);
3714
3715 /* Clone the post-dominator at the end of the block in the producer
3716 * where the output stores are.
3717 */
3718 b = &linkage->producer_builder;
3719 b->cursor = nir_after_block_before_jump(block);
3720 nir_def *producer_clone = clone_ssa(linkage, b, postdom_def);
3721
3722 /* Boolean post-dominators are upcast in the producer because we can't
3723 * use 1-bit outputs.
3724 */
3725 if (producer_clone->bit_size == 1)
3726 producer_clone = nir_b2bN(b, producer_clone, new_bit_size);
3727
3728 /* Move the existing store to the end of the block and rewrite it to use
3729 * the post-dominator result.
3730 */
3731 nir_intrinsic_instr *store =
3732 list_first_entry(&linkage->slot[final_slot].producer.stores,
3733 struct list_node, head)->instr;
3734 nir_instr_move(b->cursor, &store->instr);
3735 if (nir_src_bit_size(store->src[0]) != producer_clone->bit_size)
3736 nir_intrinsic_set_src_type(store, nir_alu_type_get_base_type(nir_intrinsic_src_type(store)) |
3737 producer_clone->bit_size);
3738 nir_src_rewrite(&store->src[0], producer_clone);
3739
3740 /* Remove all loads and stores that we are replacing from the producer
3741 * and consumer.
3742 */
3743 for (unsigned i = 0; i < num_loads; i++) {
3744 unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]);
3745
3746 if (slot_index == final_slot) {
3747 /* Keep the load and store that we reused. */
3748 continue;
3749 }
3750
3751 /* Remove loads and stores that are dead after the code motion. Only
3752 * those loads that are post-dominated by the post-dominator are dead.
3753 */
3754 struct scalar_slot *slot = &linkage->slot[slot_index];
3755 nir_instr *load;
3756
3757 if (slot->consumer.tes_interp_load) {
3758 load = &slot->consumer.tes_interp_load->instr;
3759
3760 /* With interpolated TES loads, we get here 3 times, once for each
3761 * per-vertex load. Skip this if we've been here before.
3762 */
3763 if (list_is_empty(&slot->producer.stores)) {
3764 assert(list_is_empty(&slot->consumer.loads));
3765 continue;
3766 }
3767 } else {
3768 assert(list_is_singular(&slot->consumer.loads));
3769 load = &list_first_entry(&slot->consumer.loads,
3770 struct list_node, head)->instr->instr;
3771 }
3772
3773 if (nir_instr_dominates_use(postdom_state, postdom, load)) {
3774 list_inithead(&slot->consumer.loads);
3775
3776 /* Remove stores. (transform feedback is allowed here, just not
3777 * in final_slot)
3778 */
3779 remove_all_stores_and_clear_slot(linkage, slot_index, progress);
3780 } else {
3781 /* If a load has 2 uses and one of those uses is moved into the previous
3782 * shader, making that "use" dead, the load and its associated store
3783 * can't be removed because there is still one use remaining. However,
3784 * there are actually 2 uses remaining because the use that is dead isn't
3785 * removed from NIR, but is left dangling there.
3786 *
3787 * When we run this optimization again and make the second use dead,
3788 * which makes the load dead, the output store in the producer isn't removed
3789 * because the post-dominator of the second use doesn't post-dominate
3790 * the load because we left the first use dangling there.
3791 *
3792 * To fix that, we could run DCE, but that would be costly because we would
3793 * need to re-gather all IO. Instead, remove dead uses by replacing them
3794 * with undef here, so that when this code motion pass is entered again,
3795 * the load has its number of uses reduced and the corresponding output store
3796 * will be removed by the code above.
3797 */
3798 nir_foreach_use_safe(src, nir_instr_def(load)) {
3799 if (nir_instr_dominates_use(postdom_state, postdom,
3800 nir_src_parent_instr(src))) {
3801 nir_src_rewrite(src, nir_undef(&linkage->consumer_builder,
3802 src->ssa->num_components,
3803 src->ssa->bit_size));
3804 }
3805 }
3806 }
3807 }
3808
3809 *progress |= nir_progress_producer | nir_progress_consumer;
3810 return true;
3811 }
3812
3813 static bool
backward_inter_shader_code_motion(struct linkage_info * linkage,nir_opt_varyings_progress * progress)3814 backward_inter_shader_code_motion(struct linkage_info *linkage,
3815 nir_opt_varyings_progress *progress)
3816 {
3817 /* These producers are not supported. The description at the beginning
3818 * suggests a possible workaround.
3819 */
3820 if (linkage->producer_stage == MESA_SHADER_GEOMETRY ||
3821 linkage->producer_stage == MESA_SHADER_MESH ||
3822 linkage->producer_stage == MESA_SHADER_TASK)
3823 return false;
3824
3825 /* Clear pass_flags. */
3826 nir_shader_clear_pass_flags(linkage->consumer_builder.shader);
3827
3828 /* Gather inputs that can be moved into the previous shader. These are only
3829 * checked for the basic constraints for movability.
3830 */
3831 struct {
3832 nir_def *def;
3833 nir_intrinsic_instr *first_load;
3834 } movable_loads[NUM_SCALAR_SLOTS];
3835 unsigned num_movable_loads = 0;
3836 unsigned i;
3837
3838 BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
3839 if (!can_optimize_varying(linkage,
3840 vec4_slot(i)).inter_shader_code_motion)
3841 continue;
3842
3843 struct scalar_slot *slot = &linkage->slot[i];
3844
3845 assert(!list_is_empty(&slot->producer.stores));
3846 assert(!is_interpolated_texcoord(linkage, i));
3847 assert(!is_interpolated_color(linkage, i));
3848
3849 /* Disallow producer loads. */
3850 if (!list_is_empty(&slot->producer.loads))
3851 continue;
3852
3853 /* There should be only 1 store per output. */
3854 if (!list_is_singular(&slot->producer.stores))
3855 continue;
3856
3857 nir_def *load_def = NULL;
3858 nir_intrinsic_instr *load =
3859 list_first_entry(&slot->consumer.loads, struct list_node,
3860 head)->instr;
3861
3862 nir_intrinsic_instr *store =
3863 list_first_entry(&slot->producer.stores, struct list_node,
3864 head)->instr;
3865
3866 /* Set interpolation flags.
3867 * Handle interpolated TES loads first because they are special.
3868 */
3869 if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3870 slot->consumer.tes_interp_load) {
3871 if (linkage->producer_stage == MESA_SHADER_VERTEX) {
3872 /* VS -> TES has no constraints on VS stores. */
3873 load_def = &slot->consumer.tes_interp_load->def;
3874 load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD |
3875 slot->consumer.tes_interp_mode;
3876 } else {
3877 assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
3878 assert(store->intrinsic == nir_intrinsic_store_per_vertex_output);
3879
3880 /* The vertex index of the store must InvocationID. */
3881 if (is_sysval(store->src[1].ssa->parent_instr,
3882 SYSTEM_VALUE_INVOCATION_ID)) {
3883 load_def = &slot->consumer.tes_interp_load->def;
3884 load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD |
3885 slot->consumer.tes_interp_mode;
3886 } else {
3887 continue;
3888 }
3889 }
3890 } else {
3891 /* Allow only 1 load per input. CSE should be run before this. */
3892 if (!list_is_singular(&slot->consumer.loads))
3893 continue;
3894
3895 /* This can only be TCS -> TES, which is handled above and rejected
3896 * otherwise.
3897 */
3898 if (store->intrinsic == nir_intrinsic_store_per_vertex_output) {
3899 assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
3900 continue;
3901 }
3902
3903 /* TODO: handle load_per_vertex_input for TCS and GS.
3904 * TES can also occur here if tes_interp_load is NULL.
3905 */
3906 if (load->intrinsic == nir_intrinsic_load_per_vertex_input)
3907 continue;
3908
3909 load_def = &load->def;
3910
3911 switch (load->intrinsic) {
3912 case nir_intrinsic_load_interpolated_input: {
3913 assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT);
3914 nir_instr *baryc_instr = load->src[0].ssa->parent_instr;
3915
3916 /* This is either lowered barycentric_at_offset/at_sample or user
3917 * barycentrics. Treat it like barycentric_at_offset.
3918 */
3919 if (baryc_instr->type != nir_instr_type_intrinsic)
3920 continue;
3921
3922 nir_intrinsic_instr *baryc = nir_instr_as_intrinsic(baryc_instr);
3923 nir_intrinsic_op op = baryc->intrinsic;
3924 enum glsl_interp_mode interp = nir_intrinsic_interp_mode(baryc);
3925 bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
3926 bool convergent = BITSET_TEST(linkage->convergent32_mask, i) ||
3927 BITSET_TEST(linkage->convergent16_mask, i);
3928
3929 assert(interp == INTERP_MODE_NONE ||
3930 interp == INTERP_MODE_SMOOTH ||
3931 interp == INTERP_MODE_NOPERSPECTIVE);
3932
3933 if (convergent) {
3934 load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3935 } else if (op == nir_intrinsic_load_barycentric_pixel) {
3936 load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_PIXEL
3937 : FLAG_INTERP_PERSP_PIXEL;
3938 } else if (op == nir_intrinsic_load_barycentric_centroid) {
3939 load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_CENTROID
3940 : FLAG_INTERP_PERSP_CENTROID;
3941 } else if (op == nir_intrinsic_load_barycentric_sample) {
3942 load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_SAMPLE
3943 : FLAG_INTERP_PERSP_SAMPLE;
3944 } else {
3945 /* Optimizing at_offset and at_sample would be possible but
3946 * maybe not worth it if they are not convergent. Convergent
3947 * inputs can trivially switch the barycentric coordinates
3948 * to different ones or flat.
3949 */
3950 continue;
3951 }
3952 break;
3953 }
3954 case nir_intrinsic_load_input:
3955 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
3956 if (BITSET_TEST(linkage->convergent32_mask, i) ||
3957 BITSET_TEST(linkage->convergent16_mask, i))
3958 load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3959 else
3960 load->instr.pass_flags |= FLAG_INTERP_FLAT;
3961 } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) {
3962 assert(vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
3963 vec4_slot(i) <= VARYING_SLOT_PATCH31);
3964 /* Patch inputs are always convergent. */
3965 load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3966 } else {
3967 /* It's not a fragment shader. We still need to set this. */
3968 load->instr.pass_flags |= FLAG_INTERP_FLAT;
3969 }
3970 break;
3971 case nir_intrinsic_load_per_primitive_input:
3972 case nir_intrinsic_load_input_vertex:
3973 /* Inter-shader code motion is unimplemented these. */
3974 continue;
3975 default:
3976 unreachable("unexpected load intrinsic");
3977 }
3978 }
3979
3980 load_def->parent_instr->pass_flags |= FLAG_MOVABLE;
3981
3982 /* Disallow transform feedback. The load is "movable" for the purpose of
3983 * finding a movable post-dominator, we just can't rewrite the store
3984 * because we need to keep it for xfb, so the post-dominator search
3985 * will have to start from a different load (only that varying will have
3986 * its value rewritten).
3987 */
3988 if (BITSET_TEST(linkage->xfb_mask, i))
3989 continue;
3990
3991 assert(num_movable_loads < ARRAY_SIZE(movable_loads));
3992 movable_loads[num_movable_loads].def = load_def;
3993 movable_loads[num_movable_loads].first_load = load;
3994 num_movable_loads++;
3995 }
3996
3997 if (!num_movable_loads)
3998 return false;
3999
4000 /* Inter-shader code motion turns ALU results into outputs, but not all
4001 * bit sizes are supported by outputs.
4002 *
4003 * The 1-bit type is allowed because the pass always promotes 1-bit
4004 * outputs to 16 or 32 bits, whichever is supported.
4005 *
4006 * TODO: We could support replacing 2 32-bit inputs with one 64-bit
4007 * post-dominator by supporting 64 bits here, but the likelihood of that
4008 * occuring seems low.
4009 */
4010 unsigned supported_io_types = 32 | 1;
4011
4012 if (linkage->producer_builder.shader->options->io_options &
4013 linkage->consumer_builder.shader->options->io_options &
4014 nir_io_16bit_input_output_support)
4015 supported_io_types |= 16;
4016
4017 struct nir_use_dominance_state *postdom_state =
4018 nir_calc_use_dominance_impl(linkage->consumer_builder.impl, true);
4019
4020 for (unsigned i = 0; i < num_movable_loads; i++) {
4021 nir_def *load_def = movable_loads[i].def;
4022 nir_instr *iter = load_def->parent_instr;
4023 nir_instr *movable_postdom = NULL;
4024
4025 /* Find the farthest post-dominator that is movable. */
4026 while (iter) {
4027 iter = nir_get_immediate_use_dominator(postdom_state, iter);
4028 if (iter) {
4029 if (NEED_UPDATE_MOVABLE_FLAGS(iter))
4030 update_movable_flags(linkage, iter);
4031
4032 if (iter->pass_flags & FLAG_UNMOVABLE)
4033 break;
4034
4035 /* We can't move derefs into the previous shader, but we can move
4036 * instructions that use derefs.
4037 */
4038 if (iter->type == nir_instr_type_deref)
4039 continue;
4040
4041 unsigned bit_size;
4042
4043 if (iter->type == nir_instr_type_alu) {
4044 nir_alu_instr *alu = nir_instr_as_alu(iter);
4045
4046 /* Skip comparison opcodes that directly source the first load
4047 * and a constant because any 1-bit values would have to be
4048 * converted to 32 bits in the producer and then converted back
4049 * to 1 bit using nir_op_ine in the consumer, achieving nothing.
4050 */
4051 if (alu->def.bit_size == 1 &&
4052 ((nir_op_infos[alu->op].num_inputs == 1 &&
4053 alu->src[0].src.ssa == load_def) ||
4054 (nir_op_infos[alu->op].num_inputs == 2 &&
4055 ((alu->src[0].src.ssa == load_def &&
4056 alu->src[1].src.ssa->parent_instr->type ==
4057 nir_instr_type_load_const) ||
4058 (alu->src[0].src.ssa->parent_instr->type ==
4059 nir_instr_type_load_const &&
4060 alu->src[1].src.ssa == load_def)))))
4061 continue;
4062
4063 bit_size = alu->def.bit_size;
4064 } else if (iter->type == nir_instr_type_intrinsic) {
4065 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(iter);
4066
4067 /* This is a uniform load with a non-constant index because
4068 * only a non-constant index can be post-dominated by a load.
4069 */
4070 assert(intr->intrinsic == nir_intrinsic_load_deref);
4071
4072 /* Uniform loads must be scalar if their result is immediately
4073 * stored into an output because this pass only works with
4074 * scalar outputs.
4075 */
4076 if (intr->num_components > 1)
4077 continue;
4078
4079 bit_size = intr->def.bit_size;
4080 } else {
4081 unreachable("unexpected instr type");
4082 }
4083
4084 /* Skip unsupported bit sizes and keep searching. */
4085 if (!(bit_size & supported_io_types))
4086 continue;
4087
4088 movable_postdom = iter;
4089 }
4090 }
4091
4092 /* Add the post-dominator to the list unless it's been added already. */
4093 if (movable_postdom &&
4094 !(movable_postdom->pass_flags & FLAG_POST_DOMINATOR_PROCESSED)) {
4095 if (try_move_postdominator(linkage, postdom_state, movable_postdom,
4096 load_def, movable_loads[i].first_load,
4097 progress)) {
4098 /* Moving only one postdominator can change the IR enough that
4099 * we should start from scratch.
4100 */
4101 ralloc_free(postdom_state);
4102 return true;
4103 }
4104
4105 movable_postdom->pass_flags |= FLAG_POST_DOMINATOR_PROCESSED;
4106 }
4107 }
4108
4109 ralloc_free(postdom_state);
4110 return false;
4111 }
4112
4113 /******************************************************************
4114 * COMPACTION
4115 ******************************************************************/
4116
4117 /* Relocate a slot to a new index. Used by compaction. new_index is
4118 * the component index at 16-bit granularity, so the size of vec4 is 8
4119 * in that representation.
4120 */
4121 static void
relocate_slot(struct linkage_info * linkage,struct scalar_slot * slot,unsigned i,unsigned new_index,enum fs_vec4_type fs_vec4_type,bool convergent,nir_opt_varyings_progress * progress)4122 relocate_slot(struct linkage_info *linkage, struct scalar_slot *slot,
4123 unsigned i, unsigned new_index, enum fs_vec4_type fs_vec4_type,
4124 bool convergent, nir_opt_varyings_progress *progress)
4125 {
4126 assert(!list_is_empty(&slot->producer.stores));
4127
4128 list_for_each_entry(struct list_node, iter, &slot->producer.stores, head) {
4129 assert(!nir_intrinsic_io_semantics(iter->instr).no_varying ||
4130 has_xfb(iter->instr) ||
4131 linkage->producer_stage == MESA_SHADER_TESS_CTRL);
4132 assert(!is_active_sysval_output(linkage, i, iter->instr));
4133 }
4134
4135 /* Relocate the slot in all loads and stores. */
4136 struct list_head *instruction_lists[3] = {
4137 &slot->producer.stores,
4138 &slot->producer.loads,
4139 &slot->consumer.loads,
4140 };
4141
4142 for (unsigned i = 0; i < ARRAY_SIZE(instruction_lists); i++) {
4143 list_for_each_entry(struct list_node, iter, instruction_lists[i], head) {
4144 nir_intrinsic_instr *intr = iter->instr;
4145
4146 gl_varying_slot new_semantic = vec4_slot(new_index);
4147 unsigned new_component = (new_index % 8) / 2;
4148 bool new_high_16bits = new_index % 2;
4149
4150 /* We also need to relocate xfb info because it's always relative
4151 * to component 0. This just moves it into the correct xfb slot.
4152 */
4153 if (has_xfb(intr)) {
4154 unsigned old_component = nir_intrinsic_component(intr);
4155 static const nir_io_xfb clear_xfb;
4156 nir_io_xfb xfb;
4157 bool new_is_odd = new_component % 2 == 1;
4158
4159 memset(&xfb, 0, sizeof(xfb));
4160
4161 if (old_component >= 2) {
4162 xfb.out[new_is_odd] = nir_intrinsic_io_xfb2(intr).out[old_component - 2];
4163 nir_intrinsic_set_io_xfb2(intr, clear_xfb);
4164 } else {
4165 xfb.out[new_is_odd] = nir_intrinsic_io_xfb(intr).out[old_component];
4166 nir_intrinsic_set_io_xfb(intr, clear_xfb);
4167 }
4168
4169 if (new_component >= 2)
4170 nir_intrinsic_set_io_xfb2(intr, xfb);
4171 else
4172 nir_intrinsic_set_io_xfb(intr, xfb);
4173 }
4174
4175 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
4176 unsigned bit_size = nir_intrinsic_infos[intr->intrinsic].has_dest ?
4177 intr->def.bit_size : intr->src[0].ssa->bit_size;
4178
4179 /* Set all types to float to facilitate full IO vectorization.
4180 * This is skipped only if mediump is not lowered to 16 bits.
4181 *
4182 * Set nir_io_mediump_is_32bit if you never lower mediump IO to 16
4183 * bits, which sets nir_io_semantics::mediump_precision = 0 during
4184 * nir_lower_io.
4185 *
4186 * Set nir_shader_compiler_options::lower_mediump_io if you want to
4187 * lower mediump to 16 bits in the GLSL linker before this pass.
4188 */
4189 if (bit_size != 32 || !sem.medium_precision) {
4190 nir_alu_type type = nir_intrinsic_has_src_type(intr) ?
4191 nir_intrinsic_src_type(intr) :
4192 nir_intrinsic_dest_type(intr);
4193 type = nir_alu_type_get_type_size(type) | nir_type_float;
4194
4195 if (nir_intrinsic_has_src_type(intr))
4196 nir_intrinsic_set_src_type(intr, type);
4197 else
4198 nir_intrinsic_set_dest_type(intr, type);
4199 }
4200
4201 /* When relocating a back color store, don't change it to a front
4202 * color as that would be incorrect. Keep it as back color and only
4203 * relocate it between BFC0 and BFC1.
4204 */
4205 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
4206 (sem.location == VARYING_SLOT_BFC0 ||
4207 sem.location == VARYING_SLOT_BFC1)) {
4208 assert(new_semantic == VARYING_SLOT_COL0 ||
4209 new_semantic == VARYING_SLOT_COL1);
4210 new_semantic = VARYING_SLOT_BFC0 +
4211 (new_semantic - VARYING_SLOT_COL0);
4212 }
4213
4214 #if PRINT_RELOCATE_SLOT
4215 assert(bit_size == 16 || bit_size == 32);
4216
4217 fprintf(stderr, "--- relocating: %s.%c%s%s -> %s.%c%s%s FS_VEC4_TYPE_%s\n",
4218 gl_varying_slot_name_for_stage(sem.location, linkage->producer_stage) + 13,
4219 "xyzw"[nir_intrinsic_component(intr) % 4],
4220 (bit_size == 16 && !sem.high_16bits) ? ".lo" : "",
4221 (bit_size == 16 && sem.high_16bits) ? ".hi" : "",
4222 gl_varying_slot_name_for_stage(new_semantic, linkage->producer_stage) + 13,
4223 "xyzw"[new_component % 4],
4224 (bit_size == 16 && !new_high_16bits) ? ".lo" : "",
4225 (bit_size == 16 && new_high_16bits) ? ".hi" : "",
4226 fs_vec4_type_strings[fs_vec4_type]);
4227 #endif /* PRINT_RELOCATE_SLOT */
4228
4229 sem.location = new_semantic;
4230 sem.high_16bits = new_high_16bits;
4231
4232 /* This is never indirectly indexed. Simplify num_slots. */
4233 sem.num_slots = 1;
4234
4235 nir_intrinsic_set_io_semantics(intr, sem);
4236 nir_intrinsic_set_component(intr, new_component);
4237
4238 if (fs_vec4_type == FS_VEC4_TYPE_PER_PRIMITIVE) {
4239 assert(intr->intrinsic == nir_intrinsic_store_per_primitive_output ||
4240 intr->intrinsic == nir_intrinsic_load_per_primitive_output ||
4241 intr->intrinsic == nir_intrinsic_load_per_primitive_input);
4242 } else {
4243 assert(intr->intrinsic != nir_intrinsic_store_per_primitive_output &&
4244 intr->intrinsic != nir_intrinsic_load_per_primitive_output &&
4245 intr->intrinsic != nir_intrinsic_load_per_primitive_input);
4246 }
4247
4248 if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
4249 continue;
4250
4251 /* This path is used when promoting convergent interpolated
4252 * inputs to flat. Replace load_interpolated_input with load_input.
4253 */
4254 if (fs_vec4_type == FS_VEC4_TYPE_FLAT ||
4255 /* Promote all convergent loads to flat if the driver supports it. */
4256 (convergent &&
4257 linkage->can_mix_convergent_flat_with_interpolated)) {
4258 assert(instruction_lists[i] == &slot->consumer.loads);
4259 nir_builder *b = &linkage->consumer_builder;
4260
4261 b->cursor = nir_before_instr(&intr->instr);
4262 nir_def *load =
4263 nir_load_input(b, 1, intr->def.bit_size,
4264 nir_get_io_offset_src(intr)->ssa,
4265 .io_semantics = sem,
4266 .component = new_component,
4267 .dest_type = nir_intrinsic_dest_type(intr));
4268
4269 nir_def_rewrite_uses(&intr->def, load);
4270 iter->instr = nir_instr_as_intrinsic(load->parent_instr);
4271 nir_instr_remove(&intr->instr);
4272 *progress |= nir_progress_consumer;
4273
4274 /* Interpolation converts Infs to NaNs. If we change it to flat,
4275 * we need to convert Infs to NaNs manually in the producer to
4276 * preserve that.
4277 */
4278 if (preserve_nans(linkage->consumer_builder.shader,
4279 load->bit_size)) {
4280 list_for_each_entry(struct list_node, iter,
4281 &slot->producer.stores, head) {
4282 nir_intrinsic_instr *store = iter->instr;
4283
4284 nir_builder *b = &linkage->producer_builder;
4285 b->cursor = nir_before_instr(&store->instr);
4286 nir_def *repl =
4287 build_convert_inf_to_nan(b, store->src[0].ssa);
4288 nir_src_rewrite(&store->src[0], repl);
4289 }
4290 }
4291 continue;
4292 }
4293
4294 /* We are packing convergent inputs with any other interpolated
4295 * inputs in the same vec4, but the interpolation qualifier might not
4296 * be the same between the two. Set the qualifier of the convergent
4297 * input to match the input it's being packed with.
4298 */
4299 if (!linkage->has_flexible_interp && convergent) {
4300 enum fs_vec4_type current_vec4_type =
4301 get_interp_vec4_type(linkage, i, intr);
4302
4303 /* Make the interpolation qualifier match the slot where we are
4304 * moving this input.
4305 */
4306 if (current_vec4_type != fs_vec4_type) {
4307 nir_builder *b = &linkage->consumer_builder;
4308 nir_def *baryc;
4309
4310 b->cursor = nir_before_instr(&intr->instr);
4311
4312 switch (fs_vec4_type) {
4313 case FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL:
4314 case FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL:
4315 baryc = nir_load_barycentric_pixel(b, 32,
4316 .interp_mode = INTERP_MODE_SMOOTH);
4317 break;
4318 case FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID:
4319 case FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID:
4320 baryc = nir_load_barycentric_centroid(b, 32,
4321 .interp_mode = INTERP_MODE_SMOOTH);
4322 break;
4323 case FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE:
4324 case FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE:
4325 baryc = nir_load_barycentric_sample(b, 32,
4326 .interp_mode = INTERP_MODE_SMOOTH);
4327 break;
4328 case FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL:
4329 case FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL:
4330 baryc = nir_load_barycentric_pixel(b, 32,
4331 .interp_mode = INTERP_MODE_NOPERSPECTIVE);
4332 break;
4333 case FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID:
4334 case FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID:
4335 baryc = nir_load_barycentric_centroid(b, 32,
4336 .interp_mode = INTERP_MODE_NOPERSPECTIVE);
4337 break;
4338 case FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE:
4339 case FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE:
4340 baryc = nir_load_barycentric_sample(b, 32,
4341 .interp_mode = INTERP_MODE_NOPERSPECTIVE);
4342 break;
4343 case FS_VEC4_TYPE_INTERP_COLOR_PIXEL:
4344 baryc = nir_load_barycentric_pixel(b, 32,
4345 .interp_mode = INTERP_MODE_NONE);
4346 break;
4347 case FS_VEC4_TYPE_INTERP_COLOR_CENTROID:
4348 baryc = nir_load_barycentric_centroid(b, 32,
4349 .interp_mode = INTERP_MODE_NONE);
4350 break;
4351 case FS_VEC4_TYPE_INTERP_COLOR_SAMPLE:
4352 baryc = nir_load_barycentric_sample(b, 32,
4353 .interp_mode = INTERP_MODE_NONE);
4354 break;
4355 default:
4356 unreachable("invalid qualifier");
4357 }
4358
4359 nir_src_rewrite(&intr->src[0], baryc);
4360 }
4361 }
4362 }
4363 }
4364 }
4365
4366 /**
4367 * A helper function for compact_varyings(). Assign new slot indices for
4368 * existing slots of a certain vec4 type (FLAT, FP16, or FP32). Skip already-
4369 * assigned scalar slots (determined by assigned_mask) and don't assign to
4370 * vec4 slots that have an incompatible vec4 type (determined by
4371 * assigned_fs_vec4_type). This works with both 32-bit and 16-bit types.
4372 * slot_size is the component size in the units of 16 bits (2 means 32 bits).
4373 *
4374 * The number of slots to assign can optionally be limited by
4375 * max_assigned_slots.
4376 *
4377 * Return how many 16-bit slots are left unused in the last vec4 (up to 8
4378 * slots).
4379 */
4380 static unsigned
fs_assign_slots(struct linkage_info * linkage,BITSET_WORD * assigned_mask,uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],BITSET_WORD * input_mask,enum fs_vec4_type fs_vec4_type,unsigned slot_size,unsigned max_assigned_slots,bool convergent,bool assign_colors,unsigned color_channel_rotate,nir_opt_varyings_progress * progress)4381 fs_assign_slots(struct linkage_info *linkage,
4382 BITSET_WORD *assigned_mask,
4383 uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],
4384 BITSET_WORD *input_mask,
4385 enum fs_vec4_type fs_vec4_type,
4386 unsigned slot_size,
4387 unsigned max_assigned_slots,
4388 bool convergent,
4389 bool assign_colors,
4390 unsigned color_channel_rotate,
4391 nir_opt_varyings_progress *progress)
4392 {
4393 unsigned i, slot_index, max_slot;
4394 unsigned num_assigned_slots = 0;
4395
4396 if (assign_colors) {
4397 slot_index = VARYING_SLOT_COL0 * 8; /* starting slot */
4398 max_slot = VARYING_SLOT_COL1 + 1;
4399 } else {
4400 slot_index = VARYING_SLOT_VAR0 * 8; /* starting slot */
4401 max_slot = VARYING_SLOT_MAX;
4402 }
4403
4404 /* Assign new slot indices for scalar slots. */
4405 BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) {
4406 if (is_interpolated_color(linkage, i) != assign_colors)
4407 continue;
4408
4409 /* Skip indirectly-indexed scalar slots and slots incompatible
4410 * with the FS vec4 type.
4411 */
4412 while (1) {
4413 /* If the FS vec4 type is incompatible. Move to the next vec4. */
4414 if (fs_vec4_type != FS_VEC4_TYPE_NONE &&
4415 assigned_fs_vec4_type[vec4_slot(slot_index)] !=
4416 FS_VEC4_TYPE_NONE &&
4417 assigned_fs_vec4_type[vec4_slot(slot_index)] != fs_vec4_type) {
4418 slot_index = align(slot_index + slot_size, 8); /* move to next vec4 */
4419 continue;
4420 }
4421
4422 /* This slot is already assigned (assigned_mask is set). Move to
4423 * the next one.
4424 */
4425 if (BITSET_TEST(assigned_mask, slot_index)) {
4426 slot_index += slot_size;
4427 continue;
4428 }
4429 break;
4430 }
4431
4432 /* Assign color channels in this order, starting
4433 * at the color_channel_rotate component first. Cases:
4434 * color_channel_rotate = 0: xyzw
4435 * color_channel_rotate = 1: yzwx
4436 * color_channel_rotate = 2: zwxy
4437 * color_channel_rotate = 3: wxyz
4438 *
4439 * This has no effect on behavior per se, but some drivers merge VARn
4440 * and COLn into one output if each defines different components.
4441 * For example, if we store VAR0.xy and COL0.z, a driver can merge them
4442 * by mapping the same output to 2 different inputs (VAR0 and COL0) if
4443 * color-specific behavior is per component, but it can't merge VAR0.xy
4444 * and COL0.x because they both define x.
4445 */
4446 unsigned new_slot_index = slot_index;
4447 if (assign_colors && color_channel_rotate) {
4448 new_slot_index = (vec4_slot(new_slot_index)) * 8 +
4449 (new_slot_index + color_channel_rotate * 2) % 8;
4450 }
4451
4452 /* Relocate the slot. */
4453 assert(slot_index < max_slot * 8);
4454 relocate_slot(linkage, &linkage->slot[i], i, new_slot_index,
4455 fs_vec4_type, convergent, progress);
4456
4457 for (unsigned i = 0; i < slot_size; ++i)
4458 BITSET_SET(assigned_mask, slot_index + i);
4459
4460 if (assigned_fs_vec4_type)
4461 assigned_fs_vec4_type[vec4_slot(slot_index)] = fs_vec4_type;
4462 slot_index += slot_size; /* move to the next slot */
4463 num_assigned_slots += slot_size;
4464
4465 /* Remove the slot from the input (unassigned) mask. */
4466 BITSET_CLEAR(input_mask, i);
4467
4468 /* The number of slots to assign can optionally be limited. */
4469 assert(num_assigned_slots <= max_assigned_slots);
4470 if (num_assigned_slots == max_assigned_slots)
4471 break;
4472 }
4473
4474 assert(slot_index <= max_slot * 8);
4475
4476 if (!convergent && fs_vec4_type != FS_VEC4_TYPE_NONE) {
4477 /* Count the number of unused 16-bit components. There can be holes
4478 * because indirect inputs are not moved from their original locations.
4479 * The result is used to determine which compoments should be filled
4480 * with convergent inputs.
4481 */
4482 unsigned unused_slots = 0;
4483
4484 for (unsigned i = assign_colors ? VARYING_SLOT_COL0 : VARYING_SLOT_VAR0;
4485 i < max_slot; i++) {
4486 if (assigned_fs_vec4_type[i] != fs_vec4_type)
4487 continue;
4488
4489 unsigned comp_mask =
4490 BITSET_GET_RANGE_INSIDE_WORD(assigned_mask, i * 8, i * 8 + 7);
4491 assert(comp_mask);
4492 assert(comp_mask <= 0xff);
4493
4494 if (comp_mask == 0xff)
4495 continue;
4496
4497 /* Only count full unused 32-bit slots, so that 2 disjoint unused
4498 * 16-bit slots don't give the misleading impression that there is
4499 * a full unused 32-bit slots.
4500 */
4501 for (unsigned i = 0; i < 4; i++) {
4502 if (!(comp_mask & BITFIELD_RANGE(i * 2, 2)))
4503 unused_slots += 2;
4504 }
4505 }
4506 return unused_slots;
4507 }
4508
4509 return 0;
4510 }
4511
4512 /**
4513 * This is called once for 32-bit inputs and once for 16-bit inputs.
4514 * It assigns new slot indices to all scalar slots specified in the masks.
4515 *
4516 * \param linkage Linkage info
4517 * \param assigned_mask Which scalar (16-bit) slots are already taken.
4518 * \param assigned_fs_vec4_type Which vec4 slots have an assigned qualifier
4519 * and can only be filled with compatible slots.
4520 * \param interp_mask The list of interp slots to assign locations for.
4521 * \param flat_mask The list of flat slots to assign locations for.
4522 * \param convergent_mask The list of slots that have convergent output
4523 * stores.
4524 * \param sized_interp_type One of FS_VEC4_TYPE_INTERP_{FP32, FP16, COLOR}*.
4525 * \param slot_size 1 for 16 bits, 2 for 32 bits
4526 * \param color_channel_rotate Assign color channels starting with this index,
4527 * e.g. 2 assigns channels in the zwxy order.
4528 * \param assign_colors Whether to assign only color varyings or only
4529 * non-color varyings.
4530 */
4531 static void
fs_assign_slot_groups(struct linkage_info * linkage,BITSET_WORD * assigned_mask,uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],BITSET_WORD * interp_mask,BITSET_WORD * flat_mask,BITSET_WORD * convergent_mask,BITSET_WORD * color_interp_mask,enum fs_vec4_type sized_interp_type,unsigned slot_size,bool assign_colors,unsigned color_channel_rotate,nir_opt_varyings_progress * progress)4532 fs_assign_slot_groups(struct linkage_info *linkage,
4533 BITSET_WORD *assigned_mask,
4534 uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],
4535 BITSET_WORD *interp_mask,
4536 BITSET_WORD *flat_mask,
4537 BITSET_WORD *convergent_mask,
4538 BITSET_WORD *color_interp_mask,
4539 enum fs_vec4_type sized_interp_type,
4540 unsigned slot_size,
4541 bool assign_colors,
4542 unsigned color_channel_rotate,
4543 nir_opt_varyings_progress *progress)
4544 {
4545 /* Put interpolated slots first. */
4546 unsigned unused_interp_slots =
4547 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4548 interp_mask, sized_interp_type,
4549 slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4550 color_channel_rotate, progress);
4551
4552 unsigned unused_color_interp_slots = 0;
4553 if (color_interp_mask) {
4554 unused_color_interp_slots =
4555 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4556 color_interp_mask, FS_VEC4_TYPE_INTERP_COLOR,
4557 slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4558 color_channel_rotate, progress);
4559 }
4560
4561 /* Put flat slots next.
4562 * Note that only flat vec4 slots can have both 32-bit and 16-bit types
4563 * packed in the same vec4. 32-bit flat inputs are packed first, followed
4564 * by 16-bit flat inputs.
4565 */
4566 unsigned unused_flat_slots =
4567 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4568 flat_mask, FS_VEC4_TYPE_FLAT,
4569 slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4570 color_channel_rotate, progress);
4571
4572 /* Take the inputs with convergent values and assign them as follows.
4573 * Since they can be assigned as both interpolated and flat, we can
4574 * choose. We prefer them to be flat, but if interpolated vec4s have
4575 * unused components, try to fill those before starting a new flat vec4.
4576 *
4577 * First, fill the unused components of flat (if any), then fill
4578 * the unused components of interpolated (if any), and then make
4579 * the remaining convergent inputs flat.
4580 */
4581 if (!linkage->always_interpolate_convergent_fs_inputs &&
4582 unused_flat_slots) {
4583 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4584 convergent_mask, FS_VEC4_TYPE_FLAT,
4585 slot_size, unused_flat_slots, true, assign_colors,
4586 color_channel_rotate, progress);
4587 }
4588 if (unused_interp_slots) {
4589 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4590 convergent_mask, sized_interp_type,
4591 slot_size, unused_interp_slots, true, assign_colors,
4592 color_channel_rotate, progress);
4593 }
4594 if (unused_color_interp_slots) {
4595 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4596 convergent_mask, FS_VEC4_TYPE_INTERP_COLOR,
4597 slot_size, unused_color_interp_slots, true, assign_colors,
4598 color_channel_rotate, progress);
4599 }
4600 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4601 convergent_mask,
4602 linkage->always_interpolate_convergent_fs_inputs ?
4603 (slot_size == 2 ? FS_VEC4_TYPE_INTERP_FP32 :
4604 FS_VEC4_TYPE_INTERP_FP16) :
4605 FS_VEC4_TYPE_FLAT,
4606 slot_size, NUM_SCALAR_SLOTS, true, assign_colors,
4607 color_channel_rotate, progress);
4608 }
4609
4610 /**
4611 * Same as fs_assign_slot_groups, but don't mix different interpolation
4612 * qualifiers in the same vec4.
4613 */
4614 static void
fs_assign_slot_groups_separate_qual(struct linkage_info * linkage,BITSET_WORD * assigned_mask,uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],INTERP_QUAL_BITSET * interp_masks,BITSET_WORD * flat_mask,BITSET_WORD * convergent_mask,COLOR_QUAL_BITSET * color_interp_masks,enum fs_vec4_type sized_interp_type_base,unsigned slot_size,bool assign_colors,unsigned color_channel_rotate,nir_opt_varyings_progress * progress)4615 fs_assign_slot_groups_separate_qual(struct linkage_info *linkage,
4616 BITSET_WORD *assigned_mask,
4617 uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],
4618 INTERP_QUAL_BITSET *interp_masks,
4619 BITSET_WORD *flat_mask,
4620 BITSET_WORD *convergent_mask,
4621 COLOR_QUAL_BITSET *color_interp_masks,
4622 enum fs_vec4_type sized_interp_type_base,
4623 unsigned slot_size,
4624 bool assign_colors,
4625 unsigned color_channel_rotate,
4626 nir_opt_varyings_progress *progress)
4627 {
4628 unsigned unused_interp_slots[NUM_INTERP_QUALIFIERS] = {0};
4629 unsigned unused_color_slots[NUM_COLOR_QUALIFIERS] = {0};
4630
4631 /* Put interpolated slots first. */
4632 for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) {
4633 unused_interp_slots[i] =
4634 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4635 (*interp_masks)[i], sized_interp_type_base + i,
4636 slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4637 color_channel_rotate, progress);
4638 }
4639
4640 if (color_interp_masks) {
4641 for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) {
4642 unused_color_slots[i] =
4643 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4644 (*color_interp_masks)[i],
4645 FS_VEC4_TYPE_INTERP_COLOR_PIXEL + i,
4646 slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4647 color_channel_rotate, progress);
4648 }
4649 }
4650
4651 /* Put flat slots next.
4652 * Note that only flat vec4 slots can have both 32-bit and 16-bit types
4653 * packed in the same vec4. 32-bit flat inputs are packed first, followed
4654 * by 16-bit flat inputs.
4655 */
4656 unsigned unused_flat_slots =
4657 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4658 flat_mask, FS_VEC4_TYPE_FLAT,
4659 slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4660 color_channel_rotate, progress);
4661
4662 /* Take the inputs with convergent values and assign them as follows.
4663 * Since they can be assigned as both interpolated and flat, we can
4664 * choose. We prefer them to be flat, but if interpolated vec4s have
4665 * unused components, try to fill those before starting a new flat vec4.
4666 *
4667 * First, fill the unused components of flat (if any) with convergent
4668 * inputs.
4669 */
4670 if (!linkage->always_interpolate_convergent_fs_inputs &&
4671 unused_flat_slots) {
4672 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4673 convergent_mask, FS_VEC4_TYPE_FLAT,
4674 slot_size, unused_flat_slots, true, assign_colors,
4675 color_channel_rotate, progress);
4676 }
4677
4678 /* Then fill the unused components of interpolated slots (if any) with
4679 * convergent inputs.
4680 */
4681 for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) {
4682 if (unused_interp_slots[i]) {
4683 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4684 convergent_mask, sized_interp_type_base + i,
4685 slot_size, unused_interp_slots[i], true,
4686 assign_colors, color_channel_rotate, progress);
4687 }
4688 }
4689
4690 for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) {
4691 if (unused_color_slots[i]) {
4692 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4693 convergent_mask, FS_VEC4_TYPE_INTERP_COLOR_PIXEL + i,
4694 slot_size, unused_color_slots[i], true, assign_colors,
4695 color_channel_rotate, progress);
4696 }
4697 }
4698
4699 /* Then make the remaining convergent inputs flat. */
4700 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4701 convergent_mask,
4702 linkage->always_interpolate_convergent_fs_inputs ?
4703 (slot_size == 2 ? FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL :
4704 FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL) :
4705 FS_VEC4_TYPE_FLAT,
4706 slot_size, NUM_SCALAR_SLOTS, true, assign_colors,
4707 color_channel_rotate, progress);
4708 }
4709
4710 static void
vs_tcs_tes_gs_assign_slots(struct linkage_info * linkage,BITSET_WORD * input_mask,unsigned * slot_index,unsigned * patch_slot_index,unsigned slot_size,nir_opt_varyings_progress * progress)4711 vs_tcs_tes_gs_assign_slots(struct linkage_info *linkage,
4712 BITSET_WORD *input_mask,
4713 unsigned *slot_index,
4714 unsigned *patch_slot_index,
4715 unsigned slot_size,
4716 nir_opt_varyings_progress *progress)
4717 {
4718 unsigned i;
4719
4720 BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) {
4721 if (i >= VARYING_SLOT_PATCH0 * 8 && i < VARYING_SLOT_TESS_MAX * 8) {
4722 /* Skip indirectly-indexed scalar slots at 32-bit granularity.
4723 * We have to do it at this granularity because the low 16-bit
4724 * slot is set to 1 for 32-bit inputs but not the high 16-bit slot.
4725 */
4726 while (BITSET_TEST32(linkage->indirect_mask, *patch_slot_index))
4727 *patch_slot_index = align(*patch_slot_index + 1, 2);
4728
4729 assert(*patch_slot_index < VARYING_SLOT_TESS_MAX * 8);
4730 relocate_slot(linkage, &linkage->slot[i], i, *patch_slot_index,
4731 FS_VEC4_TYPE_NONE, false, progress);
4732 *patch_slot_index += slot_size; /* increment by 16 or 32 bits */
4733 } else {
4734 /* If the driver wants to use POS and we've already used it, move
4735 * to VARn.
4736 */
4737 if (*slot_index < VARYING_SLOT_VAR0 &&
4738 *slot_index >= VARYING_SLOT_POS + 8)
4739 *slot_index = VARYING_SLOT_VAR0 * 8;
4740
4741 /* Skip indirectly-indexed scalar slots at 32-bit granularity. */
4742 while (BITSET_TEST32(linkage->indirect_mask, *slot_index))
4743 *slot_index = align(*slot_index + 1, 2);
4744
4745 assert(*slot_index < VARYING_SLOT_MAX * 8);
4746 relocate_slot(linkage, &linkage->slot[i], i, *slot_index,
4747 FS_VEC4_TYPE_NONE, false, progress);
4748 *slot_index += slot_size; /* increment by 16 or 32 bits */
4749 }
4750 }
4751 }
4752
4753 static void
vs_tcs_tes_gs_assign_slots_2sets(struct linkage_info * linkage,BITSET_WORD * input32_mask,BITSET_WORD * input16_mask,unsigned * slot_index,unsigned * patch_slot_index,nir_opt_varyings_progress * progress)4754 vs_tcs_tes_gs_assign_slots_2sets(struct linkage_info *linkage,
4755 BITSET_WORD *input32_mask,
4756 BITSET_WORD *input16_mask,
4757 unsigned *slot_index,
4758 unsigned *patch_slot_index,
4759 nir_opt_varyings_progress *progress)
4760 {
4761 /* Compact 32-bit inputs, followed by 16-bit inputs allowing them to
4762 * share vec4 slots with 32-bit inputs.
4763 */
4764 vs_tcs_tes_gs_assign_slots(linkage, input32_mask, slot_index,
4765 patch_slot_index, 2, progress);
4766 vs_tcs_tes_gs_assign_slots(linkage, input16_mask, slot_index,
4767 patch_slot_index, 1, progress);
4768
4769 assert(*slot_index <= VARYING_SLOT_MAX * 8);
4770 assert(!patch_slot_index || *patch_slot_index <= VARYING_SLOT_TESS_MAX * 8);
4771 }
4772
4773 /**
4774 * Compaction means scalarizing and then packing scalar components into full
4775 * vec4s, so that we minimize the number of unused components in vec4 slots.
4776 *
4777 * Compaction is as simple as moving a scalar input from one scalar slot
4778 * to another. Indirectly-indexed slots are not touched, so the compaction
4779 * has to compact around them. Unused 32-bit components of indirectly-indexed
4780 * slots are still filled, so no space is wasted there, but if indirectly-
4781 * indexed 16-bit components have the other 16-bit half unused, that half is
4782 * wasted.
4783 */
4784 static void
compact_varyings(struct linkage_info * linkage,nir_opt_varyings_progress * progress)4785 compact_varyings(struct linkage_info *linkage,
4786 nir_opt_varyings_progress *progress)
4787 {
4788 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
4789 /* These arrays are used to track which scalar slots we've already
4790 * assigned. We can fill unused components of indirectly-indexed slots,
4791 * but only if the vec4 slot type (FLAT, FP16, or FP32) is the same.
4792 * Assign vec4 slot type separately, skipping over already assigned
4793 * scalar slots.
4794 */
4795 uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS] = {0};
4796 BITSET_DECLARE(assigned_mask, NUM_SCALAR_SLOTS);
4797 BITSET_ZERO(assigned_mask);
4798
4799 /* Iterate over all indirectly accessed inputs and set the assigned vec4
4800 * type of each occupied slot to the vec4 type of indirect inputs, so
4801 * that compaction doesn't put inputs of a different vec4 type in
4802 * the same vec4.
4803 *
4804 * We don't try to compact indirect input arrays, though we could.
4805 */
4806 unsigned i;
4807 BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) {
4808 struct scalar_slot *slot = &linkage->slot[i];
4809
4810 /* The slot of the first array element contains all loads for all
4811 * elements, including all direct accesses, while all other array
4812 * elements are empty (on purpose).
4813 */
4814 if (list_is_empty(&linkage->slot[i].consumer.loads))
4815 continue;
4816
4817 assert(slot->num_slots >= 2);
4818
4819 for (unsigned array_index = 0; array_index < slot->num_slots;
4820 array_index++) {
4821 unsigned vec4_index = vec4_slot(i) + array_index;
4822 unsigned scalar_index = i + array_index * 8;
4823 assigned_fs_vec4_type[vec4_index] = linkage->fs_vec4_type[vec4_index];
4824 /* Indirectly-indexed slots are marked to always occupy 32 bits
4825 * (2 16-bit slots), though we waste the high 16 bits if they are unused.
4826 */
4827 BITSET_SET_RANGE_INSIDE_WORD(assigned_mask, scalar_index, scalar_index + 1);
4828 }
4829 }
4830
4831 if (linkage->has_flexible_interp) {
4832 /* This codepath packs convergent varyings with both interpolated and
4833 * flat, whichever has free space.
4834 */
4835 fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4836 linkage->interp_fp32_mask, linkage->flat32_mask,
4837 linkage->convergent32_mask, NULL,
4838 FS_VEC4_TYPE_INTERP_FP32, 2, false, 0, progress);
4839
4840 /* Now do the same thing, but for 16-bit inputs. */
4841 fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4842 linkage->interp_fp16_mask, linkage->flat16_mask,
4843 linkage->convergent16_mask, NULL,
4844 FS_VEC4_TYPE_INTERP_FP16, 1, false, 0, progress);
4845 } else {
4846 /* Basically the same as above. */
4847 fs_assign_slot_groups_separate_qual(
4848 linkage, assigned_mask, assigned_fs_vec4_type,
4849 &linkage->interp_fp32_qual_masks, linkage->flat32_mask,
4850 linkage->convergent32_mask, NULL,
4851 FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL, 2, false, 0, progress);
4852
4853 fs_assign_slot_groups_separate_qual(
4854 linkage, assigned_mask, assigned_fs_vec4_type,
4855 &linkage->interp_fp16_qual_masks, linkage->flat16_mask,
4856 linkage->convergent16_mask, NULL,
4857 FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL, 1, false, 0, progress);
4858 }
4859
4860 /* Assign INTERP_MODE_EXPLICIT. Both FP32 and FP16 can occupy the same
4861 * slot because the vertex data is passed to FS as-is.
4862 */
4863 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4864 linkage->interp_explicit32_mask, FS_VEC4_TYPE_INTERP_EXPLICIT,
4865 2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4866
4867 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4868 linkage->interp_explicit16_mask, FS_VEC4_TYPE_INTERP_EXPLICIT,
4869 1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4870
4871 /* Same for strict vertex ordering. */
4872 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4873 linkage->interp_explicit_strict32_mask, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
4874 2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4875
4876 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4877 linkage->interp_explicit_strict16_mask, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
4878 1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4879
4880 /* Same for per-primitive. */
4881 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4882 linkage->per_primitive32_mask, FS_VEC4_TYPE_PER_PRIMITIVE,
4883 2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4884
4885 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4886 linkage->per_primitive16_mask, FS_VEC4_TYPE_PER_PRIMITIVE,
4887 1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4888
4889 /* Put transform-feedback-only outputs last. */
4890 fs_assign_slots(linkage, assigned_mask, NULL,
4891 linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2,
4892 NUM_SCALAR_SLOTS, false, false, 0, progress);
4893
4894 fs_assign_slots(linkage, assigned_mask, NULL,
4895 linkage->xfb16_only_mask, FS_VEC4_TYPE_NONE, 1,
4896 NUM_SCALAR_SLOTS, false, false, 0, progress);
4897
4898 /* Color varyings are only compacted among themselves. */
4899 /* Set whether the shader contains any color varyings. */
4900 unsigned col0 = VARYING_SLOT_COL0 * 8;
4901 bool has_colors =
4902 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->interp_fp32_mask, col0,
4903 col0 + 15, 0) ||
4904 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->convergent32_mask, col0,
4905 col0 + 15, 0) ||
4906 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->color32_mask, col0,
4907 col0 + 15, 0) ||
4908 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->flat32_mask, col0,
4909 col0 + 15, 0) ||
4910 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->xfb32_only_mask, col0,
4911 col0 + 15, 0);
4912
4913 for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) {
4914 has_colors |=
4915 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->interp_fp32_qual_masks[i],
4916 col0, col0 + 15, 0);
4917 }
4918 for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) {
4919 has_colors |=
4920 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->color32_qual_masks[i],
4921 col0, col0 + 15, 0);
4922 }
4923
4924 if (has_colors) {
4925 unsigned color_channel_rotate = 0;
4926
4927 if (linkage->consumer_builder.shader->options->io_options &
4928 nir_io_compaction_rotates_color_channels) {
4929 color_channel_rotate =
4930 DIV_ROUND_UP(BITSET_LAST_BIT(assigned_mask), 2) % 4;
4931 }
4932
4933 if (linkage->has_flexible_interp) {
4934 fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4935 linkage->interp_fp32_mask, linkage->flat32_mask,
4936 linkage->convergent32_mask, linkage->color32_mask,
4937 FS_VEC4_TYPE_INTERP_FP32, 2, true,
4938 color_channel_rotate, progress);
4939 } else {
4940 fs_assign_slot_groups_separate_qual(
4941 linkage, assigned_mask, assigned_fs_vec4_type,
4942 &linkage->interp_fp32_qual_masks, linkage->flat32_mask,
4943 linkage->convergent32_mask, &linkage->color32_qual_masks,
4944 FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL, 2, true,
4945 color_channel_rotate, progress);
4946 }
4947
4948 /* Put transform-feedback-only outputs last. */
4949 fs_assign_slots(linkage, assigned_mask, NULL,
4950 linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2,
4951 NUM_SCALAR_SLOTS, false, true, color_channel_rotate,
4952 progress);
4953 }
4954 return;
4955 }
4956
4957 /* If we get here, the consumer can only be TCS, TES, or GS.
4958 *
4959 * "use_pos" says whether the driver prefers that compaction with non-FS
4960 * consumers puts varyings into POS first before using any VARn.
4961 */
4962 bool use_pos = !(linkage->producer_builder.shader->options->io_options &
4963 nir_io_dont_use_pos_for_non_fs_varyings);
4964 unsigned slot_index = (use_pos ? VARYING_SLOT_POS
4965 : VARYING_SLOT_VAR0) * 8;
4966
4967 if (linkage->consumer_stage == MESA_SHADER_TESS_CTRL) {
4968 /* Make tcs_cross_invoc*_mask bits disjoint with flat*_mask bits
4969 * because tcs_cross_invoc*_mask is initially a subset of flat*_mask,
4970 * but we must assign each scalar slot only once.
4971 */
4972 BITSET_ANDNOT(linkage->flat32_mask, linkage->flat32_mask,
4973 linkage->tcs_cross_invoc32_mask);
4974 BITSET_ANDNOT(linkage->flat16_mask, linkage->flat16_mask,
4975 linkage->tcs_cross_invoc16_mask);
4976
4977 /* Put cross-invocation-accessed TCS inputs first. */
4978 vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->tcs_cross_invoc32_mask,
4979 linkage->tcs_cross_invoc16_mask,
4980 &slot_index, NULL, progress);
4981 /* Remaining TCS inputs. */
4982 vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->flat32_mask,
4983 linkage->flat16_mask, &slot_index,
4984 NULL, progress);
4985 return;
4986 }
4987
4988 if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) {
4989 unsigned patch_slot_index = VARYING_SLOT_PATCH0 * 8;
4990
4991 vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->flat32_mask,
4992 linkage->flat16_mask, &slot_index,
4993 &patch_slot_index, progress);
4994
4995 /* Put no-varying slots last. These are TCS outputs read by TCS but
4996 * not TES.
4997 */
4998 vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->no_varying32_mask,
4999 linkage->no_varying16_mask, &slot_index,
5000 &patch_slot_index, progress);
5001 return;
5002 }
5003
5004 assert(linkage->consumer_stage == MESA_SHADER_GEOMETRY);
5005 vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->flat32_mask,
5006 linkage->flat16_mask, &slot_index,
5007 NULL, progress);
5008 }
5009
5010 /******************************************************************
5011 * PUTTING IT ALL TOGETHER
5012 ******************************************************************/
5013
5014 /* A costing function determining the cost of a uniform expression to determine
5015 * whether it's worth propagating from output stores to the next shader stage.
5016 * This tries to model instruction cost of a scalar desktop GPU.
5017 *
5018 * It's used by uniform expression propagation when drivers provide a cost
5019 * limit for such an optimization but don't provide their own costing function,
5020 * which are the majority of drivers.
5021 */
5022 static unsigned
default_varying_estimate_instr_cost(nir_instr * instr)5023 default_varying_estimate_instr_cost(nir_instr *instr)
5024 {
5025 unsigned dst_bit_size, src_bit_size, num_dst_dwords;
5026 nir_op alu_op;
5027
5028 switch (instr->type) {
5029 case nir_instr_type_alu:
5030 dst_bit_size = nir_instr_as_alu(instr)->def.bit_size;
5031 src_bit_size = nir_instr_as_alu(instr)->src[0].src.ssa->bit_size;
5032 alu_op = nir_instr_as_alu(instr)->op;
5033 num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);
5034
5035 switch (alu_op) {
5036 /* Moves are free. */
5037 case nir_op_mov:
5038 case nir_op_vec2:
5039 case nir_op_vec3:
5040 case nir_op_vec4:
5041 case nir_op_vec5:
5042 case nir_op_vec8:
5043 case nir_op_vec16:
5044 /* These are usually folded into FP instructions as src or dst
5045 * modifiers.
5046 */
5047 case nir_op_fabs:
5048 case nir_op_fneg:
5049 case nir_op_fsat:
5050 return 0;
5051
5052 /* 16-bit multiplication should be cheap. Greater sizes not so much. */
5053 case nir_op_imul:
5054 case nir_op_umul_low:
5055 case nir_op_imul_2x32_64:
5056 case nir_op_umul_2x32_64:
5057 return dst_bit_size <= 16 ? 1 : 4 * num_dst_dwords;
5058
5059 /* High bits of 64-bit multiplications. */
5060 case nir_op_imul_high:
5061 case nir_op_umul_high:
5062 /* Lowered into multiple instructions typically. */
5063 case nir_op_fsign:
5064 return 4;
5065
5066 /* Transcendental opcodes typically run at 1/4 rate of FMA. */
5067 case nir_op_fexp2:
5068 case nir_op_flog2:
5069 case nir_op_frcp:
5070 case nir_op_frsq:
5071 case nir_op_fsqrt:
5072 case nir_op_fsin:
5073 case nir_op_fcos:
5074 case nir_op_fsin_amd:
5075 case nir_op_fcos_amd:
5076 /* FP64 is usually much slower. */
5077 return dst_bit_size == 64 ? 32 : 4;
5078
5079 case nir_op_fpow:
5080 return 4 + 1 + 4; /* log2 + mul + exp2 */
5081
5082 /* Integer division is slow. */
5083 case nir_op_idiv:
5084 case nir_op_udiv:
5085 case nir_op_imod:
5086 case nir_op_umod:
5087 case nir_op_irem:
5088 return dst_bit_size == 64 ? 80 : 40;
5089
5090 case nir_op_fdiv:
5091 return dst_bit_size == 64 ? 80 : 5; /* FP16 & FP32: rcp + mul */
5092
5093 case nir_op_fmod:
5094 case nir_op_frem:
5095 return dst_bit_size == 64 ? 80 : 8;
5096
5097 default:
5098 /* FP64 is usually much slower. */
5099 if ((dst_bit_size == 64 &&
5100 nir_op_infos[alu_op].output_type & nir_type_float) ||
5101 (src_bit_size == 64 &&
5102 nir_op_infos[alu_op].input_types[0] & nir_type_float))
5103 return 16;
5104
5105 /* 1 per 32-bit result. */
5106 return DIV_ROUND_UP(MAX2(dst_bit_size, src_bit_size), 32);
5107 }
5108
5109 case nir_instr_type_intrinsic:
5110 dst_bit_size = nir_instr_as_intrinsic(instr)->def.bit_size;
5111 num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);
5112
5113 /* This can only be a uniform load. Other intrinsics and variables are
5114 * rejected before this is called.
5115 */
5116 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
5117 case nir_intrinsic_load_deref:
5118 /* Uniform loads can appear fast if latency hiding is effective. */
5119 return 2 * num_dst_dwords;
5120
5121 default:
5122 unreachable("unexpected intrinsic");
5123 }
5124
5125 case nir_instr_type_deref: {
5126 nir_deref_instr *deref = nir_instr_as_deref(instr);
5127
5128 switch (deref->deref_type) {
5129 case nir_deref_type_var:
5130 case nir_deref_type_struct:
5131 return 0;
5132 case nir_deref_type_array:
5133 /* Indexing uniforms with a divergent index has a high cost. This cost
5134 * is likely only going to be accepted by the driver if the next
5135 * shader doesn't run after amplification (e.g. VS->TCS, TES->GS).
5136 */
5137 return nir_src_is_const(deref->arr.index) ? 0 : 128;
5138
5139 default:
5140 unreachable("unexpected deref type");
5141 }
5142 }
5143
5144 default:
5145 unreachable("unexpected instr type");
5146 }
5147 }
5148
5149 static void
init_linkage(nir_shader * producer,nir_shader * consumer,bool spirv,unsigned max_uniform_components,unsigned max_ubos_per_stage,struct linkage_info * linkage,nir_opt_varyings_progress * progress)5150 init_linkage(nir_shader *producer, nir_shader *consumer, bool spirv,
5151 unsigned max_uniform_components, unsigned max_ubos_per_stage,
5152 struct linkage_info *linkage, nir_opt_varyings_progress *progress)
5153 {
5154 *linkage = (struct linkage_info){
5155 .spirv = spirv,
5156 .can_mix_convergent_flat_with_interpolated =
5157 consumer->info.stage == MESA_SHADER_FRAGMENT &&
5158 consumer->options->io_options &
5159 nir_io_mix_convergent_flat_with_interpolated,
5160 .has_flexible_interp =
5161 consumer->info.stage == MESA_SHADER_FRAGMENT &&
5162 consumer->options->io_options &
5163 nir_io_has_flexible_input_interpolation_except_flat,
5164 .always_interpolate_convergent_fs_inputs =
5165 consumer->info.stage == MESA_SHADER_FRAGMENT &&
5166 consumer->options->io_options &
5167 nir_io_always_interpolate_convergent_fs_inputs,
5168 .producer_stage = producer->info.stage,
5169 .consumer_stage = consumer->info.stage,
5170 .producer_builder =
5171 nir_builder_create(nir_shader_get_entrypoint(producer)),
5172 .consumer_builder =
5173 nir_builder_create(nir_shader_get_entrypoint(consumer)),
5174
5175 .max_varying_expression_cost =
5176 producer->options->varying_expression_max_cost ?
5177 producer->options->varying_expression_max_cost(producer, consumer) :
5178 producer->options->max_varying_expression_cost,
5179 .varying_estimate_instr_cost =
5180 producer->options->varying_estimate_instr_cost ?
5181 producer->options->varying_estimate_instr_cost :
5182 default_varying_estimate_instr_cost,
5183
5184 .linear_mem_ctx = linear_context(ralloc_context(NULL)),
5185 };
5186
5187 for (unsigned i = 0; i < ARRAY_SIZE(linkage->slot); i++) {
5188 list_inithead(&linkage->slot[i].producer.loads);
5189 list_inithead(&linkage->slot[i].producer.stores);
5190 list_inithead(&linkage->slot[i].consumer.loads);
5191 }
5192
5193 /* Preparation. */
5194 nir_shader_intrinsics_pass(consumer, gather_inputs, 0, linkage);
5195 nir_shader_intrinsics_pass(producer, gather_outputs, 0, linkage);
5196 tidy_up_indirect_varyings(linkage);
5197 determine_uniform_movability(linkage, max_uniform_components);
5198 determine_ubo_movability(linkage, max_ubos_per_stage);
5199 /* This must always be done because it also cleans up bitmasks. */
5200 remove_dead_varyings(linkage, progress);
5201 }
5202
5203 static void
free_linkage(struct linkage_info * linkage)5204 free_linkage(struct linkage_info *linkage)
5205 {
5206 ralloc_free(ralloc_parent_of_linear_context(linkage->linear_mem_ctx));
5207 }
5208
5209 static void
print_shader_linkage(nir_shader * producer,nir_shader * consumer)5210 print_shader_linkage(nir_shader *producer, nir_shader *consumer)
5211 {
5212 struct linkage_info *linkage = MALLOC_STRUCT(linkage_info);
5213 nir_opt_varyings_progress progress = 0;
5214
5215 init_linkage(producer, consumer, false, 0, 0, linkage, &progress);
5216 print_linkage(linkage);
5217 free_linkage(linkage);
5218 FREE(linkage);
5219 }
5220
5221 /**
5222 * Run lots of optimizations on varyings. See the description at the beginning
5223 * of this file.
5224 */
5225 nir_opt_varyings_progress
nir_opt_varyings(nir_shader * producer,nir_shader * consumer,bool spirv,unsigned max_uniform_components,unsigned max_ubos_per_stage)5226 nir_opt_varyings(nir_shader *producer, nir_shader *consumer, bool spirv,
5227 unsigned max_uniform_components, unsigned max_ubos_per_stage)
5228 {
5229 /* Task -> Mesh I/O uses payload variables and not varying slots,
5230 * so this pass can't do anything about it.
5231 */
5232 if (producer->info.stage == MESA_SHADER_TASK)
5233 return 0;
5234
5235 nir_opt_varyings_progress progress = 0;
5236 struct linkage_info *linkage = MALLOC_STRUCT(linkage_info);
5237 if (linkage == NULL)
5238 return 0;
5239
5240 /* Producers before a fragment shader must have up-to-date vertex
5241 * divergence information.
5242 */
5243 if (consumer->info.stage == MESA_SHADER_FRAGMENT) {
5244 nir_vertex_divergence_analysis(producer);
5245 }
5246
5247 /* This also removes dead varyings. */
5248 init_linkage(producer, consumer, spirv, max_uniform_components,
5249 max_ubos_per_stage, linkage, &progress);
5250
5251 /* Part 1: Run optimizations that only remove varyings. (they can move
5252 * instructions between shaders)
5253 */
5254 propagate_uniform_expressions(linkage, &progress);
5255
5256 /* Part 2: Deduplicate outputs. */
5257 deduplicate_outputs(linkage, &progress);
5258
5259 /* Run CSE on the consumer after output deduplication because duplicated
5260 * loads can prevent finding the post-dominator for inter-shader code
5261 * motion.
5262 */
5263 NIR_PASS(_, consumer, nir_opt_cse);
5264
5265 /* Re-gather linkage info after CSE. */
5266 free_linkage(linkage);
5267 init_linkage(producer, consumer, spirv, max_uniform_components,
5268 max_ubos_per_stage, linkage, &progress);
5269
5270 /* This must be done after deduplication and before inter-shader code
5271 * motion.
5272 */
5273 tidy_up_convergent_varyings(linkage);
5274 find_open_coded_tes_input_interpolation(linkage);
5275
5276 /* Part 3: Run optimizations that completely change varyings. */
5277 #if PRINT
5278 int i = 0;
5279 puts("Before:");
5280 nir_print_shader(linkage->producer_builder.shader, stdout);
5281 nir_print_shader(linkage->consumer_builder.shader, stdout);
5282 print_linkage(linkage);
5283 puts("");
5284 #endif
5285
5286 while (backward_inter_shader_code_motion(linkage, &progress)) {
5287 #if PRINT
5288 i++;
5289 printf("Finished: %i\n", i);
5290 nir_print_shader(linkage->producer_builder.shader, stdout);
5291 nir_print_shader(linkage->consumer_builder.shader, stdout);
5292 print_linkage(linkage);
5293 puts("");
5294 #endif
5295 }
5296
5297 /* Part 4: Do compaction. */
5298 compact_varyings(linkage, &progress);
5299
5300 nir_metadata_preserve(linkage->producer_builder.impl,
5301 progress & nir_progress_producer ?
5302 (nir_metadata_control_flow) :
5303 nir_metadata_all);
5304 nir_metadata_preserve(linkage->consumer_builder.impl,
5305 progress & nir_progress_consumer ?
5306 (nir_metadata_control_flow) :
5307 nir_metadata_all);
5308 free_linkage(linkage);
5309 FREE(linkage);
5310
5311 /* Compaction moves CLIP_DIST and CULL_DIST outputs to VARn if the next
5312 * shader is not FS. Clear those fields in shader_info.
5313 */
5314 if (consumer->info.stage <= MESA_SHADER_GEOMETRY) {
5315 producer->info.clip_distance_array_size = 0;
5316 producer->info.cull_distance_array_size = 0;
5317 }
5318
5319 if (progress & nir_progress_producer)
5320 nir_validate_shader(producer, "nir_opt_varyings");
5321 if (progress & nir_progress_consumer)
5322 nir_validate_shader(consumer, "nir_opt_varyings");
5323
5324 return progress;
5325 }
5326